diff --git "a/checkpoint-25354/trainer_state.json" "b/checkpoint-25354/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-25354/trainer_state.json" @@ -0,0 +1,188718 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.01259085673432, + "global_step": 25354, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-07, + "loss": 11.007, + "theoretical_loss": 20.81281780154715, + "tokens_seen": 65536 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-06, + "loss": 11.0078, + "theoretical_loss": 17.566201104328645, + "tokens_seen": 131072 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904763e-06, + "loss": 10.9535, + "theoretical_loss": 15.939477092836569, + "tokens_seen": 196608 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-06, + "loss": 10.8414, + "theoretical_loss": 14.89231675598857, + "tokens_seen": 262144 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-06, + "loss": 10.676, + "theoretical_loss": 14.136216937762974, + "tokens_seen": 327680 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809525e-06, + "loss": 10.5265, + "theoretical_loss": 13.552561472550224, + "tokens_seen": 393216 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-06, + "loss": 10.3438, + "theoretical_loss": 13.08180900140119, + "tokens_seen": 458752 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507936e-06, + "loss": 10.2176, + "theoretical_loss": 12.690129625483323, + "tokens_seen": 524288 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571428e-06, + "loss": 10.0803, + "theoretical_loss": 12.356592463873625, + "tokens_seen": 589824 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-06, + "loss": 9.9186, + "theoretical_loss": 12.067412607035077, + "tokens_seen": 655360 + }, + { + "epoch": 0.0, + "learning_rate": 1.0912698412698412e-05, + "loss": 9.8679, + "theoretical_loss": 11.813066231101676, + "tokens_seen": 720896 + }, + { + "epoch": 0.0, + "learning_rate": 1.1904761904761905e-05, + "loss": 9.7063, + "theoretical_loss": 11.586719208706729, + "tokens_seen": 786432 + }, + { + "epoch": 0.0, + "learning_rate": 1.2896825396825396e-05, + "loss": 9.6386, + "theoretical_loss": 11.383314140186787, + "tokens_seen": 851968 + }, + { + "epoch": 0.0, + "learning_rate": 1.3888888888888888e-05, + "loss": 9.6769, + "theoretical_loss": 11.199011702111871, + "tokens_seen": 917504 + }, + { + "epoch": 0.0, + "learning_rate": 1.4880952380952381e-05, + "loss": 9.6361, + "theoretical_loss": 11.030833917977912, + "tokens_seen": 983040 + }, + { + "epoch": 0.0, + "learning_rate": 1.5873015873015872e-05, + "loss": 9.5896, + "theoretical_loss": 10.87642808645695, + "tokens_seen": 1048576 + }, + { + "epoch": 0.0, + "learning_rate": 1.6865079365079364e-05, + "loss": 9.5241, + "theoretical_loss": 10.733905740062724, + "tokens_seen": 1114112 + }, + { + "epoch": 0.0, + "learning_rate": 1.7857142857142855e-05, + "loss": 9.4489, + "theoretical_loss": 10.60172987623028, + "tokens_seen": 1179648 + }, + { + "epoch": 0.0, + "learning_rate": 1.884920634920635e-05, + "loss": 9.2667, + "theoretical_loss": 10.478634172356642, + "tokens_seen": 1245184 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-05, + "loss": 9.4929, + "theoretical_loss": 10.36356394376333, + "tokens_seen": 1310720 + }, + { + "epoch": 0.0, + "learning_rate": 2.0833333333333333e-05, + "loss": 9.4, + "theoretical_loss": 10.255632220896747, + "tokens_seen": 1376256 + }, + { + "epoch": 0.0, + "learning_rate": 2.1825396825396824e-05, + "loss": 9.2513, + "theoretical_loss": 10.15408655327002, + "tokens_seen": 1441792 + }, + { + "epoch": 0.0, + "learning_rate": 2.2817460317460315e-05, + "loss": 9.1985, + "theoretical_loss": 10.058283561732598, + "tokens_seen": 1507328 + }, + { + "epoch": 0.0, + "learning_rate": 2.380952380952381e-05, + "loss": 9.3032, + "theoretical_loss": 9.967669178840278, + "tokens_seen": 1572864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 52330, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 8.940132141113281, + "objective/train/theoretical_loss": 9.881763126393109, + "objective/train/tokens_used": 22098400, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.48015873015873e-05, + "loss": 9.1201, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.5793650793650793e-05, + "loss": 9.173, + "theoretical_loss": 9.80014659154056, + "tokens_seen": 1703936 + }, + { + "epoch": 0.0, + "learning_rate": 2.6785714285714284e-05, + "loss": 9.0445, + "theoretical_loss": 9.722452346907446, + "tokens_seen": 1769472 + }, + { + "epoch": 0.0, + "learning_rate": 2.7777777777777776e-05, + "loss": 9.0669, + "theoretical_loss": 9.648356759081546, + "tokens_seen": 1835008 + }, + { + "epoch": 0.0, + "learning_rate": 2.876984126984127e-05, + "loss": 8.8043, + "theoretical_loss": 9.577573271145639, + "tokens_seen": 1900544 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904762e-05, + "loss": 8.9435, + "theoretical_loss": 9.509847046764852, + "tokens_seen": 1966080 + }, + { + "epoch": 0.0, + "learning_rate": 3.075396825396825e-05, + "loss": 9.0159, + "theoretical_loss": 9.444950537631936, + "tokens_seen": 2031616 + }, + { + "epoch": 0.0, + "learning_rate": 3.1746031746031745e-05, + "loss": 8.9218, + "theoretical_loss": 9.382679790910457, + "tokens_seen": 2097152 + }, + { + "epoch": 0.0, + "learning_rate": 3.273809523809524e-05, + "loss": 8.9068, + "theoretical_loss": 9.32285135423398, + "tokens_seen": 2162688 + }, + { + "epoch": 0.0, + "learning_rate": 3.373015873015873e-05, + "loss": 8.8447, + "theoretical_loss": 9.265299666660276, + "tokens_seen": 2228224 + }, + { + "epoch": 0.0, + "learning_rate": 3.472222222222222e-05, + "loss": 8.7826, + "theoretical_loss": 9.209874847444755, + "tokens_seen": 2293760 + }, + { + "epoch": 0.0, + "learning_rate": 3.571428571428571e-05, + "loss": 8.8651, + "theoretical_loss": 9.156440812508292, + "tokens_seen": 2359296 + }, + { + "epoch": 0.0, + "learning_rate": 3.670634920634921e-05, + "loss": 8.7615, + "theoretical_loss": 9.10487366241335, + "tokens_seen": 2424832 + }, + { + "epoch": 0.0, + "learning_rate": 3.76984126984127e-05, + "loss": 8.7987, + "theoretical_loss": 9.055060296533734, + "tokens_seen": 2490368 + }, + { + "epoch": 0.0, + "learning_rate": 3.8690476190476195e-05, + "loss": 8.7592, + "theoretical_loss": 9.006897216643829, + "tokens_seen": 2555904 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-05, + "loss": 8.703, + "theoretical_loss": 8.960289489909357, + "tokens_seen": 2621440 + }, + { + "epoch": 0.0, + "learning_rate": 4.067460317460318e-05, + "loss": 8.7101, + "theoretical_loss": 8.915149846640611, + "tokens_seen": 2686976 + }, + { + "epoch": 0.0, + "learning_rate": 4.1666666666666665e-05, + "loss": 8.4769, + "theoretical_loss": 8.871397892478225, + "tokens_seen": 2752512 + }, + { + "epoch": 0.0, + "learning_rate": 4.265873015873016e-05, + "loss": 8.3144, + "theoretical_loss": 8.828959418153499, + "tokens_seen": 2818048 + }, + { + "epoch": 0.0, + "learning_rate": 4.365079365079365e-05, + "loss": 8.5212, + "theoretical_loss": 8.787765792778412, + "tokens_seen": 2883584 + }, + { + "epoch": 0.0, + "learning_rate": 4.464285714285714e-05, + "loss": 8.5071, + "theoretical_loss": 8.747753428911455, + "tokens_seen": 2949120 + }, + { + "epoch": 0.0, + "learning_rate": 4.563492063492063e-05, + "loss": 8.4267, + "theoretical_loss": 8.708863309520833, + "tokens_seen": 3014656 + }, + { + "epoch": 0.0, + "learning_rate": 4.6626984126984126e-05, + "loss": 8.3429, + "theoretical_loss": 8.671040568508847, + "tokens_seen": 3080192 + }, + { + "epoch": 0.0, + "learning_rate": 4.761904761904762e-05, + "loss": 8.2506, + "theoretical_loss": 8.634234117735474, + "tokens_seen": 3145728 + }, + { + "epoch": 0.0, + "learning_rate": 4.8611111111111115e-05, + "loss": 8.2569, + "theoretical_loss": 8.598396314536323, + "tokens_seen": 3211264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 55409, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 8.003948211669922, + "objective/train/theoretical_loss": 8.563482664611069, + "objective/train/tokens_used": 23736800, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-05, + "loss": 8.058, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 5.05952380952381e-05, + "loss": 8.1338, + "theoretical_loss": 8.529451555895115, + "tokens_seen": 3342336 + }, + { + "epoch": 0.0, + "learning_rate": 5.1587301587301586e-05, + "loss": 8.2502, + "theoretical_loss": 8.496264019646002, + "tokens_seen": 3407872 + }, + { + "epoch": 0.0, + "learning_rate": 5.257936507936508e-05, + "loss": 8.105, + "theoretical_loss": 8.463883515497187, + "tokens_seen": 3473408 + }, + { + "epoch": 0.0, + "learning_rate": 5.357142857142857e-05, + "loss": 8.1543, + "theoretical_loss": 8.432275737672779, + "tokens_seen": 3538944 + }, + { + "epoch": 0.0, + "learning_rate": 5.4563492063492063e-05, + "loss": 7.8709, + "theoretical_loss": 8.401408439930716, + "tokens_seen": 3604480 + }, + { + "epoch": 0.0, + "learning_rate": 5.555555555555555e-05, + "loss": 8.1116, + "theoretical_loss": 8.371251277120209, + "tokens_seen": 3670016 + }, + { + "epoch": 0.0, + "learning_rate": 5.6547619047619046e-05, + "loss": 7.9275, + "theoretical_loss": 8.341775661511075, + "tokens_seen": 3735552 + }, + { + "epoch": 0.0, + "learning_rate": 5.753968253968254e-05, + "loss": 8.0108, + "theoretical_loss": 8.31295463228533, + "tokens_seen": 3801088 + }, + { + "epoch": 0.0, + "learning_rate": 5.8531746031746036e-05, + "loss": 8.0239, + "theoretical_loss": 8.284762736781182, + "tokens_seen": 3866624 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809524e-05, + "loss": 7.6739, + "theoretical_loss": 8.257175922251864, + "tokens_seen": 3932160 + }, + { + "epoch": 0.0, + "learning_rate": 6.051587301587302e-05, + "loss": 7.7548, + "theoretical_loss": 8.230171437050114, + "tokens_seen": 3997696 + }, + { + "epoch": 0.0, + "learning_rate": 6.15079365079365e-05, + "loss": 7.8545, + "theoretical_loss": 8.20372774027797, + "tokens_seen": 4063232 + }, + { + "epoch": 0.0, + "learning_rate": 6.25e-05, + "loss": 7.8221, + "theoretical_loss": 8.177824419053046, + "tokens_seen": 4128768 + }, + { + "epoch": 0.0, + "learning_rate": 6.349206349206349e-05, + "loss": 7.6526, + "theoretical_loss": 8.152442112639616, + "tokens_seen": 4194304 + }, + { + "epoch": 0.0, + "learning_rate": 6.448412698412699e-05, + "loss": 7.7083, + "theoretical_loss": 8.1275624427775, + "tokens_seen": 4259840 + }, + { + "epoch": 0.0, + "learning_rate": 6.547619047619048e-05, + "loss": 7.7012, + "theoretical_loss": 8.10316794961571, + "tokens_seen": 4325376 + }, + { + "epoch": 0.0, + "learning_rate": 6.646825396825397e-05, + "loss": 7.6582, + "theoretical_loss": 8.07924203272264, + "tokens_seen": 4390912 + }, + { + "epoch": 0.0, + "learning_rate": 6.746031746031745e-05, + "loss": 7.7205, + "theoretical_loss": 8.055768896701416, + "tokens_seen": 4456448 + }, + { + "epoch": 0.0, + "learning_rate": 6.845238095238096e-05, + "loss": 7.3764, + "theoretical_loss": 8.032733500989007, + "tokens_seen": 4521984 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-05, + "loss": 7.3905, + "theoretical_loss": 8.010121513461836, + "tokens_seen": 4587520 + }, + { + "epoch": 0.0, + "learning_rate": 7.043650793650793e-05, + "loss": 7.3749, + "theoretical_loss": 7.987919267509379, + "tokens_seen": 4653056 + }, + { + "epoch": 0.0, + "learning_rate": 7.142857142857142e-05, + "loss": 7.5265, + "theoretical_loss": 7.966113722271801, + "tokens_seen": 4718592 + }, + { + "epoch": 0.0, + "learning_rate": 7.242063492063492e-05, + "loss": 7.3315, + "theoretical_loss": 7.944692425767988, + "tokens_seen": 4784128 + }, + { + "epoch": 0.0, + "learning_rate": 7.341269841269842e-05, + "loss": 7.4875, + "theoretical_loss": 7.9236434806675184, + "tokens_seen": 4849664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 60447, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.313963413238525, + "objective/train/theoretical_loss": 7.902955512484067, + "objective/train/tokens_used": 25375200, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.440476190476191e-05, + "loss": 7.4231, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.53968253968254e-05, + "loss": 7.3822, + "theoretical_loss": 7.882617639989203, + "tokens_seen": 4980736 + }, + { + "epoch": 0.0, + "learning_rate": 7.63888888888889e-05, + "loss": 7.3111, + "theoretical_loss": 7.862619447664628, + "tokens_seen": 5046272 + }, + { + "epoch": 0.0, + "learning_rate": 7.738095238095239e-05, + "loss": 7.3316, + "theoretical_loss": 7.842950960027937, + "tokens_seen": 5111808 + }, + { + "epoch": 0.0, + "learning_rate": 7.837301587301588e-05, + "loss": 7.0431, + "theoretical_loss": 7.823602617682313, + "tokens_seen": 5177344 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507937e-05, + "loss": 7.166, + "theoretical_loss": 7.804565254954165, + "tokens_seen": 5242880 + }, + { + "epoch": 0.0, + "learning_rate": 8.035714285714287e-05, + "loss": 7.2017, + "theoretical_loss": 7.7858300789950725, + "tokens_seen": 5308416 + }, + { + "epoch": 0.0, + "learning_rate": 8.134920634920635e-05, + "loss": 7.184, + "theoretical_loss": 7.767388650235364, + "tokens_seen": 5373952 + }, + { + "epoch": 0.0, + "learning_rate": 8.234126984126984e-05, + "loss": 7.1994, + "theoretical_loss": 7.749232864086619, + "tokens_seen": 5439488 + }, + { + "epoch": 0.0, + "learning_rate": 8.333333333333333e-05, + "loss": 7.2288, + "theoretical_loss": 7.731354933799318, + "tokens_seen": 5505024 + }, + { + "epoch": 0.0, + "learning_rate": 8.432539682539683e-05, + "loss": 7.1285, + "theoretical_loss": 7.71374737438992, + "tokens_seen": 5570560 + }, + { + "epoch": 0.0, + "learning_rate": 8.531746031746032e-05, + "loss": 7.0346, + "theoretical_loss": 7.696402987558934, + "tokens_seen": 5636096 + }, + { + "epoch": 0.0, + "learning_rate": 8.630952380952381e-05, + "loss": 7.1917, + "theoretical_loss": 7.679314847528181, + "tokens_seen": 5701632 + }, + { + "epoch": 0.0, + "learning_rate": 8.73015873015873e-05, + "loss": 6.914, + "theoretical_loss": 7.662476287731328, + "tokens_seen": 5767168 + }, + { + "epoch": 0.0, + "learning_rate": 8.82936507936508e-05, + "loss": 6.9171, + "theoretical_loss": 7.645880888297279, + "tokens_seen": 5832704 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571429e-05, + "loss": 7.0525, + "theoretical_loss": 7.629522464270861, + "tokens_seen": 5898240 + }, + { + "epoch": 0.0, + "learning_rate": 9.027777777777777e-05, + "loss": 7.1465, + "theoretical_loss": 7.613395054519696, + "tokens_seen": 5963776 + }, + { + "epoch": 0.0, + "learning_rate": 9.126984126984126e-05, + "loss": 7.1364, + "theoretical_loss": 7.59749291128028, + "tokens_seen": 6029312 + }, + { + "epoch": 0.0, + "learning_rate": 9.226190476190476e-05, + "loss": 6.9717, + "theoretical_loss": 7.581810490299888, + "tokens_seen": 6094848 + }, + { + "epoch": 0.0, + "learning_rate": 9.325396825396825e-05, + "loss": 7.0877, + "theoretical_loss": 7.5663424415343705, + "tokens_seen": 6160384 + }, + { + "epoch": 0.0, + "learning_rate": 9.424603174603175e-05, + "loss": 7.0323, + "theoretical_loss": 7.551083600364949, + "tokens_seen": 6225920 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-05, + "loss": 7.0848, + "theoretical_loss": 7.536028979299919, + "tokens_seen": 6291456 + }, + { + "epoch": 0.0, + "learning_rate": 9.623015873015874e-05, + "loss": 6.9632, + "theoretical_loss": 7.521173760129762, + "tokens_seen": 6356992 + }, + { + "epoch": 0.0, + "learning_rate": 9.722222222222223e-05, + "loss": 6.8749, + "theoretical_loss": 7.506513286506497, + "tokens_seen": 6422528 + }, + { + "epoch": 0.0, + "learning_rate": 9.821428571428572e-05, + "loss": 6.8077, + "theoretical_loss": 7.492043056920249, + "tokens_seen": 6488064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 63315, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.100229740142822, + "objective/train/theoretical_loss": 7.4777587180480305, + "objective/train/tokens_used": 27013600, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-05, + "loss": 6.9234, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010019841269841271, + "loss": 7.0441, + "theoretical_loss": 7.463656058451462, + "tokens_seen": 6619136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001011904761904762, + "loss": 6.9863, + "theoretical_loss": 7.449731002601916, + "tokens_seen": 6684672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010218253968253968, + "loss": 7.0142, + "theoretical_loss": 7.435979605213019, + "tokens_seen": 6750208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010317460317460317, + "loss": 6.9167, + "theoretical_loss": 7.422398045861905, + "tokens_seen": 6815744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010416666666666667, + "loss": 6.8157, + "theoretical_loss": 7.408982623881875, + "tokens_seen": 6881280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010515873015873016, + "loss": 7.0067, + "theoretical_loss": 7.395729753510345, + "tokens_seen": 6946816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010615079365079365, + "loss": 6.8714, + "theoretical_loss": 7.3826359592770325, + "tokens_seen": 7012352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010714285714285714, + "loss": 6.7887, + "theoretical_loss": 7.369697871618373, + "tokens_seen": 7077888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010813492063492064, + "loss": 6.9363, + "theoretical_loss": 7.3569122227050885, + "tokens_seen": 7143424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010912698412698413, + "loss": 6.8036, + "theoretical_loss": 7.3442758424706875, + "tokens_seen": 7208960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011011904761904761, + "loss": 6.8798, + "theoretical_loss": 7.331785654829519, + "tokens_seen": 7274496 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001111111111111111, + "loss": 6.8694, + "theoretical_loss": 7.319438674073677, + "tokens_seen": 7340032 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001121031746031746, + "loss": 6.9059, + "theoretical_loss": 7.307232001438824, + "tokens_seen": 7405568 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011309523809523809, + "loss": 6.8585, + "theoretical_loss": 7.295162821829564, + "tokens_seen": 7471104 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011408730158730158, + "loss": 6.8588, + "theoretical_loss": 7.283228400695652, + "tokens_seen": 7536640 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011507936507936508, + "loss": 6.7621, + "theoretical_loss": 7.271426081050832, + "tokens_seen": 7602176 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011607142857142858, + "loss": 6.7199, + "theoretical_loss": 7.259753280626623, + "tokens_seen": 7667712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011706349206349207, + "loss": 6.8372, + "theoretical_loss": 7.24820748915387, + "tokens_seen": 7733248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011805555555555556, + "loss": 6.7524, + "theoretical_loss": 7.236786265765262, + "tokens_seen": 7798784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011904761904761905, + "loss": 6.7953, + "theoretical_loss": 7.225487236512497, + "tokens_seen": 7864320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012003968253968255, + "loss": 6.7105, + "theoretical_loss": 7.21430809199212, + "tokens_seen": 7929856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012103174603174604, + "loss": 6.7129, + "theoretical_loss": 7.2032465850744005, + "tokens_seen": 7995392 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012202380952380953, + "loss": 6.6794, + "theoretical_loss": 7.192300528730015, + "tokens_seen": 8060928 + }, + { + "epoch": 0.0, + "learning_rate": 0.000123015873015873, + "loss": 6.7831, + "theoretical_loss": 7.1814677939495155, + "tokens_seen": 8126464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 66983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.768932819366455, + "objective/train/theoretical_loss": 7.1707463077509646, + "objective/train/tokens_used": 28652000, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001240079365079365, + "loss": 6.7122, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000125, + "loss": 6.6307, + "theoretical_loss": 7.160134051271272, + "tokens_seen": 8257536 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001259920634920635, + "loss": 6.6963, + "theoretical_loss": 7.149629057937138, + "tokens_seen": 8323072 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012698412698412698, + "loss": 6.762, + "theoretical_loss": 7.139229411711638, + "tokens_seen": 8388608 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012797619047619048, + "loss": 6.7124, + "theoretical_loss": 7.128933245412794, + "tokens_seen": 8454144 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012896825396825398, + "loss": 6.6816, + "theoretical_loss": 7.118738739100616, + "tokens_seen": 8519680 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012996031746031748, + "loss": 6.6033, + "theoretical_loss": 7.1086441185293445, + "tokens_seen": 8585216 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013095238095238096, + "loss": 6.7304, + "theoretical_loss": 7.09864765366177, + "tokens_seen": 8650752 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013194444444444446, + "loss": 6.7157, + "theoretical_loss": 7.088747657242693, + "tokens_seen": 8716288 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013293650793650793, + "loss": 6.4753, + "theoretical_loss": 7.078942483428749, + "tokens_seen": 8781824 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013392857142857144, + "loss": 6.6485, + "theoretical_loss": 7.069230526471966, + "tokens_seen": 8847360 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001349206349206349, + "loss": 6.5616, + "theoretical_loss": 7.059610219454568, + "tokens_seen": 8912896 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001359126984126984, + "loss": 6.6093, + "theoretical_loss": 7.0500800330726685, + "tokens_seen": 8978432 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001369047619047619, + "loss": 6.7297, + "theoretical_loss": 7.040638474466625, + "tokens_seen": 9043968 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013789682539682541, + "loss": 6.4937, + "theoretical_loss": 7.031284086095933, + "tokens_seen": 9109504 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001388888888888889, + "loss": 6.6056, + "theoretical_loss": 7.022015444656678, + "tokens_seen": 9175040 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001398809523809524, + "loss": 6.6079, + "theoretical_loss": 7.012831160039609, + "tokens_seen": 9240576 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014087301587301586, + "loss": 6.5921, + "theoretical_loss": 7.003729874327071, + "tokens_seen": 9306112 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014186507936507937, + "loss": 6.4466, + "theoretical_loss": 6.994710260827057, + "tokens_seen": 9371648 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014285714285714284, + "loss": 6.5997, + "theoretical_loss": 6.98577102314278, + "tokens_seen": 9437184 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014384920634920634, + "loss": 6.6293, + "theoretical_loss": 6.976910894276189, + "tokens_seen": 9502720 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014484126984126984, + "loss": 6.3903, + "theoretical_loss": 6.968128635764015, + "tokens_seen": 9568256 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014583333333333335, + "loss": 6.3525, + "theoretical_loss": 6.959423036844894, + "tokens_seen": 9633792 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014682539682539685, + "loss": 6.4582, + "theoretical_loss": 6.950792913656309, + "tokens_seen": 9699328 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014781746031746032, + "loss": 6.5639, + "theoretical_loss": 6.942237108460029, + "tokens_seen": 9764864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 72059, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.56679105758667, + "objective/train/theoretical_loss": 6.9337544888949, + "objective/train/tokens_used": 30290400, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014880952380952382, + "loss": 6.4872, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001498015873015873, + "loss": 6.3905, + "theoretical_loss": 6.925343947255817, + "tokens_seen": 9895936 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001507936507936508, + "loss": 6.5979, + "theoretical_loss": 6.917004399797798, + "tokens_seen": 9961472 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015178571428571427, + "loss": 6.5659, + "theoretical_loss": 6.908734786064147, + "tokens_seen": 10027008 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001527777777777778, + "loss": 6.569, + "theoretical_loss": 6.900534068237688, + "tokens_seen": 10092544 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015376984126984128, + "loss": 6.4757, + "theoretical_loss": 6.89240123051416, + "tokens_seen": 10158080 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015476190476190478, + "loss": 6.4488, + "theoretical_loss": 6.884335278496871, + "tokens_seen": 10223616 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015575396825396825, + "loss": 6.5295, + "theoretical_loss": 6.87633523861175, + "tokens_seen": 10289152 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015674603174603175, + "loss": 6.5051, + "theoretical_loss": 6.868400157541997, + "tokens_seen": 10354688 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015773809523809523, + "loss": 6.5445, + "theoretical_loss": 6.860529101681551, + "tokens_seen": 10420224 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015873015873015873, + "loss": 6.3942, + "theoretical_loss": 6.85272115660663, + "tokens_seen": 10485760 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001597222222222222, + "loss": 6.4808, + "theoretical_loss": 6.844975426564642, + "tokens_seen": 10551296 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016071428571428573, + "loss": 6.486, + "theoretical_loss": 6.8372910339797945, + "tokens_seen": 10616832 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001617063492063492, + "loss": 6.4813, + "theoretical_loss": 6.829667118974749, + "tokens_seen": 10682368 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001626984126984127, + "loss": 6.5293, + "theoretical_loss": 6.8221028389077185, + "tokens_seen": 10747904 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016369047619047618, + "loss": 6.4264, + "theoretical_loss": 6.814597367924395, + "tokens_seen": 10813440 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016468253968253969, + "loss": 6.502, + "theoretical_loss": 6.807149896524181, + "tokens_seen": 10878976 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016567460317460316, + "loss": 6.4787, + "theoretical_loss": 6.799759631140145, + "tokens_seen": 10944512 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016666666666666666, + "loss": 6.4579, + "theoretical_loss": 6.7924257937322245, + "tokens_seen": 11010048 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016765873015873016, + "loss": 6.3582, + "theoretical_loss": 6.785147621393148, + "tokens_seen": 11075584 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016865079365079366, + "loss": 6.4468, + "theoretical_loss": 6.777924365966638, + "tokens_seen": 11141120 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016964285714285717, + "loss": 6.5197, + "theoretical_loss": 6.770755293677423, + "tokens_seen": 11206656 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017063492063492064, + "loss": 6.3758, + "theoretical_loss": 6.763639684772625, + "tokens_seen": 11272192 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017162698412698414, + "loss": 6.3577, + "theoretical_loss": 6.756576833174123, + "tokens_seen": 11337728 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017261904761904762, + "loss": 6.478, + "theoretical_loss": 6.749566046141486, + "tokens_seen": 11403264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 74859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.458885192871094, + "objective/train/theoretical_loss": 6.7426066439450905, + "objective/train/tokens_used": 31928800, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017361111111111112, + "loss": 6.3761, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001746031746031746, + "loss": 6.3486, + "theoretical_loss": 6.735697959549075, + "tokens_seen": 11534336 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001755952380952381, + "loss": 6.0777, + "theoretical_loss": 6.728839338303761, + "tokens_seen": 11599872 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001765873015873016, + "loss": 6.2819, + "theoretical_loss": 6.722030137647226, + "tokens_seen": 11665408 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001775793650793651, + "loss": 6.4933, + "theoretical_loss": 6.715269726815689, + "tokens_seen": 11730944 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017857142857142857, + "loss": 6.2914, + "theoretical_loss": 6.7085574865624125, + "tokens_seen": 11796480 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017956349206349207, + "loss": 6.365, + "theoretical_loss": 6.701892808884824, + "tokens_seen": 11862016 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018055555555555555, + "loss": 6.3504, + "theoretical_loss": 6.695275096759559, + "tokens_seen": 11927552 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018154761904761905, + "loss": 6.269, + "theoretical_loss": 6.68870376388518, + "tokens_seen": 11993088 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018253968253968252, + "loss": 6.3495, + "theoretical_loss": 6.682178234432274, + "tokens_seen": 12058624 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018353174603174602, + "loss": 6.3432, + "theoretical_loss": 6.675697942800715, + "tokens_seen": 12124160 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018452380952380953, + "loss": 6.3026, + "theoretical_loss": 6.669262333383815, + "tokens_seen": 12189696 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018551587301587303, + "loss": 6.2453, + "theoretical_loss": 6.662870860339158, + "tokens_seen": 12255232 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001865079365079365, + "loss": 6.439, + "theoretical_loss": 6.656522987365879, + "tokens_seen": 12320768 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001875, + "loss": 6.2713, + "theoretical_loss": 6.6502181874881705, + "tokens_seen": 12386304 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001884920634920635, + "loss": 6.3356, + "theoretical_loss": 6.643955942844831, + "tokens_seen": 12451840 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018948412698412698, + "loss": 6.3321, + "theoretical_loss": 6.637735744484626, + "tokens_seen": 12517376 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019047619047619048, + "loss": 6.4067, + "theoretical_loss": 6.631557092167304, + "tokens_seen": 12582912 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019146825396825398, + "loss": 6.2663, + "theoretical_loss": 6.625419494170049, + "tokens_seen": 12648448 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019246031746031748, + "loss": 6.2515, + "theoretical_loss": 6.619322467099223, + "tokens_seen": 12713984 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019345238095238096, + "loss": 6.3843, + "theoretical_loss": 6.613265535707211, + "tokens_seen": 12779520 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019444444444444446, + "loss": 6.3408, + "theoretical_loss": 6.607248232714213, + "tokens_seen": 12845056 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019543650793650793, + "loss": 6.3919, + "theoretical_loss": 6.60127009863481, + "tokens_seen": 12910592 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019642857142857144, + "loss": 6.1324, + "theoretical_loss": 6.59533068160918, + "tokens_seen": 12976128 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001974206349206349, + "loss": 6.3136, + "theoretical_loss": 6.589429537238785, + "tokens_seen": 13041664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 79781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.997321128845215, + "objective/train/theoretical_loss": 6.583566228426414, + "objective/train/tokens_used": 33567200, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001984126984126984, + "loss": 6.0948, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019940476190476191, + "loss": 6.2331, + "theoretical_loss": 6.5777403252204305, + "tokens_seen": 13172736 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020039682539682542, + "loss": 6.1609, + "theoretical_loss": 6.571951404663098, + "tokens_seen": 13238272 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002013888888888889, + "loss": 6.2912, + "theoretical_loss": 6.566199050642863, + "tokens_seen": 13303808 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002023809523809524, + "loss": 6.2575, + "theoretical_loss": 6.560482853750463, + "tokens_seen": 13369344 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020337301587301587, + "loss": 6.2837, + "theoretical_loss": 6.554802411138745, + "tokens_seen": 13434880 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020436507936507937, + "loss": 6.2622, + "theoretical_loss": 6.549157326386091, + "tokens_seen": 13500416 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020535714285714284, + "loss": 6.121, + "theoretical_loss": 6.54354720936333, + "tokens_seen": 13565952 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020634920634920634, + "loss": 6.2553, + "theoretical_loss": 6.537971676104026, + "tokens_seen": 13631488 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020734126984126985, + "loss": 6.3165, + "theoretical_loss": 6.532430348678068, + "tokens_seen": 13697024 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020833333333333335, + "loss": 6.2749, + "theoretical_loss": 6.5269228550684195, + "tokens_seen": 13762560 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020932539682539685, + "loss": 6.2514, + "theoretical_loss": 6.521448829050978, + "tokens_seen": 13828096 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021031746031746032, + "loss": 6.1101, + "theoretical_loss": 6.516007910077416, + "tokens_seen": 13893632 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021130952380952382, + "loss": 6.0964, + "theoretical_loss": 6.51059974316095, + "tokens_seen": 13959168 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002123015873015873, + "loss": 6.2298, + "theoretical_loss": 6.50522397876491, + "tokens_seen": 14024704 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002132936507936508, + "loss": 6.1403, + "theoretical_loss": 6.499880272694068, + "tokens_seen": 14090240 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021428571428571427, + "loss": 6.1319, + "theoretical_loss": 6.494568285988618, + "tokens_seen": 14155776 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002152777777777778, + "loss": 6.2058, + "theoretical_loss": 6.489287684820745, + "tokens_seen": 14221312 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021626984126984128, + "loss": 6.1177, + "theoretical_loss": 6.484038140393699, + "tokens_seen": 14286848 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021726190476190478, + "loss": 6.2828, + "theoretical_loss": 6.4788193288433105, + "tokens_seen": 14352384 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021825396825396825, + "loss": 6.2142, + "theoretical_loss": 6.473630931141869, + "tokens_seen": 14417920 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021924603174603176, + "loss": 6.2654, + "theoretical_loss": 6.468472633004308, + "tokens_seen": 14483456 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022023809523809523, + "loss": 6.0843, + "theoretical_loss": 6.463344124796616, + "tokens_seen": 14548992 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022123015873015873, + "loss": 6.1608, + "theoretical_loss": 6.45824510144643, + "tokens_seen": 14614528 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002222222222222222, + "loss": 6.2468, + "theoretical_loss": 6.45317526235573, + "tokens_seen": 14680064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 82599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.243563175201416, + "objective/train/theoretical_loss": 6.448134311315593, + "objective/train/tokens_used": 35205600, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022321428571428573, + "loss": 6.0886, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002242063492063492, + "loss": 6.1151, + "theoretical_loss": 6.443121956422939, + "tokens_seen": 14811136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002251984126984127, + "loss": 6.2154, + "theoretical_loss": 6.438137909999214, + "tokens_seen": 14876672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022619047619047618, + "loss": 6.0697, + "theoretical_loss": 6.433181888510964, + "tokens_seen": 14942208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022718253968253969, + "loss": 6.1978, + "theoretical_loss": 6.428253612492239, + "tokens_seen": 15007744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022817460317460316, + "loss": 6.1885, + "theoretical_loss": 6.4233528064687855, + "tokens_seen": 15073280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022916666666666666, + "loss": 6.1685, + "theoretical_loss": 6.418479198883969, + "tokens_seen": 15138816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023015873015873016, + "loss": 6.1792, + "theoretical_loss": 6.413632522026391, + "tokens_seen": 15204352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023115079365079367, + "loss": 6.005, + "theoretical_loss": 6.40881251195914, + "tokens_seen": 15269888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023214285714285717, + "loss": 6.1587, + "theoretical_loss": 6.404018908450656, + "tokens_seen": 15335424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023313492063492064, + "loss": 6.2659, + "theoretical_loss": 6.399251454907132, + "tokens_seen": 15400960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023412698412698414, + "loss": 6.1367, + "theoretical_loss": 6.394509898306452, + "tokens_seen": 15466496 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023511904761904762, + "loss": 6.1637, + "theoretical_loss": 6.389793989133574, + "tokens_seen": 15532032 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023611111111111112, + "loss": 6.0976, + "theoretical_loss": 6.385103481317387, + "tokens_seen": 15597568 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002371031746031746, + "loss": 6.0732, + "theoretical_loss": 6.380438132168923, + "tokens_seen": 15663104 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002380952380952381, + "loss": 6.0357, + "theoretical_loss": 6.375797702320966, + "tokens_seen": 15728640 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002390873015873016, + "loss": 6.0405, + "theoretical_loss": 6.371181955668966, + "tokens_seen": 15794176 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002400793650793651, + "loss": 6.1324, + "theoretical_loss": 6.366590659313248, + "tokens_seen": 15859712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024107142857142857, + "loss": 5.9872, + "theoretical_loss": 6.36202358350248, + "tokens_seen": 15925248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024206349206349207, + "loss": 6.1216, + "theoretical_loss": 6.357480501578371, + "tokens_seen": 15990784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024305555555555555, + "loss": 6.0038, + "theoretical_loss": 6.352961189921553, + "tokens_seen": 16056320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024404761904761905, + "loss": 6.0629, + "theoretical_loss": 6.348465427898629, + "tokens_seen": 16121856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024503968253968255, + "loss": 5.9, + "theoretical_loss": 6.343992997810366, + "tokens_seen": 16187392 + }, + { + "epoch": 0.0, + "learning_rate": 0.000246031746031746, + "loss": 5.9241, + "theoretical_loss": 6.33954368484097, + "tokens_seen": 16252928 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024702380952380955, + "loss": 6.0887, + "theoretical_loss": 6.33511727700846, + "tokens_seen": 16318464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 86378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.125805854797363, + "objective/train/theoretical_loss": 6.330713565116083, + "objective/train/tokens_used": 36844000, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000248015873015873, + "loss": 6.1167, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002490079365079365, + "loss": 6.2031, + "theoretical_loss": 6.326332342704751, + "tokens_seen": 16449536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025, + "loss": 6.1574, + "theoretical_loss": 6.32197340600647, + "tokens_seen": 16515072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002509920634920635, + "loss": 6.1541, + "theoretical_loss": 6.3176365538987636, + "tokens_seen": 16580608 + }, + { + "epoch": 0.01, + "learning_rate": 0.000251984126984127, + "loss": 6.0582, + "theoretical_loss": 6.313321587860021, + "tokens_seen": 16646144 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025297619047619046, + "loss": 6.0493, + "theoretical_loss": 6.309028311925785, + "tokens_seen": 16711680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025396825396825396, + "loss": 6.1732, + "theoretical_loss": 6.304756532645939, + "tokens_seen": 16777216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025496031746031746, + "loss": 6.1079, + "theoretical_loss": 6.300506059042775, + "tokens_seen": 16842752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025595238095238096, + "loss": 5.9529, + "theoretical_loss": 6.296276702569918, + "tokens_seen": 16908288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002569444444444444, + "loss": 6.1879, + "theoretical_loss": 6.292068277072099, + "tokens_seen": 16973824 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025793650793650796, + "loss": 5.9456, + "theoretical_loss": 6.28788059874573, + "tokens_seen": 17039360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025892857142857146, + "loss": 6.0203, + "theoretical_loss": 6.283713486100297, + "tokens_seen": 17104896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025992063492063497, + "loss": 6.0847, + "theoretical_loss": 6.279566759920507, + "tokens_seen": 17170432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002609126984126984, + "loss": 6.096, + "theoretical_loss": 6.275440243229228, + "tokens_seen": 17235968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002619047619047619, + "loss": 6.031, + "theoretical_loss": 6.271333761251142, + "tokens_seen": 17301504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002628968253968254, + "loss": 6.0532, + "theoretical_loss": 6.267247141377137, + "tokens_seen": 17367040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002638888888888889, + "loss": 6.0181, + "theoretical_loss": 6.2631802131294085, + "tokens_seen": 17432576 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026488095238095237, + "loss": 6.0524, + "theoretical_loss": 6.259132808127246, + "tokens_seen": 17498112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026587301587301587, + "loss": 6.0789, + "theoretical_loss": 6.255104760053497, + "tokens_seen": 17563648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026686507936507937, + "loss": 6.0715, + "theoretical_loss": 6.251095904621689, + "tokens_seen": 17629184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026785714285714287, + "loss": 6.1404, + "theoretical_loss": 6.247106079543801, + "tokens_seen": 17694720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002688492063492063, + "loss": 6.0391, + "theoretical_loss": 6.243135124498652, + "tokens_seen": 17760256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002698412698412698, + "loss": 5.9817, + "theoretical_loss": 6.239182881100916, + "tokens_seen": 17825792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002708333333333333, + "loss": 5.8515, + "theoretical_loss": 6.235249192870732, + "tokens_seen": 17891328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002718253968253968, + "loss": 6.0359, + "theoretical_loss": 6.231333905203899, + "tokens_seen": 17956864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 91138, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.83248233795166, + "objective/train/theoretical_loss": 6.227436865342643, + "objective/train/tokens_used": 38482400, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002728174603174603, + "loss": 5.9423, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002738095238095238, + "loss": 5.9662, + "theoretical_loss": 6.223557922346955, + "tokens_seen": 18087936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002748015873015873, + "loss": 5.941, + "theoretical_loss": 6.219696927066456, + "tokens_seen": 18153472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027579365079365083, + "loss": 6.0812, + "theoretical_loss": 6.215853732112821, + "tokens_seen": 18219008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027678571428571433, + "loss": 6.0511, + "theoretical_loss": 6.212028191832702, + "tokens_seen": 18284544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002777777777777778, + "loss": 5.9984, + "theoretical_loss": 6.208220162281178, + "tokens_seen": 18350080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002787698412698413, + "loss": 6.0931, + "theoretical_loss": 6.204429501195701, + "tokens_seen": 18415616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002797619047619048, + "loss": 5.9687, + "theoretical_loss": 6.20065606797053, + "tokens_seen": 18481152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002807539682539683, + "loss": 5.8368, + "theoretical_loss": 6.19689972363164, + "tokens_seen": 18546688 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028174603174603173, + "loss": 6.0321, + "theoretical_loss": 6.1931603308120975, + "tokens_seen": 18612224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028273809523809523, + "loss": 6.0383, + "theoretical_loss": 6.189437753727901, + "tokens_seen": 18677760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028373015873015873, + "loss": 5.9356, + "theoretical_loss": 6.185731858154261, + "tokens_seen": 18743296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028472222222222223, + "loss": 6.0565, + "theoretical_loss": 6.182042511402313, + "tokens_seen": 18808832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002857142857142857, + "loss": 5.9098, + "theoretical_loss": 6.17836958229627, + "tokens_seen": 18874368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002867063492063492, + "loss": 5.8507, + "theoretical_loss": 6.1747129411509825, + "tokens_seen": 18939904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002876984126984127, + "loss": 5.8801, + "theoretical_loss": 6.171072459749913, + "tokens_seen": 19005440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002886904761904762, + "loss": 5.9178, + "theoretical_loss": 6.1674480113235095, + "tokens_seen": 19070976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002896825396825397, + "loss": 6.0824, + "theoretical_loss": 6.163839470527964, + "tokens_seen": 19136512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002906746031746032, + "loss": 5.9712, + "theoretical_loss": 6.160246713424372, + "tokens_seen": 19202048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002916666666666667, + "loss": 6.0553, + "theoretical_loss": 6.156669617458243, + "tokens_seen": 19267584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002926587301587302, + "loss": 5.9162, + "theoretical_loss": 6.153108061439397, + "tokens_seen": 19333120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002936507936507937, + "loss": 5.923, + "theoretical_loss": 6.149561925522211, + "tokens_seen": 19398656 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029464285714285714, + "loss": 5.9831, + "theoretical_loss": 6.146031091186222, + "tokens_seen": 19464192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029563492063492064, + "loss": 5.979, + "theoretical_loss": 6.142515441217064, + "tokens_seen": 19529728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029662698412698414, + "loss": 5.9701, + "theoretical_loss": 6.1390148596877605, + "tokens_seen": 19595264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 94128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.065183162689209, + "objective/train/theoretical_loss": 6.135529231940326, + "objective/train/tokens_used": 40120800, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029761904761904765, + "loss": 5.9817, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002986111111111111, + "loss": 5.9488, + "theoretical_loss": 6.132058444567705, + "tokens_seen": 19726336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002996031746031746, + "loss": 6.0179, + "theoretical_loss": 6.128602385396022, + "tokens_seen": 19791872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003005952380952381, + "loss": 5.918, + "theoretical_loss": 6.125160943467138, + "tokens_seen": 19857408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003015873015873016, + "loss": 5.9657, + "theoretical_loss": 6.121734009021521, + "tokens_seen": 19922944 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030257936507936505, + "loss": 5.8696, + "theoretical_loss": 6.118321473481398, + "tokens_seen": 19988480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030357142857142855, + "loss": 5.9389, + "theoretical_loss": 6.114923229434213, + "tokens_seen": 20054016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030456349206349205, + "loss": 5.8942, + "theoretical_loss": 6.111539170616359, + "tokens_seen": 20119552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003055555555555556, + "loss": 5.8891, + "theoretical_loss": 6.108169191897195, + "tokens_seen": 20185088 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030654761904761905, + "loss": 5.9234, + "theoretical_loss": 6.104813189263336, + "tokens_seen": 20250624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030753968253968255, + "loss": 5.8841, + "theoretical_loss": 6.101471059803204, + "tokens_seen": 20316160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030853174603174605, + "loss": 5.8354, + "theoretical_loss": 6.098142701691856, + "tokens_seen": 20381696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030952380952380956, + "loss": 5.9279, + "theoretical_loss": 6.094828014176053, + "tokens_seen": 20447232 + }, + { + "epoch": 0.01, + "learning_rate": 0.000310515873015873, + "loss": 5.8435, + "theoretical_loss": 6.091526897559593, + "tokens_seen": 20512768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003115079365079365, + "loss": 5.7539, + "theoretical_loss": 6.088239253188885, + "tokens_seen": 20578304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003125, + "loss": 5.8578, + "theoretical_loss": 6.084964983438763, + "tokens_seen": 20643840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003134920634920635, + "loss": 5.92, + "theoretical_loss": 6.0817039916985465, + "tokens_seen": 20709376 + }, + { + "epoch": 0.01, + "learning_rate": 0.000314484126984127, + "loss": 5.9103, + "theoretical_loss": 6.078456182358325, + "tokens_seen": 20774912 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031547619047619046, + "loss": 5.8717, + "theoretical_loss": 6.075221460795472, + "tokens_seen": 20840448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031646825396825396, + "loss": 5.9514, + "theoretical_loss": 6.071999733361386, + "tokens_seen": 20905984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031746031746031746, + "loss": 5.9242, + "theoretical_loss": 6.068790907368448, + "tokens_seen": 20971520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031845238095238096, + "loss": 5.8597, + "theoretical_loss": 6.0655948910771915, + "tokens_seen": 21037056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003194444444444444, + "loss": 5.8659, + "theoretical_loss": 6.062411593683687, + "tokens_seen": 21102592 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032043650793650796, + "loss": 5.8273, + "theoretical_loss": 6.059240925307134, + "tokens_seen": 21168128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032142857142857147, + "loss": 5.8089, + "theoretical_loss": 6.056082796977648, + "tokens_seen": 21233664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 99030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.924115180969238, + "objective/train/theoretical_loss": 6.052937120624258, + "objective/train/tokens_used": 41759200, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032242063492063497, + "loss": 5.8191, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003234126984126984, + "loss": 5.8766, + "theoretical_loss": 6.049803809063083, + "tokens_seen": 21364736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003244047619047619, + "loss": 5.8766, + "theoretical_loss": 6.0466827759857145, + "tokens_seen": 21430272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003253968253968254, + "loss": 5.8165, + "theoretical_loss": 6.04357393594778, + "tokens_seen": 21495808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003263888888888889, + "loss": 5.8748, + "theoretical_loss": 6.040477204357686, + "tokens_seen": 21561344 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032738095238095237, + "loss": 5.8956, + "theoretical_loss": 6.037392497465552, + "tokens_seen": 21626880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032837301587301587, + "loss": 5.7652, + "theoretical_loss": 6.034319732352309, + "tokens_seen": 21692416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032936507936507937, + "loss": 5.8292, + "theoretical_loss": 6.031258826918979, + "tokens_seen": 21757952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033035714285714287, + "loss": 5.7749, + "theoretical_loss": 6.0282096998761245, + "tokens_seen": 21823488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003313492063492063, + "loss": 5.8285, + "theoretical_loss": 6.025172270733464, + "tokens_seen": 21889024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003323412698412698, + "loss": 5.7802, + "theoretical_loss": 6.0221464597896475, + "tokens_seen": 21954560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003333333333333333, + "loss": 5.7303, + "theoretical_loss": 6.0191321881221995, + "tokens_seen": 22020096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003343253968253968, + "loss": 5.8749, + "theoretical_loss": 6.016129377577614, + "tokens_seen": 22085632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003353174603174603, + "loss": 5.748, + "theoretical_loss": 6.01313795076161, + "tokens_seen": 22151168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003363095238095238, + "loss": 5.8951, + "theoretical_loss": 6.010157831029533, + "tokens_seen": 22216704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033730158730158733, + "loss": 5.7507, + "theoretical_loss": 6.007188942476907, + "tokens_seen": 22282240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033829365079365083, + "loss": 5.8147, + "theoretical_loss": 6.0042312099301425, + "tokens_seen": 22347776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033928571428571433, + "loss": 5.8645, + "theoretical_loss": 6.001284558937368, + "tokens_seen": 22413312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003402777777777778, + "loss": 5.8086, + "theoretical_loss": 5.998348915759426, + "tokens_seen": 22478848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003412698412698413, + "loss": 5.6491, + "theoretical_loss": 5.995424207360987, + "tokens_seen": 22544384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003422619047619048, + "loss": 5.7877, + "theoretical_loss": 5.992510361401818, + "tokens_seen": 22609920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003432539682539683, + "loss": 5.8159, + "theoretical_loss": 5.989607306228168, + "tokens_seen": 22675456 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034424603174603173, + "loss": 5.798, + "theoretical_loss": 5.986714970864292, + "tokens_seen": 22740992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034523809523809523, + "loss": 5.7716, + "theoretical_loss": 5.983833285004112, + "tokens_seen": 22806528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034623015873015873, + "loss": 5.7508, + "theoretical_loss": 5.980962179002983, + "tokens_seen": 22872064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 101925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.711303234100342, + "objective/train/theoretical_loss": 5.978101583869607, + "objective/train/tokens_used": 43397600, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034722222222222224, + "loss": 5.8797, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003482142857142857, + "loss": 5.7415, + "theoretical_loss": 5.975251431258057, + "tokens_seen": 23003136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003492063492063492, + "loss": 5.5518, + "theoretical_loss": 5.972411653459913, + "tokens_seen": 23068672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003501984126984127, + "loss": 5.8311, + "theoretical_loss": 5.9695821833965335, + "tokens_seen": 23134208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003511904761904762, + "loss": 5.6972, + "theoretical_loss": 5.966762954611432, + "tokens_seen": 23199744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003521825396825397, + "loss": 5.7567, + "theoretical_loss": 5.963953901262764, + "tokens_seen": 23265280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003531746031746032, + "loss": 5.8476, + "theoretical_loss": 5.961154958115937, + "tokens_seen": 23330816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003541666666666667, + "loss": 5.7693, + "theoretical_loss": 5.958366060536315, + "tokens_seen": 23396352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003551587301587302, + "loss": 5.8335, + "theoretical_loss": 5.955587144482044, + "tokens_seen": 23461888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003561507936507937, + "loss": 5.7313, + "theoretical_loss": 5.952818146496978, + "tokens_seen": 23527424 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035714285714285714, + "loss": 5.6252, + "theoretical_loss": 5.950059003703704, + "tokens_seen": 23592960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035813492063492064, + "loss": 5.7927, + "theoretical_loss": 5.94730965379668, + "tokens_seen": 23658496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035912698412698415, + "loss": 5.7894, + "theoretical_loss": 5.944570035035458, + "tokens_seen": 23724032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036011904761904765, + "loss": 5.8101, + "theoretical_loss": 5.941840086238027, + "tokens_seen": 23789568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003611111111111111, + "loss": 5.6609, + "theoretical_loss": 5.939119746774228, + "tokens_seen": 23855104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003621031746031746, + "loss": 5.8044, + "theoretical_loss": 5.936408956559284, + "tokens_seen": 23920640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003630952380952381, + "loss": 5.7211, + "theoretical_loss": 5.933707656047414, + "tokens_seen": 23986176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003640873015873016, + "loss": 5.7469, + "theoretical_loss": 5.93101578622554, + "tokens_seen": 24051712 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036507936507936505, + "loss": 5.7008, + "theoretical_loss": 5.928333288607086, + "tokens_seen": 24117248 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036607142857142855, + "loss": 5.7237, + "theoretical_loss": 5.925660105225867, + "tokens_seen": 24182784 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036706349206349205, + "loss": 5.7901, + "theoretical_loss": 5.92299617863006, + "tokens_seen": 24248320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003680555555555556, + "loss": 5.7617, + "theoretical_loss": 5.920341451876267, + "tokens_seen": 24313856 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036904761904761905, + "loss": 5.581, + "theoretical_loss": 5.9176958685236585, + "tokens_seen": 24379392 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037003968253968255, + "loss": 5.6535, + "theoretical_loss": 5.9150593726282015, + "tokens_seen": 24444928 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037103174603174606, + "loss": 5.7338, + "theoretical_loss": 5.912431908736972, + "tokens_seen": 24510464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 105716, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.7692060470581055, + "objective/train/theoretical_loss": 5.909813421882534, + "objective/train/tokens_used": 45036000, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037202380952380956, + "loss": 5.6169, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000373015873015873, + "loss": 5.6953, + "theoretical_loss": 5.907203857577422, + "tokens_seen": 24641536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003740079365079365, + "loss": 5.6461, + "theoretical_loss": 5.9046031618086765, + "tokens_seen": 24707072 + }, + { + "epoch": 0.01, + "learning_rate": 0.000375, + "loss": 5.7207, + "theoretical_loss": 5.902011281032472, + "tokens_seen": 24772608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003759920634920635, + "loss": 5.6687, + "theoretical_loss": 5.899428162168808, + "tokens_seen": 24838144 + }, + { + "epoch": 0.01, + "learning_rate": 0.000376984126984127, + "loss": 5.711, + "theoretical_loss": 5.896853752596286, + "tokens_seen": 24903680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037797619047619046, + "loss": 5.7208, + "theoretical_loss": 5.894288000146949, + "tokens_seen": 24969216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037896825396825396, + "loss": 5.7187, + "theoretical_loss": 5.891730853101199, + "tokens_seen": 25034752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037996031746031746, + "loss": 5.5468, + "theoretical_loss": 5.88918226018278, + "tokens_seen": 25100288 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038095238095238096, + "loss": 5.6112, + "theoretical_loss": 5.8866421705538325, + "tokens_seen": 25165824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003819444444444444, + "loss": 5.7421, + "theoretical_loss": 5.8841105338100155, + "tokens_seen": 25231360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038293650793650797, + "loss": 5.7122, + "theoretical_loss": 5.881587299975694, + "tokens_seen": 25296896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038392857142857147, + "loss": 5.6413, + "theoretical_loss": 5.8790724194991935, + "tokens_seen": 25362432 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038492063492063497, + "loss": 5.6528, + "theoretical_loss": 5.876565843248124, + "tokens_seen": 25427968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003859126984126984, + "loss": 5.4832, + "theoretical_loss": 5.8740675225047525, + "tokens_seen": 25493504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003869047619047619, + "loss": 5.483, + "theoretical_loss": 5.871577408961457, + "tokens_seen": 25559040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003878968253968254, + "loss": 5.7954, + "theoretical_loss": 5.869095454716231, + "tokens_seen": 25624576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003888888888888889, + "loss": 5.7052, + "theoretical_loss": 5.866621612268246, + "tokens_seen": 25690112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038988095238095237, + "loss": 5.6162, + "theoretical_loss": 5.864155834513486, + "tokens_seen": 25755648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039087301587301587, + "loss": 5.6197, + "theoretical_loss": 5.8616980747404295, + "tokens_seen": 25821184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039186507936507937, + "loss": 5.6594, + "theoretical_loss": 5.859248286625787, + "tokens_seen": 25886720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003928571428571429, + "loss": 5.6119, + "theoretical_loss": 5.856806424230314, + "tokens_seen": 25952256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003938492063492063, + "loss": 5.6573, + "theoretical_loss": 5.854372441994654, + "tokens_seen": 26017792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003948412698412698, + "loss": 5.64, + "theoretical_loss": 5.851946294735258, + "tokens_seen": 26083328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003958333333333333, + "loss": 5.6432, + "theoretical_loss": 5.849527937640345, + "tokens_seen": 26148864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 110635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.454937934875488, + "objective/train/theoretical_loss": 5.8471173262659235, + "objective/train/tokens_used": 46674400, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003968253968253968, + "loss": 5.6936, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003978174603174603, + "loss": 5.5767, + "theoretical_loss": 5.84471441653186, + "tokens_seen": 26279936 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039880952380952383, + "loss": 5.4744, + "theoretical_loss": 5.842319164718004, + "tokens_seen": 26345472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039980158730158733, + "loss": 5.7648, + "theoretical_loss": 5.83993152746036, + "tokens_seen": 26411008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040079365079365083, + "loss": 5.6679, + "theoretical_loss": 5.83755146174731, + "tokens_seen": 26476544 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040178571428571433, + "loss": 5.7353, + "theoretical_loss": 5.835178924915889, + "tokens_seen": 26542080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004027777777777778, + "loss": 5.6518, + "theoretical_loss": 5.832813874648102, + "tokens_seen": 26607616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004037698412698413, + "loss": 5.6277, + "theoretical_loss": 5.8304562689673, + "tokens_seen": 26673152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004047619047619048, + "loss": 5.616, + "theoretical_loss": 5.828106066234588, + "tokens_seen": 26738688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004057539682539683, + "loss": 5.6403, + "theoretical_loss": 5.825763225145295, + "tokens_seen": 26804224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040674603174603173, + "loss": 5.5958, + "theoretical_loss": 5.823427704725473, + "tokens_seen": 26869760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040773809523809523, + "loss": 5.592, + "theoretical_loss": 5.82109946432846, + "tokens_seen": 26935296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040873015873015874, + "loss": 5.5542, + "theoretical_loss": 5.818778463631473, + "tokens_seen": 27000832 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040972222222222224, + "loss": 5.5567, + "theoretical_loss": 5.816464662632243, + "tokens_seen": 27066368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004107142857142857, + "loss": 5.6813, + "theoretical_loss": 5.8141580216457065, + "tokens_seen": 27131904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004117063492063492, + "loss": 5.5969, + "theoretical_loss": 5.811858501300729, + "tokens_seen": 27197440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004126984126984127, + "loss": 5.3853, + "theoretical_loss": 5.809566062536868, + "tokens_seen": 27262976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004136904761904762, + "loss": 5.6034, + "theoretical_loss": 5.807280666601191, + "tokens_seen": 27328512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004146825396825397, + "loss": 5.4179, + "theoretical_loss": 5.805002275045111, + "tokens_seen": 27394048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004156746031746032, + "loss": 5.5728, + "theoretical_loss": 5.8027308497212875, + "tokens_seen": 27459584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004166666666666667, + "loss": 5.5165, + "theoretical_loss": 5.800466352780546, + "tokens_seen": 27525120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004176587301587302, + "loss": 5.5769, + "theoretical_loss": 5.798208746668847, + "tokens_seen": 27590656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004186507936507937, + "loss": 5.4902, + "theoretical_loss": 5.795957994124291, + "tokens_seen": 27656192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00041964285714285714, + "loss": 5.5816, + "theoretical_loss": 5.7937140581741575, + "tokens_seen": 27721728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042063492063492065, + "loss": 5.5786, + "theoretical_loss": 5.791476902131985, + "tokens_seen": 27787264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 113598, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.648964881896973, + "objective/train/theoretical_loss": 5.789246489594688, + "objective/train/tokens_used": 48312800, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042162698412698415, + "loss": 5.4282, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042261904761904765, + "loss": 5.5614, + "theoretical_loss": 5.787022784439701, + "tokens_seen": 27918336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004236111111111111, + "loss": 5.5603, + "theoretical_loss": 5.784805750822171, + "tokens_seen": 27983872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004246031746031746, + "loss": 5.5407, + "theoretical_loss": 5.782595353172176, + "tokens_seen": 28049408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004255952380952381, + "loss": 5.502, + "theoretical_loss": 5.780391556191977, + "tokens_seen": 28114944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004265873015873016, + "loss": 5.5964, + "theoretical_loss": 5.778194324853311, + "tokens_seen": 28180480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042757936507936505, + "loss": 5.5111, + "theoretical_loss": 5.776003624394711, + "tokens_seen": 28246016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042857142857142855, + "loss": 5.5779, + "theoretical_loss": 5.773819420318858, + "tokens_seen": 28311552 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042956349206349205, + "loss": 5.4489, + "theoretical_loss": 5.771641678389971, + "tokens_seen": 28377088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004305555555555556, + "loss": 5.5662, + "theoretical_loss": 5.769470364631225, + "tokens_seen": 28442624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043154761904761905, + "loss": 5.422, + "theoretical_loss": 5.767305445322201, + "tokens_seen": 28508160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043253968253968256, + "loss": 5.5771, + "theoretical_loss": 5.765146886996363, + "tokens_seen": 28573696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043353174603174606, + "loss": 5.5915, + "theoretical_loss": 5.762994656438579, + "tokens_seen": 28639232 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043452380952380956, + "loss": 5.5499, + "theoretical_loss": 5.760848720682651, + "tokens_seen": 28704768 + }, + { + "epoch": 0.01, + "learning_rate": 0.000435515873015873, + "loss": 5.4539, + "theoretical_loss": 5.758709047008894, + "tokens_seen": 28770304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004365079365079365, + "loss": 5.5218, + "theoretical_loss": 5.756575602941732, + "tokens_seen": 28835840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004375, + "loss": 5.4988, + "theoretical_loss": 5.75444835624733, + "tokens_seen": 28901376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004384920634920635, + "loss": 5.375, + "theoretical_loss": 5.752327274931249, + "tokens_seen": 28966912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000439484126984127, + "loss": 5.4416, + "theoretical_loss": 5.750212327236129, + "tokens_seen": 29032448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044047619047619046, + "loss": 5.5363, + "theoretical_loss": 5.7481034816394105, + "tokens_seen": 29097984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044146825396825396, + "loss": 5.4746, + "theoretical_loss": 5.7460007068510635, + "tokens_seen": 29163520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044246031746031746, + "loss": 5.4861, + "theoretical_loss": 5.74390397181136, + "tokens_seen": 29229056 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044345238095238096, + "loss": 5.4093, + "theoretical_loss": 5.741813245688668, + "tokens_seen": 29294592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004444444444444444, + "loss": 5.5459, + "theoretical_loss": 5.739728497877267, + "tokens_seen": 29360128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044543650793650797, + "loss": 5.5303, + "theoretical_loss": 5.737649697995197, + "tokens_seen": 29425664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 118398, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.534779071807861, + "objective/train/theoretical_loss": 5.7355768158821245, + "objective/train/tokens_used": 49951200, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044642857142857147, + "loss": 5.5086, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044742063492063497, + "loss": 5.4718, + "theoretical_loss": 5.73350982159724, + "tokens_seen": 29556736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004484126984126984, + "loss": 5.4009, + "theoretical_loss": 5.731448685417178, + "tokens_seen": 29622272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004494047619047619, + "loss": 5.4885, + "theoretical_loss": 5.729393377833956, + "tokens_seen": 29687808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004503968253968254, + "loss": 5.5496, + "theoretical_loss": 5.7273438695529535, + "tokens_seen": 29753344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004513888888888889, + "loss": 5.4559, + "theoretical_loss": 5.725300131490888, + "tokens_seen": 29818880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045238095238095237, + "loss": 5.4535, + "theoretical_loss": 5.7232621347738455, + "tokens_seen": 29884416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045337301587301587, + "loss": 5.5204, + "theoretical_loss": 5.721229850735305, + "tokens_seen": 29949952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045436507936507937, + "loss": 5.5272, + "theoretical_loss": 5.719203250914208, + "tokens_seen": 30015488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004553571428571429, + "loss": 5.4962, + "theoretical_loss": 5.717182307053037, + "tokens_seen": 30081024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004563492063492063, + "loss": 5.466, + "theoretical_loss": 5.715166991095922, + "tokens_seen": 30146560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004573412698412698, + "loss": 5.4846, + "theoretical_loss": 5.713157275186761, + "tokens_seen": 30212096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004583333333333333, + "loss": 5.3369, + "theoretical_loss": 5.71115313166738, + "tokens_seen": 30277632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004593253968253968, + "loss": 5.5429, + "theoretical_loss": 5.709154533075688, + "tokens_seen": 30343168 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046031746031746033, + "loss": 5.4942, + "theoretical_loss": 5.707161452143879, + "tokens_seen": 30408704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046130952380952383, + "loss": 5.3399, + "theoretical_loss": 5.7051738617966326, + "tokens_seen": 30474240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046230158730158733, + "loss": 5.511, + "theoretical_loss": 5.7031917351493515, + "tokens_seen": 30539776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046329365079365083, + "loss": 5.4333, + "theoretical_loss": 5.701215045506411, + "tokens_seen": 30605312 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046428571428571433, + "loss": 5.4144, + "theoretical_loss": 5.699243766359421, + "tokens_seen": 30670848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004652777777777778, + "loss": 5.4537, + "theoretical_loss": 5.697277871385534, + "tokens_seen": 30736384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004662698412698413, + "loss": 5.494, + "theoretical_loss": 5.695317334445736, + "tokens_seen": 30801920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004672619047619048, + "loss": 5.3601, + "theoretical_loss": 5.693362129583184, + "tokens_seen": 30867456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004682539682539683, + "loss": 5.3823, + "theoretical_loss": 5.691412231021549, + "tokens_seen": 30932992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046924603174603173, + "loss": 5.395, + "theoretical_loss": 5.689467613163388, + "tokens_seen": 30998528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047023809523809523, + "loss": 5.3819, + "theoretical_loss": 5.687528250588518, + "tokens_seen": 31064064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 121468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.157979965209961, + "objective/train/theoretical_loss": 5.6855941180524265, + "objective/train/tokens_used": 51589600, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047123015873015874, + "loss": 5.3373, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047222222222222224, + "loss": 5.313, + "theoretical_loss": 5.683665190484683, + "tokens_seen": 31195136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004732142857142857, + "loss": 5.3682, + "theoretical_loss": 5.681741442987381, + "tokens_seen": 31260672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004742063492063492, + "loss": 5.2739, + "theoretical_loss": 5.679822850833591, + "tokens_seen": 31326208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004751984126984127, + "loss": 5.4301, + "theoretical_loss": 5.677909389465831, + "tokens_seen": 31391744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004761904761904762, + "loss": 5.3267, + "theoretical_loss": 5.676001034494554, + "tokens_seen": 31457280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004771825396825397, + "loss": 5.4425, + "theoretical_loss": 5.674097761696653, + "tokens_seen": 31522816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004781746031746032, + "loss": 5.3106, + "theoretical_loss": 5.672199547013983, + "tokens_seen": 31588352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004791666666666667, + "loss": 5.3464, + "theoretical_loss": 5.670306366551898, + "tokens_seen": 31653888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004801587301587302, + "loss": 5.4162, + "theoretical_loss": 5.6684181965778, + "tokens_seen": 31719424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004811507936507937, + "loss": 5.424, + "theoretical_loss": 5.666535013519715, + "tokens_seen": 31784960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048214285714285715, + "loss": 5.4053, + "theoretical_loss": 5.6646567939648715, + "tokens_seen": 31850496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048313492063492065, + "loss": 5.4443, + "theoretical_loss": 5.6627835146583045, + "tokens_seen": 31916032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048412698412698415, + "loss": 5.3122, + "theoretical_loss": 5.660915152501465, + "tokens_seen": 31981568 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048511904761904765, + "loss": 5.243, + "theoretical_loss": 5.659051684550857, + "tokens_seen": 32047104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004861111111111111, + "loss": 5.4154, + "theoretical_loss": 5.657193088016677, + "tokens_seen": 32112640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004871031746031746, + "loss": 5.3613, + "theoretical_loss": 5.655339340261474, + "tokens_seen": 32178176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004880952380952381, + "loss": 5.3432, + "theoretical_loss": 5.653490418798825, + "tokens_seen": 32243712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004890873015873016, + "loss": 5.402, + "theoretical_loss": 5.651646301292022, + "tokens_seen": 32309248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004900793650793651, + "loss": 5.4419, + "theoretical_loss": 5.649806965552774, + "tokens_seen": 32374784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004910714285714286, + "loss": 5.3914, + "theoretical_loss": 5.6479723895399205, + "tokens_seen": 32440320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000492063492063492, + "loss": 5.3547, + "theoretical_loss": 5.6461425513581665, + "tokens_seen": 32505856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004930555555555556, + "loss": 5.3101, + "theoretical_loss": 5.6443174292568195, + "tokens_seen": 32571392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004940476190476191, + "loss": 5.4439, + "theoretical_loss": 5.6424970016285485, + "tokens_seen": 32636928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004950396825396826, + "loss": 5.3199, + "theoretical_loss": 5.640681247008156, + "tokens_seen": 32702464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 125388, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.230047225952148, + "objective/train/theoretical_loss": 5.638870144071353, + "objective/train/tokens_used": 53228000, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000496031746031746, + "loss": 5.2621, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004970238095238095, + "loss": 5.2634, + "theoretical_loss": 5.637063671633564, + "tokens_seen": 32833536 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498015873015873, + "loss": 5.361, + "theoretical_loss": 5.635261808648728, + "tokens_seen": 32899072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990079365079365, + "loss": 5.295, + "theoretical_loss": 5.6334645342081195, + "tokens_seen": 32964608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 5.3021, + "theoretical_loss": 5.631671827539186, + "tokens_seen": 33030144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999899699097292, + "loss": 5.4006, + "theoretical_loss": 5.629883668004389, + "tokens_seen": 33095680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999799398194584, + "loss": 5.2592, + "theoretical_loss": 5.628100035100061, + "tokens_seen": 33161216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999699097291876, + "loss": 5.4167, + "theoretical_loss": 5.626320908455279, + "tokens_seen": 33226752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999598796389167, + "loss": 5.341, + "theoretical_loss": 5.6245462678307385, + "tokens_seen": 33292288 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499949849548646, + "loss": 5.3891, + "theoretical_loss": 5.622776093117652, + "tokens_seen": 33357824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999398194583751, + "loss": 5.3496, + "theoretical_loss": 5.621010364336651, + "tokens_seen": 33423360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999297893681044, + "loss": 5.357, + "theoretical_loss": 5.619249061636698, + "tokens_seen": 33488896 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999197592778335, + "loss": 5.1972, + "theoretical_loss": 5.61749216529402, + "tokens_seen": 33554432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999097291875627, + "loss": 5.1447, + "theoretical_loss": 5.615739655711037, + "tokens_seen": 33619968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998996990972919, + "loss": 5.2689, + "theoretical_loss": 5.61399151341532, + "tokens_seen": 33685504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998896690070211, + "loss": 5.2205, + "theoretical_loss": 5.6122477190585425, + "tokens_seen": 33751040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998796389167503, + "loss": 5.2017, + "theoretical_loss": 5.610508253415453, + "tokens_seen": 33816576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998696088264795, + "loss": 5.2831, + "theoretical_loss": 5.6087730973828585, + "tokens_seen": 33882112 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998595787362087, + "loss": 5.3221, + "theoretical_loss": 5.6070422319786095, + "tokens_seen": 33947648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998495486459378, + "loss": 5.3296, + "theoretical_loss": 5.605315638340606, + "tokens_seen": 34013184 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499839518555667, + "loss": 5.3342, + "theoretical_loss": 5.603593297725807, + "tokens_seen": 34078720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998294884653962, + "loss": 5.2039, + "theoretical_loss": 5.601875191509249, + "tokens_seen": 34144256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998194583751254, + "loss": 5.2283, + "theoretical_loss": 5.600161301183084, + "tokens_seen": 34209792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998094282848546, + "loss": 5.2802, + "theoretical_loss": 5.598451608355614, + "tokens_seen": 34275328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997993981945837, + "loss": 5.3015, + "theoretical_loss": 5.596746094750342, + "tokens_seen": 34340864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 129871, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.297621250152588, + "objective/train/theoretical_loss": 5.595044742205037, + "objective/train/tokens_used": 54866400, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997893681043129, + "loss": 5.2338, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997793380140421, + "loss": 5.2656, + "theoretical_loss": 5.5933475326707995, + "tokens_seen": 34471936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997693079237714, + "loss": 5.2797, + "theoretical_loss": 5.591654448211143, + "tokens_seen": 34537472 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997592778335005, + "loss": 5.25, + "theoretical_loss": 5.589965471001077, + "tokens_seen": 34603008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997492477432298, + "loss": 5.2665, + "theoretical_loss": 5.5882805833262115, + "tokens_seen": 34668544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997392176529588, + "loss": 5.3501, + "theoretical_loss": 5.586599767581859, + "tokens_seen": 34734080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997291875626881, + "loss": 5.2371, + "theoretical_loss": 5.584923006272151, + "tokens_seen": 34799616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997191574724173, + "loss": 5.1854, + "theoretical_loss": 5.583250282009159, + "tokens_seen": 34865152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997091273821465, + "loss": 5.2736, + "theoretical_loss": 5.581581577512031, + "tokens_seen": 34930688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996990972918757, + "loss": 5.2715, + "theoretical_loss": 5.579916875606134, + "tokens_seen": 34996224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996890672016048, + "loss": 5.1803, + "theoretical_loss": 5.578256159222196, + "tokens_seen": 35061760 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499679037111334, + "loss": 5.1744, + "theoretical_loss": 5.576599411395472, + "tokens_seen": 35127296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996690070210632, + "loss": 5.2362, + "theoretical_loss": 5.574946615264906, + "tokens_seen": 35192832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996589769307924, + "loss": 5.1904, + "theoretical_loss": 5.5732977540723105, + "tokens_seen": 35258368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996489468405216, + "loss": 5.152, + "theoretical_loss": 5.571652811161542, + "tokens_seen": 35323904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996389167502507, + "loss": 5.2052, + "theoretical_loss": 5.570011769977693, + "tokens_seen": 35389440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996288866599799, + "loss": 5.2434, + "theoretical_loss": 5.568374614066299, + "tokens_seen": 35454976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996188565697091, + "loss": 5.299, + "theoretical_loss": 5.566741327072535, + "tokens_seen": 35520512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996088264794383, + "loss": 5.2863, + "theoretical_loss": 5.565111892740433, + "tokens_seen": 35586048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995987963891675, + "loss": 5.2562, + "theoretical_loss": 5.563486294912105, + "tokens_seen": 35651584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995887662988968, + "loss": 5.1109, + "theoretical_loss": 5.56186451752697, + "tokens_seen": 35717120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995787362086258, + "loss": 5.1261, + "theoretical_loss": 5.560246544620993, + "tokens_seen": 35782656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995687061183551, + "loss": 5.1447, + "theoretical_loss": 5.558632360325929, + "tokens_seen": 35848192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995586760280842, + "loss": 5.207, + "theoretical_loss": 5.557021948868571, + "tokens_seen": 35913728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995486459378135, + "loss": 5.0805, + "theoretical_loss": 5.555415294570011, + "tokens_seen": 35979264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 133279, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.201353549957275, + "objective/train/theoretical_loss": 5.553812381844907, + "objective/train/tokens_used": 56504800, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995386158475427, + "loss": 5.2018, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995285857572718, + "loss": 5.2167, + "theoretical_loss": 5.552213195200755, + "tokens_seen": 36110336 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499518555667001, + "loss": 5.1571, + "theoretical_loss": 5.550617719237167, + "tokens_seen": 36175872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995085255767302, + "loss": 4.988, + "theoretical_loss": 5.549025938645155, + "tokens_seen": 36241408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994984954864594, + "loss": 5.1538, + "theoretical_loss": 5.547437838206435, + "tokens_seen": 36306944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994884653961886, + "loss": 5.1898, + "theoretical_loss": 5.545853402792717, + "tokens_seen": 36372480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994784353059178, + "loss": 5.1026, + "theoretical_loss": 5.544272617365014, + "tokens_seen": 36438016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994684052156469, + "loss": 5.1055, + "theoretical_loss": 5.542695466972956, + "tokens_seen": 36503552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994583751253761, + "loss": 5.1314, + "theoretical_loss": 5.541121936754111, + "tokens_seen": 36569088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994483450351053, + "loss": 5.2148, + "theoretical_loss": 5.539552011933312, + "tokens_seen": 36634624 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994383149448345, + "loss": 5.1095, + "theoretical_loss": 5.537985677821986, + "tokens_seen": 36700160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994282848545637, + "loss": 5.1574, + "theoretical_loss": 5.536422919817495, + "tokens_seen": 36765696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994182547642928, + "loss": 5.0786, + "theoretical_loss": 5.5348637234024824, + "tokens_seen": 36831232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994082246740221, + "loss": 5.1143, + "theoretical_loss": 5.53330807414422, + "tokens_seen": 36896768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993981945837512, + "loss": 5.0789, + "theoretical_loss": 5.5317559576939725, + "tokens_seen": 36962304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993881644934805, + "loss": 4.9922, + "theoretical_loss": 5.530207359786353, + "tokens_seen": 37027840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993781344032096, + "loss": 5.2217, + "theoretical_loss": 5.5286622662386975, + "tokens_seen": 37093376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993681043129389, + "loss": 5.1455, + "theoretical_loss": 5.52712066295044, + "tokens_seen": 37158912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499358074222668, + "loss": 5.1498, + "theoretical_loss": 5.525582535902489, + "tokens_seen": 37224448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993480441323972, + "loss": 5.1892, + "theoretical_loss": 5.524047871156618, + "tokens_seen": 37289984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993380140421264, + "loss": 5.1692, + "theoretical_loss": 5.52251665485486, + "tokens_seen": 37355520 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993279839518556, + "loss": 5.1239, + "theoretical_loss": 5.520988873218897, + "tokens_seen": 37421056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993179538615848, + "loss": 5.074, + "theoretical_loss": 5.519464512549478, + "tokens_seen": 37486592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993079237713139, + "loss": 5.0963, + "theoretical_loss": 5.5179435592258095, + "tokens_seen": 37552128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992978936810431, + "loss": 5.1509, + "theoretical_loss": 5.516425999704987, + "tokens_seen": 37617664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 136157, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.125978946685791, + "objective/train/theoretical_loss": 5.514911820521407, + "objective/train/tokens_used": 58143200, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992878635907723, + "loss": 5.1517, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992778335005015, + "loss": 5.076, + "theoretical_loss": 5.5134010082861895, + "tokens_seen": 37748736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992678034102307, + "loss": 5.146, + "theoretical_loss": 5.511893549686616, + "tokens_seen": 37814272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992577733199598, + "loss": 5.1829, + "theoretical_loss": 5.51038943148556, + "tokens_seen": 37879808 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499247743229689, + "loss": 5.2205, + "theoretical_loss": 5.508888640520928, + "tokens_seen": 37945344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992377131394183, + "loss": 5.002, + "theoretical_loss": 5.50739116370511, + "tokens_seen": 38010880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992276830491475, + "loss": 5.2065, + "theoretical_loss": 5.505896988024423, + "tokens_seen": 38076416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992176529588767, + "loss": 5.1859, + "theoretical_loss": 5.5044061005385725, + "tokens_seen": 38141952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992076228686059, + "loss": 5.1453, + "theoretical_loss": 5.502918488380116, + "tokens_seen": 38207488 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499197592778335, + "loss": 5.0665, + "theoretical_loss": 5.501434138753918, + "tokens_seen": 38273024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991875626880642, + "loss": 5.0175, + "theoretical_loss": 5.499953038936635, + "tokens_seen": 38338560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991775325977934, + "loss": 4.9964, + "theoretical_loss": 5.498475176276176, + "tokens_seen": 38404096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991675025075226, + "loss": 5.0337, + "theoretical_loss": 5.497000538191195, + "tokens_seen": 38469632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991574724172518, + "loss": 5.1417, + "theoretical_loss": 5.495529112170568, + "tokens_seen": 38535168 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499147442326981, + "loss": 5.0655, + "theoretical_loss": 5.494060885772887, + "tokens_seen": 38600704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991374122367101, + "loss": 5.0774, + "theoretical_loss": 5.492595846625951, + "tokens_seen": 38666240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991273821464393, + "loss": 5.1069, + "theoretical_loss": 5.491133982426266, + "tokens_seen": 38731776 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991173520561685, + "loss": 5.083, + "theoretical_loss": 5.489675280938547, + "tokens_seen": 38797312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991073219658977, + "loss": 5.0716, + "theoretical_loss": 5.488219729995227, + "tokens_seen": 38862848 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499097291875627, + "loss": 5.0837, + "theoretical_loss": 5.486767317495966, + "tokens_seen": 38928384 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499087261785356, + "loss": 5.0556, + "theoretical_loss": 5.48531803140717, + "tokens_seen": 38993920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990772316950853, + "loss": 5.1244, + "theoretical_loss": 5.483871859761511, + "tokens_seen": 39059456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990672016048144, + "loss": 5.0696, + "theoretical_loss": 5.482428790657449, + "tokens_seen": 39124992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990571715145437, + "loss": 5.1238, + "theoretical_loss": 5.480988812258763, + "tokens_seen": 39190528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990471414242729, + "loss": 5.0737, + "theoretical_loss": 5.479551912794086, + "tokens_seen": 39256064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 140929, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.0706868171691895, + "objective/train/theoretical_loss": 5.478118080556438, + "objective/train/tokens_used": 59781600, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499037111334002, + "loss": 5.0808, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990270812437312, + "loss": 4.8909, + "theoretical_loss": 5.476687303902768, + "tokens_seen": 39387136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990170511534604, + "loss": 5.1081, + "theoretical_loss": 5.475259571253502, + "tokens_seen": 39452672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990070210631896, + "loss": 5.1546, + "theoretical_loss": 5.473834871092089, + "tokens_seen": 39518208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989969909729188, + "loss": 4.9641, + "theoretical_loss": 5.4724131919645576, + "tokens_seen": 39583744 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498986960882648, + "loss": 4.97, + "theoretical_loss": 5.470994522479069, + "tokens_seen": 39649280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989769307923771, + "loss": 5.0672, + "theoretical_loss": 5.4695788513054815, + "tokens_seen": 39714816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989669007021063, + "loss": 5.0077, + "theoretical_loss": 5.468166167174912, + "tokens_seen": 39780352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989568706118355, + "loss": 4.9971, + "theoretical_loss": 5.466756458879306, + "tokens_seen": 39845888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989468405215647, + "loss": 5.0227, + "theoretical_loss": 5.465349715271013, + "tokens_seen": 39911424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989368104312939, + "loss": 4.9819, + "theoretical_loss": 5.463945925262355, + "tokens_seen": 39976960 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498926780341023, + "loss": 5.0182, + "theoretical_loss": 5.462545077825214, + "tokens_seen": 40042496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989167502507523, + "loss": 4.9084, + "theoretical_loss": 5.461147161990611, + "tokens_seen": 40108032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989067201604814, + "loss": 5.0463, + "theoretical_loss": 5.459752166848292, + "tokens_seen": 40173568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988966900702107, + "loss": 5.0472, + "theoretical_loss": 5.458360081546321, + "tokens_seen": 40239104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988866599799398, + "loss": 5.0402, + "theoretical_loss": 5.456970895290674, + "tokens_seen": 40304640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988766298896691, + "loss": 5.0373, + "theoretical_loss": 5.455584597344835, + "tokens_seen": 40370176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988665997993982, + "loss": 4.9383, + "theoretical_loss": 5.454201177029395, + "tokens_seen": 40435712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988565697091274, + "loss": 5.0924, + "theoretical_loss": 5.452820623721662, + "tokens_seen": 40501248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988465396188566, + "loss": 5.1169, + "theoretical_loss": 5.45144292685526, + "tokens_seen": 40566784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988365095285858, + "loss": 5.0519, + "theoretical_loss": 5.450068075919752, + "tokens_seen": 40632320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498826479438315, + "loss": 4.9982, + "theoretical_loss": 5.44869606046024, + "tokens_seen": 40697856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988164493480441, + "loss": 4.9847, + "theoretical_loss": 5.447326870076996, + "tokens_seen": 40763392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988064192577733, + "loss": 4.9257, + "theoretical_loss": 5.445960494425072, + "tokens_seen": 40828928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987963891675025, + "loss": 5.0428, + "theoretical_loss": 5.444596923213931, + "tokens_seen": 40894464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 144562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.1864800453186035, + "objective/train/theoretical_loss": 5.443236146207074, + "objective/train/tokens_used": 61420000, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987863590772317, + "loss": 5.0335, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987763289869609, + "loss": 4.9408, + "theoretical_loss": 5.441878153221662, + "tokens_seen": 41025536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049876629889669, + "loss": 5.0268, + "theoretical_loss": 5.440522934128164, + "tokens_seen": 41091072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987562688064192, + "loss": 4.9282, + "theoretical_loss": 5.439170478849976, + "tokens_seen": 41156608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987462387161484, + "loss": 4.9223, + "theoretical_loss": 5.437820777363078, + "tokens_seen": 41222144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987362086258777, + "loss": 4.9247, + "theoretical_loss": 5.4364738196956655, + "tokens_seen": 41287680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987261785356068, + "loss": 4.9027, + "theoretical_loss": 5.435129595927794, + "tokens_seen": 41353216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987161484453361, + "loss": 4.9038, + "theoretical_loss": 5.433788096191039, + "tokens_seen": 41418752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987061183550651, + "loss": 4.9935, + "theoretical_loss": 5.432449310668134, + "tokens_seen": 41484288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986960882647944, + "loss": 5.0582, + "theoretical_loss": 5.4311132295926345, + "tokens_seen": 41549824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986860581745236, + "loss": 5.0057, + "theoretical_loss": 5.42977984324857, + "tokens_seen": 41615360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986760280842528, + "loss": 4.9362, + "theoretical_loss": 5.428449141970107, + "tokens_seen": 41680896 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498665997993982, + "loss": 4.9497, + "theoretical_loss": 5.427121116141212, + "tokens_seen": 41746432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986559679037111, + "loss": 4.9611, + "theoretical_loss": 5.42579575619531, + "tokens_seen": 41811968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986459378134403, + "loss": 4.993, + "theoretical_loss": 5.424473052614967, + "tokens_seen": 41877504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986359077231695, + "loss": 4.9695, + "theoretical_loss": 5.423152995931552, + "tokens_seen": 41943040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986258776328987, + "loss": 5.0474, + "theoretical_loss": 5.421835576724906, + "tokens_seen": 42008576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986158475426279, + "loss": 4.9263, + "theoretical_loss": 5.420520785623031, + "tokens_seen": 42074112 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498605817452357, + "loss": 4.9627, + "theoretical_loss": 5.4192086133017625, + "tokens_seen": 42139648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985957873620862, + "loss": 4.9688, + "theoretical_loss": 5.417899050484451, + "tokens_seen": 42205184 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985857572718154, + "loss": 4.9594, + "theoretical_loss": 5.416592087941646, + "tokens_seen": 42270720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985757271815446, + "loss": 4.914, + "theoretical_loss": 5.415287716490787, + "tokens_seen": 42336256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985656970912738, + "loss": 4.884, + "theoretical_loss": 5.413985926995892, + "tokens_seen": 42401792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985556670010031, + "loss": 4.933, + "theoretical_loss": 5.412686710367245, + "tokens_seen": 42467328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985456369107321, + "loss": 4.859, + "theoretical_loss": 5.411390057561097, + "tokens_seen": 42532864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 147605, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.961941242218018, + "objective/train/theoretical_loss": 5.410095959579362, + "objective/train/tokens_used": 63058400, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985356068204614, + "loss": 4.9436, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985255767301905, + "loss": 5.0078, + "theoretical_loss": 5.408804407469308, + "tokens_seen": 42663936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985155466399198, + "loss": 4.9442, + "theoretical_loss": 5.407515392323276, + "tokens_seen": 42729472 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498505516549649, + "loss": 4.9535, + "theoretical_loss": 5.406228905278368, + "tokens_seen": 42795008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984954864593782, + "loss": 4.91, + "theoretical_loss": 5.404944937516161, + "tokens_seen": 42860544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984854563691073, + "loss": 4.9872, + "theoretical_loss": 5.403663480262418, + "tokens_seen": 42926080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984754262788365, + "loss": 4.9409, + "theoretical_loss": 5.402384524786797, + "tokens_seen": 42991616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984653961885657, + "loss": 4.9064, + "theoretical_loss": 5.401108062402562, + "tokens_seen": 43057152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984553660982949, + "loss": 4.8755, + "theoretical_loss": 5.399834084466306, + "tokens_seen": 43122688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984453360080241, + "loss": 4.9289, + "theoretical_loss": 5.398562582377666, + "tokens_seen": 43188224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984353059177532, + "loss": 4.8312, + "theoretical_loss": 5.397293547579041, + "tokens_seen": 43253760 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984252758274825, + "loss": 4.9232, + "theoretical_loss": 5.396026971555319, + "tokens_seen": 43319296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984152457372116, + "loss": 4.9173, + "theoretical_loss": 5.394762845833601, + "tokens_seen": 43384832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984052156469409, + "loss": 4.8911, + "theoretical_loss": 5.393501161982926, + "tokens_seen": 43450368 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049839518555667, + "loss": 4.9085, + "theoretical_loss": 5.392241911614005, + "tokens_seen": 43515904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983851554663993, + "loss": 4.9038, + "theoretical_loss": 5.390985086378949, + "tokens_seen": 43581440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983751253761284, + "loss": 4.8963, + "theoretical_loss": 5.389730677971002, + "tokens_seen": 43646976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983650952858576, + "loss": 4.8313, + "theoretical_loss": 5.388478678124285, + "tokens_seen": 43712512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983550651955868, + "loss": 4.9585, + "theoretical_loss": 5.387229078613521, + "tokens_seen": 43778048 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498345035105316, + "loss": 4.9063, + "theoretical_loss": 5.385981871253785, + "tokens_seen": 43843584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983350050150452, + "loss": 4.7834, + "theoretical_loss": 5.384737047900243, + "tokens_seen": 43909120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983249749247743, + "loss": 4.9174, + "theoretical_loss": 5.3834946004478965, + "tokens_seen": 43974656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983149448345035, + "loss": 4.9098, + "theoretical_loss": 5.382254520831328, + "tokens_seen": 44040192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983049147442327, + "loss": 4.8257, + "theoretical_loss": 5.381016801024449, + "tokens_seen": 44105728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982948846539619, + "loss": 4.9429, + "theoretical_loss": 5.379781433040252, + "tokens_seen": 44171264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 152556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.84815788269043, + "objective/train/theoretical_loss": 5.378548408930558, + "objective/train/tokens_used": 64696800, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982848545636911, + "loss": 4.8622, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982748244734202, + "loss": 4.8836, + "theoretical_loss": 5.377317720785777, + "tokens_seen": 44302336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982647943831494, + "loss": 4.9127, + "theoretical_loss": 5.37608936073466, + "tokens_seen": 44367872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982547642928786, + "loss": 4.8605, + "theoretical_loss": 5.374863320944057, + "tokens_seen": 44433408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982447342026079, + "loss": 4.7671, + "theoretical_loss": 5.373639593618675, + "tokens_seen": 44498944 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498234704112337, + "loss": 4.9045, + "theoretical_loss": 5.372418171000847, + "tokens_seen": 44564480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982246740220663, + "loss": 4.8433, + "theoretical_loss": 5.371199045370283, + "tokens_seen": 44630016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982146439317953, + "loss": 4.9034, + "theoretical_loss": 5.369982209043851, + "tokens_seen": 44695552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982046138415246, + "loss": 4.9305, + "theoretical_loss": 5.368767654375327, + "tokens_seen": 44761088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981945837512538, + "loss": 4.9046, + "theoretical_loss": 5.367555373755179, + "tokens_seen": 44826624 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498184553660983, + "loss": 4.872, + "theoretical_loss": 5.366345359610327, + "tokens_seen": 44892160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981745235707122, + "loss": 5.016, + "theoretical_loss": 5.365137604403923, + "tokens_seen": 44957696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981644934804413, + "loss": 4.9065, + "theoretical_loss": 5.363932100635117, + "tokens_seen": 45023232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981544633901705, + "loss": 4.8304, + "theoretical_loss": 5.362728840838843, + "tokens_seen": 45088768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981444332998997, + "loss": 4.9154, + "theoretical_loss": 5.361527817585586, + "tokens_seen": 45154304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981344032096289, + "loss": 4.8804, + "theoretical_loss": 5.360329023481169, + "tokens_seen": 45219840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981243731193581, + "loss": 4.8716, + "theoretical_loss": 5.359132451166534, + "tokens_seen": 45285376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981143430290873, + "loss": 4.8306, + "theoretical_loss": 5.357938093317518, + "tokens_seen": 45350912 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981043129388164, + "loss": 4.779, + "theoretical_loss": 5.356745942644645, + "tokens_seen": 45416448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980942828485456, + "loss": 4.9063, + "theoretical_loss": 5.355555991892905, + "tokens_seen": 45481984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980842527582748, + "loss": 4.8626, + "theoretical_loss": 5.35436823384155, + "tokens_seen": 45547520 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498074222668004, + "loss": 4.7971, + "theoretical_loss": 5.353182661303873, + "tokens_seen": 45613056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980641925777333, + "loss": 4.8087, + "theoretical_loss": 5.35199926712701, + "tokens_seen": 45678592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980541624874623, + "loss": 4.8797, + "theoretical_loss": 5.350818044191721, + "tokens_seen": 45744128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980441323971916, + "loss": 4.8033, + "theoretical_loss": 5.349638985412193, + "tokens_seen": 45809664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 155500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.722625255584717, + "objective/train/theoretical_loss": 5.348462083735834, + "objective/train/tokens_used": 66335200, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980341023069207, + "loss": 4.8402, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049802407221665, + "loss": 4.8472, + "theoretical_loss": 5.347287332143064, + "tokens_seen": 45940736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980140421263792, + "loss": 4.8456, + "theoretical_loss": 5.346114723647119, + "tokens_seen": 46006272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980040120361084, + "loss": 4.8908, + "theoretical_loss": 5.344944251293852, + "tokens_seen": 46071808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979939819458375, + "loss": 4.8046, + "theoretical_loss": 5.343775908161532, + "tokens_seen": 46137344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979839518555667, + "loss": 4.8745, + "theoretical_loss": 5.342609687360644, + "tokens_seen": 46202880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979739217652959, + "loss": 4.8665, + "theoretical_loss": 5.341445582033705, + "tokens_seen": 46268416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979638916750251, + "loss": 4.7444, + "theoretical_loss": 5.3402835853550545, + "tokens_seen": 46333952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979538615847543, + "loss": 4.8877, + "theoretical_loss": 5.339123690530673, + "tokens_seen": 46399488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979438314944834, + "loss": 4.8812, + "theoretical_loss": 5.337965890797989, + "tokens_seen": 46465024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979338014042126, + "loss": 4.801, + "theoretical_loss": 5.336810179425685, + "tokens_seen": 46530560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979237713139418, + "loss": 4.7882, + "theoretical_loss": 5.335656549713516, + "tokens_seen": 46596096 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497913741223671, + "loss": 4.8571, + "theoretical_loss": 5.334504994992115, + "tokens_seen": 46661632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979037111334002, + "loss": 4.7664, + "theoretical_loss": 5.333355508622814, + "tokens_seen": 46727168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978936810431293, + "loss": 4.7522, + "theoretical_loss": 5.332208083997459, + "tokens_seen": 46792704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978836509528586, + "loss": 4.8191, + "theoretical_loss": 5.33106271453822, + "tokens_seen": 46858240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978736208625877, + "loss": 4.8856, + "theoretical_loss": 5.329919393697422, + "tokens_seen": 46923776 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497863590772317, + "loss": 4.8403, + "theoretical_loss": 5.328778114957351, + "tokens_seen": 46989312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978535606820461, + "loss": 4.6984, + "theoretical_loss": 5.327638871830089, + "tokens_seen": 47054848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978435305917754, + "loss": 4.775, + "theoretical_loss": 5.326501657857326, + "tokens_seen": 47120384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978335005015045, + "loss": 4.8654, + "theoretical_loss": 5.32536646661019, + "tokens_seen": 47185920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978234704112337, + "loss": 4.6961, + "theoretical_loss": 5.324233291689069, + "tokens_seen": 47251456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978134403209629, + "loss": 4.8173, + "theoretical_loss": 5.323102126723439, + "tokens_seen": 47316992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978034102306921, + "loss": 4.8502, + "theoretical_loss": 5.321972965371691, + "tokens_seen": 47382528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977933801404213, + "loss": 4.7509, + "theoretical_loss": 5.320845801320959, + "tokens_seen": 47448064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 159194, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.888257026672363, + "objective/train/theoretical_loss": 5.319720628286955, + "objective/train/tokens_used": 67973600, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977833500501504, + "loss": 4.835, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977733199598796, + "loss": 4.7449, + "theoretical_loss": 5.318597440013795, + "tokens_seen": 47579136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977632898696088, + "loss": 4.8623, + "theoretical_loss": 5.317476230273831, + "tokens_seen": 47644672 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497753259779338, + "loss": 4.8029, + "theoretical_loss": 5.316356992867491, + "tokens_seen": 47710208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977432296890672, + "loss": 4.876, + "theoretical_loss": 5.31523972162311, + "tokens_seen": 47775744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977331995987965, + "loss": 4.8232, + "theoretical_loss": 5.314124410396767, + "tokens_seen": 47841280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977231695085255, + "loss": 4.8547, + "theoretical_loss": 5.31301105307212, + "tokens_seen": 47906816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977131394182548, + "loss": 4.6761, + "theoretical_loss": 5.311899643560251, + "tokens_seen": 47972352 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497703109327984, + "loss": 4.7273, + "theoretical_loss": 5.310790175799497, + "tokens_seen": 48037888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976930792377132, + "loss": 4.752, + "theoretical_loss": 5.3096826437553, + "tokens_seen": 48103424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976830491474424, + "loss": 4.8813, + "theoretical_loss": 5.308577041420046, + "tokens_seen": 48168960 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976730190571715, + "loss": 4.638, + "theoretical_loss": 5.3074733628129005, + "tokens_seen": 48234496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976629889669007, + "loss": 4.6773, + "theoretical_loss": 5.3063716019796665, + "tokens_seen": 48300032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976529588766299, + "loss": 4.8116, + "theoretical_loss": 5.305271752992619, + "tokens_seen": 48365568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976429287863591, + "loss": 4.7088, + "theoretical_loss": 5.304173809950358, + "tokens_seen": 48431104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976328986960883, + "loss": 4.6842, + "theoretical_loss": 5.303077766977653, + "tokens_seen": 48496640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976228686058175, + "loss": 4.8037, + "theoretical_loss": 5.3019836182252895, + "tokens_seen": 48562176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976128385155466, + "loss": 4.8628, + "theoretical_loss": 5.300891357869929, + "tokens_seen": 48627712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976028084252758, + "loss": 4.7758, + "theoretical_loss": 5.299800980113945, + "tokens_seen": 48693248 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497592778335005, + "loss": 4.6108, + "theoretical_loss": 5.298712479185288, + "tokens_seen": 48758784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975827482447342, + "loss": 4.8135, + "theoretical_loss": 5.297625849337331, + "tokens_seen": 48824320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975727181544635, + "loss": 4.7497, + "theoretical_loss": 5.296541084848727, + "tokens_seen": 48889856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975626880641925, + "loss": 4.7418, + "theoretical_loss": 5.295458180023262, + "tokens_seen": 48955392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975526579739218, + "loss": 4.6621, + "theoretical_loss": 5.294377129189715, + "tokens_seen": 49020928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975426278836509, + "loss": 4.7849, + "theoretical_loss": 5.293297926701706, + "tokens_seen": 49086464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 164182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.570253849029541, + "objective/train/theoretical_loss": 5.292220566937567, + "objective/train/tokens_used": 69612000, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975325977933802, + "loss": 4.6274, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975225677031094, + "loss": 4.7887, + "theoretical_loss": 5.29114504430019, + "tokens_seen": 49217536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975125376128386, + "loss": 4.6764, + "theoretical_loss": 5.290071353216895, + "tokens_seen": 49283072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975025075225677, + "loss": 4.6283, + "theoretical_loss": 5.288999488139284, + "tokens_seen": 49348608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974924774322969, + "loss": 4.6825, + "theoretical_loss": 5.28792944354311, + "tokens_seen": 49414144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974824473420261, + "loss": 4.7326, + "theoretical_loss": 5.286861213928137, + "tokens_seen": 49479680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974724172517553, + "loss": 4.6581, + "theoretical_loss": 5.285794793817999, + "tokens_seen": 49545216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974623871614845, + "loss": 4.6843, + "theoretical_loss": 5.284730177760077, + "tokens_seen": 49610752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974523570712136, + "loss": 4.5414, + "theoretical_loss": 5.283667360325351, + "tokens_seen": 49676288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974423269809428, + "loss": 4.6875, + "theoretical_loss": 5.2826063361082785, + "tokens_seen": 49741824 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497432296890672, + "loss": 4.7088, + "theoretical_loss": 5.281547099726654, + "tokens_seen": 49807360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974222668004012, + "loss": 4.6908, + "theoretical_loss": 5.280489645821483, + "tokens_seen": 49872896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974122367101304, + "loss": 4.7484, + "theoretical_loss": 5.279433969056848, + "tokens_seen": 49938432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974022066198595, + "loss": 4.6255, + "theoretical_loss": 5.278380064119782, + "tokens_seen": 50003968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973921765295888, + "loss": 4.657, + "theoretical_loss": 5.277327925720137, + "tokens_seen": 50069504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973821464393179, + "loss": 4.5506, + "theoretical_loss": 5.276277548590457, + "tokens_seen": 50135040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973721163490472, + "loss": 4.6456, + "theoretical_loss": 5.275228927485855, + "tokens_seen": 50200576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973620862587763, + "loss": 4.6601, + "theoretical_loss": 5.2741820571838804, + "tokens_seen": 50266112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973520561685056, + "loss": 4.6918, + "theoretical_loss": 5.273136932484399, + "tokens_seen": 50331648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973420260782347, + "loss": 4.5541, + "theoretical_loss": 5.272093548209467, + "tokens_seen": 50397184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973319959879639, + "loss": 4.6842, + "theoretical_loss": 5.271051899203207, + "tokens_seen": 50462720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973219658976931, + "loss": 4.6565, + "theoretical_loss": 5.270011980331685, + "tokens_seen": 50528256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973119358074223, + "loss": 4.7256, + "theoretical_loss": 5.268973786482794, + "tokens_seen": 50593792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973019057171515, + "loss": 4.691, + "theoretical_loss": 5.267937312566123, + "tokens_seen": 50659328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972918756268806, + "loss": 4.7499, + "theoretical_loss": 5.266902553512847, + "tokens_seen": 50724864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 166938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.751072883605957, + "objective/train/theoretical_loss": 5.265869504275602, + "objective/train/tokens_used": 71250400, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972818455366098, + "loss": 4.6891, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497271815446339, + "loss": 4.695, + "theoretical_loss": 5.264838159828369, + "tokens_seen": 50855936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972617853560682, + "loss": 4.6247, + "theoretical_loss": 5.263808515166355, + "tokens_seen": 50921472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972517552657974, + "loss": 4.6805, + "theoretical_loss": 5.262780565305875, + "tokens_seen": 50987008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972417251755266, + "loss": 4.647, + "theoretical_loss": 5.261754305284241, + "tokens_seen": 51052544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972316950852557, + "loss": 4.554, + "theoretical_loss": 5.260729730159641, + "tokens_seen": 51118080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972216649949849, + "loss": 4.7313, + "theoretical_loss": 5.259706835011027, + "tokens_seen": 51183616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972116349047142, + "loss": 4.6867, + "theoretical_loss": 5.2586856149380035, + "tokens_seen": 51249152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972016048144433, + "loss": 4.601, + "theoretical_loss": 5.257666065060709, + "tokens_seen": 51314688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971915747241726, + "loss": 4.6235, + "theoretical_loss": 5.256648180519708, + "tokens_seen": 51380224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971815446339017, + "loss": 4.6658, + "theoretical_loss": 5.255631956475881, + "tokens_seen": 51445760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971715145436309, + "loss": 4.5963, + "theoretical_loss": 5.25461738811031, + "tokens_seen": 51511296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971614844533601, + "loss": 4.6084, + "theoretical_loss": 5.25360447062417, + "tokens_seen": 51576832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971514543630893, + "loss": 4.6452, + "theoretical_loss": 5.252593199238619, + "tokens_seen": 51642368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971414242728185, + "loss": 4.7294, + "theoretical_loss": 5.2515835691946915, + "tokens_seen": 51707904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971313941825477, + "loss": 4.5275, + "theoretical_loss": 5.2505755757531904, + "tokens_seen": 51773440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971213640922768, + "loss": 4.6753, + "theoretical_loss": 5.24956921419458, + "tokens_seen": 51838976 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497111334002006, + "loss": 4.5996, + "theoretical_loss": 5.248564479818876, + "tokens_seen": 51904512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971013039117352, + "loss": 4.6336, + "theoretical_loss": 5.247561367945544, + "tokens_seen": 51970048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970912738214644, + "loss": 4.5342, + "theoretical_loss": 5.246559873913396, + "tokens_seen": 52035584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970812437311936, + "loss": 4.6035, + "theoretical_loss": 5.245559993080484, + "tokens_seen": 52101120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970712136409227, + "loss": 4.6588, + "theoretical_loss": 5.24456172082399, + "tokens_seen": 52166656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970611835506519, + "loss": 4.5565, + "theoretical_loss": 5.243565052540136, + "tokens_seen": 52232192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970511534603811, + "loss": 4.7121, + "theoretical_loss": 5.242569983644074, + "tokens_seen": 52297728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970411233701103, + "loss": 4.6251, + "theoretical_loss": 5.241576509569784, + "tokens_seen": 52363264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 171959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.387710094451904, + "objective/train/theoretical_loss": 5.240584625769978, + "objective/train/tokens_used": 72888800, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970310932798396, + "loss": 4.5609, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970210631895686, + "loss": 4.6113, + "theoretical_loss": 5.239594327715992, + "tokens_seen": 52494336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970110330992979, + "loss": 4.6099, + "theoretical_loss": 5.238605610897698, + "tokens_seen": 52559872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970010030090271, + "loss": 4.5979, + "theoretical_loss": 5.237618470823394, + "tokens_seen": 52625408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969909729187563, + "loss": 4.5285, + "theoretical_loss": 5.2366329030197125, + "tokens_seen": 52690944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969809428284855, + "loss": 4.6408, + "theoretical_loss": 5.235648903031521, + "tokens_seen": 52756480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969709127382147, + "loss": 4.5222, + "theoretical_loss": 5.2346664664218245, + "tokens_seen": 52822016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969608826479438, + "loss": 4.7192, + "theoretical_loss": 5.233685588771669, + "tokens_seen": 52887552 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496950852557673, + "loss": 4.6249, + "theoretical_loss": 5.232706265680049, + "tokens_seen": 52953088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969408224674022, + "loss": 4.6264, + "theoretical_loss": 5.231728492763811, + "tokens_seen": 53018624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969307923771314, + "loss": 4.549, + "theoretical_loss": 5.230752265657554, + "tokens_seen": 53084160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969207622868606, + "loss": 4.5995, + "theoretical_loss": 5.229777580013545, + "tokens_seen": 53149696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969107321965897, + "loss": 4.5681, + "theoretical_loss": 5.228804431501619, + "tokens_seen": 53215232 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496900702106319, + "loss": 4.5599, + "theoretical_loss": 5.227832815809087, + "tokens_seen": 53280768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968906720160481, + "loss": 4.4797, + "theoretical_loss": 5.226862728640651, + "tokens_seen": 53346304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968806419257774, + "loss": 4.6105, + "theoretical_loss": 5.2258941657183, + "tokens_seen": 53411840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968706118355065, + "loss": 4.5085, + "theoretical_loss": 5.2249271227812315, + "tokens_seen": 53477376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968605817452358, + "loss": 4.5419, + "theoretical_loss": 5.223961595585755, + "tokens_seen": 53542912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968505516549649, + "loss": 4.561, + "theoretical_loss": 5.222997579905204, + "tokens_seen": 53608448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968405215646941, + "loss": 4.4733, + "theoretical_loss": 5.222035071529845, + "tokens_seen": 53673984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968304914744233, + "loss": 4.6407, + "theoretical_loss": 5.2210740662667945, + "tokens_seen": 53739520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968204613841525, + "loss": 4.601, + "theoretical_loss": 5.220114559939923, + "tokens_seen": 53805056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968104312938817, + "loss": 4.568, + "theoretical_loss": 5.219156548389775, + "tokens_seen": 53870592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968004012036108, + "loss": 4.5971, + "theoretical_loss": 5.218200027473481, + "tokens_seen": 53936128 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049679037111334, + "loss": 4.5618, + "theoretical_loss": 5.217244993064664, + "tokens_seen": 54001664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 174882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.400208473205566, + "objective/train/theoretical_loss": 5.216291441053366, + "objective/train/tokens_used": 74527200, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967803410230692, + "loss": 4.4811, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967703109327984, + "loss": 4.5616, + "theoretical_loss": 5.215339367345955, + "tokens_seen": 54132736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967602808425276, + "loss": 4.4965, + "theoretical_loss": 5.214388767865036, + "tokens_seen": 54198272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967502507522568, + "loss": 4.5037, + "theoretical_loss": 5.2134396385493815, + "tokens_seen": 54263808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967402206619859, + "loss": 4.5722, + "theoretical_loss": 5.212491975353835, + "tokens_seen": 54329344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967301905717151, + "loss": 4.5315, + "theoretical_loss": 5.211545774249233, + "tokens_seen": 54394880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967201604814444, + "loss": 4.4384, + "theoretical_loss": 5.210601031222324, + "tokens_seen": 54460416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967101303911735, + "loss": 4.5785, + "theoretical_loss": 5.209657742275683, + "tokens_seen": 54525952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967001003009028, + "loss": 4.391, + "theoretical_loss": 5.208715903427631, + "tokens_seen": 54591488 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496690070210632, + "loss": 4.5022, + "theoretical_loss": 5.207775510712159, + "tokens_seen": 54657024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966800401203611, + "loss": 4.5287, + "theoretical_loss": 5.2068365601788384, + "tokens_seen": 54722560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966700100300903, + "loss": 4.5569, + "theoretical_loss": 5.205899047892753, + "tokens_seen": 54788096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966599799398195, + "loss": 4.5661, + "theoretical_loss": 5.2049629699344075, + "tokens_seen": 54853632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966499498495487, + "loss": 4.5648, + "theoretical_loss": 5.204028322399658, + "tokens_seen": 54919168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966399197592779, + "loss": 4.6525, + "theoretical_loss": 5.203095101399628, + "tokens_seen": 54984704 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496629889669007, + "loss": 4.4029, + "theoretical_loss": 5.202163303060633, + "tokens_seen": 55050240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966198595787362, + "loss": 4.4427, + "theoretical_loss": 5.201232923524104, + "tokens_seen": 55115776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966098294884654, + "loss": 4.3917, + "theoretical_loss": 5.20030395894651, + "tokens_seen": 55181312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965997993981946, + "loss": 4.5708, + "theoretical_loss": 5.199376405499277, + "tokens_seen": 55246848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965897693079238, + "loss": 4.4216, + "theoretical_loss": 5.198450259368721, + "tokens_seen": 55312384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965797392176529, + "loss": 4.3407, + "theoretical_loss": 5.197525516755965, + "tokens_seen": 55377920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965697091273821, + "loss": 4.6291, + "theoretical_loss": 5.196602173876867, + "tokens_seen": 55443456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965596790371113, + "loss": 4.5092, + "theoretical_loss": 5.195680226961947, + "tokens_seen": 55508992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965496489468405, + "loss": 4.5488, + "theoretical_loss": 5.194759672256309, + "tokens_seen": 55574528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965396188565698, + "loss": 4.5177, + "theoretical_loss": 5.19384050601957, + "tokens_seen": 55640064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 178742, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.567445278167725, + "objective/train/theoretical_loss": 5.192922724525789, + "objective/train/tokens_used": 76165600, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965295887662988, + "loss": 4.5324, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965195586760281, + "loss": 4.5158, + "theoretical_loss": 5.19200632406339, + "tokens_seen": 55771136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965095285857573, + "loss": 4.361, + "theoretical_loss": 5.19109130093509, + "tokens_seen": 55836672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964994984954865, + "loss": 4.4467, + "theoretical_loss": 5.190177651457833, + "tokens_seen": 55902208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964894684052157, + "loss": 4.3854, + "theoretical_loss": 5.189265371962712, + "tokens_seen": 55967744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964794383149449, + "loss": 4.4949, + "theoretical_loss": 5.188354458794902, + "tokens_seen": 56033280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496469408224674, + "loss": 4.5525, + "theoretical_loss": 5.187444908313586, + "tokens_seen": 56098816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964593781344032, + "loss": 4.5491, + "theoretical_loss": 5.186536716891892, + "tokens_seen": 56164352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964493480441324, + "loss": 4.5449, + "theoretical_loss": 5.185629880916814, + "tokens_seen": 56229888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964393179538616, + "loss": 4.4341, + "theoretical_loss": 5.18472439678915, + "tokens_seen": 56295424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964292878635908, + "loss": 4.4862, + "theoretical_loss": 5.18382026092343, + "tokens_seen": 56360960 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049641925777332, + "loss": 4.5076, + "theoretical_loss": 5.182917469747851, + "tokens_seen": 56426496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964092276830491, + "loss": 4.3652, + "theoretical_loss": 5.182016019704204, + "tokens_seen": 56492032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963991975927783, + "loss": 4.5333, + "theoretical_loss": 5.1811159072478095, + "tokens_seen": 56557568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963891675025075, + "loss": 4.5474, + "theoretical_loss": 5.180217128847451, + "tokens_seen": 56623104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963791374122367, + "loss": 4.5041, + "theoretical_loss": 5.17931968098531, + "tokens_seen": 56688640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963691073219659, + "loss": 4.5362, + "theoretical_loss": 5.178423560156894, + "tokens_seen": 56754176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963590772316951, + "loss": 4.4355, + "theoretical_loss": 5.177528762870973, + "tokens_seen": 56819712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963490471414242, + "loss": 4.4247, + "theoretical_loss": 5.176635285649521, + "tokens_seen": 56885248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963390170511535, + "loss": 4.4013, + "theoretical_loss": 5.175743125027638, + "tokens_seen": 56950784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963289869608827, + "loss": 4.434, + "theoretical_loss": 5.174852277553498, + "tokens_seen": 57016320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963189568706119, + "loss": 4.5131, + "theoretical_loss": 5.173962739788276, + "tokens_seen": 57081856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496308926780341, + "loss": 4.4027, + "theoretical_loss": 5.17307450830609, + "tokens_seen": 57147392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962988966900702, + "loss": 4.4665, + "theoretical_loss": 5.172187579693933, + "tokens_seen": 57212928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962888665997994, + "loss": 4.331, + "theoretical_loss": 5.1713019505516105, + "tokens_seen": 57278464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 183397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.273628234863281, + "objective/train/theoretical_loss": 5.170417617491682, + "objective/train/tokens_used": 77804000, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962788365095286, + "loss": 4.364, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962688064192578, + "loss": 4.4353, + "theoretical_loss": 5.169534577139395, + "tokens_seen": 57409536 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496258776328987, + "loss": 4.4439, + "theoretical_loss": 5.168652826132623, + "tokens_seen": 57475072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962487462387161, + "loss": 4.4183, + "theoretical_loss": 5.167772361121805, + "tokens_seen": 57540608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962387161484453, + "loss": 4.5093, + "theoretical_loss": 5.166893178769884, + "tokens_seen": 57606144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962286860581746, + "loss": 4.4409, + "theoretical_loss": 5.1660152757522475, + "tokens_seen": 57671680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962186559679037, + "loss": 4.4782, + "theoretical_loss": 5.165138648756665, + "tokens_seen": 57737216 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496208625877633, + "loss": 4.502, + "theoretical_loss": 5.164263294483226, + "tokens_seen": 57802752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961985957873621, + "loss": 4.5053, + "theoretical_loss": 5.163389209644287, + "tokens_seen": 57868288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961885656970913, + "loss": 4.3989, + "theoretical_loss": 5.162516390964408, + "tokens_seen": 57933824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961785356068205, + "loss": 4.4705, + "theoretical_loss": 5.1616448351802875, + "tokens_seen": 57999360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961685055165497, + "loss": 4.5144, + "theoretical_loss": 5.160774539040716, + "tokens_seen": 58064896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961584754262789, + "loss": 4.316, + "theoretical_loss": 5.159905499306511, + "tokens_seen": 58130432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961484453360081, + "loss": 4.4013, + "theoretical_loss": 5.159037712750455, + "tokens_seen": 58195968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961384152457372, + "loss": 4.5296, + "theoretical_loss": 5.158171176157245, + "tokens_seen": 58261504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961283851554664, + "loss": 4.3922, + "theoretical_loss": 5.157305886323435, + "tokens_seen": 58327040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961183550651956, + "loss": 4.3327, + "theoretical_loss": 5.156441840057371, + "tokens_seen": 58392576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961083249749248, + "loss": 4.457, + "theoretical_loss": 5.155579034179144, + "tokens_seen": 58458112 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496098294884654, + "loss": 4.3448, + "theoretical_loss": 5.15471746552053, + "tokens_seen": 58523648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960882647943831, + "loss": 4.3962, + "theoretical_loss": 5.153857130924929, + "tokens_seen": 58589184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960782347041123, + "loss": 4.4167, + "theoretical_loss": 5.1529980272473175, + "tokens_seen": 58654720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960682046138415, + "loss": 4.3858, + "theoretical_loss": 5.152140151354191, + "tokens_seen": 58720256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960581745235707, + "loss": 4.4539, + "theoretical_loss": 5.151283500123505, + "tokens_seen": 58785792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960481444333, + "loss": 4.2706, + "theoretical_loss": 5.150428070444621, + "tokens_seen": 58851328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496038114343029, + "loss": 4.3648, + "theoretical_loss": 5.149573859218261, + "tokens_seen": 58916864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 186163, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.522251605987549, + "objective/train/theoretical_loss": 5.1487208633564405, + "objective/train/tokens_used": 79442400, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960280842527583, + "loss": 4.4738, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960180541624875, + "loss": 4.4259, + "theoretical_loss": 5.147869079782423, + "tokens_seen": 59047936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960080240722167, + "loss": 4.3434, + "theoretical_loss": 5.147018505430666, + "tokens_seen": 59113472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959979939819459, + "loss": 4.3584, + "theoretical_loss": 5.146169137246765, + "tokens_seen": 59179008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959879638916751, + "loss": 4.296, + "theoretical_loss": 5.145320972187402, + "tokens_seen": 59244544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959779338014042, + "loss": 4.2608, + "theoretical_loss": 5.144474007220293, + "tokens_seen": 59310080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959679037111334, + "loss": 4.3342, + "theoretical_loss": 5.143628239324139, + "tokens_seen": 59375616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959578736208626, + "loss": 4.4682, + "theoretical_loss": 5.142783665488567, + "tokens_seen": 59441152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959478435305918, + "loss": 4.3792, + "theoretical_loss": 5.1419402827140885, + "tokens_seen": 59506688 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495937813440321, + "loss": 4.4156, + "theoretical_loss": 5.141098088012036, + "tokens_seen": 59572224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959277833500501, + "loss": 4.3873, + "theoretical_loss": 5.140257078404524, + "tokens_seen": 59637760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959177532597793, + "loss": 4.3697, + "theoretical_loss": 5.13941725092439, + "tokens_seen": 59703296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959077231695085, + "loss": 4.4197, + "theoretical_loss": 5.138578602615146, + "tokens_seen": 59768832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958976930792377, + "loss": 4.38, + "theoretical_loss": 5.137741130530934, + "tokens_seen": 59834368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958876629889669, + "loss": 4.367, + "theoretical_loss": 5.1369048317364685, + "tokens_seen": 59899904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495877632898696, + "loss": 4.3504, + "theoretical_loss": 5.13606970330699, + "tokens_seen": 59965440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958676028084253, + "loss": 4.2946, + "theoretical_loss": 5.135235742328217, + "tokens_seen": 60030976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958575727181544, + "loss": 4.3574, + "theoretical_loss": 5.134402945896297, + "tokens_seen": 60096512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958475426278837, + "loss": 4.3678, + "theoretical_loss": 5.133571311117755, + "tokens_seen": 60162048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958375125376129, + "loss": 4.4617, + "theoretical_loss": 5.132740835109448, + "tokens_seen": 60227584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958274824473421, + "loss": 4.4509, + "theoretical_loss": 5.131911514998518, + "tokens_seen": 60293120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958174523570712, + "loss": 4.4328, + "theoretical_loss": 5.131083347922338, + "tokens_seen": 60358656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958074222668004, + "loss": 4.4193, + "theoretical_loss": 5.130256331028474, + "tokens_seen": 60424192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957973921765296, + "loss": 4.3417, + "theoretical_loss": 5.129430461474628, + "tokens_seen": 60489728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957873620862588, + "loss": 4.3446, + "theoretical_loss": 5.128605736428597, + "tokens_seen": 60555264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.308119773864746, + "objective/train/theoretical_loss": 5.127782153068225, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495777331995988, + "loss": 4.3466, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957673019057172, + "loss": 4.3043, + "theoretical_loss": 5.126959708581356, + "tokens_seen": 60686336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957572718154463, + "loss": 4.3334, + "theoretical_loss": 5.1261384001657895, + "tokens_seen": 60751872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957472417251755, + "loss": 4.3052, + "theoretical_loss": 5.125318225029231, + "tokens_seen": 60817408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957372116349047, + "loss": 4.2635, + "theoretical_loss": 5.124499180389249, + "tokens_seen": 60882944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957271815446339, + "loss": 4.424, + "theoretical_loss": 5.12368126347323, + "tokens_seen": 60948480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957171514543631, + "loss": 4.3177, + "theoretical_loss": 5.122864471518334, + "tokens_seen": 61014016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957071213640923, + "loss": 4.3414, + "theoretical_loss": 5.122048801771443, + "tokens_seen": 61079552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956970912738214, + "loss": 4.3959, + "theoretical_loss": 5.121234251489128, + "tokens_seen": 61145088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956870611835507, + "loss": 4.3814, + "theoretical_loss": 5.120420817937591, + "tokens_seen": 61210624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956770310932798, + "loss": 4.4037, + "theoretical_loss": 5.119608498392633, + "tokens_seen": 61276160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956670010030091, + "loss": 4.5061, + "theoretical_loss": 5.118797290139605, + "tokens_seen": 61341696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956569709127383, + "loss": 4.4422, + "theoretical_loss": 5.117987190473361, + "tokens_seen": 61407232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956469408224674, + "loss": 4.319, + "theoretical_loss": 5.1171781966982195, + "tokens_seen": 61472768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956369107321966, + "loss": 4.1722, + "theoretical_loss": 5.116370306127921, + "tokens_seen": 61538304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956268806419258, + "loss": 4.2671, + "theoretical_loss": 5.11556351608558, + "tokens_seen": 61603840 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495616850551655, + "loss": 4.3878, + "theoretical_loss": 5.114757823903647, + "tokens_seen": 61669376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956068204613842, + "loss": 4.2525, + "theoretical_loss": 5.113953226923864, + "tokens_seen": 61734912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955967903711133, + "loss": 4.2229, + "theoretical_loss": 5.113149722497221, + "tokens_seen": 61800448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955867602808425, + "loss": 4.2915, + "theoretical_loss": 5.112347307983919, + "tokens_seen": 61865984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955767301905717, + "loss": 4.4037, + "theoretical_loss": 5.111545980753322, + "tokens_seen": 61931520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955667001003009, + "loss": 4.3471, + "theoretical_loss": 5.110745738183919, + "tokens_seen": 61997056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955566700100301, + "loss": 4.352, + "theoretical_loss": 5.109946577663284, + "tokens_seen": 62062592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955466399197592, + "loss": 4.3584, + "theoretical_loss": 5.109148496588032, + "tokens_seen": 62128128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955366098294884, + "loss": 4.3856, + "theoretical_loss": 5.108351492363779, + "tokens_seen": 62193664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.481258392333984, + "objective/train/theoretical_loss": 5.107555562405102, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955265797392177, + "loss": 4.3206, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955165496489468, + "loss": 4.3443, + "theoretical_loss": 5.106760704135499, + "tokens_seen": 62324736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955065195586761, + "loss": 4.2783, + "theoretical_loss": 5.105966914987349, + "tokens_seen": 62390272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954964894684052, + "loss": 4.3055, + "theoretical_loss": 5.1051741924018685, + "tokens_seen": 62455808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954864593781344, + "loss": 4.2903, + "theoretical_loss": 5.10438253382908, + "tokens_seen": 62521344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954764292878636, + "loss": 4.3735, + "theoretical_loss": 5.103591936727762, + "tokens_seen": 62586880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954663991975928, + "loss": 4.3138, + "theoretical_loss": 5.102802398565418, + "tokens_seen": 62652416 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495456369107322, + "loss": 4.3459, + "theoretical_loss": 5.102013916818235, + "tokens_seen": 62717952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954463390170512, + "loss": 4.3303, + "theoretical_loss": 5.101226488971042, + "tokens_seen": 62783488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954363089267803, + "loss": 4.3441, + "theoretical_loss": 5.100440112517276, + "tokens_seen": 62849024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954262788365095, + "loss": 4.284, + "theoretical_loss": 5.09965478495894, + "tokens_seen": 62914560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954162487462387, + "loss": 4.238, + "theoretical_loss": 5.098870503806567, + "tokens_seen": 62980096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954062186559679, + "loss": 4.4365, + "theoretical_loss": 5.09808726657918, + "tokens_seen": 63045632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953961885656971, + "loss": 4.3002, + "theoretical_loss": 5.097305070804255, + "tokens_seen": 63111168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953861584754263, + "loss": 4.317, + "theoretical_loss": 5.096523914017688, + "tokens_seen": 63176704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953761283851555, + "loss": 4.3246, + "theoretical_loss": 5.095743793763747, + "tokens_seen": 63242240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953660982948846, + "loss": 4.338, + "theoretical_loss": 5.094964707595047, + "tokens_seen": 63307776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953560682046139, + "loss": 4.3278, + "theoretical_loss": 5.094186653072505, + "tokens_seen": 63373312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953460381143431, + "loss": 4.373, + "theoretical_loss": 5.093409627765306, + "tokens_seen": 63438848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953360080240723, + "loss": 4.1992, + "theoretical_loss": 5.092633629250866, + "tokens_seen": 63504384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953259779338014, + "loss": 4.2758, + "theoretical_loss": 5.091858655114796, + "tokens_seen": 63569920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953159478435306, + "loss": 4.2784, + "theoretical_loss": 5.091084702950868, + "tokens_seen": 63635456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953059177532598, + "loss": 4.3738, + "theoretical_loss": 5.090311770360971, + "tokens_seen": 63700992 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495295887662989, + "loss": 4.4038, + "theoretical_loss": 5.089539854955088, + "tokens_seen": 63766528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952858575727182, + "loss": 4.3564, + "theoretical_loss": 5.088768954351249, + "tokens_seen": 63832064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.3900861740112305, + "objective/train/theoretical_loss": 5.087999066175502, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952758274824474, + "loss": 4.3116, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952657973921765, + "loss": 4.3008, + "theoretical_loss": 5.0872301880618735, + "tokens_seen": 63963136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952557673019057, + "loss": 4.3599, + "theoretical_loss": 5.086462317652341, + "tokens_seen": 64028672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952457372116349, + "loss": 4.3105, + "theoretical_loss": 5.085695452596788, + "tokens_seen": 64094208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952357071213641, + "loss": 4.3182, + "theoretical_loss": 5.084929590552976, + "tokens_seen": 64159744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952256770310933, + "loss": 4.2261, + "theoretical_loss": 5.0841647291865115, + "tokens_seen": 64225280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952156469408225, + "loss": 4.2838, + "theoretical_loss": 5.083400866170806, + "tokens_seen": 64290816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952056168505516, + "loss": 4.3078, + "theoretical_loss": 5.082637999187046, + "tokens_seen": 64356352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951955867602809, + "loss": 4.1771, + "theoretical_loss": 5.081876125924159, + "tokens_seen": 64421888 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049518555667001, + "loss": 4.4188, + "theoretical_loss": 5.0811152440787755, + "tokens_seen": 64487424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951755265797393, + "loss": 4.292, + "theoretical_loss": 5.0803553513552036, + "tokens_seen": 64552960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951654964894685, + "loss": 4.3209, + "theoretical_loss": 5.079596445465386, + "tokens_seen": 64618496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951554663991976, + "loss": 4.1464, + "theoretical_loss": 5.078838524128878, + "tokens_seen": 64684032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951454363089268, + "loss": 4.3801, + "theoretical_loss": 5.078081585072802, + "tokens_seen": 64749568 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495135406218656, + "loss": 4.2986, + "theoretical_loss": 5.077325626031826, + "tokens_seen": 64815104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951253761283852, + "loss": 4.2016, + "theoretical_loss": 5.076570644748123, + "tokens_seen": 64880640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951153460381144, + "loss": 4.3051, + "theoretical_loss": 5.075816638971341, + "tokens_seen": 64946176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951053159478435, + "loss": 4.1873, + "theoretical_loss": 5.075063606458576, + "tokens_seen": 65011712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950952858575727, + "loss": 4.1995, + "theoretical_loss": 5.074311544974331, + "tokens_seen": 65077248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950852557673019, + "loss": 4.3205, + "theoretical_loss": 5.07356045229049, + "tokens_seen": 65142784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950752256770311, + "loss": 4.2847, + "theoretical_loss": 5.072810326186285, + "tokens_seen": 65208320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950651955867603, + "loss": 4.1807, + "theoretical_loss": 5.072061164448261, + "tokens_seen": 65273856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950551654964894, + "loss": 4.2798, + "theoretical_loss": 5.071312964870252, + "tokens_seen": 65339392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950451354062186, + "loss": 4.32, + "theoretical_loss": 5.070565725253344, + "tokens_seen": 65404928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950351053159479, + "loss": 4.2627, + "theoretical_loss": 5.069819443405842, + "tokens_seen": 65470464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.215516567230225, + "objective/train/theoretical_loss": 5.069074117143246, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495025075225677, + "loss": 4.2181, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950150451354063, + "loss": 4.1912, + "theoretical_loss": 5.068329744288216, + "tokens_seen": 65601536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950050150451354, + "loss": 4.3116, + "theoretical_loss": 5.067586322670541, + "tokens_seen": 65667072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949949849548646, + "loss": 4.2353, + "theoretical_loss": 5.0668438501271105, + "tokens_seen": 65732608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949849548645938, + "loss": 4.2448, + "theoretical_loss": 5.066102324501883, + "tokens_seen": 65798144 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494974924774323, + "loss": 4.2644, + "theoretical_loss": 5.065361743645855, + "tokens_seen": 65863680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949648946840522, + "loss": 4.2341, + "theoretical_loss": 5.064622105417033, + "tokens_seen": 65929216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949548645937814, + "loss": 4.2625, + "theoretical_loss": 5.063883407680405, + "tokens_seen": 65994752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949448345035105, + "loss": 4.2429, + "theoretical_loss": 5.063145648307904, + "tokens_seen": 66060288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949348044132397, + "loss": 4.1738, + "theoretical_loss": 5.062408825178388, + "tokens_seen": 66125824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949247743229689, + "loss": 4.1093, + "theoretical_loss": 5.061672936177604, + "tokens_seen": 66191360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949147442326981, + "loss": 4.2322, + "theoretical_loss": 5.06093797919816, + "tokens_seen": 66256896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949047141424273, + "loss": 4.1303, + "theoretical_loss": 5.060203952139497, + "tokens_seen": 66322432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948946840521565, + "loss": 4.2649, + "theoretical_loss": 5.059470852907861, + "tokens_seen": 66387968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948846539618856, + "loss": 4.0836, + "theoretical_loss": 5.0587386794162725, + "tokens_seen": 66453504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948746238716148, + "loss": 4.2793, + "theoretical_loss": 5.058007429584498, + "tokens_seen": 66519040 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494864593781344, + "loss": 4.2177, + "theoretical_loss": 5.057277101339023, + "tokens_seen": 66584576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948545636910733, + "loss": 4.2812, + "theoretical_loss": 5.056547692613021, + "tokens_seen": 66650112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948445336008024, + "loss": 4.3317, + "theoretical_loss": 5.055819201346331, + "tokens_seen": 66715648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948345035105316, + "loss": 4.3069, + "theoretical_loss": 5.055091625485421, + "tokens_seen": 66781184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948244734202607, + "loss": 4.288, + "theoretical_loss": 5.054364962983367, + "tokens_seen": 66846720 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049481444332999, + "loss": 4.1207, + "theoretical_loss": 5.053639211799824, + "tokens_seen": 66912256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948044132397192, + "loss": 4.2379, + "theoretical_loss": 5.052914369900997, + "tokens_seen": 66977792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947943831494484, + "loss": 4.2218, + "theoretical_loss": 5.052190435259614, + "tokens_seen": 67043328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947843530591776, + "loss": 4.1456, + "theoretical_loss": 5.051467405854897, + "tokens_seen": 67108864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.165023326873779, + "objective/train/theoretical_loss": 5.05074527967254, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947743229689067, + "loss": 4.2281, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947642928786359, + "loss": 4.3011, + "theoretical_loss": 5.050024054704677, + "tokens_seen": 67239936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947542627883651, + "loss": 4.2792, + "theoretical_loss": 5.049303728949859, + "tokens_seen": 67305472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947442326980943, + "loss": 4.2554, + "theoretical_loss": 5.048584300413019, + "tokens_seen": 67371008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947342026078235, + "loss": 4.2487, + "theoretical_loss": 5.04786576710546, + "tokens_seen": 67436544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947241725175527, + "loss": 4.3437, + "theoretical_loss": 5.0471481270448155, + "tokens_seen": 67502080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947141424272818, + "loss": 4.1409, + "theoretical_loss": 5.046431378255027, + "tokens_seen": 67567616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947041123370111, + "loss": 4.0621, + "theoretical_loss": 5.045715518766322, + "tokens_seen": 67633152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946940822467402, + "loss": 4.3199, + "theoretical_loss": 5.0450005466151815, + "tokens_seen": 67698688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946840521564695, + "loss": 4.1969, + "theoretical_loss": 5.044286459844319, + "tokens_seen": 67764224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946740220661987, + "loss": 4.2833, + "theoretical_loss": 5.043573256502652, + "tokens_seen": 67829760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946639919759278, + "loss": 4.3762, + "theoretical_loss": 5.0428609346452795, + "tokens_seen": 67895296 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494653961885657, + "loss": 4.2701, + "theoretical_loss": 5.042149492333452, + "tokens_seen": 67960832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946439317953862, + "loss": 4.2414, + "theoretical_loss": 5.041438927634549, + "tokens_seen": 68026368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946339017051154, + "loss": 4.2595, + "theoretical_loss": 5.040729238622053, + "tokens_seen": 68091904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946238716148446, + "loss": 4.2995, + "theoretical_loss": 5.040020423375525, + "tokens_seen": 68157440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946138415245737, + "loss": 4.1367, + "theoretical_loss": 5.039312479980579, + "tokens_seen": 68222976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946038114343029, + "loss": 4.2829, + "theoretical_loss": 5.038605406528857, + "tokens_seen": 68288512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945937813440321, + "loss": 4.1998, + "theoretical_loss": 5.037899201118005, + "tokens_seen": 68354048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945837512537613, + "loss": 4.2251, + "theoretical_loss": 5.037193861851646, + "tokens_seen": 68419584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945737211634905, + "loss": 4.2813, + "theoretical_loss": 5.03648938683936, + "tokens_seen": 68485120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945636910732196, + "loss": 4.2451, + "theoretical_loss": 5.035785774196654, + "tokens_seen": 68550656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945536609829488, + "loss": 4.1649, + "theoretical_loss": 5.035083022044944, + "tokens_seen": 68616192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945436308926781, + "loss": 4.3007, + "theoretical_loss": 5.034381128511525, + "tokens_seen": 68681728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945336008024072, + "loss": 4.2296, + "theoretical_loss": 5.0336800917295506, + "tokens_seen": 68747264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.21781063079834, + "objective/train/theoretical_loss": 5.032979909838007, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945235707121365, + "loss": 4.2399, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945135406218656, + "loss": 4.1754, + "theoretical_loss": 5.032280580981691, + "tokens_seen": 68878336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945035105315948, + "loss": 4.2439, + "theoretical_loss": 5.031582103311187, + "tokens_seen": 68943872 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494493480441324, + "loss": 4.228, + "theoretical_loss": 5.030884474982842, + "tokens_seen": 69009408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944834503510532, + "loss": 4.1396, + "theoretical_loss": 5.030187694158739, + "tokens_seen": 69074944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944734202607824, + "loss": 4.2747, + "theoretical_loss": 5.02949175900668, + "tokens_seen": 69140480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944633901705116, + "loss": 4.3222, + "theoretical_loss": 5.028796667700159, + "tokens_seen": 69206016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944533600802407, + "loss": 3.9614, + "theoretical_loss": 5.0281024184183405, + "tokens_seen": 69271552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944433299899699, + "loss": 4.1021, + "theoretical_loss": 5.0274090093460355, + "tokens_seen": 69337088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944332998996991, + "loss": 4.0959, + "theoretical_loss": 5.026716438673677, + "tokens_seen": 69402624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944232698094283, + "loss": 4.1969, + "theoretical_loss": 5.0260247045973045, + "tokens_seen": 69468160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944132397191575, + "loss": 4.159, + "theoretical_loss": 5.02533380531853, + "tokens_seen": 69533696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944032096288867, + "loss": 4.0626, + "theoretical_loss": 5.024643739044526, + "tokens_seen": 69599232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943931795386158, + "loss": 4.2746, + "theoretical_loss": 5.023954503987998, + "tokens_seen": 69664768 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494383149448345, + "loss": 4.3004, + "theoretical_loss": 5.023266098367161, + "tokens_seen": 69730304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943731193580742, + "loss": 4.0764, + "theoretical_loss": 5.022578520405721, + "tokens_seen": 69795840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943630892678035, + "loss": 4.1454, + "theoretical_loss": 5.0218917683328534, + "tokens_seen": 69861376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943530591775326, + "loss": 4.2406, + "theoretical_loss": 5.021205840383175, + "tokens_seen": 69926912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943430290872618, + "loss": 4.1691, + "theoretical_loss": 5.020520734796728, + "tokens_seen": 69992448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943329989969909, + "loss": 4.173, + "theoretical_loss": 5.019836449818957, + "tokens_seen": 70057984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943229689067202, + "loss": 4.2154, + "theoretical_loss": 5.019152983700687, + "tokens_seen": 70123520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943129388164494, + "loss": 4.1014, + "theoretical_loss": 5.018470334698101, + "tokens_seen": 70189056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943029087261786, + "loss": 4.152, + "theoretical_loss": 5.01778850107272, + "tokens_seen": 70254592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942928786359078, + "loss": 4.16, + "theoretical_loss": 5.017107481091379, + "tokens_seen": 70320128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942828485456369, + "loss": 4.2324, + "theoretical_loss": 5.016427273026212, + "tokens_seen": 70385664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.014771938323975, + "objective/train/theoretical_loss": 5.015747875154622, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942728184553661, + "loss": 4.2399, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942627883650953, + "loss": 4.2139, + "theoretical_loss": 5.015069285759269, + "tokens_seen": 70516736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942527582748245, + "loss": 4.2581, + "theoretical_loss": 5.01439150312804, + "tokens_seen": 70582272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942427281845537, + "loss": 4.1951, + "theoretical_loss": 5.0137145255540405, + "tokens_seen": 70647808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942326980942828, + "loss": 4.1533, + "theoretical_loss": 5.013038351335559, + "tokens_seen": 70713344 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494222668004012, + "loss": 4.2263, + "theoretical_loss": 5.012362978776057, + "tokens_seen": 70778880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942126379137412, + "loss": 4.261, + "theoretical_loss": 5.011688406184147, + "tokens_seen": 70844416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942026078234704, + "loss": 4.2612, + "theoretical_loss": 5.011014631873566, + "tokens_seen": 70909952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941925777331996, + "loss": 4.316, + "theoretical_loss": 5.010341654163167, + "tokens_seen": 70975488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941825476429289, + "loss": 4.3099, + "theoretical_loss": 5.009669471376882, + "tokens_seen": 71041024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941725175526579, + "loss": 4.2783, + "theoretical_loss": 5.008998081843721, + "tokens_seen": 71106560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941624874623872, + "loss": 4.2541, + "theoretical_loss": 5.008327483897736, + "tokens_seen": 71172096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941524573721163, + "loss": 4.2455, + "theoretical_loss": 5.00765767587801, + "tokens_seen": 71237632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941424272818456, + "loss": 4.2236, + "theoretical_loss": 5.006988656128635, + "tokens_seen": 71303168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941323971915748, + "loss": 4.2589, + "theoretical_loss": 5.006320422998691, + "tokens_seen": 71368704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941223671013039, + "loss": 4.2177, + "theoretical_loss": 5.00565297484223, + "tokens_seen": 71434240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941123370110331, + "loss": 4.1941, + "theoretical_loss": 5.004986310018252, + "tokens_seen": 71499776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941023069207623, + "loss": 4.2152, + "theoretical_loss": 5.004320426890686, + "tokens_seen": 71565312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940922768304915, + "loss": 4.1527, + "theoretical_loss": 5.003655323828376, + "tokens_seen": 71630848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940822467402207, + "loss": 4.2329, + "theoretical_loss": 5.002990999205057, + "tokens_seen": 71696384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940722166499498, + "loss": 4.163, + "theoretical_loss": 5.002327451399335, + "tokens_seen": 71761920 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494062186559679, + "loss": 4.2309, + "theoretical_loss": 5.001664678794671, + "tokens_seen": 71827456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940521564694082, + "loss": 4.0704, + "theoretical_loss": 5.001002679779363, + "tokens_seen": 71892992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940421263791374, + "loss": 4.159, + "theoretical_loss": 5.0003414527465235, + "tokens_seen": 71958528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940320962888666, + "loss": 4.1777, + "theoretical_loss": 4.99968099609406, + "tokens_seen": 72024064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.16330099105835, + "objective/train/theoretical_loss": 4.999021308224664, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940220661985958, + "loss": 4.1074, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940120361083249, + "loss": 4.2526, + "theoretical_loss": 4.998362387545782, + "tokens_seen": 72155136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940020060180542, + "loss": 4.2048, + "theoretical_loss": 4.997704232469606, + "tokens_seen": 72220672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939919759277834, + "loss": 4.1254, + "theoretical_loss": 4.997046841413049, + "tokens_seen": 72286208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939819458375126, + "loss": 4.1864, + "theoretical_loss": 4.996390212797728, + "tokens_seen": 72351744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939719157472418, + "loss": 4.1978, + "theoretical_loss": 4.995734345049949, + "tokens_seen": 72417280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493961885656971, + "loss": 4.1756, + "theoretical_loss": 4.995079236600686, + "tokens_seen": 72482816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939518555667001, + "loss": 4.2247, + "theoretical_loss": 4.994424885885564, + "tokens_seen": 72548352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939418254764293, + "loss": 4.1707, + "theoretical_loss": 4.993771291344839, + "tokens_seen": 72613888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939317953861585, + "loss": 4.1154, + "theoretical_loss": 4.993118451423381, + "tokens_seen": 72679424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939217652958877, + "loss": 4.2098, + "theoretical_loss": 4.992466364570659, + "tokens_seen": 72744960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939117352056169, + "loss": 4.1042, + "theoretical_loss": 4.991815029240721, + "tokens_seen": 72810496 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493901705115346, + "loss": 4.0273, + "theoretical_loss": 4.991164443892175, + "tokens_seen": 72876032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938916750250752, + "loss": 4.1082, + "theoretical_loss": 4.990514606988173, + "tokens_seen": 72941568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938816449348044, + "loss": 4.1414, + "theoretical_loss": 4.989865516996396, + "tokens_seen": 73007104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938716148445337, + "loss": 4.04, + "theoretical_loss": 4.98921717238903, + "tokens_seen": 73072640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938615847542628, + "loss": 4.1151, + "theoretical_loss": 4.988569571642756, + "tokens_seen": 73138176 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493851554663992, + "loss": 4.1587, + "theoretical_loss": 4.98792271323873, + "tokens_seen": 73203712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938415245737211, + "loss": 4.1451, + "theoretical_loss": 4.9872765956625615, + "tokens_seen": 73269248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938314944834504, + "loss": 4.1767, + "theoretical_loss": 4.9866312174043035, + "tokens_seen": 73334784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938214643931796, + "loss": 4.1401, + "theoretical_loss": 4.9859865769584335, + "tokens_seen": 73400320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938114343029088, + "loss": 4.1012, + "theoretical_loss": 4.9853426728238315, + "tokens_seen": 73465856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493801404212638, + "loss": 4.2419, + "theoretical_loss": 4.984699503503771, + "tokens_seen": 73531392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937913741223671, + "loss": 4.0809, + "theoretical_loss": 4.984057067505898, + "tokens_seen": 73596928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937813440320963, + "loss": 4.0844, + "theoretical_loss": 4.9834153633422105, + "tokens_seen": 73662464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.242827415466309, + "objective/train/theoretical_loss": 4.982774389529053, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937713139418255, + "loss": 4.1766, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937612838515547, + "loss": 4.1756, + "theoretical_loss": 4.9821341445870875, + "tokens_seen": 73793536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937512537612839, + "loss": 4.1197, + "theoretical_loss": 4.981494627041286, + "tokens_seen": 73859072 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493741223671013, + "loss": 4.1979, + "theoretical_loss": 4.98085583542091, + "tokens_seen": 73924608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937311935807422, + "loss": 4.1983, + "theoretical_loss": 4.980217768259496, + "tokens_seen": 73990144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937211634904714, + "loss": 4.0897, + "theoretical_loss": 4.979580424094836, + "tokens_seen": 74055680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937111334002006, + "loss": 4.1213, + "theoretical_loss": 4.978943801468967, + "tokens_seen": 74121216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937011033099298, + "loss": 4.0891, + "theoretical_loss": 4.978307898928149, + "tokens_seen": 74186752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936910732196591, + "loss": 4.1455, + "theoretical_loss": 4.977672715022855, + "tokens_seen": 74252288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936810431293881, + "loss": 4.0851, + "theoretical_loss": 4.97703824830775, + "tokens_seen": 74317824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936710130391174, + "loss": 4.066, + "theoretical_loss": 4.976404497341676, + "tokens_seen": 74383360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936609829488465, + "loss": 4.1917, + "theoretical_loss": 4.975771460687641, + "tokens_seen": 74448896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936509528585758, + "loss": 4.1945, + "theoretical_loss": 4.975139136912794, + "tokens_seen": 74514432 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493640922768305, + "loss": 4.1049, + "theoretical_loss": 4.974507524588424, + "tokens_seen": 74579968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936308926780341, + "loss": 4.0651, + "theoretical_loss": 4.973876622289927, + "tokens_seen": 74645504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936208625877633, + "loss": 4.0799, + "theoretical_loss": 4.973246428596802, + "tokens_seen": 74711040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936108324974925, + "loss": 4.1921, + "theoretical_loss": 4.972616942092634, + "tokens_seen": 74776576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936008024072217, + "loss": 4.0607, + "theoretical_loss": 4.971988161365077, + "tokens_seen": 74842112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935907723169509, + "loss": 4.0117, + "theoretical_loss": 4.9713600850058395, + "tokens_seen": 74907648 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049358074222668, + "loss": 4.2029, + "theoretical_loss": 4.970732711610667, + "tokens_seen": 74973184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935707121364092, + "loss": 4.2257, + "theoretical_loss": 4.97010603977933, + "tokens_seen": 75038720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935606820461384, + "loss": 4.1904, + "theoretical_loss": 4.96948006811561, + "tokens_seen": 75104256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935506519558676, + "loss": 4.2068, + "theoretical_loss": 4.968854795227281, + "tokens_seen": 75169792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935406218655968, + "loss": 4.1193, + "theoretical_loss": 4.968230219726093, + "tokens_seen": 75235328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493530591775326, + "loss": 4.1102, + "theoretical_loss": 4.967606340227765, + "tokens_seen": 75300864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.183708190917969, + "objective/train/theoretical_loss": 4.966983155351962, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935205616850551, + "loss": 4.1882, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935105315947844, + "loss": 4.0553, + "theoretical_loss": 4.966360663722287, + "tokens_seen": 75431936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935005015045135, + "loss": 4.1441, + "theoretical_loss": 4.96573886396626, + "tokens_seen": 75497472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934904714142428, + "loss": 4.2158, + "theoretical_loss": 4.965117754715307, + "tokens_seen": 75563008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934804413239719, + "loss": 4.0298, + "theoretical_loss": 4.964497334604748, + "tokens_seen": 75628544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934704112337011, + "loss": 3.9946, + "theoretical_loss": 4.963877602273776, + "tokens_seen": 75694080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934603811434303, + "loss": 4.2013, + "theoretical_loss": 4.963258556365449, + "tokens_seen": 75759616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934503510531595, + "loss": 4.1773, + "theoretical_loss": 4.962640195526673, + "tokens_seen": 75825152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934403209628887, + "loss": 4.1526, + "theoretical_loss": 4.962022518408183, + "tokens_seen": 75890688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934302908726179, + "loss": 4.1474, + "theoretical_loss": 4.96140552366454, + "tokens_seen": 75956224 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493420260782347, + "loss": 4.1837, + "theoretical_loss": 4.9607892099541075, + "tokens_seen": 76021760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934102306920762, + "loss": 4.1463, + "theoretical_loss": 4.9601735759390415, + "tokens_seen": 76087296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934002006018054, + "loss": 4.0957, + "theoretical_loss": 4.959558620285274, + "tokens_seen": 76152832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933901705115346, + "loss": 4.094, + "theoretical_loss": 4.958944341662502, + "tokens_seen": 76218368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933801404212638, + "loss": 4.089, + "theoretical_loss": 4.958330738744172, + "tokens_seen": 76283904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493370110330993, + "loss": 4.1699, + "theoretical_loss": 4.957717810207466, + "tokens_seen": 76349440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933600802407221, + "loss": 4.1714, + "theoretical_loss": 4.957105554733289, + "tokens_seen": 76414976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933500501504513, + "loss": 4.1132, + "theoretical_loss": 4.956493971006253, + "tokens_seen": 76480512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933400200601805, + "loss": 4.1602, + "theoretical_loss": 4.955883057714669, + "tokens_seen": 76546048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933299899699098, + "loss": 4.1732, + "theoretical_loss": 4.955272813550524, + "tokens_seen": 76611584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933199598796389, + "loss": 4.1381, + "theoretical_loss": 4.954663237209477, + "tokens_seen": 76677120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933099297893682, + "loss": 4.0654, + "theoretical_loss": 4.954054327390841, + "tokens_seen": 76742656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932998996990972, + "loss": 4.1875, + "theoretical_loss": 4.9534460827975675, + "tokens_seen": 76808192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932898696088265, + "loss": 4.1217, + "theoretical_loss": 4.952838502136241, + "tokens_seen": 76873728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932798395185557, + "loss": 4.214, + "theoretical_loss": 4.952231584117056, + "tokens_seen": 76939264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.07984733581543, + "objective/train/theoretical_loss": 4.951625327453812, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932698094282849, + "loss": 4.1098, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932597793380141, + "loss": 4.1437, + "theoretical_loss": 4.951019730863894, + "tokens_seen": 77070336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932497492477432, + "loss": 4.0637, + "theoretical_loss": 4.950414793068266, + "tokens_seen": 77135872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932397191574724, + "loss": 3.9279, + "theoretical_loss": 4.94981051279145, + "tokens_seen": 77201408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932296890672016, + "loss": 4.1175, + "theoretical_loss": 4.94920688876152, + "tokens_seen": 77266944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932196589769308, + "loss": 4.1365, + "theoretical_loss": 4.948603919710088, + "tokens_seen": 77332480 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049320962888666, + "loss": 4.1799, + "theoretical_loss": 4.948001604372287, + "tokens_seen": 77398016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931995987963893, + "loss": 4.1572, + "theoretical_loss": 4.947399941486762, + "tokens_seen": 77463552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931895687061183, + "loss": 4.1502, + "theoretical_loss": 4.946798929795658, + "tokens_seen": 77529088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931795386158476, + "loss": 4.1076, + "theoretical_loss": 4.946198568044602, + "tokens_seen": 77594624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931695085255767, + "loss": 4.1184, + "theoretical_loss": 4.945598854982698, + "tokens_seen": 77660160 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493159478435306, + "loss": 4.1189, + "theoretical_loss": 4.944999789362508, + "tokens_seen": 77725696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931494483450352, + "loss": 4.0689, + "theoretical_loss": 4.944401369940043, + "tokens_seen": 77791232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931394182547643, + "loss": 3.985, + "theoretical_loss": 4.9438035954747495, + "tokens_seen": 77856768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931293881644935, + "loss": 4.1133, + "theoretical_loss": 4.9432064647294975, + "tokens_seen": 77922304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931193580742227, + "loss": 4.1206, + "theoretical_loss": 4.942609976470566, + "tokens_seen": 77987840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931093279839519, + "loss": 4.2123, + "theoretical_loss": 4.942014129467637, + "tokens_seen": 78053376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930992978936811, + "loss": 4.0982, + "theoretical_loss": 4.941418922493774, + "tokens_seen": 78118912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930892678034102, + "loss": 4.1152, + "theoretical_loss": 4.940824354325419, + "tokens_seen": 78184448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930792377131394, + "loss": 4.1925, + "theoretical_loss": 4.940230423742372, + "tokens_seen": 78249984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930692076228686, + "loss": 3.9771, + "theoretical_loss": 4.939637129527789, + "tokens_seen": 78315520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930591775325978, + "loss": 4.0862, + "theoretical_loss": 4.939044470468156, + "tokens_seen": 78381056 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493049147442327, + "loss": 3.9812, + "theoretical_loss": 4.938452445353294, + "tokens_seen": 78446592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930391173520562, + "loss": 4.0852, + "theoretical_loss": 4.937861052976332, + "tokens_seen": 78512128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930290872617853, + "loss": 4.1354, + "theoretical_loss": 4.937270292133704, + "tokens_seen": 78577664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.984527349472046, + "objective/train/theoretical_loss": 4.9366801616251355, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930190571715146, + "loss": 4.0252, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930090270812437, + "loss": 4.0784, + "theoretical_loss": 4.93609066025363, + "tokens_seen": 78708736 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492998996990973, + "loss": 4.1603, + "theoretical_loss": 4.935501786825457, + "tokens_seen": 78774272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929889669007021, + "loss": 4.0425, + "theoretical_loss": 4.934913540150143, + "tokens_seen": 78839808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929789368104313, + "loss": 4.0677, + "theoretical_loss": 4.934325919040461, + "tokens_seen": 78905344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929689067201605, + "loss": 4.1841, + "theoretical_loss": 4.933738922312413, + "tokens_seen": 78970880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929588766298897, + "loss": 4.0316, + "theoretical_loss": 4.933152548785222, + "tokens_seen": 79036416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929488465396189, + "loss": 4.0733, + "theoretical_loss": 4.932566797281324, + "tokens_seen": 79101952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929388164493481, + "loss": 4.1637, + "theoretical_loss": 4.931981666626351, + "tokens_seen": 79167488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929287863590773, + "loss": 4.0612, + "theoretical_loss": 4.931397155649121, + "tokens_seen": 79233024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929187562688064, + "loss": 4.1908, + "theoretical_loss": 4.930813263181631, + "tokens_seen": 79298560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929087261785356, + "loss": 4.1307, + "theoretical_loss": 4.93022998805904, + "tokens_seen": 79364096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928986960882648, + "loss": 4.0671, + "theoretical_loss": 4.929647329119659, + "tokens_seen": 79429632 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492888665997994, + "loss": 4.124, + "theoretical_loss": 4.9290652852049455, + "tokens_seen": 79495168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928786359077232, + "loss": 4.0418, + "theoretical_loss": 4.928483855159485, + "tokens_seen": 79560704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928686058174523, + "loss": 4.0567, + "theoretical_loss": 4.927903037830983, + "tokens_seen": 79626240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928585757271815, + "loss": 4.1626, + "theoretical_loss": 4.9273228320702565, + "tokens_seen": 79691776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928485456369107, + "loss": 4.124, + "theoretical_loss": 4.926743236731218, + "tokens_seen": 79757312 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049283851554664, + "loss": 4.087, + "theoretical_loss": 4.926164250670868, + "tokens_seen": 79822848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928284854563691, + "loss": 4.1156, + "theoretical_loss": 4.925585872749284, + "tokens_seen": 79888384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928184553660984, + "loss": 3.9405, + "theoretical_loss": 4.925008101829608, + "tokens_seen": 79953920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928084252758274, + "loss": 4.1672, + "theoretical_loss": 4.9244309367780374, + "tokens_seen": 80019456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927983951855567, + "loss": 3.8423, + "theoretical_loss": 4.923854376463816, + "tokens_seen": 80084992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927883650952859, + "loss": 3.9469, + "theoretical_loss": 4.923278419759217, + "tokens_seen": 80150528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927783350050151, + "loss": 4.1869, + "theoretical_loss": 4.92270306553954, + "tokens_seen": 80216064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 189088, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.988189220428467, + "objective/train/theoretical_loss": 4.922128312683096, + "objective/train/tokens_used": 80975328, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927683049147443, + "loss": 4.0798, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927582748244734, + "loss": 4.1211, + "theoretical_loss": 4.921554160071194, + "tokens_seen": 80347136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927482447342026, + "loss": 3.9753, + "theoretical_loss": 4.920980606588142, + "tokens_seen": 80412672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927382146439318, + "loss": 4.1182, + "theoretical_loss": 4.920407651121222, + "tokens_seen": 80478208 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492728184553661, + "loss": 4.087, + "theoretical_loss": 4.919835292560689, + "tokens_seen": 80543744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927181544633902, + "loss": 3.9387, + "theoretical_loss": 4.919263529799759, + "tokens_seen": 80609280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927081243731193, + "loss": 4.0184, + "theoretical_loss": 4.918692361734598, + "tokens_seen": 80674816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926980942828485, + "loss": 4.1203, + "theoretical_loss": 4.91812178726431, + "tokens_seen": 80740352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926880641925777, + "loss": 4.109, + "theoretical_loss": 4.917551805290929, + "tokens_seen": 80805888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926780341023069, + "loss": 4.056, + "theoretical_loss": 4.916982414719408, + "tokens_seen": 80871424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926680040120361, + "loss": 4.1324, + "theoretical_loss": 4.9164136144576105, + "tokens_seen": 80936960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926579739217654, + "loss": 4.1693, + "theoretical_loss": 4.915907522833708, + "tokens_seen": 80995328 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004926479438314944, + "loss": 3.9343, + "theoretical_loss": 4.915339835652183, + "tokens_seen": 81060864 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004926379137412237, + "loss": 4.1204, + "theoretical_loss": 4.914772735639691, + "tokens_seen": 81126400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004926278836509528, + "loss": 4.0271, + "theoretical_loss": 4.914206221715315, + "tokens_seen": 81191936 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004926178535606821, + "loss": 4.0426, + "theoretical_loss": 4.913640292800999, + "tokens_seen": 81257472 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004926078234704113, + "loss": 4.0662, + "theoretical_loss": 4.913074947821542, + "tokens_seen": 81323008 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925977933801404, + "loss": 4.1579, + "theoretical_loss": 4.912510185704582, + "tokens_seen": 81388544 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925877632898696, + "loss": 4.1622, + "theoretical_loss": 4.911946005380583, + "tokens_seen": 81454080 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925777331995988, + "loss": 4.0513, + "theoretical_loss": 4.911382405782836, + "tokens_seen": 81519616 + }, + { + "epoch": 1.0, + "learning_rate": 0.000492567703109328, + "loss": 4.0156, + "theoretical_loss": 4.910819385847441, + "tokens_seen": 81585152 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925576730190572, + "loss": 4.02, + "theoretical_loss": 4.910256944513299, + "tokens_seen": 81650688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925476429287864, + "loss": 3.9947, + "theoretical_loss": 4.909695080722105, + "tokens_seen": 81716224 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925376128385155, + "loss": 3.8804, + "theoretical_loss": 4.909133793418334, + "tokens_seen": 81781760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925275827482447, + "loss": 4.0527, + "theoretical_loss": 4.908573081549237, + "tokens_seen": 81847296 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 239532, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.146024227142334, + "objective/train/theoretical_loss": 4.908012944064825, + "objective/train/tokens_used": 102372832, + "theoretical_loss": 4.908012944064825, + "tokens_seen": 81912832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925175526579739, + "loss": 4.1579, + "theoretical_loss": 4.908012944064825, + "tokens_seen": 81912832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004925075225677031, + "loss": 3.909, + "theoretical_loss": 4.907453379917865, + "tokens_seen": 81978368 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924974924774323, + "loss": 3.9209, + "theoretical_loss": 4.90689438806387, + "tokens_seen": 82043904 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924874623871615, + "loss": 4.1313, + "theoretical_loss": 4.906335967461084, + "tokens_seen": 82109440 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924774322968907, + "loss": 4.0911, + "theoretical_loss": 4.905778117070482, + "tokens_seen": 82174976 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924674022066199, + "loss": 4.1084, + "theoretical_loss": 4.905220835855752, + "tokens_seen": 82240512 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924573721163491, + "loss": 3.9675, + "theoretical_loss": 4.904664122783289, + "tokens_seen": 82306048 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924473420260783, + "loss": 4.0398, + "theoretical_loss": 4.904107976822188, + "tokens_seen": 82371584 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924373119358075, + "loss": 4.0657, + "theoretical_loss": 4.903552396944233, + "tokens_seen": 82437120 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924272818455366, + "loss": 4.0674, + "theoretical_loss": 4.902997382123887, + "tokens_seen": 82502656 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004924172517552658, + "loss": 4.0661, + "theoretical_loss": 4.902442931338283, + "tokens_seen": 82568192 + }, + { + "epoch": 1.0, + "learning_rate": 0.000492407221664995, + "loss": 4.1057, + "theoretical_loss": 4.901889043567216, + "tokens_seen": 82633728 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923971915747242, + "loss": 3.9454, + "theoretical_loss": 4.901335717793134, + "tokens_seen": 82699264 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923871614844534, + "loss": 4.0741, + "theoretical_loss": 4.900782953001128, + "tokens_seen": 82764800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923771313941825, + "loss": 4.0076, + "theoretical_loss": 4.900230748178926, + "tokens_seen": 82830336 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923671013039117, + "loss": 3.9248, + "theoretical_loss": 4.8996791023168775, + "tokens_seen": 82895872 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923570712136409, + "loss": 4.0229, + "theoretical_loss": 4.899128014407951, + "tokens_seen": 82961408 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923470411233702, + "loss": 4.0567, + "theoretical_loss": 4.898577483447724, + "tokens_seen": 83026944 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923370110330993, + "loss": 3.9387, + "theoretical_loss": 4.898027508434371, + "tokens_seen": 83092480 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923269809428286, + "loss": 3.9172, + "theoretical_loss": 4.897478088368658, + "tokens_seen": 83158016 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923169508525576, + "loss": 4.0995, + "theoretical_loss": 4.896929222253933, + "tokens_seen": 83223552 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004923069207622869, + "loss": 4.0439, + "theoretical_loss": 4.896380909096116, + "tokens_seen": 83289088 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922968906720161, + "loss": 3.9987, + "theoretical_loss": 4.8958331479036925, + "tokens_seen": 83354624 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922868605817453, + "loss": 4.0779, + "theoretical_loss": 4.895285937687703, + "tokens_seen": 83420160 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922768304914745, + "loss": 4.0418, + "theoretical_loss": 4.894739277461735, + "tokens_seen": 83485696 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 244497, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.016027450561523, + "objective/train/theoretical_loss": 4.894193166241914, + "objective/train/tokens_used": 104011232, + "theoretical_loss": 4.894193166241914, + "tokens_seen": 83551232 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922668004012036, + "loss": 4.024, + "theoretical_loss": 4.894193166241914, + "tokens_seen": 83551232 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922567703109328, + "loss": 4.0535, + "theoretical_loss": 4.8936476030468965, + "tokens_seen": 83616768 + }, + { + "epoch": 1.0, + "learning_rate": 0.000492246740220662, + "loss": 4.0401, + "theoretical_loss": 4.893102586897861, + "tokens_seen": 83682304 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922367101303912, + "loss": 4.0006, + "theoretical_loss": 4.892558116818496, + "tokens_seen": 83747840 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922266800401204, + "loss": 3.9029, + "theoretical_loss": 4.892014191834999, + "tokens_seen": 83813376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922166499498495, + "loss": 4.0269, + "theoretical_loss": 4.891470810976061, + "tokens_seen": 83878912 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004922066198595787, + "loss": 4.068, + "theoretical_loss": 4.890927973272863, + "tokens_seen": 83944448 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921965897693079, + "loss": 3.9539, + "theoretical_loss": 4.8903856777590615, + "tokens_seen": 84009984 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921865596790371, + "loss": 4.0123, + "theoretical_loss": 4.889843923470789, + "tokens_seen": 84075520 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921765295887663, + "loss": 3.9251, + "theoretical_loss": 4.889302709446637, + "tokens_seen": 84141056 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921664994984956, + "loss": 3.9716, + "theoretical_loss": 4.888762034727659, + "tokens_seen": 84206592 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921564694082246, + "loss": 4.1171, + "theoretical_loss": 4.8882218983573456, + "tokens_seen": 84272128 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921464393179539, + "loss": 4.1962, + "theoretical_loss": 4.887682299381632, + "tokens_seen": 84337664 + }, + { + "epoch": 1.0, + "learning_rate": 0.000492136409227683, + "loss": 3.9961, + "theoretical_loss": 4.887143236848885, + "tokens_seen": 84403200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921263791374123, + "loss": 4.0593, + "theoretical_loss": 4.886604709809889, + "tokens_seen": 84468736 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921163490471415, + "loss": 3.9361, + "theoretical_loss": 4.886066717317847, + "tokens_seen": 84534272 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004921063189568706, + "loss": 4.1666, + "theoretical_loss": 4.885529258428365, + "tokens_seen": 84599808 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920962888665998, + "loss": 4.0883, + "theoretical_loss": 4.884992332199451, + "tokens_seen": 84665344 + }, + { + "epoch": 1.0, + "learning_rate": 0.000492086258776329, + "loss": 3.9572, + "theoretical_loss": 4.8844559376914995, + "tokens_seen": 84730880 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920762286860582, + "loss": 3.9166, + "theoretical_loss": 4.883920073967291, + "tokens_seen": 84796416 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920661985957874, + "loss": 4.0128, + "theoretical_loss": 4.883384740091979, + "tokens_seen": 84861952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920561685055166, + "loss": 4.0844, + "theoretical_loss": 4.882849935133084, + "tokens_seen": 84927488 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920461384152457, + "loss": 4.0001, + "theoretical_loss": 4.882315658160485, + "tokens_seen": 84993024 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920361083249749, + "loss": 3.9255, + "theoretical_loss": 4.881781908246414, + "tokens_seen": 85058560 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920260782347041, + "loss": 3.9537, + "theoretical_loss": 4.8812486844654455, + "tokens_seen": 85124096 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 247615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.79223370552063, + "objective/train/theoretical_loss": 4.880715985894488, + "objective/train/tokens_used": 105649632, + "theoretical_loss": 4.880715985894488, + "tokens_seen": 85189632 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920160481444333, + "loss": 4.0262, + "theoretical_loss": 4.880715985894488, + "tokens_seen": 85189632 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004920060180541625, + "loss": 4.0026, + "theoretical_loss": 4.880183811612779, + "tokens_seen": 85255168 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919959879638916, + "loss": 3.9988, + "theoretical_loss": 4.879652160701881, + "tokens_seen": 85320704 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919859578736209, + "loss": 4.0454, + "theoretical_loss": 4.879121032245662, + "tokens_seen": 85386240 + }, + { + "epoch": 1.0, + "learning_rate": 0.00049197592778335, + "loss": 4.0146, + "theoretical_loss": 4.8785904253303, + "tokens_seen": 85451776 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919658976930793, + "loss": 3.9823, + "theoretical_loss": 4.878060339044269, + "tokens_seen": 85517312 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919558676028084, + "loss": 3.7818, + "theoretical_loss": 4.877530772478332, + "tokens_seen": 85582848 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919458375125377, + "loss": 4.0108, + "theoretical_loss": 4.877001724725538, + "tokens_seen": 85648384 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919358074222668, + "loss": 3.9917, + "theoretical_loss": 4.876473194881207, + "tokens_seen": 85713920 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491925777331996, + "loss": 3.9795, + "theoretical_loss": 4.875945182042932, + "tokens_seen": 85779456 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919157472417252, + "loss": 3.8657, + "theoretical_loss": 4.875417685310559, + "tokens_seen": 85844992 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004919057171514544, + "loss": 4.0146, + "theoretical_loss": 4.8748907037861935, + "tokens_seen": 85910528 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918956870611836, + "loss": 4.0241, + "theoretical_loss": 4.874364236574183, + "tokens_seen": 85976064 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918856569709127, + "loss": 3.9355, + "theoretical_loss": 4.873838282781115, + "tokens_seen": 86041600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918756268806419, + "loss": 3.9681, + "theoretical_loss": 4.873312841515807, + "tokens_seen": 86107136 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918655967903711, + "loss": 3.9138, + "theoretical_loss": 4.872787911889299, + "tokens_seen": 86172672 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918555667001003, + "loss": 3.9805, + "theoretical_loss": 4.8722634930148505, + "tokens_seen": 86238208 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918455366098295, + "loss": 3.9333, + "theoretical_loss": 4.871739584007928, + "tokens_seen": 86303744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918355065195586, + "loss": 3.9805, + "theoretical_loss": 4.871216183986199, + "tokens_seen": 86369280 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918254764292878, + "loss": 3.9778, + "theoretical_loss": 4.8706932920695305, + "tokens_seen": 86434816 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491815446339017, + "loss": 4.0759, + "theoretical_loss": 4.87017090737997, + "tokens_seen": 86500352 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004918054162487463, + "loss": 4.1109, + "theoretical_loss": 4.869649029041755, + "tokens_seen": 86565888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917953861584754, + "loss": 3.9231, + "theoretical_loss": 4.869127656181286, + "tokens_seen": 86631424 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917853560682047, + "loss": 3.9588, + "theoretical_loss": 4.86860678792714, + "tokens_seen": 86696960 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917753259779337, + "loss": 4.0293, + "theoretical_loss": 4.868086423410047, + "tokens_seen": 86762496 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 251498, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8594000339508057, + "objective/train/theoretical_loss": 4.867566561762893, + "objective/train/tokens_used": 107288032, + "theoretical_loss": 4.867566561762893, + "tokens_seen": 86828032 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491765295887663, + "loss": 3.9306, + "theoretical_loss": 4.867566561762893, + "tokens_seen": 86828032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917552657973922, + "loss": 4.0392, + "theoretical_loss": 4.867047202120708, + "tokens_seen": 86893568 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917452357071214, + "loss": 3.9995, + "theoretical_loss": 4.8665283436206614, + "tokens_seen": 86959104 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917352056168506, + "loss": 3.9126, + "theoretical_loss": 4.866009985402053, + "tokens_seen": 87024640 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917251755265797, + "loss": 3.9559, + "theoretical_loss": 4.86549212660631, + "tokens_seen": 87090176 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917151454363089, + "loss": 4.0595, + "theoretical_loss": 4.864974766376976, + "tokens_seen": 87155712 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004917051153460381, + "loss": 4.0961, + "theoretical_loss": 4.864457903859709, + "tokens_seen": 87221248 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916950852557673, + "loss": 4.168, + "theoretical_loss": 4.863941538202265, + "tokens_seen": 87286784 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916850551654965, + "loss": 3.9581, + "theoretical_loss": 4.863425668554504, + "tokens_seen": 87352320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916750250752258, + "loss": 4.0163, + "theoretical_loss": 4.862910294068374, + "tokens_seen": 87417856 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916649949849548, + "loss": 4.0502, + "theoretical_loss": 4.862395413897909, + "tokens_seen": 87483392 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916549648946841, + "loss": 4.0644, + "theoretical_loss": 4.86188102719922, + "tokens_seen": 87548928 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916449348044132, + "loss": 3.9939, + "theoretical_loss": 4.861367133130488, + "tokens_seen": 87614464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916349047141425, + "loss": 3.9202, + "theoretical_loss": 4.86085373085196, + "tokens_seen": 87680000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916248746238717, + "loss": 3.9134, + "theoretical_loss": 4.860340819525939, + "tokens_seen": 87745536 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004916148445336008, + "loss": 4.0175, + "theoretical_loss": 4.859828398316781, + "tokens_seen": 87811072 + }, + { + "epoch": 1.0, + "learning_rate": 0.00049160481444333, + "loss": 3.9901, + "theoretical_loss": 4.859316466390885, + "tokens_seen": 87876608 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915947843530592, + "loss": 4.0879, + "theoretical_loss": 4.858805022916689, + "tokens_seen": 87942144 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915847542627884, + "loss": 3.9995, + "theoretical_loss": 4.858294067064665, + "tokens_seen": 88007680 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915747241725176, + "loss": 3.9646, + "theoretical_loss": 4.857783598007304, + "tokens_seen": 88073216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915646940822468, + "loss": 4.0047, + "theoretical_loss": 4.857273614919122, + "tokens_seen": 88138752 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915546639919759, + "loss": 4.0669, + "theoretical_loss": 4.856764116976644, + "tokens_seen": 88204288 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915446339017051, + "loss": 4.0402, + "theoretical_loss": 4.8562551033584, + "tokens_seen": 88269824 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915346038114343, + "loss": 3.8891, + "theoretical_loss": 4.855746573244925, + "tokens_seen": 88335360 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915245737211635, + "loss": 3.9851, + "theoretical_loss": 4.85523852581874, + "tokens_seen": 88400896 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 254195, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.126548767089844, + "objective/train/theoretical_loss": 4.854730960264357, + "objective/train/tokens_used": 108926432, + "theoretical_loss": 4.854730960264357, + "tokens_seen": 88466432 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915145436308927, + "loss": 4.0326, + "theoretical_loss": 4.854730960264357, + "tokens_seen": 88466432 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004915045135406218, + "loss": 3.9349, + "theoretical_loss": 4.854223875768268, + "tokens_seen": 88531968 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914944834503511, + "loss": 3.9017, + "theoretical_loss": 4.8537172715189385, + "tokens_seen": 88597504 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914844533600802, + "loss": 4.0921, + "theoretical_loss": 4.853211146706803, + "tokens_seen": 88663040 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914744232698095, + "loss": 3.8345, + "theoretical_loss": 4.852705500524259, + "tokens_seen": 88728576 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914643931795386, + "loss": 3.9944, + "theoretical_loss": 4.852200332165655, + "tokens_seen": 88794112 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914543630892679, + "loss": 3.9565, + "theoretical_loss": 4.851695640827293, + "tokens_seen": 88859648 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491444332998997, + "loss": 4.0236, + "theoretical_loss": 4.851191425707419, + "tokens_seen": 88925184 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914343029087262, + "loss": 4.055, + "theoretical_loss": 4.850687686006212, + "tokens_seen": 88990720 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914242728184554, + "loss": 4.0792, + "theoretical_loss": 4.8501844209257845, + "tokens_seen": 89056256 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914142427281846, + "loss": 3.9179, + "theoretical_loss": 4.849681629670176, + "tokens_seen": 89121792 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004914042126379138, + "loss": 3.9338, + "theoretical_loss": 4.849179311445341, + "tokens_seen": 89187328 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913941825476429, + "loss": 3.9421, + "theoretical_loss": 4.84867746545915, + "tokens_seen": 89252864 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913841524573721, + "loss": 3.9337, + "theoretical_loss": 4.848176090921378, + "tokens_seen": 89318400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913741223671013, + "loss": 4.1499, + "theoretical_loss": 4.847675187043702, + "tokens_seen": 89383936 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913640922768305, + "loss": 4.0275, + "theoretical_loss": 4.847174753039695, + "tokens_seen": 89449472 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913540621865597, + "loss": 4.0568, + "theoretical_loss": 4.846674788124817, + "tokens_seen": 89515008 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913440320962888, + "loss": 4.022, + "theoretical_loss": 4.846175291516413, + "tokens_seen": 89580544 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491334002006018, + "loss": 3.9836, + "theoretical_loss": 4.845676262433704, + "tokens_seen": 89646080 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913239719157472, + "loss": 3.9063, + "theoretical_loss": 4.845177700097783, + "tokens_seen": 89711616 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913139418254765, + "loss": 4.1461, + "theoretical_loss": 4.844679603731606, + "tokens_seen": 89777152 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004913039117352056, + "loss": 3.9702, + "theoretical_loss": 4.844181972559993, + "tokens_seen": 89842688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912938816449349, + "loss": 3.9433, + "theoretical_loss": 4.843684805809614, + "tokens_seen": 89908224 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912838515546639, + "loss": 4.0026, + "theoretical_loss": 4.843188102708989, + "tokens_seen": 89973760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912738214643932, + "loss": 3.891, + "theoretical_loss": 4.84269186248848, + "tokens_seen": 90039296 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 259256, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.106069564819336, + "objective/train/theoretical_loss": 4.842196084380285, + "objective/train/tokens_used": 110564832, + "theoretical_loss": 4.842196084380285, + "tokens_seen": 90104832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912637913741224, + "loss": 4.0099, + "theoretical_loss": 4.842196084380285, + "tokens_seen": 90104832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912537612838516, + "loss": 4.0629, + "theoretical_loss": 4.841700767618434, + "tokens_seen": 90170368 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912437311935808, + "loss": 3.959, + "theoretical_loss": 4.841205911438781, + "tokens_seen": 90235904 + }, + { + "epoch": 1.0, + "learning_rate": 0.00049123370110331, + "loss": 3.8832, + "theoretical_loss": 4.8407115150790005, + "tokens_seen": 90301440 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912236710130391, + "loss": 4.0147, + "theoretical_loss": 4.840217577778579, + "tokens_seen": 90366976 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912136409227683, + "loss": 3.9575, + "theoretical_loss": 4.839724098778813, + "tokens_seen": 90432512 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004912036108324975, + "loss": 3.9639, + "theoretical_loss": 4.839231077322802, + "tokens_seen": 90498048 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911935807422267, + "loss": 3.8956, + "theoretical_loss": 4.838738512655441, + "tokens_seen": 90563584 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911835506519559, + "loss": 4.0533, + "theoretical_loss": 4.838246404023415, + "tokens_seen": 90629120 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491173520561685, + "loss": 4.0426, + "theoretical_loss": 4.837754750675201, + "tokens_seen": 90694656 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911634904714142, + "loss": 3.9726, + "theoretical_loss": 4.837263551861049, + "tokens_seen": 90760192 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911534603811434, + "loss": 3.9872, + "theoretical_loss": 4.83677280683299, + "tokens_seen": 90825728 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911434302908726, + "loss": 3.9478, + "theoretical_loss": 4.836282514844819, + "tokens_seen": 90891264 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911334002006019, + "loss": 4.0245, + "theoretical_loss": 4.835792675152099, + "tokens_seen": 90956800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911233701103309, + "loss": 3.8705, + "theoretical_loss": 4.83530328701215, + "tokens_seen": 91022336 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911133400200602, + "loss": 3.9019, + "theoretical_loss": 4.834814349684045, + "tokens_seen": 91087872 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004911033099297893, + "loss": 4.0045, + "theoretical_loss": 4.834325862428605, + "tokens_seen": 91153408 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910932798395186, + "loss": 3.9924, + "theoretical_loss": 4.833837824508393, + "tokens_seen": 91218944 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910832497492478, + "loss": 3.9253, + "theoretical_loss": 4.8333502351877105, + "tokens_seen": 91284480 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491073219658977, + "loss": 4.0505, + "theoretical_loss": 4.832863093732588, + "tokens_seen": 91350016 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910631895687061, + "loss": 3.8049, + "theoretical_loss": 4.8323763994107845, + "tokens_seen": 91415552 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910531594784353, + "loss": 3.9416, + "theoretical_loss": 4.831890151491779, + "tokens_seen": 91481088 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910431293881645, + "loss": 3.9675, + "theoretical_loss": 4.831404349246766, + "tokens_seen": 91546624 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910330992978937, + "loss": 4.0591, + "theoretical_loss": 4.830918991948652, + "tokens_seen": 91612160 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910230692076229, + "loss": 3.7114, + "theoretical_loss": 4.830434078872047, + "tokens_seen": 91677696 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 262093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.077275276184082, + "objective/train/theoretical_loss": 4.829949609293262, + "objective/train/tokens_used": 112203232, + "theoretical_loss": 4.829949609293262, + "tokens_seen": 91743232 + }, + { + "epoch": 1.0, + "learning_rate": 0.000491013039117352, + "loss": 3.9833, + "theoretical_loss": 4.829949609293262, + "tokens_seen": 91743232 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004910030090270812, + "loss": 3.8065, + "theoretical_loss": 4.829465582490304, + "tokens_seen": 91808768 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909929789368104, + "loss": 3.9873, + "theoretical_loss": 4.828981997742865, + "tokens_seen": 91874304 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909829488465397, + "loss": 3.9512, + "theoretical_loss": 4.828498854332329, + "tokens_seen": 91939840 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909729187562688, + "loss": 3.9106, + "theoretical_loss": 4.828016151541753, + "tokens_seen": 92005376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909628886659981, + "loss": 3.9173, + "theoretical_loss": 4.827533888655871, + "tokens_seen": 92070912 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909528585757272, + "loss": 3.8462, + "theoretical_loss": 4.8270520649610855, + "tokens_seen": 92136448 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909428284854564, + "loss": 4.0262, + "theoretical_loss": 4.826570679745464, + "tokens_seen": 92201984 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909327983951856, + "loss": 3.7411, + "theoretical_loss": 4.8260897322987315, + "tokens_seen": 92267520 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909227683049148, + "loss": 4.026, + "theoretical_loss": 4.825609221912269, + "tokens_seen": 92333056 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490912738214644, + "loss": 3.9166, + "theoretical_loss": 4.825129147879104, + "tokens_seen": 92398592 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004909027081243731, + "loss": 4.0215, + "theoretical_loss": 4.82464950949391, + "tokens_seen": 92464128 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908926780341023, + "loss": 3.9754, + "theoretical_loss": 4.824170306052999, + "tokens_seen": 92529664 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908826479438315, + "loss": 3.9285, + "theoretical_loss": 4.823691536854316, + "tokens_seen": 92595200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908726178535607, + "loss": 4.0352, + "theoretical_loss": 4.8232132011974365, + "tokens_seen": 92660736 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908625877632899, + "loss": 4.0835, + "theoretical_loss": 4.82273529838356, + "tokens_seen": 92726272 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490852557673019, + "loss": 3.9783, + "theoretical_loss": 4.822257827715504, + "tokens_seen": 92791808 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908425275827482, + "loss": 4.006, + "theoretical_loss": 4.8217807884977, + "tokens_seen": 92857344 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908324974924774, + "loss": 3.787, + "theoretical_loss": 4.821304180036193, + "tokens_seen": 92922880 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908224674022067, + "loss": 3.8132, + "theoretical_loss": 4.820828001638626, + "tokens_seen": 92988416 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908124373119358, + "loss": 4.0177, + "theoretical_loss": 4.820352252614249, + "tokens_seen": 93053952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004908024072216651, + "loss": 4.0406, + "theoretical_loss": 4.8198769322739, + "tokens_seen": 93119488 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907923771313941, + "loss": 3.7879, + "theoretical_loss": 4.819402039930013, + "tokens_seen": 93185024 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907823470411234, + "loss": 3.9557, + "theoretical_loss": 4.818927574896604, + "tokens_seen": 93250560 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907723169508526, + "loss": 3.9608, + "theoretical_loss": 4.818453536489271, + "tokens_seen": 93316096 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 265747, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.664768695831299, + "objective/train/theoretical_loss": 4.8179799240251855, + "objective/train/tokens_used": 113841632, + "theoretical_loss": 4.8179799240251855, + "tokens_seen": 93381632 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907622868605818, + "loss": 3.9764, + "theoretical_loss": 4.8179799240251855, + "tokens_seen": 93381632 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490752256770311, + "loss": 3.9202, + "theoretical_loss": 4.817506736823094, + "tokens_seen": 93447168 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907422266800401, + "loss": 3.8601, + "theoretical_loss": 4.817033974203305, + "tokens_seen": 93512704 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907321965897693, + "loss": 3.889, + "theoretical_loss": 4.816561635487695, + "tokens_seen": 93578240 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907221664994985, + "loss": 4.0704, + "theoretical_loss": 4.816089719999688, + "tokens_seen": 93643776 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907121364092277, + "loss": 3.9202, + "theoretical_loss": 4.815618227064271, + "tokens_seen": 93709312 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004907021063189569, + "loss": 3.9391, + "theoretical_loss": 4.815147156007971, + "tokens_seen": 93774848 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490692076228686, + "loss": 3.9288, + "theoretical_loss": 4.814676506158861, + "tokens_seen": 93840384 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906820461384152, + "loss": 3.9556, + "theoretical_loss": 4.814206276846553, + "tokens_seen": 93905920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906720160481444, + "loss": 3.9209, + "theoretical_loss": 4.813736467402192, + "tokens_seen": 93971456 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906619859578736, + "loss": 3.9125, + "theoretical_loss": 4.81326707715845, + "tokens_seen": 94036992 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906519558676028, + "loss": 4.1021, + "theoretical_loss": 4.81279810544953, + "tokens_seen": 94102528 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906419257773321, + "loss": 3.9303, + "theoretical_loss": 4.812329551611149, + "tokens_seen": 94168064 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906318956870611, + "loss": 3.9975, + "theoretical_loss": 4.811861414980543, + "tokens_seen": 94233600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906218655967904, + "loss": 3.98, + "theoretical_loss": 4.811393694896456, + "tokens_seen": 94299136 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906118355065195, + "loss": 3.9938, + "theoretical_loss": 4.810926390699144, + "tokens_seen": 94364672 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004906018054162488, + "loss": 3.8447, + "theoretical_loss": 4.81045950173036, + "tokens_seen": 94430208 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490591775325978, + "loss": 3.9281, + "theoretical_loss": 4.809993027333358, + "tokens_seen": 94495744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905817452357072, + "loss": 3.9872, + "theoretical_loss": 4.809526966852884, + "tokens_seen": 94561280 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905717151454363, + "loss": 4.0247, + "theoretical_loss": 4.809061319635172, + "tokens_seen": 94626816 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905616850551655, + "loss": 3.9396, + "theoretical_loss": 4.8085960850279434, + "tokens_seen": 94692352 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905516549648947, + "loss": 3.9686, + "theoretical_loss": 4.808131262380396, + "tokens_seen": 94757888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905416248746239, + "loss": 3.9546, + "theoretical_loss": 4.807666851043205, + "tokens_seen": 94823424 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905315947843531, + "loss": 4.006, + "theoretical_loss": 4.807202850368518, + "tokens_seen": 94888960 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905215646940822, + "loss": 3.7635, + "theoretical_loss": 4.806739259709948, + "tokens_seen": 94954496 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 270606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.04507303237915, + "objective/train/theoretical_loss": 4.806276078422572, + "objective/train/tokens_used": 115480032, + "theoretical_loss": 4.806276078422572, + "tokens_seen": 95020032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905115346038114, + "loss": 3.9503, + "theoretical_loss": 4.806276078422572, + "tokens_seen": 95020032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004905015045135406, + "loss": 3.8835, + "theoretical_loss": 4.805813305862923, + "tokens_seen": 95085568 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904914744232698, + "loss": 3.8912, + "theoretical_loss": 4.8053509413889905, + "tokens_seen": 95151104 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490481444332999, + "loss": 3.8026, + "theoretical_loss": 4.804888984360212, + "tokens_seen": 95216640 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904714142427281, + "loss": 3.9113, + "theoretical_loss": 4.804427434137472, + "tokens_seen": 95282176 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904613841524574, + "loss": 3.9001, + "theoretical_loss": 4.803966290083096, + "tokens_seen": 95347712 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904513540621865, + "loss": 3.9818, + "theoretical_loss": 4.8035055515608445, + "tokens_seen": 95413248 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904413239719158, + "loss": 3.9336, + "theoretical_loss": 4.803045217935914, + "tokens_seen": 95478784 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904312938816449, + "loss": 3.979, + "theoretical_loss": 4.802585288574925, + "tokens_seen": 95544320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904212637913742, + "loss": 3.948, + "theoretical_loss": 4.802125762845929, + "tokens_seen": 95609856 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904112337011033, + "loss": 3.7291, + "theoretical_loss": 4.801666640118389, + "tokens_seen": 95675392 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004904012036108325, + "loss": 4.0453, + "theoretical_loss": 4.801207919763194, + "tokens_seen": 95740928 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903911735205617, + "loss": 4.0101, + "theoretical_loss": 4.800749601152637, + "tokens_seen": 95806464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903811434302909, + "loss": 3.8854, + "theoretical_loss": 4.80029168366042, + "tokens_seen": 95872000 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903711133400201, + "loss": 3.9119, + "theoretical_loss": 4.799834166661654, + "tokens_seen": 95937536 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903610832497492, + "loss": 3.9935, + "theoretical_loss": 4.7993770495328425, + "tokens_seen": 96003072 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903510531594784, + "loss": 3.971, + "theoretical_loss": 4.798920331651889, + "tokens_seen": 96068608 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903410230692076, + "loss": 3.9336, + "theoretical_loss": 4.798464012398085, + "tokens_seen": 96134144 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903309929789368, + "loss": 3.7684, + "theoretical_loss": 4.798008091152115, + "tokens_seen": 96199680 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490320962888666, + "loss": 3.8629, + "theoretical_loss": 4.797552567296039, + "tokens_seen": 96265216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903109327983952, + "loss": 3.8151, + "theoretical_loss": 4.797097440213302, + "tokens_seen": 96330752 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004903009027081243, + "loss": 3.9059, + "theoretical_loss": 4.796642709288722, + "tokens_seen": 96396288 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902908726178535, + "loss": 3.8766, + "theoretical_loss": 4.796188373908491, + "tokens_seen": 96461824 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902808425275828, + "loss": 3.9174, + "theoretical_loss": 4.795734433460162, + "tokens_seen": 96527360 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902708124373119, + "loss": 4.0004, + "theoretical_loss": 4.795280887332658, + "tokens_seen": 96592896 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 273510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.817716598510742, + "objective/train/theoretical_loss": 4.794827734916257, + "objective/train/tokens_used": 117118432, + "theoretical_loss": 4.794827734916257, + "tokens_seen": 96658432 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902607823470412, + "loss": 3.9256, + "theoretical_loss": 4.794827734916257, + "tokens_seen": 96658432 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902507522567703, + "loss": 3.9607, + "theoretical_loss": 4.7943749756025955, + "tokens_seen": 96723968 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902407221664995, + "loss": 3.863, + "theoretical_loss": 4.7939226087846585, + "tokens_seen": 96789504 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902306920762287, + "loss": 3.8933, + "theoretical_loss": 4.79347063385678, + "tokens_seen": 96855040 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902206619859579, + "loss": 3.9209, + "theoretical_loss": 4.793019050214637, + "tokens_seen": 96920576 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902106318956871, + "loss": 3.8297, + "theoretical_loss": 4.792567857255248, + "tokens_seen": 96986112 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004902006018054163, + "loss": 3.9628, + "theoretical_loss": 4.792117054376966, + "tokens_seen": 97051648 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004901905717151454, + "loss": 3.9465, + "theoretical_loss": 4.791666640979473, + "tokens_seen": 97117184 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004901805416248746, + "loss": 3.9194, + "theoretical_loss": 4.791216616463787, + "tokens_seen": 97182720 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004901705115346038, + "loss": 3.9622, + "theoretical_loss": 4.790766980232242, + "tokens_seen": 97248256 + }, + { + "epoch": 1.0, + "learning_rate": 0.000490160481444333, + "loss": 3.9555, + "theoretical_loss": 4.790317731688496, + "tokens_seen": 97313792 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004901504513540623, + "loss": 3.876, + "theoretical_loss": 4.789868870237527, + "tokens_seen": 97379328 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004901404212637913, + "loss": 3.8952, + "theoretical_loss": 4.789420395285619, + "tokens_seen": 97444864 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004901303911735206, + "loss": 3.8012, + "theoretical_loss": 4.78897230624037, + "tokens_seen": 97510400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004901203610832497, + "loss": 3.8849, + "theoretical_loss": 4.788524602510683, + "tokens_seen": 97575936 + }, + { + "epoch": 1.01, + "learning_rate": 0.000490110330992979, + "loss": 3.936, + "theoretical_loss": 4.788077283506761, + "tokens_seen": 97641472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004901003009027082, + "loss": 4.0009, + "theoretical_loss": 4.787630348640105, + "tokens_seen": 97707008 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900902708124374, + "loss": 3.8527, + "theoretical_loss": 4.787183797323513, + "tokens_seen": 97772544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900802407221665, + "loss": 3.9784, + "theoretical_loss": 4.786737628971069, + "tokens_seen": 97838080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900702106318957, + "loss": 3.9346, + "theoretical_loss": 4.786291842998146, + "tokens_seen": 97903616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900601805416249, + "loss": 3.907, + "theoretical_loss": 4.785846438821402, + "tokens_seen": 97969152 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900501504513541, + "loss": 3.8764, + "theoretical_loss": 4.785401415858771, + "tokens_seen": 98034688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900401203610833, + "loss": 3.9491, + "theoretical_loss": 4.784956773529464, + "tokens_seen": 98100224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900300902708124, + "loss": 3.9028, + "theoretical_loss": 4.784512511253965, + "tokens_seen": 98165760 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900200601805416, + "loss": 3.8762, + "theoretical_loss": 4.784068628454026, + "tokens_seen": 98231296 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 278385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.786750316619873, + "objective/train/theoretical_loss": 4.783625124552662, + "objective/train/tokens_used": 118756832, + "theoretical_loss": 4.783625124552662, + "tokens_seen": 98296832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004900100300902708, + "loss": 3.9435, + "theoretical_loss": 4.783625124552662, + "tokens_seen": 98296832 + }, + { + "epoch": 1.01, + "learning_rate": 0.00049, + "loss": 3.8627, + "theoretical_loss": 4.783181998974152, + "tokens_seen": 98362368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899899699097292, + "loss": 3.9375, + "theoretical_loss": 4.782739251144032, + "tokens_seen": 98427904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899799398194583, + "loss": 3.9008, + "theoretical_loss": 4.78229688048909, + "tokens_seen": 98493440 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899699097291876, + "loss": 3.8599, + "theoretical_loss": 4.781854886437367, + "tokens_seen": 98558976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899598796389167, + "loss": 3.8549, + "theoretical_loss": 4.78141326841815, + "tokens_seen": 98624512 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489949849548646, + "loss": 3.727, + "theoretical_loss": 4.780972025861967, + "tokens_seen": 98690048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899398194583751, + "loss": 3.9222, + "theoretical_loss": 4.78053115820059, + "tokens_seen": 98755584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899297893681044, + "loss": 3.9413, + "theoretical_loss": 4.780090664867027, + "tokens_seen": 98821120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899197592778335, + "loss": 3.8337, + "theoretical_loss": 4.779650545295514, + "tokens_seen": 98886656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004899097291875627, + "loss": 3.9516, + "theoretical_loss": 4.77921079892152, + "tokens_seen": 98952192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898996990972919, + "loss": 3.9043, + "theoretical_loss": 4.778771425181741, + "tokens_seen": 99017728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898896690070211, + "loss": 3.9787, + "theoretical_loss": 4.778332423514094, + "tokens_seen": 99083264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898796389167503, + "loss": 3.7945, + "theoretical_loss": 4.777893793357713, + "tokens_seen": 99148800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898696088264794, + "loss": 3.748, + "theoretical_loss": 4.777455534152948, + "tokens_seen": 99214336 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898595787362086, + "loss": 3.8625, + "theoretical_loss": 4.777017645341365, + "tokens_seen": 99279872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898495486459378, + "loss": 3.9995, + "theoretical_loss": 4.776580126365735, + "tokens_seen": 99345408 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489839518555667, + "loss": 3.9043, + "theoretical_loss": 4.776142976670034, + "tokens_seen": 99410944 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898294884653962, + "loss": 3.9163, + "theoretical_loss": 4.77570619569944, + "tokens_seen": 99476480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898194583751254, + "loss": 3.8828, + "theoretical_loss": 4.775269782900333, + "tokens_seen": 99542016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004898094282848545, + "loss": 3.9991, + "theoretical_loss": 4.774833737720282, + "tokens_seen": 99607552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897993981945837, + "loss": 3.8591, + "theoretical_loss": 4.774398059608053, + "tokens_seen": 99673088 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489789368104313, + "loss": 3.8361, + "theoretical_loss": 4.773962748013597, + "tokens_seen": 99738624 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897793380140421, + "loss": 3.7105, + "theoretical_loss": 4.773527802388051, + "tokens_seen": 99804160 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897693079237714, + "loss": 3.8674, + "theoretical_loss": 4.773093222183734, + "tokens_seen": 99869696 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 281325, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.977241277694702, + "objective/train/theoretical_loss": 4.772659006854143, + "objective/train/tokens_used": 120395232, + "theoretical_loss": 4.772659006854143, + "tokens_seen": 99935232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897592778335005, + "loss": 3.81, + "theoretical_loss": 4.772659006854143, + "tokens_seen": 99935232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897492477432297, + "loss": 3.945, + "theoretical_loss": 4.772225155853952, + "tokens_seen": 100000768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897392176529589, + "loss": 3.9172, + "theoretical_loss": 4.7717916686390005, + "tokens_seen": 100066304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897291875626881, + "loss": 3.8579, + "theoretical_loss": 4.771358544666305, + "tokens_seen": 100131840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897191574724173, + "loss": 3.8978, + "theoretical_loss": 4.770925783394041, + "tokens_seen": 100197376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004897091273821465, + "loss": 3.8267, + "theoretical_loss": 4.770493384281547, + "tokens_seen": 100262912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896990972918756, + "loss": 3.8734, + "theoretical_loss": 4.770061346789323, + "tokens_seen": 100328448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896890672016048, + "loss": 3.908, + "theoretical_loss": 4.769629670379021, + "tokens_seen": 100393984 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489679037111334, + "loss": 3.8749, + "theoretical_loss": 4.769198354513447, + "tokens_seen": 100459520 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896690070210632, + "loss": 3.8629, + "theoretical_loss": 4.768767398656555, + "tokens_seen": 100525056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896589769307924, + "loss": 3.9093, + "theoretical_loss": 4.768336802273447, + "tokens_seen": 100590592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896489468405215, + "loss": 3.9369, + "theoretical_loss": 4.767906564830366, + "tokens_seen": 100656128 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896389167502507, + "loss": 3.9189, + "theoretical_loss": 4.7674766857946915, + "tokens_seen": 100721664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896288866599799, + "loss": 3.7921, + "theoretical_loss": 4.767047164634944, + "tokens_seen": 100787200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896188565697091, + "loss": 3.937, + "theoretical_loss": 4.766618000820776, + "tokens_seen": 100852736 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004896088264794384, + "loss": 3.8925, + "theoretical_loss": 4.766189193822967, + "tokens_seen": 100918272 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895987963891674, + "loss": 3.9295, + "theoretical_loss": 4.765760743113427, + "tokens_seen": 100983808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895887662988967, + "loss": 3.7712, + "theoretical_loss": 4.765332648165186, + "tokens_seen": 101049344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895787362086259, + "loss": 3.8931, + "theoretical_loss": 4.764904908452401, + "tokens_seen": 101114880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895687061183551, + "loss": 3.9138, + "theoretical_loss": 4.764477523450337, + "tokens_seen": 101180416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895586760280843, + "loss": 3.9605, + "theoretical_loss": 4.764050492635379, + "tokens_seen": 101245952 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895486459378135, + "loss": 3.8636, + "theoretical_loss": 4.763623815485026, + "tokens_seen": 101311488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895386158475426, + "loss": 3.9329, + "theoretical_loss": 4.763197491477879, + "tokens_seen": 101377024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895285857572718, + "loss": 3.8612, + "theoretical_loss": 4.762771520093645, + "tokens_seen": 101442560 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489518555667001, + "loss": 3.8717, + "theoretical_loss": 4.7623459008131395, + "tokens_seen": 101508096 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 285136, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.874786853790283, + "objective/train/theoretical_loss": 4.761920633118269, + "objective/train/tokens_used": 122033632, + "theoretical_loss": 4.761920633118269, + "tokens_seen": 101573632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004895085255767302, + "loss": 3.8225, + "theoretical_loss": 4.761920633118269, + "tokens_seen": 101573632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894984954864594, + "loss": 3.9237, + "theoretical_loss": 4.76149571649204, + "tokens_seen": 101639168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894884653961885, + "loss": 3.8966, + "theoretical_loss": 4.761071150418552, + "tokens_seen": 101704704 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894784353059178, + "loss": 3.8936, + "theoretical_loss": 4.760646934382992, + "tokens_seen": 101770240 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894684052156469, + "loss": 3.9775, + "theoretical_loss": 4.760223067871638, + "tokens_seen": 101835776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894583751253762, + "loss": 4.0339, + "theoretical_loss": 4.759799550371848, + "tokens_seen": 101901312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894483450351053, + "loss": 3.7347, + "theoretical_loss": 4.759376381372062, + "tokens_seen": 101966848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894383149448346, + "loss": 3.9586, + "theoretical_loss": 4.758953560361801, + "tokens_seen": 102032384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894282848545637, + "loss": 3.9412, + "theoretical_loss": 4.758531086831656, + "tokens_seen": 102097920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894182547642929, + "loss": 3.8407, + "theoretical_loss": 4.758108960273294, + "tokens_seen": 102163456 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004894082246740221, + "loss": 3.8864, + "theoretical_loss": 4.757687180179449, + "tokens_seen": 102228992 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893981945837513, + "loss": 3.8299, + "theoretical_loss": 4.757265746043922, + "tokens_seen": 102294528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893881644934805, + "loss": 3.9267, + "theoretical_loss": 4.756844657361577, + "tokens_seen": 102360064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893781344032096, + "loss": 3.9061, + "theoretical_loss": 4.756423913628339, + "tokens_seen": 102425600 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893681043129388, + "loss": 3.9543, + "theoretical_loss": 4.75600351434119, + "tokens_seen": 102491136 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489358074222668, + "loss": 3.8048, + "theoretical_loss": 4.755583458998167, + "tokens_seen": 102556672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893480441323972, + "loss": 3.8676, + "theoretical_loss": 4.755163747098358, + "tokens_seen": 102622208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893380140421264, + "loss": 3.8126, + "theoretical_loss": 4.754744378141899, + "tokens_seen": 102687744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893279839518556, + "loss": 3.8601, + "theoretical_loss": 4.754325351629975, + "tokens_seen": 102753280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893179538615847, + "loss": 3.7927, + "theoretical_loss": 4.753906667064809, + "tokens_seen": 102818816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004893079237713139, + "loss": 3.9031, + "theoretical_loss": 4.753488323949671, + "tokens_seen": 102884352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892978936810432, + "loss": 3.9104, + "theoretical_loss": 4.7530703217888615, + "tokens_seen": 102949888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892878635907723, + "loss": 3.8452, + "theoretical_loss": 4.752652660087723, + "tokens_seen": 103015424 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892778335005016, + "loss": 3.8021, + "theoretical_loss": 4.75223533835262, + "tokens_seen": 103080960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892678034102307, + "loss": 4.0007, + "theoretical_loss": 4.751818356090956, + "tokens_seen": 103146496 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 290051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.75856876373291, + "objective/train/theoretical_loss": 4.751401712811155, + "objective/train/tokens_used": 123672032, + "theoretical_loss": 4.751401712811155, + "tokens_seen": 103212032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892577733199599, + "loss": 3.8704, + "theoretical_loss": 4.751401712811155, + "tokens_seen": 103212032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892477432296891, + "loss": 3.7788, + "theoretical_loss": 4.750985408022666, + "tokens_seen": 103277568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892377131394183, + "loss": 3.7416, + "theoretical_loss": 4.750569441235957, + "tokens_seen": 103343104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892276830491475, + "loss": 3.9178, + "theoretical_loss": 4.750153811962516, + "tokens_seen": 103408640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892176529588767, + "loss": 3.8593, + "theoretical_loss": 4.749738519714844, + "tokens_seen": 103474176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004892076228686058, + "loss": 3.8182, + "theoretical_loss": 4.749323564006458, + "tokens_seen": 103539712 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489197592778335, + "loss": 3.9466, + "theoretical_loss": 4.748908944351879, + "tokens_seen": 103605248 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891875626880642, + "loss": 3.8848, + "theoretical_loss": 4.7484946602666405, + "tokens_seen": 103670784 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891775325977934, + "loss": 3.7739, + "theoretical_loss": 4.748080711267275, + "tokens_seen": 103736320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891675025075226, + "loss": 3.8486, + "theoretical_loss": 4.7476670968713215, + "tokens_seen": 103801856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891574724172517, + "loss": 3.8552, + "theoretical_loss": 4.747253816597313, + "tokens_seen": 103867392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891474423269809, + "loss": 3.8403, + "theoretical_loss": 4.746840869964782, + "tokens_seen": 103932928 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891374122367101, + "loss": 3.7124, + "theoretical_loss": 4.7464282564942515, + "tokens_seen": 103998464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891273821464393, + "loss": 3.7801, + "theoretical_loss": 4.7460159757072375, + "tokens_seen": 104064000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891173520561686, + "loss": 3.8051, + "theoretical_loss": 4.7456040271262445, + "tokens_seen": 104129536 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004891073219658976, + "loss": 3.7371, + "theoretical_loss": 4.745192410274759, + "tokens_seen": 104195072 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890972918756269, + "loss": 3.8029, + "theoretical_loss": 4.744781124677252, + "tokens_seen": 104260608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890872617853561, + "loss": 3.9454, + "theoretical_loss": 4.744370169859176, + "tokens_seen": 104326144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890772316950853, + "loss": 3.9565, + "theoretical_loss": 4.743959545346958, + "tokens_seen": 104391680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890672016048145, + "loss": 3.8318, + "theoretical_loss": 4.743549250668002, + "tokens_seen": 104457216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890571715145437, + "loss": 3.9307, + "theoretical_loss": 4.743139285350684, + "tokens_seen": 104522752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890471414242728, + "loss": 3.956, + "theoretical_loss": 4.742729648924348, + "tokens_seen": 104588288 + }, + { + "epoch": 1.01, + "learning_rate": 0.000489037111334002, + "loss": 3.8876, + "theoretical_loss": 4.742320340919305, + "tokens_seen": 104653824 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890270812437312, + "loss": 3.9146, + "theoretical_loss": 4.741911360866833, + "tokens_seen": 104719360 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890170511534604, + "loss": 3.8901, + "theoretical_loss": 4.741502708299169, + "tokens_seen": 104784896 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 293028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.033762454986572, + "objective/train/theoretical_loss": 4.741094382749511, + "objective/train/tokens_used": 125310432, + "theoretical_loss": 4.741094382749511, + "tokens_seen": 104850432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004890070210631896, + "loss": 3.9855, + "theoretical_loss": 4.741094382749511, + "tokens_seen": 104850432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889969909729187, + "loss": 3.6704, + "theoretical_loss": 4.740686383752012, + "tokens_seen": 104915968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889869608826479, + "loss": 3.8493, + "theoretical_loss": 4.74027871084178, + "tokens_seen": 104981504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889769307923771, + "loss": 3.811, + "theoretical_loss": 4.739871363554874, + "tokens_seen": 105047040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889669007021063, + "loss": 3.8792, + "theoretical_loss": 4.739464341428303, + "tokens_seen": 105112576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889568706118355, + "loss": 3.8843, + "theoretical_loss": 4.739057644000022, + "tokens_seen": 105178112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889468405215647, + "loss": 3.8342, + "theoretical_loss": 4.73865127080893, + "tokens_seen": 105243648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889368104312939, + "loss": 3.9635, + "theoretical_loss": 4.738245221394866, + "tokens_seen": 105309184 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488926780341023, + "loss": 3.844, + "theoretical_loss": 4.737839495298609, + "tokens_seen": 105374720 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889167502507523, + "loss": 3.8157, + "theoretical_loss": 4.737434092061877, + "tokens_seen": 105440256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004889067201604815, + "loss": 3.8693, + "theoretical_loss": 4.737029011227316, + "tokens_seen": 105505792 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888966900702107, + "loss": 3.8775, + "theoretical_loss": 4.736624252338509, + "tokens_seen": 105571328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888866599799398, + "loss": 3.8937, + "theoretical_loss": 4.736219814939966, + "tokens_seen": 105636864 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488876629889669, + "loss": 3.7476, + "theoretical_loss": 4.735815698577122, + "tokens_seen": 105702400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888665997993982, + "loss": 3.922, + "theoretical_loss": 4.735411902796338, + "tokens_seen": 105767936 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888565697091274, + "loss": 3.8928, + "theoretical_loss": 4.735008427144896, + "tokens_seen": 105833472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888465396188566, + "loss": 3.9295, + "theoretical_loss": 4.734605271170997, + "tokens_seen": 105899008 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888365095285858, + "loss": 3.9472, + "theoretical_loss": 4.734202434423759, + "tokens_seen": 105964544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888264794383149, + "loss": 3.8539, + "theoretical_loss": 4.733799916453214, + "tokens_seen": 106030080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888164493480441, + "loss": 3.8909, + "theoretical_loss": 4.733397716810306, + "tokens_seen": 106095616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004888064192577733, + "loss": 3.9033, + "theoretical_loss": 4.732995835046888, + "tokens_seen": 106161152 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887963891675025, + "loss": 3.8606, + "theoretical_loss": 4.732594270715722, + "tokens_seen": 106226688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887863590772317, + "loss": 3.947, + "theoretical_loss": 4.7321930233704705, + "tokens_seen": 106292224 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488776328986961, + "loss": 4.004, + "theoretical_loss": 4.731792092565704, + "tokens_seen": 106357760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048876629889669, + "loss": 3.8486, + "theoretical_loss": 4.731391477856888, + "tokens_seen": 106423296 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 297884, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9518229961395264, + "objective/train/theoretical_loss": 4.730991178800387, + "objective/train/tokens_used": 126948832, + "theoretical_loss": 4.730991178800387, + "tokens_seen": 106488832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887562688064193, + "loss": 3.8906, + "theoretical_loss": 4.730991178800387, + "tokens_seen": 106488832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887462387161484, + "loss": 3.8545, + "theoretical_loss": 4.730591194953464, + "tokens_seen": 106554368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887362086258777, + "loss": 3.8008, + "theoretical_loss": 4.730191525874266, + "tokens_seen": 106619904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887261785356069, + "loss": 3.8579, + "theoretical_loss": 4.729792171121844, + "tokens_seen": 106685440 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488716148445336, + "loss": 3.9029, + "theoretical_loss": 4.729393130256124, + "tokens_seen": 106750976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004887061183550652, + "loss": 3.8077, + "theoretical_loss": 4.728994402837924, + "tokens_seen": 106816512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886960882647944, + "loss": 3.8239, + "theoretical_loss": 4.728595988428946, + "tokens_seen": 106882048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886860581745236, + "loss": 3.7656, + "theoretical_loss": 4.728197886591771, + "tokens_seen": 106947584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886760280842528, + "loss": 3.8995, + "theoretical_loss": 4.727800096889862, + "tokens_seen": 107013120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886659979939819, + "loss": 3.7463, + "theoretical_loss": 4.727402618887552, + "tokens_seen": 107078656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886559679037111, + "loss": 3.8088, + "theoretical_loss": 4.727005452150056, + "tokens_seen": 107144192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886459378134403, + "loss": 3.9348, + "theoretical_loss": 4.726608596243455, + "tokens_seen": 107209728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886359077231695, + "loss": 3.9517, + "theoretical_loss": 4.726212050734705, + "tokens_seen": 107275264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886258776328988, + "loss": 3.9144, + "theoretical_loss": 4.725815815191624, + "tokens_seen": 107340800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886158475426278, + "loss": 3.882, + "theoretical_loss": 4.7254198891828985, + "tokens_seen": 107406336 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004886058174523571, + "loss": 3.9286, + "theoretical_loss": 4.725024272278078, + "tokens_seen": 107471872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885957873620863, + "loss": 3.9461, + "theoretical_loss": 4.72462896404757, + "tokens_seen": 107537408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885857572718155, + "loss": 3.8237, + "theoretical_loss": 4.724233964062641, + "tokens_seen": 107602944 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885757271815447, + "loss": 3.7508, + "theoretical_loss": 4.723839271895419, + "tokens_seen": 107668480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885656970912739, + "loss": 3.8195, + "theoretical_loss": 4.723444887118878, + "tokens_seen": 107734016 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488555667001003, + "loss": 3.8222, + "theoretical_loss": 4.723050809306848, + "tokens_seen": 107799552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885456369107322, + "loss": 3.9041, + "theoretical_loss": 4.722657038034008, + "tokens_seen": 107865088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885356068204614, + "loss": 3.8556, + "theoretical_loss": 4.7222635728758835, + "tokens_seen": 107930624 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885255767301906, + "loss": 3.9022, + "theoretical_loss": 4.721870413408848, + "tokens_seen": 107996160 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004885155466399198, + "loss": 3.9476, + "theoretical_loss": 4.721477559210112, + "tokens_seen": 108061696 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 300770, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7595181465148926, + "objective/train/theoretical_loss": 4.721085009857732, + "objective/train/tokens_used": 128587232, + "theoretical_loss": 4.721085009857732, + "tokens_seen": 108127232 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488505516549649, + "loss": 3.9288, + "theoretical_loss": 4.721085009857732, + "tokens_seen": 108127232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884954864593781, + "loss": 3.8169, + "theoretical_loss": 4.7206927649306, + "tokens_seen": 108192768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884854563691073, + "loss": 3.7995, + "theoretical_loss": 4.720300824008448, + "tokens_seen": 108258304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884754262788365, + "loss": 3.7927, + "theoretical_loss": 4.7199091866718375, + "tokens_seen": 108323840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884653961885657, + "loss": 3.9595, + "theoretical_loss": 4.719517852502165, + "tokens_seen": 108389376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884553660982949, + "loss": 3.859, + "theoretical_loss": 4.719126821081655, + "tokens_seen": 108454912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884453360080241, + "loss": 3.8458, + "theoretical_loss": 4.718736091993362, + "tokens_seen": 108520448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884353059177532, + "loss": 3.8162, + "theoretical_loss": 4.718345664821163, + "tokens_seen": 108585984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884252758274825, + "loss": 3.8777, + "theoretical_loss": 4.717955539149763, + "tokens_seen": 108651520 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884152457372117, + "loss": 3.8284, + "theoretical_loss": 4.717565714564683, + "tokens_seen": 108717056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004884052156469409, + "loss": 3.8332, + "theoretical_loss": 4.717176190652268, + "tokens_seen": 108782592 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048839518555667, + "loss": 3.7666, + "theoretical_loss": 4.716786966999676, + "tokens_seen": 108848128 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883851554663992, + "loss": 3.8795, + "theoretical_loss": 4.716398043194881, + "tokens_seen": 108913664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883751253761284, + "loss": 3.915, + "theoretical_loss": 4.716009418826674, + "tokens_seen": 108979200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883650952858576, + "loss": 3.8759, + "theoretical_loss": 4.715621093484652, + "tokens_seen": 109044736 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883550651955868, + "loss": 3.877, + "theoretical_loss": 4.71523306675922, + "tokens_seen": 109110272 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488345035105316, + "loss": 3.7772, + "theoretical_loss": 4.714845338241595, + "tokens_seen": 109175808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883350050150451, + "loss": 3.9056, + "theoretical_loss": 4.714457907523794, + "tokens_seen": 109241344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883249749247743, + "loss": 3.825, + "theoretical_loss": 4.714070774198639, + "tokens_seen": 109306880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883149448345035, + "loss": 3.7294, + "theoretical_loss": 4.713683937859749, + "tokens_seen": 109372416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004883049147442327, + "loss": 3.8612, + "theoretical_loss": 4.713297398101545, + "tokens_seen": 109437952 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004882948846539619, + "loss": 3.8987, + "theoretical_loss": 4.712911154519244, + "tokens_seen": 109503488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004882848545636911, + "loss": 3.9021, + "theoretical_loss": 4.712525206708854, + "tokens_seen": 109569024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004882748244734203, + "loss": 3.7764, + "theoretical_loss": 4.71213955426718, + "tokens_seen": 109634560 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048826479438314946, + "loss": 3.8949, + "theoretical_loss": 4.711754196791814, + "tokens_seen": 109700096 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 304518, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7989871501922607, + "objective/train/theoretical_loss": 4.711369133881137, + "objective/train/tokens_used": 130225632, + "theoretical_loss": 4.711369133881137, + "tokens_seen": 109765632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048825476429287864, + "loss": 3.8576, + "theoretical_loss": 4.711369133881137, + "tokens_seen": 109765632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004882447342026078, + "loss": 3.8614, + "theoretical_loss": 4.710984365134315, + "tokens_seen": 109831168 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488234704112337, + "loss": 3.7865, + "theoretical_loss": 4.7105998901513, + "tokens_seen": 109896704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048822467402206624, + "loss": 3.8341, + "theoretical_loss": 4.710215708532828, + "tokens_seen": 109962240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048821464393179536, + "loss": 3.7732, + "theoretical_loss": 4.70983181988041, + "tokens_seen": 110027776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004882046138415246, + "loss": 3.8397, + "theoretical_loss": 4.70944822379634, + "tokens_seen": 110093312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004881945837512537, + "loss": 3.8024, + "theoretical_loss": 4.709064919883685, + "tokens_seen": 110158848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048818455366098296, + "loss": 3.955, + "theoretical_loss": 4.708681907746288, + "tokens_seen": 110224384 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048817452357071214, + "loss": 3.8467, + "theoretical_loss": 4.708299186988764, + "tokens_seen": 110289920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004881644934804413, + "loss": 3.9468, + "theoretical_loss": 4.7079167572164975, + "tokens_seen": 110355456 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004881544633901705, + "loss": 3.6804, + "theoretical_loss": 4.707534618035643, + "tokens_seen": 110420992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048814443329989974, + "loss": 3.7062, + "theoretical_loss": 4.7071527690531205, + "tokens_seen": 110486528 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048813440320962887, + "loss": 3.8513, + "theoretical_loss": 4.706771209876613, + "tokens_seen": 110552064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004881243731193581, + "loss": 3.9044, + "theoretical_loss": 4.706389940114569, + "tokens_seen": 110617600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048811434302908723, + "loss": 3.743, + "theoretical_loss": 4.706008959376195, + "tokens_seen": 110683136 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048810431293881646, + "loss": 3.9563, + "theoretical_loss": 4.705628267271457, + "tokens_seen": 110748672 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048809428284854564, + "loss": 3.8371, + "theoretical_loss": 4.7052478634110795, + "tokens_seen": 110814208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004880842527582748, + "loss": 3.77, + "theoretical_loss": 4.7048677474065395, + "tokens_seen": 110879744 + }, + { + "epoch": 1.01, + "learning_rate": 0.000488074222668004, + "loss": 3.8789, + "theoretical_loss": 4.7044879188700675, + "tokens_seen": 110945280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004880641925777332, + "loss": 3.8903, + "theoretical_loss": 4.704108377414645, + "tokens_seen": 111010816 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048805416248746237, + "loss": 3.9428, + "theoretical_loss": 4.7037291226540034, + "tokens_seen": 111076352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004880441323971916, + "loss": 3.7705, + "theoretical_loss": 4.703350154202621, + "tokens_seen": 111141888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048803410230692073, + "loss": 3.8648, + "theoretical_loss": 4.702971471675721, + "tokens_seen": 111207424 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048802407221664997, + "loss": 3.9187, + "theoretical_loss": 4.70259307468927, + "tokens_seen": 111272960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048801404212637915, + "loss": 3.8708, + "theoretical_loss": 4.702214962859978, + "tokens_seen": 111338496 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 309328, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8928475379943848, + "objective/train/theoretical_loss": 4.701837135805292, + "objective/train/tokens_used": 131864032, + "theoretical_loss": 4.701837135805292, + "tokens_seen": 111404032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048800401203610833, + "loss": 3.8353, + "theoretical_loss": 4.701837135805292, + "tokens_seen": 111404032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879939819458375, + "loss": 3.7706, + "theoretical_loss": 4.701459593143397, + "tokens_seen": 111469568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879839518555667, + "loss": 3.8086, + "theoretical_loss": 4.701082334493217, + "tokens_seen": 111535104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879739217652959, + "loss": 3.9351, + "theoretical_loss": 4.700705359474409, + "tokens_seen": 111600640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879638916750251, + "loss": 3.891, + "theoretical_loss": 4.700328667707359, + "tokens_seen": 111666176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879538615847543, + "loss": 3.7482, + "theoretical_loss": 4.6999522588131875, + "tokens_seen": 111731712 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048794383149448347, + "loss": 3.828, + "theoretical_loss": 4.699576132413743, + "tokens_seen": 111797248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048793380140421265, + "loss": 3.7973, + "theoretical_loss": 4.6992002881316, + "tokens_seen": 111862784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048792377131394183, + "loss": 3.8156, + "theoretical_loss": 4.698824725590056, + "tokens_seen": 111928320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048791374122367107, + "loss": 3.7747, + "theoretical_loss": 4.698449444413137, + "tokens_seen": 111993856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004879037111334002, + "loss": 3.8534, + "theoretical_loss": 4.698074444225584, + "tokens_seen": 112059392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048789368104312943, + "loss": 3.8266, + "theoretical_loss": 4.697699724652862, + "tokens_seen": 112124928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048788365095285856, + "loss": 3.7805, + "theoretical_loss": 4.6973252853211545, + "tokens_seen": 112190464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004878736208625878, + "loss": 3.9046, + "theoretical_loss": 4.696951125857355, + "tokens_seen": 112256000 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048786359077231697, + "loss": 3.6987, + "theoretical_loss": 4.696577245889079, + "tokens_seen": 112321536 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048785356068204615, + "loss": 3.7059, + "theoretical_loss": 4.696203645044648, + "tokens_seen": 112387072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048784353059177533, + "loss": 3.8154, + "theoretical_loss": 4.695830322953098, + "tokens_seen": 112452608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048783350050150457, + "loss": 3.8812, + "theoretical_loss": 4.69545727924417, + "tokens_seen": 112518144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004878234704112337, + "loss": 3.883, + "theoretical_loss": 4.695084513548318, + "tokens_seen": 112583680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048781344032096293, + "loss": 3.6161, + "theoretical_loss": 4.694712025496696, + "tokens_seen": 112649216 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048780341023069206, + "loss": 3.8544, + "theoretical_loss": 4.694339814721165, + "tokens_seen": 112714752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004877933801404213, + "loss": 3.785, + "theoretical_loss": 4.6939678808542835, + "tokens_seen": 112780288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004877833500501505, + "loss": 3.8565, + "theoretical_loss": 4.693596223529315, + "tokens_seen": 112845824 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048777331995987966, + "loss": 3.8563, + "theoretical_loss": 4.69322484238022, + "tokens_seen": 112911360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048776328986960884, + "loss": 3.9167, + "theoretical_loss": 4.692853737041652, + "tokens_seen": 112976896 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 312532, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.829629421234131, + "objective/train/theoretical_loss": 4.692482907148964, + "objective/train/tokens_used": 133502432, + "theoretical_loss": 4.692482907148964, + "tokens_seen": 113042432 + }, + { + "epoch": 1.01, + "learning_rate": 0.000487753259779338, + "loss": 3.856, + "theoretical_loss": 4.692482907148964, + "tokens_seen": 113042432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004877432296890672, + "loss": 3.9176, + "theoretical_loss": 4.692112352338201, + "tokens_seen": 113107968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048773319959879644, + "loss": 3.9561, + "theoretical_loss": 4.691742072246098, + "tokens_seen": 113173504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048772316950852556, + "loss": 3.8348, + "theoretical_loss": 4.691372066510079, + "tokens_seen": 113239040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004877131394182548, + "loss": 3.8252, + "theoretical_loss": 4.69100233476826, + "tokens_seen": 113304576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004877031093279839, + "loss": 3.8764, + "theoretical_loss": 4.690632876659439, + "tokens_seen": 113370112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048769307923771316, + "loss": 3.8223, + "theoretical_loss": 4.690263691823102, + "tokens_seen": 113435648 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048768304914744234, + "loss": 3.8624, + "theoretical_loss": 4.689894779899414, + "tokens_seen": 113501184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004876730190571715, + "loss": 3.7, + "theoretical_loss": 4.689526140529226, + "tokens_seen": 113566720 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004876629889669007, + "loss": 3.7971, + "theoretical_loss": 4.689157773354065, + "tokens_seen": 113632256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048765295887662994, + "loss": 3.6977, + "theoretical_loss": 4.688789678016136, + "tokens_seen": 113697792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048764292878635907, + "loss": 3.7481, + "theoretical_loss": 4.688421854158324, + "tokens_seen": 113763328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004876328986960883, + "loss": 3.8044, + "theoretical_loss": 4.688054301424183, + "tokens_seen": 113828864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048762286860581743, + "loss": 3.7587, + "theoretical_loss": 4.687687019457942, + "tokens_seen": 113894400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048761283851554666, + "loss": 3.8538, + "theoretical_loss": 4.687320007904505, + "tokens_seen": 113959936 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048760280842527584, + "loss": 3.688, + "theoretical_loss": 4.686953266409439, + "tokens_seen": 114025472 + }, + { + "epoch": 1.01, + "learning_rate": 0.000487592778335005, + "loss": 3.8959, + "theoretical_loss": 4.686586794618986, + "tokens_seen": 114091008 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004875827482447342, + "loss": 3.8348, + "theoretical_loss": 4.686220592180047, + "tokens_seen": 114156544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004875727181544634, + "loss": 3.8824, + "theoretical_loss": 4.685854658740192, + "tokens_seen": 114222080 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048756268806419257, + "loss": 3.8689, + "theoretical_loss": 4.685488993947655, + "tokens_seen": 114287616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004875526579739218, + "loss": 3.8646, + "theoretical_loss": 4.685123597451328, + "tokens_seen": 114353152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048754262788365093, + "loss": 3.8994, + "theoretical_loss": 4.684758468900765, + "tokens_seen": 114418688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048753259779338017, + "loss": 3.8168, + "theoretical_loss": 4.684393607946177, + "tokens_seen": 114484224 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048752256770310935, + "loss": 3.857, + "theoretical_loss": 4.6840290142384315, + "tokens_seen": 114549760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048751253761283853, + "loss": 3.8914, + "theoretical_loss": 4.683664687429053, + "tokens_seen": 114615296 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 317161, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7360217571258545, + "objective/train/theoretical_loss": 4.683300627170215, + "objective/train/tokens_used": 135140832, + "theoretical_loss": 4.683300627170215, + "tokens_seen": 114680832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004875025075225677, + "loss": 3.9052, + "theoretical_loss": 4.683300627170215, + "tokens_seen": 114680832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874924774322969, + "loss": 3.9026, + "theoretical_loss": 4.6829368331147485, + "tokens_seen": 114746368 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048748244734202607, + "loss": 3.6596, + "theoretical_loss": 4.682573304916131, + "tokens_seen": 114811904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874724172517553, + "loss": 3.8071, + "theoretical_loss": 4.682210042228488, + "tokens_seen": 114877440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048746238716148443, + "loss": 3.8493, + "theoretical_loss": 4.681847044706593, + "tokens_seen": 114942976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048745235707121367, + "loss": 3.8429, + "theoretical_loss": 4.681484312005866, + "tokens_seen": 115008512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874423269809428, + "loss": 3.9535, + "theoretical_loss": 4.681121843782369, + "tokens_seen": 115074048 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048743229689067203, + "loss": 3.6955, + "theoretical_loss": 4.680759639692808, + "tokens_seen": 115139584 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874222668004012, + "loss": 3.8031, + "theoretical_loss": 4.680397699394528, + "tokens_seen": 115205120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874122367101304, + "loss": 3.74, + "theoretical_loss": 4.680036022545514, + "tokens_seen": 115270656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004874022066198596, + "loss": 3.7374, + "theoretical_loss": 4.679674608804389, + "tokens_seen": 115336192 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048739217652958876, + "loss": 3.8434, + "theoretical_loss": 4.679313457830409, + "tokens_seen": 115401728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048738214643931794, + "loss": 3.8561, + "theoretical_loss": 4.67895256928347, + "tokens_seen": 115467264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048737211634904717, + "loss": 3.8852, + "theoretical_loss": 4.678591942824095, + "tokens_seen": 115532800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004873620862587763, + "loss": 3.7778, + "theoretical_loss": 4.678231578113444, + "tokens_seen": 115598336 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048735205616850553, + "loss": 3.9136, + "theoretical_loss": 4.677871474813302, + "tokens_seen": 115663872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004873420260782347, + "loss": 3.7162, + "theoretical_loss": 4.677511632586086, + "tokens_seen": 115729408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004873319959879639, + "loss": 3.8423, + "theoretical_loss": 4.6771520510948354, + "tokens_seen": 115794944 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004873219658976931, + "loss": 3.7936, + "theoretical_loss": 4.67679273000322, + "tokens_seen": 115860480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048731193580742226, + "loss": 3.897, + "theoretical_loss": 4.676433668975531, + "tokens_seen": 115926016 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048730190571715144, + "loss": 3.6542, + "theoretical_loss": 4.6760748676766815, + "tokens_seen": 115991552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004872918756268807, + "loss": 3.9173, + "theoretical_loss": 4.675716325772205, + "tokens_seen": 116057088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004872818455366098, + "loss": 3.7968, + "theoretical_loss": 4.675358042928257, + "tokens_seen": 116122624 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048727181544633904, + "loss": 3.8134, + "theoretical_loss": 4.675000018811605, + "tokens_seen": 116188160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048726178535606816, + "loss": 3.7287, + "theoretical_loss": 4.674642253089642, + "tokens_seen": 116253696 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 320427, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6847989559173584, + "objective/train/theoretical_loss": 4.674284745430366, + "objective/train/tokens_used": 136779232, + "theoretical_loss": 4.674284745430366, + "tokens_seen": 116319232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004872517552657974, + "loss": 3.5945, + "theoretical_loss": 4.674284745430366, + "tokens_seen": 116319232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004872417251755266, + "loss": 3.8892, + "theoretical_loss": 4.6739274955023955, + "tokens_seen": 116384768 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048723169508525576, + "loss": 3.8516, + "theoretical_loss": 4.673570502974956, + "tokens_seen": 116450304 + }, + { + "epoch": 1.01, + "learning_rate": 0.000487221664994985, + "loss": 3.8006, + "theoretical_loss": 4.673213767517886, + "tokens_seen": 116515840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004872116349047141, + "loss": 3.8078, + "theoretical_loss": 4.672857288801632, + "tokens_seen": 116581376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048720160481444336, + "loss": 3.8284, + "theoretical_loss": 4.67250106649725, + "tokens_seen": 116646912 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048719157472417254, + "loss": 3.9574, + "theoretical_loss": 4.672145100276398, + "tokens_seen": 116712448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004871815446339017, + "loss": 3.8029, + "theoretical_loss": 4.671789389811342, + "tokens_seen": 116777984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004871715145436309, + "loss": 3.8268, + "theoretical_loss": 4.671433934774949, + "tokens_seen": 116843520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048716148445336014, + "loss": 3.7836, + "theoretical_loss": 4.671078734840689, + "tokens_seen": 116909056 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048715145436308927, + "loss": 3.7414, + "theoretical_loss": 4.670723789682631, + "tokens_seen": 116974592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004871414242728185, + "loss": 3.8888, + "theoretical_loss": 4.670369098975444, + "tokens_seen": 117040128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048713139418254763, + "loss": 3.8059, + "theoretical_loss": 4.670014662394392, + "tokens_seen": 117105664 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048712136409227686, + "loss": 3.7662, + "theoretical_loss": 4.669660479615338, + "tokens_seen": 117171200 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048711133400200604, + "loss": 3.8567, + "theoretical_loss": 4.6693065503147375, + "tokens_seen": 117236736 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004871013039117352, + "loss": 3.7476, + "theoretical_loss": 4.668952874169639, + "tokens_seen": 117302272 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004870912738214644, + "loss": 3.7042, + "theoretical_loss": 4.668599450857684, + "tokens_seen": 117367808 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004870812437311936, + "loss": 3.8124, + "theoretical_loss": 4.668246280057101, + "tokens_seen": 117433344 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048707121364092277, + "loss": 3.8326, + "theoretical_loss": 4.667893361446712, + "tokens_seen": 117498880 + }, + { + "epoch": 1.01, + "learning_rate": 0.000487061183550652, + "loss": 3.7307, + "theoretical_loss": 4.667540694705922, + "tokens_seen": 117564416 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048705115346038113, + "loss": 3.8212, + "theoretical_loss": 4.6671882795147255, + "tokens_seen": 117629952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048704112337011037, + "loss": 3.832, + "theoretical_loss": 4.666836115553697, + "tokens_seen": 117695488 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048703109327983955, + "loss": 3.7178, + "theoretical_loss": 4.666484202504001, + "tokens_seen": 117761024 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048702106318956873, + "loss": 3.8385, + "theoretical_loss": 4.666132540047376, + "tokens_seen": 117826560 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004870110330992979, + "loss": 3.7573, + "theoretical_loss": 4.665781127866147, + "tokens_seen": 117892096 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 324214, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.768333911895752, + "objective/train/theoretical_loss": 4.665429965643216, + "objective/train/tokens_used": 138417632, + "theoretical_loss": 4.665429965643216, + "tokens_seen": 117957632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004870010030090271, + "loss": 3.8131, + "theoretical_loss": 4.665429965643216, + "tokens_seen": 117957632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048699097291875627, + "loss": 3.797, + "theoretical_loss": 4.665079053062062, + "tokens_seen": 118023168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004869809428284855, + "loss": 3.8027, + "theoretical_loss": 4.664728389806739, + "tokens_seen": 118088704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048697091273821463, + "loss": 3.799, + "theoretical_loss": 4.66437797556188, + "tokens_seen": 118154240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048696088264794387, + "loss": 3.6595, + "theoretical_loss": 4.664027810012689, + "tokens_seen": 118219776 + }, + { + "epoch": 1.01, + "learning_rate": 0.000486950852557673, + "loss": 3.8912, + "theoretical_loss": 4.66367789284494, + "tokens_seen": 118285312 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048694082246740223, + "loss": 3.8459, + "theoretical_loss": 4.663328223744983, + "tokens_seen": 118350848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004869307923771314, + "loss": 3.8515, + "theoretical_loss": 4.662978802399733, + "tokens_seen": 118416384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004869207622868606, + "loss": 3.8705, + "theoretical_loss": 4.662629628496676, + "tokens_seen": 118481920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004869107321965898, + "loss": 3.7818, + "theoretical_loss": 4.662280701723862, + "tokens_seen": 118547456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048690070210631896, + "loss": 3.7876, + "theoretical_loss": 4.6619320217699105, + "tokens_seen": 118612992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048689067201604814, + "loss": 3.7476, + "theoretical_loss": 4.661583588324, + "tokens_seen": 118678528 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048688064192577737, + "loss": 3.7923, + "theoretical_loss": 4.661235401075876, + "tokens_seen": 118744064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004868706118355065, + "loss": 3.7882, + "theoretical_loss": 4.660887459715845, + "tokens_seen": 118809600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048686058174523573, + "loss": 3.8378, + "theoretical_loss": 4.660539763934769, + "tokens_seen": 118875136 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004868505516549649, + "loss": 3.8098, + "theoretical_loss": 4.660192313424075, + "tokens_seen": 118940672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004868405215646941, + "loss": 3.8788, + "theoretical_loss": 4.659845107875744, + "tokens_seen": 119006208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004868304914744233, + "loss": 3.9372, + "theoretical_loss": 4.659498146982314, + "tokens_seen": 119071744 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048682046138415246, + "loss": 3.646, + "theoretical_loss": 4.659151430436877, + "tokens_seen": 119137280 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048681043129388164, + "loss": 3.8347, + "theoretical_loss": 4.658804957933081, + "tokens_seen": 119202816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004868004012036109, + "loss": 3.8488, + "theoretical_loss": 4.658458729165124, + "tokens_seen": 119268352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048679037111334, + "loss": 3.825, + "theoretical_loss": 4.658112743827756, + "tokens_seen": 119333888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048678034102306924, + "loss": 3.8346, + "theoretical_loss": 4.657767001616275, + "tokens_seen": 119399424 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048677031093279836, + "loss": 3.8016, + "theoretical_loss": 4.657421502226529, + "tokens_seen": 119464960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004867602808425276, + "loss": 3.9088, + "theoretical_loss": 4.657076245354913, + "tokens_seen": 119530496 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 328717, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.834892749786377, + "objective/train/theoretical_loss": 4.656731230698368, + "objective/train/tokens_used": 140056032, + "theoretical_loss": 4.656731230698368, + "tokens_seen": 119596032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004867502507522568, + "loss": 3.8182, + "theoretical_loss": 4.656731230698368, + "tokens_seen": 119596032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048674022066198596, + "loss": 3.7902, + "theoretical_loss": 4.656386457954378, + "tokens_seen": 119661568 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048673019057171514, + "loss": 3.9269, + "theoretical_loss": 4.656041926820972, + "tokens_seen": 119727104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004867201604814443, + "loss": 3.9256, + "theoretical_loss": 4.655697636996719, + "tokens_seen": 119792640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004867101303911735, + "loss": 3.7332, + "theoretical_loss": 4.655353588180732, + "tokens_seen": 119858176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048670010030090274, + "loss": 3.8534, + "theoretical_loss": 4.6550097800726595, + "tokens_seen": 119923712 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048669007021063187, + "loss": 3.8307, + "theoretical_loss": 4.654666212372691, + "tokens_seen": 119989248 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004866800401203611, + "loss": 3.8143, + "theoretical_loss": 4.654322884781552, + "tokens_seen": 120054784 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004866700100300903, + "loss": 3.8246, + "theoretical_loss": 4.653979797000502, + "tokens_seen": 120120320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048665997993981947, + "loss": 3.8461, + "theoretical_loss": 4.6536369487313385, + "tokens_seen": 120185856 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048664994984954865, + "loss": 3.8496, + "theoretical_loss": 4.653294339676389, + "tokens_seen": 120251392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048663991975927783, + "loss": 3.8826, + "theoretical_loss": 4.652951969538515, + "tokens_seen": 120316928 + }, + { + "epoch": 1.01, + "learning_rate": 0.000486629889669007, + "loss": 3.9084, + "theoretical_loss": 4.652609838021108, + "tokens_seen": 120382464 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048661985957873624, + "loss": 3.8277, + "theoretical_loss": 4.652267944828087, + "tokens_seen": 120448000 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048660982948846537, + "loss": 3.7849, + "theoretical_loss": 4.651926289663903, + "tokens_seen": 120513536 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004865997993981946, + "loss": 3.7606, + "theoretical_loss": 4.65158487223353, + "tokens_seen": 120579072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048658976930792373, + "loss": 3.904, + "theoretical_loss": 4.651243692242472, + "tokens_seen": 120644608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048657973921765297, + "loss": 3.8381, + "theoretical_loss": 4.650902749396753, + "tokens_seen": 120710144 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048656970912738215, + "loss": 3.7665, + "theoretical_loss": 4.650562043402925, + "tokens_seen": 120775680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048655967903711133, + "loss": 3.8898, + "theoretical_loss": 4.6502215739680555, + "tokens_seen": 120841216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004865496489468405, + "loss": 3.7905, + "theoretical_loss": 4.649881340799743, + "tokens_seen": 120906752 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048653961885656975, + "loss": 3.8524, + "theoretical_loss": 4.649541343606094, + "tokens_seen": 120972288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004865295887662989, + "loss": 3.9006, + "theoretical_loss": 4.649201582095744, + "tokens_seen": 121037824 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004865195586760281, + "loss": 3.8063, + "theoretical_loss": 4.648862055977837, + "tokens_seen": 121103360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048650952858575724, + "loss": 3.8215, + "theoretical_loss": 4.648522764962039, + "tokens_seen": 121168896 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 331852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7816104888916016, + "objective/train/theoretical_loss": 4.6481837087585305, + "objective/train/tokens_used": 141694432, + "theoretical_loss": 4.6481837087585305, + "tokens_seen": 121234432 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048649949849548647, + "loss": 3.7711, + "theoretical_loss": 4.6481837087585305, + "tokens_seen": 121234432 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048648946840521565, + "loss": 3.7303, + "theoretical_loss": 4.647844887078001, + "tokens_seen": 121299968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048647943831494483, + "loss": 3.8228, + "theoretical_loss": 4.647506299631658, + "tokens_seen": 121365504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048646940822467407, + "loss": 3.8507, + "theoretical_loss": 4.647167946131217, + "tokens_seen": 121431040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004864593781344032, + "loss": 3.837, + "theoretical_loss": 4.646829826288904, + "tokens_seen": 121496576 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048644934804413243, + "loss": 3.7749, + "theoretical_loss": 4.646491939817458, + "tokens_seen": 121562112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004864393179538616, + "loss": 3.8541, + "theoretical_loss": 4.646154286430118, + "tokens_seen": 121627648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004864292878635908, + "loss": 3.8228, + "theoretical_loss": 4.645816865840637, + "tokens_seen": 121693184 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048641925777332, + "loss": 3.8611, + "theoretical_loss": 4.645479677763269, + "tokens_seen": 121758720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048640922768304916, + "loss": 3.8827, + "theoretical_loss": 4.645142721912775, + "tokens_seen": 121824256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048639919759277834, + "loss": 3.8512, + "theoretical_loss": 4.644805998004418, + "tokens_seen": 121889792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048638916750250757, + "loss": 3.7428, + "theoretical_loss": 4.6444695057539604, + "tokens_seen": 121955328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004863791374122367, + "loss": 3.6853, + "theoretical_loss": 4.644133244877671, + "tokens_seen": 122020864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048636910732196593, + "loss": 3.8334, + "theoretical_loss": 4.643797215092313, + "tokens_seen": 122086400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004863590772316951, + "loss": 3.8216, + "theoretical_loss": 4.643461416115152, + "tokens_seen": 122151936 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004863490471414243, + "loss": 3.8532, + "theoretical_loss": 4.643125847663949, + "tokens_seen": 122217472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004863390170511535, + "loss": 3.724, + "theoretical_loss": 4.64279050945696, + "tokens_seen": 122283008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048632898696088266, + "loss": 3.8188, + "theoretical_loss": 4.642455401212939, + "tokens_seen": 122348544 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048631895687061184, + "loss": 3.8263, + "theoretical_loss": 4.642120522651133, + "tokens_seen": 122414080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004863089267803411, + "loss": 3.8094, + "theoretical_loss": 4.64178587349128, + "tokens_seen": 122479616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004862988966900702, + "loss": 3.8884, + "theoretical_loss": 4.6414514534536115, + "tokens_seen": 122545152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048628886659979944, + "loss": 3.869, + "theoretical_loss": 4.641117262258851, + "tokens_seen": 122610688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048627883650952857, + "loss": 3.7495, + "theoretical_loss": 4.640783299628206, + "tokens_seen": 122676224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004862688064192578, + "loss": 3.9174, + "theoretical_loss": 4.6404495652833795, + "tokens_seen": 122741760 + }, + { + "epoch": 1.01, + "learning_rate": 0.000486258776328987, + "loss": 3.7989, + "theoretical_loss": 4.640116058946557, + "tokens_seen": 122807296 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 334859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6581387519836426, + "objective/train/theoretical_loss": 4.63978278034041, + "objective/train/tokens_used": 143332832, + "theoretical_loss": 4.63978278034041, + "tokens_seen": 122872832 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048624874623871616, + "loss": 3.7498, + "theoretical_loss": 4.63978278034041, + "tokens_seen": 122872832 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048623871614844534, + "loss": 3.7022, + "theoretical_loss": 4.639449729188099, + "tokens_seen": 122938368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004862286860581745, + "loss": 3.8196, + "theoretical_loss": 4.639116905213264, + "tokens_seen": 123003904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004862186559679037, + "loss": 3.7981, + "theoretical_loss": 4.638784308140028, + "tokens_seen": 123069440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048620862587763294, + "loss": 3.7704, + "theoretical_loss": 4.638451937693001, + "tokens_seen": 123134976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048619859578736207, + "loss": 3.7793, + "theoretical_loss": 4.638119793597268, + "tokens_seen": 123200512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004861885656970913, + "loss": 3.6678, + "theoretical_loss": 4.637787875578393, + "tokens_seen": 123266048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004861785356068205, + "loss": 3.782, + "theoretical_loss": 4.637456183362425, + "tokens_seen": 123331584 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048616850551654967, + "loss": 3.8283, + "theoretical_loss": 4.637124716675883, + "tokens_seen": 123397120 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048615847542627885, + "loss": 3.8371, + "theoretical_loss": 4.636793475245765, + "tokens_seen": 123462656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048614844533600803, + "loss": 3.7563, + "theoretical_loss": 4.6364624587995475, + "tokens_seen": 123528192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004861384152457372, + "loss": 3.742, + "theoretical_loss": 4.636131667065175, + "tokens_seen": 123593728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048612838515546644, + "loss": 3.7707, + "theoretical_loss": 4.635801099771069, + "tokens_seen": 123659264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048611835506519557, + "loss": 3.7768, + "theoretical_loss": 4.635470756646122, + "tokens_seen": 123724800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004861083249749248, + "loss": 3.84, + "theoretical_loss": 4.635140637419697, + "tokens_seen": 123790336 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048609829488465393, + "loss": 3.8001, + "theoretical_loss": 4.634810741821628, + "tokens_seen": 123855872 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048608826479438317, + "loss": 3.796, + "theoretical_loss": 4.634481069582216, + "tokens_seen": 123921408 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048607823470411235, + "loss": 3.7413, + "theoretical_loss": 4.63415162043223, + "tokens_seen": 123986944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048606820461384153, + "loss": 3.7611, + "theoretical_loss": 4.633822394102907, + "tokens_seen": 124052480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004860581745235707, + "loss": 3.8469, + "theoretical_loss": 4.633493390325949, + "tokens_seen": 124118016 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048604814443329995, + "loss": 3.7448, + "theoretical_loss": 4.633164608833521, + "tokens_seen": 124183552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004860381143430291, + "loss": 3.7905, + "theoretical_loss": 4.632836049358254, + "tokens_seen": 124249088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004860280842527583, + "loss": 3.692, + "theoretical_loss": 4.632507711633237, + "tokens_seen": 124314624 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048601805416248744, + "loss": 3.6894, + "theoretical_loss": 4.632179595392028, + "tokens_seen": 124380160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048600802407221667, + "loss": 3.9255, + "theoretical_loss": 4.631851700368637, + "tokens_seen": 124445696 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 338590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4689290523529053, + "objective/train/theoretical_loss": 4.631524026297538, + "objective/train/tokens_used": 144971232, + "theoretical_loss": 4.631524026297538, + "tokens_seen": 124511232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048599799398194585, + "loss": 3.6594, + "theoretical_loss": 4.631524026297538, + "tokens_seen": 124511232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048598796389167503, + "loss": 3.8042, + "theoretical_loss": 4.631196572913663, + "tokens_seen": 124576768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859779338014042, + "loss": 3.7094, + "theoretical_loss": 4.6308693399524, + "tokens_seen": 124642304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859679037111334, + "loss": 3.7755, + "theoretical_loss": 4.630542327149594, + "tokens_seen": 124707840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859578736208626, + "loss": 3.7276, + "theoretical_loss": 4.630215534241545, + "tokens_seen": 124773376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859478435305918, + "loss": 3.6713, + "theoretical_loss": 4.629888960965005, + "tokens_seen": 124838912 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048593781344032094, + "loss": 3.8595, + "theoretical_loss": 4.629562607057184, + "tokens_seen": 124904448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859277833500502, + "loss": 3.6869, + "theoretical_loss": 4.629236472255739, + "tokens_seen": 124969984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004859177532597793, + "loss": 3.8106, + "theoretical_loss": 4.6289105562987825, + "tokens_seen": 125035520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048590772316950854, + "loss": 3.7784, + "theoretical_loss": 4.628584858924873, + "tokens_seen": 125101056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858976930792377, + "loss": 3.7355, + "theoretical_loss": 4.62825937987302, + "tokens_seen": 125166592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858876629889669, + "loss": 3.7424, + "theoretical_loss": 4.627934118882682, + "tokens_seen": 125232128 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858776328986961, + "loss": 3.7236, + "theoretical_loss": 4.627609075693764, + "tokens_seen": 125297664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858676028084253, + "loss": 3.7351, + "theoretical_loss": 4.627284250046616, + "tokens_seen": 125363200 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048585757271815444, + "loss": 3.75, + "theoretical_loss": 4.626959641682033, + "tokens_seen": 125428736 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858475426278837, + "loss": 3.7105, + "theoretical_loss": 4.626635250341256, + "tokens_seen": 125494272 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858375125376128, + "loss": 3.852, + "theoretical_loss": 4.626311075765967, + "tokens_seen": 125559808 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048582748244734204, + "loss": 3.8004, + "theoretical_loss": 4.625987117698292, + "tokens_seen": 125625344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858174523570712, + "loss": 3.7245, + "theoretical_loss": 4.625663375880797, + "tokens_seen": 125690880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004858074222668004, + "loss": 3.8311, + "theoretical_loss": 4.625339850056488, + "tokens_seen": 125756416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004857973921765296, + "loss": 3.7943, + "theoretical_loss": 4.62501653996881, + "tokens_seen": 125821952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048578736208625877, + "loss": 3.7605, + "theoretical_loss": 4.624693445361646, + "tokens_seen": 125887488 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048577733199598795, + "loss": 3.7435, + "theoretical_loss": 4.6243705659793175, + "tokens_seen": 125953024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004857673019057172, + "loss": 3.8491, + "theoretical_loss": 4.624047901566582, + "tokens_seen": 126018560 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004857572718154463, + "loss": 3.7093, + "theoretical_loss": 4.62372545186863, + "tokens_seen": 126084096 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 343588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.83012318611145, + "objective/train/theoretical_loss": 4.62340321663109, + "objective/train/tokens_used": 146609632, + "theoretical_loss": 4.62340321663109, + "tokens_seen": 126149632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048574724172517554, + "loss": 3.7979, + "theoretical_loss": 4.62340321663109, + "tokens_seen": 126149632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048573721163490467, + "loss": 3.7951, + "theoretical_loss": 4.62308119560002, + "tokens_seen": 126215168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004857271815446339, + "loss": 3.804, + "theoretical_loss": 4.622759388521913, + "tokens_seen": 126280704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048571715145436314, + "loss": 3.7249, + "theoretical_loss": 4.622437795143691, + "tokens_seen": 126346240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048570712136409227, + "loss": 3.7086, + "theoretical_loss": 4.6221164152127105, + "tokens_seen": 126411776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004856970912738215, + "loss": 3.7812, + "theoretical_loss": 4.621795248476753, + "tokens_seen": 126477312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004856870611835507, + "loss": 3.8111, + "theoretical_loss": 4.621474294684031, + "tokens_seen": 126542848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048567703109327987, + "loss": 3.8106, + "theoretical_loss": 4.621153553583183, + "tokens_seen": 126608384 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048566700100300905, + "loss": 3.7342, + "theoretical_loss": 4.620833024923277, + "tokens_seen": 126673920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048565697091273823, + "loss": 3.7684, + "theoretical_loss": 4.620512708453802, + "tokens_seen": 126739456 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004856469408224674, + "loss": 3.7853, + "theoretical_loss": 4.620192603924678, + "tokens_seen": 126804992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048563691073219664, + "loss": 3.7998, + "theoretical_loss": 4.619872711086241, + "tokens_seen": 126870528 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048562688064192577, + "loss": 3.7379, + "theoretical_loss": 4.619553029689259, + "tokens_seen": 126936064 + }, + { + "epoch": 1.01, + "learning_rate": 0.000485616850551655, + "loss": 3.7997, + "theoretical_loss": 4.619233559484913, + "tokens_seen": 127001600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048560682046138413, + "loss": 3.6835, + "theoretical_loss": 4.6189143002248105, + "tokens_seen": 127067136 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048559679037111337, + "loss": 3.8059, + "theoretical_loss": 4.618595251660978, + "tokens_seen": 127132672 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048558676028084255, + "loss": 3.7946, + "theoretical_loss": 4.618276413545861, + "tokens_seen": 127198208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048557673019057173, + "loss": 3.8466, + "theoretical_loss": 4.617957785632322, + "tokens_seen": 127263744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004855667001003009, + "loss": 3.7214, + "theoretical_loss": 4.617639367673643, + "tokens_seen": 127329280 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048555667001003015, + "loss": 3.8058, + "theoretical_loss": 4.61732115942352, + "tokens_seen": 127394816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004855466399197593, + "loss": 3.6589, + "theoretical_loss": 4.617003160636067, + "tokens_seen": 127460352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004855366098294885, + "loss": 3.7833, + "theoretical_loss": 4.61668537106581, + "tokens_seen": 127525888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048552657973921764, + "loss": 3.6741, + "theoretical_loss": 4.616367790467689, + "tokens_seen": 127591424 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048551654964894687, + "loss": 3.6351, + "theoretical_loss": 4.616050418597059, + "tokens_seen": 127656960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048550651955867605, + "loss": 3.7646, + "theoretical_loss": 4.615733255209685, + "tokens_seen": 127722496 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 346412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.697233200073242, + "objective/train/theoretical_loss": 4.615416300061744, + "objective/train/tokens_used": 148248032, + "theoretical_loss": 4.615416300061744, + "tokens_seen": 127788032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048549648946840523, + "loss": 3.835, + "theoretical_loss": 4.615416300061744, + "tokens_seen": 127788032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854864593781344, + "loss": 3.8913, + "theoretical_loss": 4.615099552909821, + "tokens_seen": 127853568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854764292878636, + "loss": 3.8515, + "theoretical_loss": 4.614783013510912, + "tokens_seen": 127919104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854663991975928, + "loss": 3.8091, + "theoretical_loss": 4.61446668162242, + "tokens_seen": 127984640 + }, + { + "epoch": 1.01, + "learning_rate": 0.000485456369107322, + "loss": 3.7531, + "theoretical_loss": 4.614150557002157, + "tokens_seen": 128050176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048544633901705114, + "loss": 3.7811, + "theoretical_loss": 4.6138346394083385, + "tokens_seen": 128115712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854363089267804, + "loss": 3.7923, + "theoretical_loss": 4.61351892859959, + "tokens_seen": 128181248 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854262788365095, + "loss": 3.8267, + "theoretical_loss": 4.613203424334937, + "tokens_seen": 128246784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048541624874623874, + "loss": 3.769, + "theoretical_loss": 4.612888126373813, + "tokens_seen": 128312320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004854062186559679, + "loss": 3.6585, + "theoretical_loss": 4.612573034476049, + "tokens_seen": 128377856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853961885656971, + "loss": 3.8547, + "theoretical_loss": 4.612258148401883, + "tokens_seen": 128443392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853861584754263, + "loss": 3.6757, + "theoretical_loss": 4.611943467911953, + "tokens_seen": 128508928 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853761283851555, + "loss": 3.5319, + "theoretical_loss": 4.6116289927672955, + "tokens_seen": 128574464 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048536609829488464, + "loss": 3.7209, + "theoretical_loss": 4.6113147227293485, + "tokens_seen": 128640000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853560682046139, + "loss": 3.763, + "theoretical_loss": 4.611000657559947, + "tokens_seen": 128705536 + }, + { + "epoch": 1.01, + "learning_rate": 0.000485346038114343, + "loss": 3.815, + "theoretical_loss": 4.610686797021323, + "tokens_seen": 128771072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048533600802407224, + "loss": 3.6675, + "theoretical_loss": 4.610373140876108, + "tokens_seen": 128836608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853259779338014, + "loss": 3.6843, + "theoretical_loss": 4.610059688887326, + "tokens_seen": 128902144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853159478435306, + "loss": 3.7837, + "theoretical_loss": 4.6097464408184, + "tokens_seen": 128967680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004853059177532598, + "loss": 3.6301, + "theoretical_loss": 4.609433396433144, + "tokens_seen": 129033216 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048529588766298897, + "loss": 3.7831, + "theoretical_loss": 4.6091205554957675, + "tokens_seen": 129098752 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048528585757271815, + "loss": 3.7939, + "theoretical_loss": 4.608807917770869, + "tokens_seen": 129164288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004852758274824474, + "loss": 3.7605, + "theoretical_loss": 4.608495483023443, + "tokens_seen": 129229824 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004852657973921765, + "loss": 3.8294, + "theoretical_loss": 4.608183251018872, + "tokens_seen": 129295360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048525576730190574, + "loss": 3.779, + "theoretical_loss": 4.607871221522928, + "tokens_seen": 129360896 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 351318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8568546772003174, + "objective/train/theoretical_loss": 4.607559394301776, + "objective/train/tokens_used": 149886432, + "theoretical_loss": 4.607559394301776, + "tokens_seen": 129426432 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048524573721163487, + "loss": 3.7001, + "theoretical_loss": 4.607559394301776, + "tokens_seen": 129426432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004852357071213641, + "loss": 3.7905, + "theoretical_loss": 4.607247769121965, + "tokens_seen": 129491968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004852256770310933, + "loss": 3.6399, + "theoretical_loss": 4.606936345750434, + "tokens_seen": 129557504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048521564694082247, + "loss": 3.7224, + "theoretical_loss": 4.6066251239545055, + "tokens_seen": 129623040 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048520561685055165, + "loss": 3.6292, + "theoretical_loss": 4.606314103501891, + "tokens_seen": 129688576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004851955867602809, + "loss": 3.8001, + "theoretical_loss": 4.6060032841606855, + "tokens_seen": 129754112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048518555667001, + "loss": 3.7694, + "theoretical_loss": 4.605692665699369, + "tokens_seen": 129819648 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048517552657973925, + "loss": 3.7386, + "theoretical_loss": 4.605382247886803, + "tokens_seen": 129885184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004851654964894684, + "loss": 3.829, + "theoretical_loss": 4.605072030492233, + "tokens_seen": 129950720 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004851554663991976, + "loss": 3.7708, + "theoretical_loss": 4.604762013285283, + "tokens_seen": 130016256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004851454363089268, + "loss": 3.6371, + "theoretical_loss": 4.604452196035964, + "tokens_seen": 130081792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048513540621865597, + "loss": 3.6265, + "theoretical_loss": 4.604142578514661, + "tokens_seen": 130147328 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048512537612838515, + "loss": 3.7906, + "theoretical_loss": 4.603833160492139, + "tokens_seen": 130212864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048511534603811433, + "loss": 3.8043, + "theoretical_loss": 4.603523941739546, + "tokens_seen": 130278400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004851053159478435, + "loss": 3.7972, + "theoretical_loss": 4.6032149220284, + "tokens_seen": 130343936 + }, + { + "epoch": 1.01, + "learning_rate": 0.00048509528585757275, + "loss": 3.7194, + "theoretical_loss": 4.602906101130603, + "tokens_seen": 130409472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004850852557673019, + "loss": 3.8168, + "theoretical_loss": 4.602597478818428, + "tokens_seen": 130475008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004850752256770311, + "loss": 3.7898, + "theoretical_loss": 4.602289054864525, + "tokens_seen": 130540544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048506519558676024, + "loss": 3.9004, + "theoretical_loss": 4.601980829041918, + "tokens_seen": 130606080 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004850551654964895, + "loss": 3.8352, + "theoretical_loss": 4.601672801124003, + "tokens_seen": 130671616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048504513540621866, + "loss": 3.8035, + "theoretical_loss": 4.6013649708845525, + "tokens_seen": 130737152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048503510531594784, + "loss": 3.6837, + "theoretical_loss": 4.601057338097706, + "tokens_seen": 130802688 + }, + { + "epoch": 1.02, + "learning_rate": 0.000485025075225677, + "loss": 3.7915, + "theoretical_loss": 4.600749902537977, + "tokens_seen": 130868224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048501504513540625, + "loss": 3.8592, + "theoretical_loss": 4.6004426639802505, + "tokens_seen": 130933760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004850050150451354, + "loss": 3.6793, + "theoretical_loss": 4.600135622199776, + "tokens_seen": 130999296 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 354144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7266104221343994, + "objective/train/theoretical_loss": 4.599828776972177, + "objective/train/tokens_used": 151524832, + "theoretical_loss": 4.599828776972177, + "tokens_seen": 131064832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849949849548646, + "loss": 3.7193, + "theoretical_loss": 4.599828776972177, + "tokens_seen": 131064832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849849548645938, + "loss": 3.7221, + "theoretical_loss": 4.599522128073442, + "tokens_seen": 131130368 + }, + { + "epoch": 1.02, + "learning_rate": 0.000484974924774323, + "loss": 3.7957, + "theoretical_loss": 4.599215675279925, + "tokens_seen": 131195904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849648946840522, + "loss": 3.7945, + "theoretical_loss": 4.598909418368353, + "tokens_seen": 131261440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048495486459378134, + "loss": 3.6542, + "theoretical_loss": 4.598603357115811, + "tokens_seen": 131326976 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849448345035106, + "loss": 3.758, + "theoretical_loss": 4.598297491299753, + "tokens_seen": 131392512 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849348044132397, + "loss": 3.8012, + "theoretical_loss": 4.597991820697995, + "tokens_seen": 131458048 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048492477432296894, + "loss": 3.8249, + "theoretical_loss": 4.597686345088718, + "tokens_seen": 131523584 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849147442326981, + "loss": 3.8587, + "theoretical_loss": 4.597381064250463, + "tokens_seen": 131589120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004849047141424273, + "loss": 3.7248, + "theoretical_loss": 4.597075977962135, + "tokens_seen": 131654656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848946840521565, + "loss": 3.685, + "theoretical_loss": 4.596771086002999, + "tokens_seen": 131720192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848846539618857, + "loss": 3.8187, + "theoretical_loss": 4.596466388152679, + "tokens_seen": 131785728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048487462387161484, + "loss": 3.7983, + "theoretical_loss": 4.596161884191162, + "tokens_seen": 131851264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848645937813441, + "loss": 3.748, + "theoretical_loss": 4.595857573898789, + "tokens_seen": 131916800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848545636910732, + "loss": 3.9093, + "theoretical_loss": 4.595553457056258, + "tokens_seen": 131982336 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048484453360080244, + "loss": 3.6598, + "theoretical_loss": 4.595249533444633, + "tokens_seen": 132047872 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848345035105316, + "loss": 3.6463, + "theoretical_loss": 4.5949458028453245, + "tokens_seen": 132113408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004848244734202608, + "loss": 3.873, + "theoretical_loss": 4.594642265040102, + "tokens_seen": 132178944 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048481444332999, + "loss": 3.7172, + "theoretical_loss": 4.594338919811092, + "tokens_seen": 132244480 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048480441323971917, + "loss": 3.8216, + "theoretical_loss": 4.59403576694077, + "tokens_seen": 132310016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048479438314944835, + "loss": 3.8195, + "theoretical_loss": 4.593732806211971, + "tokens_seen": 132375552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004847843530591776, + "loss": 3.7041, + "theoretical_loss": 4.593430037407877, + "tokens_seen": 132441088 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004847743229689067, + "loss": 3.8046, + "theoretical_loss": 4.593127460312026, + "tokens_seen": 132506624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048476429287863594, + "loss": 3.6229, + "theoretical_loss": 4.592825074708303, + "tokens_seen": 132572160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048475426278836507, + "loss": 3.6642, + "theoretical_loss": 4.5925228803809475, + "tokens_seen": 132637696 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 357779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9715874195098877, + "objective/train/theoretical_loss": 4.592220877114545, + "objective/train/tokens_used": 153163232, + "theoretical_loss": 4.592220877114545, + "tokens_seen": 132703232 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004847442326980943, + "loss": 3.8785, + "theoretical_loss": 4.592220877114545, + "tokens_seen": 132703232 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004847342026078235, + "loss": 3.7315, + "theoretical_loss": 4.591919064694032, + "tokens_seen": 132768768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048472417251755267, + "loss": 3.6485, + "theoretical_loss": 4.591617442904693, + "tokens_seen": 132834304 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048471414242728185, + "loss": 3.8655, + "theoretical_loss": 4.591316011532158, + "tokens_seen": 132899840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004847041123370111, + "loss": 3.7664, + "theoretical_loss": 4.591014770362406, + "tokens_seen": 132965376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004846940822467402, + "loss": 3.7334, + "theoretical_loss": 4.59071371918176, + "tokens_seen": 133030912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048468405215646945, + "loss": 3.8233, + "theoretical_loss": 4.590412857776891, + "tokens_seen": 133096448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004846740220661986, + "loss": 3.8244, + "theoretical_loss": 4.59011218593481, + "tokens_seen": 133161984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004846639919759278, + "loss": 3.7729, + "theoretical_loss": 4.589811703442875, + "tokens_seen": 133227520 + }, + { + "epoch": 1.02, + "learning_rate": 0.000484653961885657, + "loss": 3.6087, + "theoretical_loss": 4.589511410088786, + "tokens_seen": 133293056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048464393179538617, + "loss": 3.678, + "theoretical_loss": 4.589211305660585, + "tokens_seen": 133358592 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048463390170511535, + "loss": 3.7855, + "theoretical_loss": 4.588911389946658, + "tokens_seen": 133424128 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048462387161484453, + "loss": 3.7685, + "theoretical_loss": 4.588611662735727, + "tokens_seen": 133489664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004846138415245737, + "loss": 3.749, + "theoretical_loss": 4.588312123816858, + "tokens_seen": 133555200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048460381143430295, + "loss": 3.6955, + "theoretical_loss": 4.5880127729794555, + "tokens_seen": 133620736 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845937813440321, + "loss": 3.7819, + "theoretical_loss": 4.587713610013262, + "tokens_seen": 133686272 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845837512537613, + "loss": 3.7788, + "theoretical_loss": 4.587414634708358, + "tokens_seen": 133751808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048457372116349044, + "loss": 3.6489, + "theoretical_loss": 4.587115846855163, + "tokens_seen": 133817344 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845636910732197, + "loss": 3.8082, + "theoretical_loss": 4.586817246244429, + "tokens_seen": 133882880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048455366098294886, + "loss": 3.7914, + "theoretical_loss": 4.586518832667251, + "tokens_seen": 133948416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048454363089267804, + "loss": 3.6503, + "theoretical_loss": 4.5862206059150505, + "tokens_seen": 134013952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845336008024072, + "loss": 3.8813, + "theoretical_loss": 4.585922565779589, + "tokens_seen": 134079488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048452357071213645, + "loss": 3.7523, + "theoretical_loss": 4.585624712052962, + "tokens_seen": 134145024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845135406218656, + "loss": 3.8067, + "theoretical_loss": 4.5853270445275935, + "tokens_seen": 134210560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004845035105315948, + "loss": 3.7272, + "theoretical_loss": 4.585029562996246, + "tokens_seen": 134276096 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 363035, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8604016304016113, + "objective/train/theoretical_loss": 4.584732267252008, + "objective/train/tokens_used": 154801632, + "theoretical_loss": 4.584732267252008, + "tokens_seen": 134341632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048449348044132394, + "loss": 3.7014, + "theoretical_loss": 4.584732267252008, + "tokens_seen": 134341632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844834503510532, + "loss": 3.5897, + "theoretical_loss": 4.584435157088302, + "tokens_seen": 134407168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048447342026078236, + "loss": 3.7653, + "theoretical_loss": 4.584138232298881, + "tokens_seen": 134472704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048446339017051154, + "loss": 3.7757, + "theoretical_loss": 4.583841492677826, + "tokens_seen": 134538240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844533600802407, + "loss": 3.8722, + "theoretical_loss": 4.583544938019549, + "tokens_seen": 134603776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844433299899699, + "loss": 3.7772, + "theoretical_loss": 4.583248568118787, + "tokens_seen": 134669312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844332998996991, + "loss": 3.6449, + "theoretical_loss": 4.582952382770609, + "tokens_seen": 134734848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844232698094283, + "loss": 3.8283, + "theoretical_loss": 4.582656381770406, + "tokens_seen": 134800384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048441323971915745, + "loss": 3.7218, + "theoretical_loss": 4.582360564913898, + "tokens_seen": 134865920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004844032096288867, + "loss": 3.7711, + "theoretical_loss": 4.582064931997131, + "tokens_seen": 134931456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048439317953861586, + "loss": 3.8036, + "theoretical_loss": 4.5817694828164734, + "tokens_seen": 134996992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048438314944834504, + "loss": 3.7968, + "theoretical_loss": 4.581474217168619, + "tokens_seen": 135062528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843731193580742, + "loss": 3.7431, + "theoretical_loss": 4.581179134850588, + "tokens_seen": 135128064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843630892678034, + "loss": 3.7526, + "theoretical_loss": 4.580884235659718, + "tokens_seen": 135193600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843530591775326, + "loss": 3.7348, + "theoretical_loss": 4.580589519393672, + "tokens_seen": 135259136 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843430290872618, + "loss": 3.7728, + "theoretical_loss": 4.580294985850433, + "tokens_seen": 135324672 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048433299899699095, + "loss": 3.6933, + "theoretical_loss": 4.580000634828307, + "tokens_seen": 135390208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843229689067202, + "loss": 3.7155, + "theoretical_loss": 4.579706466125919, + "tokens_seen": 135455744 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004843129388164493, + "loss": 3.5735, + "theoretical_loss": 4.579412479542212, + "tokens_seen": 135521280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048430290872617855, + "loss": 3.6568, + "theoretical_loss": 4.579118674876449, + "tokens_seen": 135586816 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048429287863590773, + "loss": 3.7939, + "theoretical_loss": 4.578825051928211, + "tokens_seen": 135652352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004842828485456369, + "loss": 3.6225, + "theoretical_loss": 4.578531610497398, + "tokens_seen": 135717888 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004842728184553661, + "loss": 3.7879, + "theoretical_loss": 4.5782383503842246, + "tokens_seen": 135783424 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048426278836509527, + "loss": 3.5678, + "theoretical_loss": 4.577945271389222, + "tokens_seen": 135848960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048425275827482445, + "loss": 3.8165, + "theoretical_loss": 4.577652373313239, + "tokens_seen": 135914496 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 365833, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7439584732055664, + "objective/train/theoretical_loss": 4.577359655957435, + "objective/train/tokens_used": 156440032, + "theoretical_loss": 4.577359655957435, + "tokens_seen": 135980032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004842427281845537, + "loss": 3.6939, + "theoretical_loss": 4.577359655957435, + "tokens_seen": 135980032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048423269809428287, + "loss": 3.5648, + "theoretical_loss": 4.577067119123289, + "tokens_seen": 136045568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048422266800401205, + "loss": 3.6839, + "theoretical_loss": 4.576774762612589, + "tokens_seen": 136111104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004842126379137413, + "loss": 3.7089, + "theoretical_loss": 4.576482586227439, + "tokens_seen": 136176640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004842026078234704, + "loss": 3.6687, + "theoretical_loss": 4.576190589770254, + "tokens_seen": 136242176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048419257773319965, + "loss": 3.7441, + "theoretical_loss": 4.575898773043761, + "tokens_seen": 136307712 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004841825476429288, + "loss": 3.7224, + "theoretical_loss": 4.575607135850996, + "tokens_seen": 136373248 + }, + { + "epoch": 1.02, + "learning_rate": 0.000484172517552658, + "loss": 3.6871, + "theoretical_loss": 4.575315677995308, + "tokens_seen": 136438784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004841624874623872, + "loss": 3.6906, + "theoretical_loss": 4.575024399280355, + "tokens_seen": 136504320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048415245737211637, + "loss": 3.7717, + "theoretical_loss": 4.574733299510106, + "tokens_seen": 136569856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048414242728184555, + "loss": 3.5397, + "theoretical_loss": 4.574442378488833, + "tokens_seen": 136635392 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048413239719157473, + "loss": 3.775, + "theoretical_loss": 4.574151636021121, + "tokens_seen": 136700928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004841223671013039, + "loss": 3.767, + "theoretical_loss": 4.5738610719118595, + "tokens_seen": 136766464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048411233701103315, + "loss": 3.6912, + "theoretical_loss": 4.573570685966246, + "tokens_seen": 136832000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004841023069207623, + "loss": 3.7008, + "theoretical_loss": 4.573280477989784, + "tokens_seen": 136897536 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004840922768304915, + "loss": 3.7569, + "theoretical_loss": 4.572990447788281, + "tokens_seen": 136963072 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048408224674022064, + "loss": 3.8734, + "theoretical_loss": 4.57270059516785, + "tokens_seen": 137028608 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004840722166499499, + "loss": 3.6652, + "theoretical_loss": 4.572410919934908, + "tokens_seen": 137094144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048406218655967906, + "loss": 3.6926, + "theoretical_loss": 4.572121421896176, + "tokens_seen": 137159680 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048405215646940824, + "loss": 3.7466, + "theoretical_loss": 4.571832100858675, + "tokens_seen": 137225216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004840421263791374, + "loss": 3.6944, + "theoretical_loss": 4.571542956629734, + "tokens_seen": 137290752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048403209628886665, + "loss": 3.6618, + "theoretical_loss": 4.571253989016977, + "tokens_seen": 137356288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004840220661985958, + "loss": 3.7423, + "theoretical_loss": 4.5709651978283325, + "tokens_seen": 137421824 + }, + { + "epoch": 1.02, + "learning_rate": 0.000484012036108325, + "loss": 3.6444, + "theoretical_loss": 4.57067658287203, + "tokens_seen": 137487360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048400200601805414, + "loss": 3.8544, + "theoretical_loss": 4.570388143956597, + "tokens_seen": 137552896 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 370629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.609988212585449, + "objective/train/theoretical_loss": 4.57009988089086, + "objective/train/tokens_used": 158078432, + "theoretical_loss": 4.57009988089086, + "tokens_seen": 137618432 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839919759277834, + "loss": 3.7497, + "theoretical_loss": 4.57009988089086, + "tokens_seen": 137618432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048398194583751256, + "loss": 3.8137, + "theoretical_loss": 4.569811793483945, + "tokens_seen": 137683968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048397191574724174, + "loss": 3.6443, + "theoretical_loss": 4.569523881545277, + "tokens_seen": 137749504 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839618856569709, + "loss": 3.7402, + "theoretical_loss": 4.569236144884575, + "tokens_seen": 137815040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839518555667001, + "loss": 3.7829, + "theoretical_loss": 4.568948583311856, + "tokens_seen": 137880576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839418254764293, + "loss": 3.624, + "theoretical_loss": 4.5686611966374375, + "tokens_seen": 137946112 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839317953861585, + "loss": 3.6905, + "theoretical_loss": 4.568373984671925, + "tokens_seen": 138011648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048392176529588765, + "loss": 3.7963, + "theoretical_loss": 4.568086947226224, + "tokens_seen": 138077184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004839117352056169, + "loss": 3.7021, + "theoretical_loss": 4.567800084111532, + "tokens_seen": 138142720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048390170511534606, + "loss": 3.6746, + "theoretical_loss": 4.567513395139342, + "tokens_seen": 138208256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048389167502507524, + "loss": 3.5088, + "theoretical_loss": 4.5672268801214395, + "tokens_seen": 138273792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004838816449348044, + "loss": 3.6548, + "theoretical_loss": 4.566940538869901, + "tokens_seen": 138339328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004838716148445336, + "loss": 3.7809, + "theoretical_loss": 4.566654371197098, + "tokens_seen": 138404864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004838615847542628, + "loss": 3.6627, + "theoretical_loss": 4.566368376915689, + "tokens_seen": 138470400 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483851554663992, + "loss": 3.7693, + "theoretical_loss": 4.566082555838626, + "tokens_seen": 138535936 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048384152457372115, + "loss": 3.7206, + "theoretical_loss": 4.565796907779154, + "tokens_seen": 138601472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004838314944834504, + "loss": 3.7692, + "theoretical_loss": 4.5655114325508, + "tokens_seen": 138667008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004838214643931795, + "loss": 3.7645, + "theoretical_loss": 4.565226129967387, + "tokens_seen": 138732544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048381143430290875, + "loss": 3.7529, + "theoretical_loss": 4.564940999843024, + "tokens_seen": 138798080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048380140421263793, + "loss": 3.6997, + "theoretical_loss": 4.564656041992105, + "tokens_seen": 138863616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004837913741223671, + "loss": 3.7603, + "theoretical_loss": 4.564371256229316, + "tokens_seen": 138929152 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004837813440320963, + "loss": 3.8247, + "theoretical_loss": 4.564086642369627, + "tokens_seen": 138994688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048377131394182547, + "loss": 3.7409, + "theoretical_loss": 4.563802200228294, + "tokens_seen": 139060224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048376128385155465, + "loss": 3.7429, + "theoretical_loss": 4.5635179296208594, + "tokens_seen": 139125760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004837512537612839, + "loss": 3.7676, + "theoretical_loss": 4.563233830363149, + "tokens_seen": 139191296 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 373445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.637946605682373, + "objective/train/theoretical_loss": 4.562949902271276, + "objective/train/tokens_used": 159716832, + "theoretical_loss": 4.562949902271276, + "tokens_seen": 139256832 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483741223671013, + "loss": 3.7082, + "theoretical_loss": 4.562949902271276, + "tokens_seen": 139256832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048373119358074225, + "loss": 3.7211, + "theoretical_loss": 4.562666145161632, + "tokens_seen": 139322368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048372116349047143, + "loss": 3.6682, + "theoretical_loss": 4.562382558850899, + "tokens_seen": 139387904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004837111334002006, + "loss": 3.6635, + "theoretical_loss": 4.562099143156036, + "tokens_seen": 139453440 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004837011033099298, + "loss": 3.7292, + "theoretical_loss": 4.561815897894286, + "tokens_seen": 139518976 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483691073219659, + "loss": 3.7989, + "theoretical_loss": 4.561532822883173, + "tokens_seen": 139584512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048368104312938816, + "loss": 3.7174, + "theoretical_loss": 4.561249917940502, + "tokens_seen": 139650048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836710130391174, + "loss": 3.711, + "theoretical_loss": 4.56096718288436, + "tokens_seen": 139715584 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836609829488465, + "loss": 3.737, + "theoretical_loss": 4.560684617533111, + "tokens_seen": 139781120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048365095285857575, + "loss": 3.7335, + "theoretical_loss": 4.5604022217054, + "tokens_seen": 139846656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836409227683049, + "loss": 3.7431, + "theoretical_loss": 4.56011999522015, + "tokens_seen": 139912192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836308926780341, + "loss": 3.7259, + "theoretical_loss": 4.559837937896565, + "tokens_seen": 139977728 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836208625877633, + "loss": 3.7794, + "theoretical_loss": 4.5595560495541205, + "tokens_seen": 140043264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004836108324974925, + "loss": 3.7206, + "theoretical_loss": 4.559274330012574, + "tokens_seen": 140108800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048360080240722166, + "loss": 3.7715, + "theoretical_loss": 4.558992779091959, + "tokens_seen": 140174336 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048359077231695084, + "loss": 3.686, + "theoretical_loss": 4.5587113966125825, + "tokens_seen": 140239872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048358074222668, + "loss": 3.7536, + "theoretical_loss": 4.558430182395028, + "tokens_seen": 140305408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048357071213640926, + "loss": 3.6563, + "theoretical_loss": 4.558149136260155, + "tokens_seen": 140370944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004835606820461384, + "loss": 3.7016, + "theoretical_loss": 4.557868258029098, + "tokens_seen": 140436480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004835506519558676, + "loss": 3.7652, + "theoretical_loss": 4.557587547523259, + "tokens_seen": 140502016 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004835406218655968, + "loss": 3.6841, + "theoretical_loss": 4.55730700456432, + "tokens_seen": 140567552 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483530591775326, + "loss": 3.7381, + "theoretical_loss": 4.557026628974234, + "tokens_seen": 140633088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048352056168505516, + "loss": 3.679, + "theoretical_loss": 4.556746420575225, + "tokens_seen": 140698624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048351053159478434, + "loss": 3.7087, + "theoretical_loss": 4.5564663791897875, + "tokens_seen": 140764160 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004835005015045135, + "loss": 3.7854, + "theoretical_loss": 4.556186504640689, + "tokens_seen": 140829696 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 377221, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8285017013549805, + "objective/train/theoretical_loss": 4.555906796750967, + "objective/train/tokens_used": 161355232, + "theoretical_loss": 4.555906796750967, + "tokens_seen": 140895232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048349047141424276, + "loss": 3.8036, + "theoretical_loss": 4.555906796750967, + "tokens_seen": 140895232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048348044132397194, + "loss": 3.7058, + "theoretical_loss": 4.5556272553439285, + "tokens_seen": 140960768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004834704112337011, + "loss": 3.6557, + "theoretical_loss": 4.555347880243151, + "tokens_seen": 141026304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004834603811434303, + "loss": 3.7085, + "theoretical_loss": 4.555068671272477, + "tokens_seen": 141091840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004834503510531595, + "loss": 3.6774, + "theoretical_loss": 4.554789628256023, + "tokens_seen": 141157376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004834403209628887, + "loss": 3.7209, + "theoretical_loss": 4.554510751018169, + "tokens_seen": 141222912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048343029087261785, + "loss": 3.6136, + "theoretical_loss": 4.554232039383566, + "tokens_seen": 141288448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004834202607823471, + "loss": 3.7535, + "theoretical_loss": 4.553953493177125, + "tokens_seen": 141353984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048341023069207626, + "loss": 3.6994, + "theoretical_loss": 4.553675112224031, + "tokens_seen": 141419520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048340020060180544, + "loss": 3.632, + "theoretical_loss": 4.55339689634973, + "tokens_seen": 141485056 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004833901705115346, + "loss": 3.7244, + "theoretical_loss": 4.5531188453799345, + "tokens_seen": 141550592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004833801404212638, + "loss": 3.7799, + "theoretical_loss": 4.552840959140621, + "tokens_seen": 141616128 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483370110330993, + "loss": 3.6372, + "theoretical_loss": 4.552563237458031, + "tokens_seen": 141681664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004833600802407222, + "loss": 3.7344, + "theoretical_loss": 4.552285680158668, + "tokens_seen": 141747200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048335005015045135, + "loss": 3.5842, + "theoretical_loss": 4.552008287069301, + "tokens_seen": 141812736 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004833400200601806, + "loss": 3.7495, + "theoretical_loss": 4.5517310580169585, + "tokens_seen": 141878272 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004833299899699097, + "loss": 3.7445, + "theoretical_loss": 4.551453992828934, + "tokens_seen": 141943808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048331995987963895, + "loss": 3.7199, + "theoretical_loss": 4.5511770913327805, + "tokens_seen": 142009344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048330992978936813, + "loss": 3.7244, + "theoretical_loss": 4.550900353356311, + "tokens_seen": 142074880 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004832998996990973, + "loss": 3.739, + "theoretical_loss": 4.550623778727602, + "tokens_seen": 142140416 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004832898696088265, + "loss": 3.7083, + "theoretical_loss": 4.550347367274988, + "tokens_seen": 142205952 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048327983951855567, + "loss": 3.6906, + "theoretical_loss": 4.550071118827063, + "tokens_seen": 142271488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048326980942828485, + "loss": 3.6918, + "theoretical_loss": 4.54979503321268, + "tokens_seen": 142337024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004832597793380141, + "loss": 3.6823, + "theoretical_loss": 4.549519110260952, + "tokens_seen": 142402560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004832497492477432, + "loss": 3.7504, + "theoretical_loss": 4.549243349801245, + "tokens_seen": 142468096 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8125271797180176, + "objective/train/theoretical_loss": 4.54896775166319, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.54896775166319, + "tokens_seen": 142533632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048323971915747245, + "loss": 3.7508, + "theoretical_loss": 4.54896775166319, + "tokens_seen": 142533632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048322968906720163, + "loss": 3.7297, + "theoretical_loss": 4.548692315676666, + "tokens_seen": 142599168 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004832196589769308, + "loss": 3.6711, + "theoretical_loss": 4.548417041671817, + "tokens_seen": 142664704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048320962888666, + "loss": 3.6459, + "theoretical_loss": 4.548141929479037, + "tokens_seen": 142730240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831995987963892, + "loss": 3.7878, + "theoretical_loss": 4.547866978928978, + "tokens_seen": 142795776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048318956870611836, + "loss": 3.8342, + "theoretical_loss": 4.547592189852544, + "tokens_seen": 142861312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831795386158476, + "loss": 3.7572, + "theoretical_loss": 4.547317562080899, + "tokens_seen": 142926848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831695085255767, + "loss": 3.6654, + "theoretical_loss": 4.547043095445454, + "tokens_seen": 142992384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048315947843530595, + "loss": 3.5882, + "theoretical_loss": 4.546768789777878, + "tokens_seen": 143057920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831494483450351, + "loss": 3.7545, + "theoretical_loss": 4.546494644910092, + "tokens_seen": 143123456 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831394182547643, + "loss": 3.722, + "theoretical_loss": 4.546220660674267, + "tokens_seen": 143188992 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831293881644935, + "loss": 3.6521, + "theoretical_loss": 4.5459468369028295, + "tokens_seen": 143254528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004831193580742227, + "loss": 3.6504, + "theoretical_loss": 4.5456731734284554, + "tokens_seen": 143320064 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048310932798395186, + "loss": 3.5334, + "theoretical_loss": 4.5453996700840715, + "tokens_seen": 143385600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048309929789368104, + "loss": 3.7136, + "theoretical_loss": 4.545126326702855, + "tokens_seen": 143451136 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004830892678034102, + "loss": 3.7636, + "theoretical_loss": 4.544853143118232, + "tokens_seen": 143516672 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048307923771313946, + "loss": 3.728, + "theoretical_loss": 4.544580119163882, + "tokens_seen": 143582208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004830692076228686, + "loss": 3.5888, + "theoretical_loss": 4.544307254673728, + "tokens_seen": 143647744 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004830591775325978, + "loss": 3.7348, + "theoretical_loss": 4.544034549481946, + "tokens_seen": 143713280 + }, + { + "epoch": 1.02, + "learning_rate": 0.000483049147442327, + "loss": 3.723, + "theoretical_loss": 4.543762003422957, + "tokens_seen": 143778816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004830391173520562, + "loss": 3.7265, + "theoretical_loss": 4.5434896163314304, + "tokens_seen": 143844352 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048302908726178536, + "loss": 3.7132, + "theoretical_loss": 4.543217388042283, + "tokens_seen": 143909888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048301905717151454, + "loss": 3.7319, + "theoretical_loss": 4.542945318390679, + "tokens_seen": 143975424 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004830090270812437, + "loss": 3.7385, + "theoretical_loss": 4.542673407212025, + "tokens_seen": 144040960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048299899699097296, + "loss": 3.6283, + "theoretical_loss": 4.542401654341978, + "tokens_seen": 144106496 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9029579162597656, + "objective/train/theoretical_loss": 4.5421300596164365, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.5421300596164365, + "tokens_seen": 144172032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004829889669007021, + "loss": 3.7445, + "theoretical_loss": 4.5421300596164365, + "tokens_seen": 144172032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004829789368104313, + "loss": 3.7529, + "theoretical_loss": 4.541858622871545, + "tokens_seen": 144237568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048296890672016045, + "loss": 3.701, + "theoretical_loss": 4.541587343943692, + "tokens_seen": 144303104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004829588766298897, + "loss": 3.8045, + "theoretical_loss": 4.54131622266951, + "tokens_seen": 144368640 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048294884653961887, + "loss": 3.7733, + "theoretical_loss": 4.541045258885872, + "tokens_seen": 144434176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048293881644934805, + "loss": 3.697, + "theoretical_loss": 4.540774452429899, + "tokens_seen": 144499712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048292878635907723, + "loss": 3.7281, + "theoretical_loss": 4.54050380313895, + "tokens_seen": 144565248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048291875626880646, + "loss": 3.6427, + "theoretical_loss": 4.540233310850626, + "tokens_seen": 144630784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004829087261785356, + "loss": 3.6351, + "theoretical_loss": 4.539962975402773, + "tokens_seen": 144696320 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004828986960882648, + "loss": 3.7009, + "theoretical_loss": 4.539692796633471, + "tokens_seen": 144761856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048288866599799395, + "loss": 3.6983, + "theoretical_loss": 4.539422774381048, + "tokens_seen": 144827392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004828786359077232, + "loss": 3.7105, + "theoretical_loss": 4.539152908484066, + "tokens_seen": 144892928 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048286860581745237, + "loss": 3.6208, + "theoretical_loss": 4.53888319878133, + "tokens_seen": 144958464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048285857572718155, + "loss": 3.6817, + "theoretical_loss": 4.538613645111883, + "tokens_seen": 145024000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048284854563691073, + "loss": 3.7789, + "theoretical_loss": 4.538344247315004, + "tokens_seen": 145089536 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004828385155466399, + "loss": 3.6659, + "theoretical_loss": 4.538075005230216, + "tokens_seen": 145155072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004828284854563691, + "loss": 3.6851, + "theoretical_loss": 4.5378059186972735, + "tokens_seen": 145220608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048281845536609833, + "loss": 3.7169, + "theoretical_loss": 4.5375369875561695, + "tokens_seen": 145286144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048280842527582746, + "loss": 3.6826, + "theoretical_loss": 4.537268211647137, + "tokens_seen": 145351680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827983951855567, + "loss": 3.6585, + "theoretical_loss": 4.536999590810641, + "tokens_seen": 145417216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827883650952858, + "loss": 3.7408, + "theoretical_loss": 4.536731124887385, + "tokens_seen": 145482752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048277833500501505, + "loss": 3.7399, + "theoretical_loss": 4.536462813718308, + "tokens_seen": 145548288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048276830491474423, + "loss": 3.7107, + "theoretical_loss": 4.536194657144581, + "tokens_seen": 145613824 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827582748244734, + "loss": 3.6612, + "theoretical_loss": 4.535926655007612, + "tokens_seen": 145679360 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827482447342026, + "loss": 3.6323, + "theoretical_loss": 4.535658807149042, + "tokens_seen": 145744896 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6512744426727295, + "objective/train/theoretical_loss": 4.5353911134107445, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.5353911134107445, + "tokens_seen": 145810432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048273821464393183, + "loss": 3.7073, + "theoretical_loss": 4.5353911134107445, + "tokens_seen": 145810432 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482728184553661, + "loss": 3.731, + "theoretical_loss": 4.535123573634829, + "tokens_seen": 145875968 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827181544633902, + "loss": 3.7874, + "theoretical_loss": 4.534856187663635, + "tokens_seen": 145941504 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004827081243731194, + "loss": 3.733, + "theoretical_loss": 4.534588955339735, + "tokens_seen": 146007040 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048269809428284856, + "loss": 3.7537, + "theoretical_loss": 4.534321876505933, + "tokens_seen": 146072576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826880641925778, + "loss": 3.6098, + "theoretical_loss": 4.534054951005263, + "tokens_seen": 146138112 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826780341023069, + "loss": 3.8, + "theoretical_loss": 4.533788178680995, + "tokens_seen": 146203648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048266800401203615, + "loss": 3.6791, + "theoretical_loss": 4.533521559376622, + "tokens_seen": 146269184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826579739217653, + "loss": 3.566, + "theoretical_loss": 4.533255092935871, + "tokens_seen": 146334720 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826479438314945, + "loss": 3.6087, + "theoretical_loss": 4.532988779202698, + "tokens_seen": 146400256 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826379137412237, + "loss": 3.6915, + "theoretical_loss": 4.532722618021291, + "tokens_seen": 146465792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004826278836509529, + "loss": 3.6967, + "theoretical_loss": 4.5324566092360605, + "tokens_seen": 146531328 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048261785356068206, + "loss": 3.5981, + "theoretical_loss": 4.5321907526916485, + "tokens_seen": 146596864 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048260782347041124, + "loss": 3.8127, + "theoretical_loss": 4.5319250482329245, + "tokens_seen": 146662400 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004825977933801404, + "loss": 3.5108, + "theoretical_loss": 4.531659495704988, + "tokens_seen": 146727936 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048258776328986966, + "loss": 3.7245, + "theoretical_loss": 4.53139409495316, + "tokens_seen": 146793472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004825777331995988, + "loss": 3.742, + "theoretical_loss": 4.531128845822991, + "tokens_seen": 146859008 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482567703109328, + "loss": 3.6645, + "theoretical_loss": 4.530863748160258, + "tokens_seen": 146924544 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004825576730190572, + "loss": 3.6404, + "theoretical_loss": 4.530598801810963, + "tokens_seen": 146990080 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004825476429287864, + "loss": 3.6453, + "theoretical_loss": 4.530334006621332, + "tokens_seen": 147055616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048253761283851556, + "loss": 3.8921, + "theoretical_loss": 4.530069362437818, + "tokens_seen": 147121152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048252758274824474, + "loss": 3.6029, + "theoretical_loss": 4.529804869107094, + "tokens_seen": 147186688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004825175526579739, + "loss": 3.7036, + "theoretical_loss": 4.529540526476063, + "tokens_seen": 147252224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048250752256770316, + "loss": 3.6382, + "theoretical_loss": 4.529276334391846, + "tokens_seen": 147317760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004824974924774323, + "loss": 3.7768, + "theoretical_loss": 4.529012292701791, + "tokens_seen": 147383296 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7816004753112793, + "objective/train/theoretical_loss": 4.5287484012534645, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.5287484012534645, + "tokens_seen": 147448832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004824874623871615, + "loss": 3.8217, + "theoretical_loss": 4.5287484012534645, + "tokens_seen": 147448832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048247743229689065, + "loss": 3.7557, + "theoretical_loss": 4.52848465989466, + "tokens_seen": 147514368 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004824674022066199, + "loss": 3.6635, + "theoretical_loss": 4.528221068473389, + "tokens_seen": 147579904 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048245737211634907, + "loss": 3.6092, + "theoretical_loss": 4.527957626837885, + "tokens_seen": 147645440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048244734202607825, + "loss": 3.6638, + "theoretical_loss": 4.527694334836603, + "tokens_seen": 147710976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048243731193580743, + "loss": 3.7425, + "theoretical_loss": 4.527431192318219, + "tokens_seen": 147776512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048242728184553666, + "loss": 3.6615, + "theoretical_loss": 4.527168199131628, + "tokens_seen": 147842048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004824172517552658, + "loss": 3.6915, + "theoretical_loss": 4.526905355125946, + "tokens_seen": 147907584 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482407221664995, + "loss": 3.6277, + "theoretical_loss": 4.526642660150506, + "tokens_seen": 147973120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048239719157472415, + "loss": 3.6726, + "theoretical_loss": 4.52638011405486, + "tokens_seen": 148038656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004823871614844534, + "loss": 3.6396, + "theoretical_loss": 4.526117716688782, + "tokens_seen": 148104192 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048237713139418257, + "loss": 3.6824, + "theoretical_loss": 4.525855467902261, + "tokens_seen": 148169728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048236710130391175, + "loss": 3.702, + "theoretical_loss": 4.525593367545502, + "tokens_seen": 148235264 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048235707121364093, + "loss": 3.6609, + "theoretical_loss": 4.525331415468933, + "tokens_seen": 148300800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004823470411233701, + "loss": 3.7115, + "theoretical_loss": 4.525069611523191, + "tokens_seen": 148366336 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004823370110330993, + "loss": 3.6906, + "theoretical_loss": 4.524807955559135, + "tokens_seen": 148431872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048232698094282853, + "loss": 3.6583, + "theoretical_loss": 4.524546447427838, + "tokens_seen": 148497408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048231695085255766, + "loss": 3.8171, + "theoretical_loss": 4.524285086980589, + "tokens_seen": 148562944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004823069207622869, + "loss": 3.677, + "theoretical_loss": 4.524023874068892, + "tokens_seen": 148628480 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482296890672016, + "loss": 3.6625, + "theoretical_loss": 4.523762808544463, + "tokens_seen": 148694016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048228686058174525, + "loss": 3.6727, + "theoretical_loss": 4.523501890259238, + "tokens_seen": 148759552 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048227683049147443, + "loss": 3.7059, + "theoretical_loss": 4.523241119065363, + "tokens_seen": 148825088 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004822668004012036, + "loss": 3.6794, + "theoretical_loss": 4.522980494815198, + "tokens_seen": 148890624 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004822567703109328, + "loss": 3.6631, + "theoretical_loss": 4.522720017361316, + "tokens_seen": 148956160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048224674022066203, + "loss": 3.5377, + "theoretical_loss": 4.522459686556503, + "tokens_seen": 149021696 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7438864707946777, + "objective/train/theoretical_loss": 4.522199502253757, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.522199502253757, + "tokens_seen": 149087232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048223671013039116, + "loss": 3.7476, + "theoretical_loss": 4.522199502253757, + "tokens_seen": 149087232 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004822266800401204, + "loss": 3.7479, + "theoretical_loss": 4.521939464306289, + "tokens_seen": 149152768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004822166499498495, + "loss": 3.6908, + "theoretical_loss": 4.521679572567521, + "tokens_seen": 149218304 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048220661985957876, + "loss": 3.7784, + "theoretical_loss": 4.5214198268910835, + "tokens_seen": 149283840 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048219658976930794, + "loss": 3.5865, + "theoretical_loss": 4.521160227130823, + "tokens_seen": 149349376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821865596790371, + "loss": 3.5473, + "theoretical_loss": 4.520900773140791, + "tokens_seen": 149414912 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821765295887663, + "loss": 3.4981, + "theoretical_loss": 4.520641464775252, + "tokens_seen": 149480448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821664994984955, + "loss": 3.4658, + "theoretical_loss": 4.520382301888679, + "tokens_seen": 149545984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048215646940822466, + "loss": 3.7246, + "theoretical_loss": 4.520123284335755, + "tokens_seen": 149611520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821464393179539, + "loss": 3.6043, + "theoretical_loss": 4.51986441197137, + "tokens_seen": 149677056 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482136409227683, + "loss": 3.7952, + "theoretical_loss": 4.5196056846506245, + "tokens_seen": 149742592 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048212637913741226, + "loss": 3.7056, + "theoretical_loss": 4.519347102228824, + "tokens_seen": 149808128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821163490471414, + "loss": 3.6667, + "theoretical_loss": 4.519088664561487, + "tokens_seen": 149873664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004821063189568706, + "loss": 3.7137, + "theoretical_loss": 4.518830371504331, + "tokens_seen": 149939200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820962888665998, + "loss": 3.7935, + "theoretical_loss": 4.5185722229132885, + "tokens_seen": 150004736 + }, + { + "epoch": 1.02, + "learning_rate": 0.000482086258776329, + "loss": 3.5954, + "theoretical_loss": 4.518314218644493, + "tokens_seen": 150070272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048207622868605816, + "loss": 3.5365, + "theoretical_loss": 4.5180563585542854, + "tokens_seen": 150135808 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820661985957874, + "loss": 3.6198, + "theoretical_loss": 4.517798642499214, + "tokens_seen": 150201344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048205616850551653, + "loss": 3.6498, + "theoretical_loss": 4.51754107033603, + "tokens_seen": 150266880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048204613841524576, + "loss": 3.691, + "theoretical_loss": 4.517283641921691, + "tokens_seen": 150332416 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820361083249749, + "loss": 3.6536, + "theoretical_loss": 4.517026357113357, + "tokens_seen": 150397952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820260782347041, + "loss": 3.7524, + "theoretical_loss": 4.516769215768396, + "tokens_seen": 150463488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820160481444333, + "loss": 3.681, + "theoretical_loss": 4.516512217744376, + "tokens_seen": 150529024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004820060180541625, + "loss": 3.7235, + "theoretical_loss": 4.516255362899068, + "tokens_seen": 150594560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048199598796389167, + "loss": 3.7437, + "theoretical_loss": 4.515998651090451, + "tokens_seen": 150660096 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9150028228759766, + "objective/train/theoretical_loss": 4.5157420821767, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.5157420821767, + "tokens_seen": 150725632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048198595787362085, + "loss": 3.7785, + "theoretical_loss": 4.5157420821767, + "tokens_seen": 150725632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004819759277833501, + "loss": 3.7221, + "theoretical_loss": 4.515485656016197, + "tokens_seen": 150791168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048196589769307927, + "loss": 3.7634, + "theoretical_loss": 4.515229372467524, + "tokens_seen": 150856704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048195586760280845, + "loss": 3.6404, + "theoretical_loss": 4.514973231389464, + "tokens_seen": 150922240 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048194583751253763, + "loss": 3.6345, + "theoretical_loss": 4.5147172326410026, + "tokens_seen": 150987776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048193580742226686, + "loss": 3.6265, + "theoretical_loss": 4.514461376081325, + "tokens_seen": 151053312 + }, + { + "epoch": 1.02, + "learning_rate": 0.000481925777331996, + "loss": 3.64, + "theoretical_loss": 4.514205661569817, + "tokens_seen": 151118848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004819157472417252, + "loss": 3.6527, + "theoretical_loss": 4.513950088966064, + "tokens_seen": 151184384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048190571715145435, + "loss": 3.6978, + "theoretical_loss": 4.513694658129852, + "tokens_seen": 151249920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004818956870611836, + "loss": 3.674, + "theoretical_loss": 4.513439368921164, + "tokens_seen": 151315456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048188565697091277, + "loss": 3.7029, + "theoretical_loss": 4.513184221200184, + "tokens_seen": 151380992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048187562688064195, + "loss": 3.7216, + "theoretical_loss": 4.512929214827295, + "tokens_seen": 151446528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048186559679037113, + "loss": 3.746, + "theoretical_loss": 4.512674349663076, + "tokens_seen": 151512064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004818555667001003, + "loss": 3.6122, + "theoretical_loss": 4.512419625568306, + "tokens_seen": 151577600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004818455366098295, + "loss": 3.78, + "theoretical_loss": 4.512165042403957, + "tokens_seen": 151643136 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048183550651955873, + "loss": 3.6996, + "theoretical_loss": 4.511910600031205, + "tokens_seen": 151708672 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048182547642928786, + "loss": 3.5587, + "theoretical_loss": 4.511656298311416, + "tokens_seen": 151774208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004818154463390171, + "loss": 3.6627, + "theoretical_loss": 4.511402137106156, + "tokens_seen": 151839744 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004818054162487462, + "loss": 3.6568, + "theoretical_loss": 4.511148116277186, + "tokens_seen": 151905280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048179538615847545, + "loss": 3.652, + "theoretical_loss": 4.510894235686463, + "tokens_seen": 151970816 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048178535606820463, + "loss": 3.7072, + "theoretical_loss": 4.510640495196137, + "tokens_seen": 152036352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004817753259779338, + "loss": 3.746, + "theoretical_loss": 4.510386894668557, + "tokens_seen": 152101888 + }, + { + "epoch": 1.02, + "learning_rate": 0.000481765295887663, + "loss": 3.7231, + "theoretical_loss": 4.510133433966263, + "tokens_seen": 152167424 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048175526579739223, + "loss": 3.6866, + "theoretical_loss": 4.50988011295199, + "tokens_seen": 152232960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048174523570712136, + "loss": 3.5797, + "theoretical_loss": 4.509626931488667, + "tokens_seen": 152298496 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.528425455093384, + "objective/train/theoretical_loss": 4.509373889439416, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.509373889439416, + "tokens_seen": 152364032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004817352056168506, + "loss": 3.5808, + "theoretical_loss": 4.509373889439416, + "tokens_seen": 152364032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004817251755265797, + "loss": 3.6927, + "theoretical_loss": 4.509120986667554, + "tokens_seen": 152429568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048171514543630896, + "loss": 3.5532, + "theoretical_loss": 4.508868223036588, + "tokens_seen": 152495104 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048170511534603814, + "loss": 3.6516, + "theoretical_loss": 4.508615598410218, + "tokens_seen": 152560640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816950852557673, + "loss": 3.8181, + "theoretical_loss": 4.508363112652338, + "tokens_seen": 152626176 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816850551654965, + "loss": 3.6529, + "theoretical_loss": 4.5081107656270305, + "tokens_seen": 152691712 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816750250752257, + "loss": 3.6503, + "theoretical_loss": 4.507858557198572, + "tokens_seen": 152757248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048166499498495486, + "loss": 3.7746, + "theoretical_loss": 4.507606487231429, + "tokens_seen": 152822784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816549648946841, + "loss": 3.6185, + "theoretical_loss": 4.5073545555902585, + "tokens_seen": 152888320 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816449348044132, + "loss": 3.7135, + "theoretical_loss": 4.5071027621399065, + "tokens_seen": 152953856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048163490471414246, + "loss": 3.7363, + "theoretical_loss": 4.506851106745412, + "tokens_seen": 153019392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816248746238716, + "loss": 3.5944, + "theoretical_loss": 4.506599589271999, + "tokens_seen": 153084928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004816148445336008, + "loss": 3.6403, + "theoretical_loss": 4.506348209585084, + "tokens_seen": 153150464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048160481444333, + "loss": 3.6333, + "theoretical_loss": 4.506096967550274, + "tokens_seen": 153216000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815947843530592, + "loss": 3.691, + "theoretical_loss": 4.505845863033361, + "tokens_seen": 153281536 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048158475426278837, + "loss": 3.7128, + "theoretical_loss": 4.505594895900325, + "tokens_seen": 153347072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815747241725176, + "loss": 3.7034, + "theoretical_loss": 4.5053440660173365, + "tokens_seen": 153412608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048156469408224673, + "loss": 3.628, + "theoretical_loss": 4.50509337325075, + "tokens_seen": 153478144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048155466399197596, + "loss": 3.6603, + "theoretical_loss": 4.504842817467113, + "tokens_seen": 153543680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815446339017051, + "loss": 3.6497, + "theoretical_loss": 4.504592398533152, + "tokens_seen": 153609216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815346038114343, + "loss": 3.6738, + "theoretical_loss": 4.504342116315787, + "tokens_seen": 153674752 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815245737211635, + "loss": 3.6473, + "theoretical_loss": 4.504091970682119, + "tokens_seen": 153740288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004815145436308927, + "loss": 3.6884, + "theoretical_loss": 4.503841961499439, + "tokens_seen": 153805824 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048150451354062187, + "loss": 3.6602, + "theoretical_loss": 4.503592088635219, + "tokens_seen": 153871360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048149448345035105, + "loss": 3.6205, + "theoretical_loss": 4.503342351957119, + "tokens_seen": 153936896 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9903054237365723, + "objective/train/theoretical_loss": 4.503092751332984, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.503092751332984, + "tokens_seen": 154002432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048148445336008023, + "loss": 3.7092, + "theoretical_loss": 4.503092751332984, + "tokens_seen": 154002432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048147442326980947, + "loss": 3.7146, + "theoretical_loss": 4.502843286630842, + "tokens_seen": 154067968 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004814643931795386, + "loss": 3.6569, + "theoretical_loss": 4.502593957718904, + "tokens_seen": 154133504 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048145436308926783, + "loss": 3.7043, + "theoretical_loss": 4.502344764465569, + "tokens_seen": 154199040 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048144433299899696, + "loss": 3.6249, + "theoretical_loss": 4.502095706739416, + "tokens_seen": 154264576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004814343029087262, + "loss": 3.6976, + "theoretical_loss": 4.5018467844092065, + "tokens_seen": 154330112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048142427281845537, + "loss": 3.538, + "theoretical_loss": 4.501597997343886, + "tokens_seen": 154395648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048141424272818455, + "loss": 3.7439, + "theoretical_loss": 4.501349345412585, + "tokens_seen": 154461184 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048140421263791373, + "loss": 3.6392, + "theoretical_loss": 4.501100828484611, + "tokens_seen": 154526720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048139418254764297, + "loss": 3.6978, + "theoretical_loss": 4.500852446429457, + "tokens_seen": 154592256 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004813841524573721, + "loss": 3.7374, + "theoretical_loss": 4.500604199116795, + "tokens_seen": 154657792 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048137412236710133, + "loss": 3.6013, + "theoretical_loss": 4.500356086416481, + "tokens_seen": 154723328 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048136409227683046, + "loss": 3.7863, + "theoretical_loss": 4.500108108198549, + "tokens_seen": 154788864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004813540621865597, + "loss": 3.5435, + "theoretical_loss": 4.499860264333215, + "tokens_seen": 154854400 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004813440320962889, + "loss": 3.7738, + "theoretical_loss": 4.499612554690874, + "tokens_seen": 154919936 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048133400200601806, + "loss": 3.6152, + "theoretical_loss": 4.4993649791421015, + "tokens_seen": 154985472 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048132397191574724, + "loss": 3.6656, + "theoretical_loss": 4.4991175375576535, + "tokens_seen": 155051008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004813139418254764, + "loss": 3.6998, + "theoretical_loss": 4.498870229808462, + "tokens_seen": 155116544 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004813039117352056, + "loss": 3.6164, + "theoretical_loss": 4.498623055765641, + "tokens_seen": 155182080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048129388164493483, + "loss": 3.647, + "theoretical_loss": 4.498376015300483, + "tokens_seen": 155247616 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048128385155466396, + "loss": 3.6911, + "theoretical_loss": 4.498129108284456, + "tokens_seen": 155313152 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004812738214643932, + "loss": 3.6691, + "theoretical_loss": 4.497882334589206, + "tokens_seen": 155378688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004812637913741223, + "loss": 3.6302, + "theoretical_loss": 4.49763569408656, + "tokens_seen": 155444224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048125376128385156, + "loss": 3.6573, + "theoretical_loss": 4.497389186648519, + "tokens_seen": 155509760 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048124373119358074, + "loss": 3.6349, + "theoretical_loss": 4.4971428121472625, + "tokens_seen": 155575296 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6590099334716797, + "objective/train/theoretical_loss": 4.496896570455146, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.496896570455146, + "tokens_seen": 155640832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004812337011033099, + "loss": 3.7404, + "theoretical_loss": 4.496896570455146, + "tokens_seen": 155640832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048122367101303916, + "loss": 3.7132, + "theoretical_loss": 4.4966504614447, + "tokens_seen": 155706368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048121364092276834, + "loss": 3.5395, + "theoretical_loss": 4.496404484988633, + "tokens_seen": 155771904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004812036108324975, + "loss": 3.6316, + "theoretical_loss": 4.496158640959829, + "tokens_seen": 155837440 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811935807422267, + "loss": 3.6939, + "theoretical_loss": 4.495912929231345, + "tokens_seen": 155902976 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811835506519559, + "loss": 3.5446, + "theoretical_loss": 4.495667349676415, + "tokens_seen": 155968512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048117352056168506, + "loss": 3.626, + "theoretical_loss": 4.495421902168448, + "tokens_seen": 156034048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811634904714143, + "loss": 3.7031, + "theoretical_loss": 4.495176586581025, + "tokens_seen": 156099584 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811534603811434, + "loss": 3.6082, + "theoretical_loss": 4.494931402787904, + "tokens_seen": 156165120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048114343029087266, + "loss": 3.6272, + "theoretical_loss": 4.494686350663015, + "tokens_seen": 156230656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811334002006018, + "loss": 3.7121, + "theoretical_loss": 4.494441430080462, + "tokens_seen": 156296192 + }, + { + "epoch": 1.02, + "learning_rate": 0.000481123370110331, + "loss": 3.5994, + "theoretical_loss": 4.49419664091452, + "tokens_seen": 156361728 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811133400200602, + "loss": 3.6507, + "theoretical_loss": 4.493951983039641, + "tokens_seen": 156427264 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004811033099297894, + "loss": 3.7754, + "theoretical_loss": 4.493707456330445, + "tokens_seen": 156492800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048109327983951857, + "loss": 3.6652, + "theoretical_loss": 4.493463060661728, + "tokens_seen": 156558336 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004810832497492478, + "loss": 3.7158, + "theoretical_loss": 4.493218795908454, + "tokens_seen": 156623872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048107321965897693, + "loss": 3.6131, + "theoretical_loss": 4.492974661945763, + "tokens_seen": 156689408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048106318956870616, + "loss": 3.6603, + "theoretical_loss": 4.492730658648963, + "tokens_seen": 156754944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004810531594784353, + "loss": 3.6872, + "theoretical_loss": 4.492486785893534, + "tokens_seen": 156820480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004810431293881645, + "loss": 3.6959, + "theoretical_loss": 4.492243043555126, + "tokens_seen": 156886016 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004810330992978937, + "loss": 3.6186, + "theoretical_loss": 4.4919994315095595, + "tokens_seen": 156951552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004810230692076229, + "loss": 3.7711, + "theoretical_loss": 4.491755949632826, + "tokens_seen": 157017088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048101303911735207, + "loss": 3.5053, + "theoretical_loss": 4.491512597801087, + "tokens_seen": 157082624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048100300902708125, + "loss": 3.665, + "theoretical_loss": 4.4912693758906705, + "tokens_seen": 157148160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048099297893681043, + "loss": 3.7549, + "theoretical_loss": 4.491026283778077, + "tokens_seen": 157213696 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8013460636138916, + "objective/train/theoretical_loss": 4.490783321339975, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.490783321339975, + "tokens_seen": 157279232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048098294884653967, + "loss": 3.708, + "theoretical_loss": 4.490783321339975, + "tokens_seen": 157279232 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004809729187562688, + "loss": 3.6307, + "theoretical_loss": 4.490540488453199, + "tokens_seen": 157344768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048096288866599803, + "loss": 3.5993, + "theoretical_loss": 4.490297784994755, + "tokens_seen": 157410304 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048095285857572716, + "loss": 3.7192, + "theoretical_loss": 4.490055210841815, + "tokens_seen": 157475840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004809428284854564, + "loss": 3.5845, + "theoretical_loss": 4.48981276587172, + "tokens_seen": 157541376 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048093279839518557, + "loss": 3.6995, + "theoretical_loss": 4.4895704499619775, + "tokens_seen": 157606912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048092276830491475, + "loss": 3.6753, + "theoretical_loss": 4.489328262990261, + "tokens_seen": 157672448 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048091273821464393, + "loss": 3.7296, + "theoretical_loss": 4.489086204834413, + "tokens_seen": 157737984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048090270812437317, + "loss": 3.6132, + "theoretical_loss": 4.48884427537244, + "tokens_seen": 157803520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004808926780341023, + "loss": 3.612, + "theoretical_loss": 4.488602474482516, + "tokens_seen": 157869056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048088264794383153, + "loss": 3.7045, + "theoretical_loss": 4.4883608020429815, + "tokens_seen": 157934592 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048087261785356066, + "loss": 3.6699, + "theoretical_loss": 4.488119257932342, + "tokens_seen": 158000128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004808625877632899, + "loss": 3.6723, + "theoretical_loss": 4.487877842029267, + "tokens_seen": 158065664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004808525576730191, + "loss": 3.6252, + "theoretical_loss": 4.487636554212592, + "tokens_seen": 158131200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048084252758274826, + "loss": 3.7377, + "theoretical_loss": 4.487395394361318, + "tokens_seen": 158196736 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048083249749247744, + "loss": 3.7388, + "theoretical_loss": 4.487154362354608, + "tokens_seen": 158262272 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004808224674022066, + "loss": 3.6241, + "theoretical_loss": 4.4869134580717915, + "tokens_seen": 158327808 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004808124373119358, + "loss": 3.7877, + "theoretical_loss": 4.486672681392361, + "tokens_seen": 158393344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048080240722166503, + "loss": 3.6914, + "theoretical_loss": 4.486432032195971, + "tokens_seen": 158458880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048079237713139416, + "loss": 3.7101, + "theoretical_loss": 4.486191510362443, + "tokens_seen": 158524416 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004807823470411234, + "loss": 3.6839, + "theoretical_loss": 4.485951115771756, + "tokens_seen": 158589952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004807723169508525, + "loss": 3.6754, + "theoretical_loss": 4.485710848304057, + "tokens_seen": 158655488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048076228686058176, + "loss": 3.6772, + "theoretical_loss": 4.485470707839651, + "tokens_seen": 158721024 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048075225677031094, + "loss": 3.7069, + "theoretical_loss": 4.485230694259009, + "tokens_seen": 158786560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004807422266800401, + "loss": 3.5777, + "theoretical_loss": 4.48499080744276, + "tokens_seen": 158852096 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5406033992767334, + "objective/train/theoretical_loss": 4.484751047271697, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.484751047271697, + "tokens_seen": 158917632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004807321965897693, + "loss": 3.5702, + "theoretical_loss": 4.484751047271697, + "tokens_seen": 158917632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048072216649949854, + "loss": 3.6636, + "theoretical_loss": 4.484511413626774, + "tokens_seen": 158983168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048071213640922766, + "loss": 3.6384, + "theoretical_loss": 4.484271906389105, + "tokens_seen": 159048704 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004807021063189569, + "loss": 3.7286, + "theoretical_loss": 4.4840325254399644, + "tokens_seen": 159114240 + }, + { + "epoch": 1.02, + "learning_rate": 0.000480692076228686, + "loss": 3.7293, + "theoretical_loss": 4.483793270660788, + "tokens_seen": 159179776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048068204613841526, + "loss": 3.6596, + "theoretical_loss": 4.483554141933171, + "tokens_seen": 159245312 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048067201604814444, + "loss": 3.7506, + "theoretical_loss": 4.483315139138869, + "tokens_seen": 159310848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004806619859578736, + "loss": 3.5622, + "theoretical_loss": 4.483076262159797, + "tokens_seen": 159376384 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004806519558676028, + "loss": 3.5633, + "theoretical_loss": 4.482837510878028, + "tokens_seen": 159441920 + }, + { + "epoch": 1.02, + "learning_rate": 0.000480641925777332, + "loss": 3.5939, + "theoretical_loss": 4.482598885175795, + "tokens_seen": 159507456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048063189568706117, + "loss": 3.6573, + "theoretical_loss": 4.482360384935491, + "tokens_seen": 159572992 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004806218655967904, + "loss": 3.6659, + "theoretical_loss": 4.482122010039664, + "tokens_seen": 159638528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048061183550651953, + "loss": 3.7111, + "theoretical_loss": 4.481883760371024, + "tokens_seen": 159704064 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048060180541624877, + "loss": 3.6421, + "theoretical_loss": 4.481645635812435, + "tokens_seen": 159769600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048059177532597795, + "loss": 3.6759, + "theoretical_loss": 4.481407636246922, + "tokens_seen": 159835136 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048058174523570713, + "loss": 3.6274, + "theoretical_loss": 4.481169761557664, + "tokens_seen": 159900672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004805717151454363, + "loss": 3.4985, + "theoretical_loss": 4.480932011628001, + "tokens_seen": 159966208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004805616850551655, + "loss": 3.6544, + "theoretical_loss": 4.480694386341424, + "tokens_seen": 160031744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048055165496489467, + "loss": 3.6408, + "theoretical_loss": 4.480456885581589, + "tokens_seen": 160097280 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004805416248746239, + "loss": 3.592, + "theoretical_loss": 4.480219509232298, + "tokens_seen": 160162816 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048053159478435303, + "loss": 3.6107, + "theoretical_loss": 4.479982257177516, + "tokens_seen": 160228352 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048052156469408227, + "loss": 3.7043, + "theoretical_loss": 4.479745129301364, + "tokens_seen": 160293888 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004805115346038114, + "loss": 3.7045, + "theoretical_loss": 4.4795081254881115, + "tokens_seen": 160359424 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048050150451354063, + "loss": 3.7341, + "theoretical_loss": 4.47927124562219, + "tokens_seen": 160424960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004804914744232698, + "loss": 3.6445, + "theoretical_loss": 4.479034489588184, + "tokens_seen": 160490496 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 378176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.355400800704956, + "objective/train/theoretical_loss": 4.4787978572708305, + "objective/train/tokens_used": 161970656, + "theoretical_loss": 4.4787978572708305, + "tokens_seen": 160556032 + }, + { + "epoch": 1.02, + "learning_rate": 0.000480481444332999, + "loss": 3.6239, + "theoretical_loss": 4.4787978572708305, + "tokens_seen": 160556032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048047141424272823, + "loss": 3.7061, + "theoretical_loss": 4.478561348555022, + "tokens_seen": 160621568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048046138415245736, + "loss": 3.6489, + "theoretical_loss": 4.478324963325807, + "tokens_seen": 160687104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004804513540621866, + "loss": 3.4935, + "theoretical_loss": 4.478088701468385, + "tokens_seen": 160752640 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048044132397191577, + "loss": 3.6658, + "theoretical_loss": 4.47785256286811, + "tokens_seen": 160818176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048043129388164495, + "loss": 3.6588, + "theoretical_loss": 4.477616547410488, + "tokens_seen": 160883712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048042126379137413, + "loss": 3.698, + "theoretical_loss": 4.4773806549811805, + "tokens_seen": 160949248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048041123370110337, + "loss": 3.7416, + "theoretical_loss": 4.477144885465998, + "tokens_seen": 161014784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004804012036108325, + "loss": 3.6872, + "theoretical_loss": 4.47690923875091, + "tokens_seen": 161080320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048039117352056173, + "loss": 3.7372, + "theoretical_loss": 4.476673714722029, + "tokens_seen": 161145856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048038114343029086, + "loss": 3.4777, + "theoretical_loss": 4.476438313265627, + "tokens_seen": 161211392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004803711133400201, + "loss": 3.7179, + "theoretical_loss": 4.476203034268124, + "tokens_seen": 161276928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004803610832497493, + "loss": 3.7069, + "theoretical_loss": 4.475967877616092, + "tokens_seen": 161342464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048035105315947846, + "loss": 3.6934, + "theoretical_loss": 4.475732843196253, + "tokens_seen": 161408000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048034102306920764, + "loss": 3.4894, + "theoretical_loss": 4.475497930895483, + "tokens_seen": 161473536 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004803309929789368, + "loss": 3.5751, + "theoretical_loss": 4.475263140600806, + "tokens_seen": 161539072 + }, + { + "epoch": 1.02, + "learning_rate": 0.000480320962888666, + "loss": 3.6215, + "theoretical_loss": 4.475028472199396, + "tokens_seen": 161604608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048031093279839523, + "loss": 3.5943, + "theoretical_loss": 4.474793925578577, + "tokens_seen": 161670144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048030090270812436, + "loss": 3.6031, + "theoretical_loss": 4.474559500625827, + "tokens_seen": 161735680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004802908726178536, + "loss": 3.6, + "theoretical_loss": 4.474325197228768, + "tokens_seen": 161801216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004802808425275827, + "loss": 3.5645, + "theoretical_loss": 4.474091015275173, + "tokens_seen": 161866752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048027081243731196, + "loss": 3.5662, + "theoretical_loss": 4.473856954652965, + "tokens_seen": 161932288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00048026078234704114, + "loss": 3.5927, + "theoretical_loss": 4.473648596471702, + "tokens_seen": 161990656 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004802507522567703, + "loss": 3.6077, + "theoretical_loss": 4.473414764935917, + "tokens_seen": 162056192 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004802407221664995, + "loss": 3.7407, + "theoretical_loss": 4.473181054408381, + "tokens_seen": 162121728 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 426851, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.77402663230896, + "objective/train/theoretical_loss": 4.472947464777593, + "objective/train/tokens_used": 182647264, + "theoretical_loss": 4.472947464777593, + "tokens_seen": 162187264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048023069207622874, + "loss": 3.6008, + "theoretical_loss": 4.472947464777593, + "tokens_seen": 162187264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048022066198595786, + "loss": 3.5477, + "theoretical_loss": 4.4727139959321995, + "tokens_seen": 162252800 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004802106318956871, + "loss": 3.5316, + "theoretical_loss": 4.472480647760996, + "tokens_seen": 162318336 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048020060180541623, + "loss": 3.6883, + "theoretical_loss": 4.472247420152925, + "tokens_seen": 162383872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048019057171514546, + "loss": 3.5626, + "theoretical_loss": 4.472014312997077, + "tokens_seen": 162449408 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048018054162487464, + "loss": 3.7414, + "theoretical_loss": 4.471781326182684, + "tokens_seen": 162514944 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004801705115346038, + "loss": 3.6046, + "theoretical_loss": 4.471548459599131, + "tokens_seen": 162580480 + }, + { + "epoch": 2.0, + "learning_rate": 0.000480160481444333, + "loss": 3.5305, + "theoretical_loss": 4.4713157131359464, + "tokens_seen": 162646016 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004801504513540622, + "loss": 3.6908, + "theoretical_loss": 4.471083086682803, + "tokens_seen": 162711552 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048014042126379137, + "loss": 3.5507, + "theoretical_loss": 4.4708505801295235, + "tokens_seen": 162777088 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004801303911735206, + "loss": 3.5437, + "theoretical_loss": 4.4706181933660725, + "tokens_seen": 162842624 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048012036108324973, + "loss": 3.6483, + "theoretical_loss": 4.470385926282559, + "tokens_seen": 162908160 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048011033099297897, + "loss": 3.5597, + "theoretical_loss": 4.470153778769243, + "tokens_seen": 162973696 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048010030090270815, + "loss": 3.6892, + "theoretical_loss": 4.469921750716519, + "tokens_seen": 163039232 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048009027081243733, + "loss": 3.5257, + "theoretical_loss": 4.4696898420149385, + "tokens_seen": 163104768 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004800802407221665, + "loss": 3.6206, + "theoretical_loss": 4.4694580525551855, + "tokens_seen": 163170304 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004800702106318957, + "loss": 3.7363, + "theoretical_loss": 4.469226382228097, + "tokens_seen": 163235840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048006018054162487, + "loss": 3.6159, + "theoretical_loss": 4.468994830924647, + "tokens_seen": 163301376 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004800501504513541, + "loss": 3.6262, + "theoretical_loss": 4.468763398535957, + "tokens_seen": 163366912 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048004012036108323, + "loss": 3.5062, + "theoretical_loss": 4.468532084953291, + "tokens_seen": 163432448 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048003009027081247, + "loss": 3.6885, + "theoretical_loss": 4.468300890068056, + "tokens_seen": 163497984 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004800200601805416, + "loss": 3.6097, + "theoretical_loss": 4.4680698137718, + "tokens_seen": 163563520 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048001003009027083, + "loss": 3.7044, + "theoretical_loss": 4.467838855956215, + "tokens_seen": 163629056 + }, + { + "epoch": 2.0, + "learning_rate": 0.00048, + "loss": 3.6113, + "theoretical_loss": 4.467608016513135, + "tokens_seen": 163694592 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004799899699097292, + "loss": 3.6085, + "theoretical_loss": 4.467377295334537, + "tokens_seen": 163760128 + }, + { + "debugging/Self-BLEU-5": 0.7156837570907468, + "debugging/distinct-1-grams": 0.7373837852215696, + "debugging/distinct-2-grams": 0.9386051065222842, + "debugging/entropy-1-grams": 6.441300866766138, + "debugging/entropy-2-grams": 7.774271728448435, + "debugging/length": 594.1212121212121, + "debugging/num_segments": 33, + "epoch": 2.0, + "objective/train/docs_used": 430506, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.498753547668457, + "objective/train/theoretical_loss": 4.467146692312539, + "objective/train/tokens_used": 184285664, + "theoretical_loss": 4.467146692312539, + "tokens_seen": 163825664 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004799799398194584, + "loss": 3.6607, + "theoretical_loss": 4.467146692312539, + "tokens_seen": 163825664 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047996990972918756, + "loss": 3.6664, + "theoretical_loss": 4.466916207339398, + "tokens_seen": 163891200 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047995987963891674, + "loss": 3.6741, + "theoretical_loss": 4.466685840307518, + "tokens_seen": 163956736 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047994984954864597, + "loss": 3.5604, + "theoretical_loss": 4.466455591109439, + "tokens_seen": 164022272 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004799398194583751, + "loss": 3.5872, + "theoretical_loss": 4.466225459637842, + "tokens_seen": 164087808 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047992978936810433, + "loss": 3.5496, + "theoretical_loss": 4.465995445785552, + "tokens_seen": 164153344 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004799197592778335, + "loss": 3.5098, + "theoretical_loss": 4.46576554944553, + "tokens_seen": 164218880 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004799097291875627, + "loss": 3.6409, + "theoretical_loss": 4.465535770510883, + "tokens_seen": 164284416 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004798996990972919, + "loss": 3.5091, + "theoretical_loss": 4.465306108874851, + "tokens_seen": 164349952 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047988966900702106, + "loss": 3.5647, + "theoretical_loss": 4.465076564430818, + "tokens_seen": 164415488 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047987963891675024, + "loss": 3.6075, + "theoretical_loss": 4.464847137072307, + "tokens_seen": 164481024 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004798696088264795, + "loss": 3.5129, + "theoretical_loss": 4.464617826692978, + "tokens_seen": 164546560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004798595787362086, + "loss": 3.6512, + "theoretical_loss": 4.464388633186632, + "tokens_seen": 164612096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047984954864593784, + "loss": 3.5029, + "theoretical_loss": 4.464159556447206, + "tokens_seen": 164677632 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047983951855566696, + "loss": 3.6575, + "theoretical_loss": 4.46393059636878, + "tokens_seen": 164743168 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004798294884653962, + "loss": 3.5135, + "theoretical_loss": 4.463701752845567, + "tokens_seen": 164808704 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004798194583751254, + "loss": 3.6073, + "theoretical_loss": 4.463473025771921, + "tokens_seen": 164874240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047980942828485456, + "loss": 3.4507, + "theoretical_loss": 4.463244415042334, + "tokens_seen": 164939776 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047979939819458374, + "loss": 3.6124, + "theoretical_loss": 4.463015920551433, + "tokens_seen": 165005312 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004797893681043129, + "loss": 3.5483, + "theoretical_loss": 4.462787542193985, + "tokens_seen": 165070848 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004797793380140421, + "loss": 3.6642, + "theoretical_loss": 4.462559279864893, + "tokens_seen": 165136384 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047976930792377134, + "loss": 3.7073, + "theoretical_loss": 4.462331133459195, + "tokens_seen": 165201920 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047975927783350047, + "loss": 3.5634, + "theoretical_loss": 4.462103102872067, + "tokens_seen": 165267456 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004797492477432297, + "loss": 3.612, + "theoretical_loss": 4.4618751879988245, + "tokens_seen": 165332992 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004797392176529589, + "loss": 3.5723, + "theoretical_loss": 4.461647388734912, + "tokens_seen": 165398528 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 435439, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.663811445236206, + "objective/train/theoretical_loss": 4.461419704975915, + "objective/train/tokens_used": 185924064, + "theoretical_loss": 4.461419704975915, + "tokens_seen": 165464064 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047972918756268807, + "loss": 3.5897, + "theoretical_loss": 4.461419704975915, + "tokens_seen": 165464064 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004797191574724173, + "loss": 3.6495, + "theoretical_loss": 4.461192136617554, + "tokens_seen": 165529600 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047970912738214643, + "loss": 3.6417, + "theoretical_loss": 4.460964683555684, + "tokens_seen": 165595136 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047969909729187566, + "loss": 3.6949, + "theoretical_loss": 4.460737345686296, + "tokens_seen": 165660672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047968906720160484, + "loss": 3.7034, + "theoretical_loss": 4.460510122905514, + "tokens_seen": 165726208 + }, + { + "epoch": 2.0, + "learning_rate": 0.000479679037111334, + "loss": 3.6535, + "theoretical_loss": 4.4602830151096, + "tokens_seen": 165791744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004796690070210632, + "loss": 3.5573, + "theoretical_loss": 4.460056022194946, + "tokens_seen": 165857280 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004796589769307924, + "loss": 3.7105, + "theoretical_loss": 4.459829144058084, + "tokens_seen": 165922816 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047964894684052157, + "loss": 3.6686, + "theoretical_loss": 4.4596023805956735, + "tokens_seen": 165988352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004796389167502508, + "loss": 3.6904, + "theoretical_loss": 4.459375731704513, + "tokens_seen": 166053888 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047962888665997993, + "loss": 3.6895, + "theoretical_loss": 4.459149197281532, + "tokens_seen": 166119424 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047961885656970917, + "loss": 3.5273, + "theoretical_loss": 4.458922777223793, + "tokens_seen": 166184960 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047960882647943835, + "loss": 3.6895, + "theoretical_loss": 4.458696471428495, + "tokens_seen": 166250496 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047959879638916753, + "loss": 3.6107, + "theoretical_loss": 4.458470279792966, + "tokens_seen": 166316032 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004795887662988967, + "loss": 3.675, + "theoretical_loss": 4.458244202214668, + "tokens_seen": 166381568 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004795787362086259, + "loss": 3.5779, + "theoretical_loss": 4.458018238591196, + "tokens_seen": 166447104 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047956870611835507, + "loss": 3.3576, + "theoretical_loss": 4.457792388820277, + "tokens_seen": 166512640 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004795586760280843, + "loss": 3.6385, + "theoretical_loss": 4.457566652799768, + "tokens_seen": 166578176 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047954864593781343, + "loss": 3.7339, + "theoretical_loss": 4.457341030427663, + "tokens_seen": 166643712 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047953861584754267, + "loss": 3.7391, + "theoretical_loss": 4.457115521602081, + "tokens_seen": 166709248 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004795285857572718, + "loss": 3.7443, + "theoretical_loss": 4.456890126221277, + "tokens_seen": 166774784 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047951855566700103, + "loss": 3.5718, + "theoretical_loss": 4.456664844183635, + "tokens_seen": 166840320 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004795085255767302, + "loss": 3.5754, + "theoretical_loss": 4.456439675387671, + "tokens_seen": 166905856 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794984954864594, + "loss": 3.5943, + "theoretical_loss": 4.456214619732029, + "tokens_seen": 166971392 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794884653961886, + "loss": 3.7053, + "theoretical_loss": 4.45598967711549, + "tokens_seen": 167036928 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 438623, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8556952476501465, + "objective/train/theoretical_loss": 4.455764847436956, + "objective/train/tokens_used": 187562464, + "theoretical_loss": 4.455764847436956, + "tokens_seen": 167102464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047947843530591776, + "loss": 3.6919, + "theoretical_loss": 4.455764847436956, + "tokens_seen": 167102464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047946840521564694, + "loss": 3.6003, + "theoretical_loss": 4.455540130595467, + "tokens_seen": 167168000 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047945837512537617, + "loss": 3.7931, + "theoretical_loss": 4.455315526490188, + "tokens_seen": 167233536 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794483450351053, + "loss": 3.6974, + "theoretical_loss": 4.455091035020416, + "tokens_seen": 167299072 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047943831494483453, + "loss": 3.6041, + "theoretical_loss": 4.454866656085576, + "tokens_seen": 167364608 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794282848545637, + "loss": 3.5106, + "theoretical_loss": 4.4546423895852225, + "tokens_seen": 167430144 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794182547642929, + "loss": 3.5492, + "theoretical_loss": 4.454418235419041, + "tokens_seen": 167495680 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004794082246740221, + "loss": 3.6944, + "theoretical_loss": 4.454194193486841, + "tokens_seen": 167561216 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047939819458375126, + "loss": 3.7105, + "theoretical_loss": 4.453970263688565, + "tokens_seen": 167626752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047938816449348044, + "loss": 3.609, + "theoretical_loss": 4.453746445924282, + "tokens_seen": 167692288 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004793781344032097, + "loss": 3.723, + "theoretical_loss": 4.453522740094188, + "tokens_seen": 167757824 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004793681043129388, + "loss": 3.6943, + "theoretical_loss": 4.45329914609861, + "tokens_seen": 167823360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047935807422266804, + "loss": 3.5447, + "theoretical_loss": 4.453075663838, + "tokens_seen": 167888896 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047934804413239716, + "loss": 3.5804, + "theoretical_loss": 4.452852293212937, + "tokens_seen": 167954432 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004793380140421264, + "loss": 3.5918, + "theoretical_loss": 4.452629034124128, + "tokens_seen": 168019968 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004793279839518556, + "loss": 3.698, + "theoretical_loss": 4.452405886472409, + "tokens_seen": 168085504 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047931795386158476, + "loss": 3.515, + "theoretical_loss": 4.45218285015874, + "tokens_seen": 168151040 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047930792377131394, + "loss": 3.6059, + "theoretical_loss": 4.451959925084211, + "tokens_seen": 168216576 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004792978936810431, + "loss": 3.5616, + "theoretical_loss": 4.451737111150031, + "tokens_seen": 168282112 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004792878635907723, + "loss": 3.6087, + "theoretical_loss": 4.451514408257545, + "tokens_seen": 168347648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047927783350050154, + "loss": 3.6422, + "theoretical_loss": 4.451291816308217, + "tokens_seen": 168413184 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047926780341023067, + "loss": 3.5509, + "theoretical_loss": 4.451069335203638, + "tokens_seen": 168478720 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004792577733199599, + "loss": 3.6852, + "theoretical_loss": 4.450846964845526, + "tokens_seen": 168544256 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004792477432296891, + "loss": 3.5128, + "theoretical_loss": 4.450624705135726, + "tokens_seen": 168609792 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047923771313941827, + "loss": 3.712, + "theoretical_loss": 4.450402555976203, + "tokens_seen": 168675328 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 443283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.689483880996704, + "objective/train/theoretical_loss": 4.450180517269051, + "objective/train/tokens_used": 189200864, + "theoretical_loss": 4.450180517269051, + "tokens_seen": 168740864 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047922768304914745, + "loss": 3.5774, + "theoretical_loss": 4.450180517269051, + "tokens_seen": 168740864 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047921765295887663, + "loss": 3.5022, + "theoretical_loss": 4.449958588916487, + "tokens_seen": 168806400 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004792076228686058, + "loss": 3.505, + "theoretical_loss": 4.449736770820852, + "tokens_seen": 168871936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047919759277833504, + "loss": 3.4933, + "theoretical_loss": 4.449515062884615, + "tokens_seen": 168937472 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047918756268806417, + "loss": 3.3951, + "theoretical_loss": 4.449293465010362, + "tokens_seen": 169003008 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004791775325977934, + "loss": 3.5627, + "theoretical_loss": 4.449071977100811, + "tokens_seen": 169068544 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047916750250752253, + "loss": 3.5957, + "theoretical_loss": 4.448850599058797, + "tokens_seen": 169134080 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047915747241725177, + "loss": 3.5257, + "theoretical_loss": 4.448629330787282, + "tokens_seen": 169199616 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047914744232698095, + "loss": 3.4879, + "theoretical_loss": 4.448408172189353, + "tokens_seen": 169265152 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047913741223671013, + "loss": 3.6382, + "theoretical_loss": 4.448187123168214, + "tokens_seen": 169330688 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004791273821464393, + "loss": 3.3982, + "theoretical_loss": 4.447966183627196, + "tokens_seen": 169396224 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047911735205616855, + "loss": 3.5768, + "theoretical_loss": 4.447745353469753, + "tokens_seen": 169461760 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004791073219658977, + "loss": 3.5387, + "theoretical_loss": 4.44752463259946, + "tokens_seen": 169527296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004790972918756269, + "loss": 3.5598, + "theoretical_loss": 4.447304020920015, + "tokens_seen": 169592832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047908726178535604, + "loss": 3.6398, + "theoretical_loss": 4.447083518335238, + "tokens_seen": 169658368 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047907723169508527, + "loss": 3.598, + "theoretical_loss": 4.446863124749069, + "tokens_seen": 169723904 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047906720160481445, + "loss": 3.5996, + "theoretical_loss": 4.446642840065573, + "tokens_seen": 169789440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047905717151454363, + "loss": 3.6348, + "theoretical_loss": 4.446422664188933, + "tokens_seen": 169854976 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004790471414242728, + "loss": 3.6233, + "theoretical_loss": 4.446202597023456, + "tokens_seen": 169920512 + }, + { + "epoch": 2.0, + "learning_rate": 0.000479037111334002, + "loss": 3.6429, + "theoretical_loss": 4.445982638473568, + "tokens_seen": 169986048 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004790270812437312, + "loss": 3.5427, + "theoretical_loss": 4.445762788443817, + "tokens_seen": 170051584 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004790170511534604, + "loss": 3.641, + "theoretical_loss": 4.445543046838872, + "tokens_seen": 170117120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047900702106318954, + "loss": 3.6395, + "theoretical_loss": 4.4453234135635205, + "tokens_seen": 170182656 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789969909729188, + "loss": 3.6181, + "theoretical_loss": 4.445103888522672, + "tokens_seen": 170248192 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789869608826479, + "loss": 3.6038, + "theoretical_loss": 4.444884471621355, + "tokens_seen": 170313728 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 446384, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4876255989074707, + "objective/train/theoretical_loss": 4.4446651627647205, + "objective/train/tokens_used": 190839264, + "theoretical_loss": 4.4446651627647205, + "tokens_seen": 170379264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047897693079237714, + "loss": 3.5053, + "theoretical_loss": 4.4446651627647205, + "tokens_seen": 170379264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047896690070210637, + "loss": 3.5625, + "theoretical_loss": 4.444445961858034, + "tokens_seen": 170444800 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789568706118355, + "loss": 3.6433, + "theoretical_loss": 4.444226868806686, + "tokens_seen": 170510336 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047894684052156473, + "loss": 3.42, + "theoretical_loss": 4.44400788351618, + "tokens_seen": 170575872 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789368104312939, + "loss": 3.6576, + "theoretical_loss": 4.443789005892147, + "tokens_seen": 170641408 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789267803410231, + "loss": 3.5027, + "theoretical_loss": 4.443570235840328, + "tokens_seen": 170706944 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004789167502507523, + "loss": 3.6012, + "theoretical_loss": 4.44335157326659, + "tokens_seen": 170772480 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047890672016048146, + "loss": 3.5763, + "theoretical_loss": 4.443133018076912, + "tokens_seen": 170838016 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047889669007021064, + "loss": 3.4666, + "theoretical_loss": 4.442914570177397, + "tokens_seen": 170903552 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004788866599799399, + "loss": 3.66, + "theoretical_loss": 4.442696229474263, + "tokens_seen": 170969088 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478876629889669, + "loss": 3.6924, + "theoretical_loss": 4.442477995873846, + "tokens_seen": 171034624 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047886659979939824, + "loss": 3.3546, + "theoretical_loss": 4.442259869282598, + "tokens_seen": 171100160 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047885656970912736, + "loss": 3.6204, + "theoretical_loss": 4.442041849607095, + "tokens_seen": 171165696 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004788465396188566, + "loss": 3.7245, + "theoretical_loss": 4.441823936754023, + "tokens_seen": 171231232 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004788365095285858, + "loss": 3.6665, + "theoretical_loss": 4.441606130630188, + "tokens_seen": 171296768 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047882647943831496, + "loss": 3.6331, + "theoretical_loss": 4.441388431142514, + "tokens_seen": 171362304 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047881644934804414, + "loss": 3.6869, + "theoretical_loss": 4.4411708381980395, + "tokens_seen": 171427840 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004788064192577733, + "loss": 3.6115, + "theoretical_loss": 4.4409533517039215, + "tokens_seen": 171493376 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004787963891675025, + "loss": 3.6146, + "theoretical_loss": 4.440735971567433, + "tokens_seen": 171558912 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047878635907723174, + "loss": 3.6107, + "theoretical_loss": 4.440518697695962, + "tokens_seen": 171624448 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047877632898696087, + "loss": 3.5579, + "theoretical_loss": 4.440301529997013, + "tokens_seen": 171689984 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004787662988966901, + "loss": 3.6426, + "theoretical_loss": 4.440084468378207, + "tokens_seen": 171755520 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004787562688064193, + "loss": 3.5663, + "theoretical_loss": 4.43986751274728, + "tokens_seen": 171821056 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047874623871614847, + "loss": 3.6195, + "theoretical_loss": 4.439650663012083, + "tokens_seen": 171886592 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047873620862587765, + "loss": 3.708, + "theoretical_loss": 4.4394339190805825, + "tokens_seen": 171952128 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 450235, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.655320882797241, + "objective/train/theoretical_loss": 4.439217280860861, + "objective/train/tokens_used": 192477664, + "theoretical_loss": 4.439217280860861, + "tokens_seen": 172017664 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047872617853560683, + "loss": 3.7381, + "theoretical_loss": 4.439217280860861, + "tokens_seen": 172017664 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478716148445336, + "loss": 3.5715, + "theoretical_loss": 4.439000748261114, + "tokens_seen": 172083200 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047870611835506524, + "loss": 3.6153, + "theoretical_loss": 4.438784321189653, + "tokens_seen": 172148736 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047869608826479437, + "loss": 3.5956, + "theoretical_loss": 4.438567999554904, + "tokens_seen": 172214272 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004786860581745236, + "loss": 3.5524, + "theoretical_loss": 4.438351783265407, + "tokens_seen": 172279808 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047867602808425273, + "loss": 3.614, + "theoretical_loss": 4.438135672229815, + "tokens_seen": 172345344 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047866599799398197, + "loss": 3.5213, + "theoretical_loss": 4.437919666356896, + "tokens_seen": 172410880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047865596790371115, + "loss": 3.5783, + "theoretical_loss": 4.437703765555533, + "tokens_seen": 172476416 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047864593781344033, + "loss": 3.5306, + "theoretical_loss": 4.43748796973472, + "tokens_seen": 172541952 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004786359077231695, + "loss": 3.5212, + "theoretical_loss": 4.437272278803566, + "tokens_seen": 172607488 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047862587763289875, + "loss": 3.6123, + "theoretical_loss": 4.437056692671292, + "tokens_seen": 172673024 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004786158475426279, + "loss": 3.6877, + "theoretical_loss": 4.436841211247232, + "tokens_seen": 172738560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004786058174523571, + "loss": 3.6359, + "theoretical_loss": 4.436625834440836, + "tokens_seen": 172804096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047859578736208624, + "loss": 3.4673, + "theoretical_loss": 4.436410562161662, + "tokens_seen": 172869632 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047858575727181547, + "loss": 3.6679, + "theoretical_loss": 4.436195394319383, + "tokens_seen": 172935168 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047857572718154465, + "loss": 3.6206, + "theoretical_loss": 4.435980330823783, + "tokens_seen": 173000704 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047856569709127383, + "loss": 3.6728, + "theoretical_loss": 4.435765371584759, + "tokens_seen": 173066240 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478555667001003, + "loss": 3.4296, + "theoretical_loss": 4.435550516512322, + "tokens_seen": 173131776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004785456369107322, + "loss": 3.5975, + "theoretical_loss": 4.435335765516589, + "tokens_seen": 173197312 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004785356068204614, + "loss": 3.6644, + "theoretical_loss": 4.4351211185077934, + "tokens_seen": 173262848 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004785255767301906, + "loss": 3.6997, + "theoretical_loss": 4.434906575396278, + "tokens_seen": 173328384 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047851554663991974, + "loss": 3.5828, + "theoretical_loss": 4.434692136092499, + "tokens_seen": 173393920 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478505516549649, + "loss": 3.7316, + "theoretical_loss": 4.434477800507019, + "tokens_seen": 173459456 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784954864593781, + "loss": 3.6055, + "theoretical_loss": 4.4342635685505165, + "tokens_seen": 173524992 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047848545636910734, + "loss": 3.6023, + "theoretical_loss": 4.434049440133776, + "tokens_seen": 173590528 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 454835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6979739665985107, + "objective/train/theoretical_loss": 4.433835415167698, + "objective/train/tokens_used": 194116064, + "theoretical_loss": 4.433835415167698, + "tokens_seen": 173656064 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784754262788365, + "loss": 3.5398, + "theoretical_loss": 4.433835415167698, + "tokens_seen": 173656064 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784653961885657, + "loss": 3.5655, + "theoretical_loss": 4.433621493563288, + "tokens_seen": 173721600 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784553660982949, + "loss": 3.5522, + "theoretical_loss": 4.433407675231663, + "tokens_seen": 173787136 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784453360080241, + "loss": 3.4722, + "theoretical_loss": 4.4331939600840515, + "tokens_seen": 173852672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047843530591775324, + "loss": 3.6181, + "theoretical_loss": 4.432980348031791, + "tokens_seen": 173918208 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784252758274825, + "loss": 3.6486, + "theoretical_loss": 4.432766838986328, + "tokens_seen": 173983744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004784152457372116, + "loss": 3.6054, + "theoretical_loss": 4.432553432859219, + "tokens_seen": 174049280 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047840521564694084, + "loss": 3.4635, + "theoretical_loss": 4.432340129562128, + "tokens_seen": 174114816 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047839518555667, + "loss": 3.5047, + "theoretical_loss": 4.432126929006833, + "tokens_seen": 174180352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004783851554663992, + "loss": 3.5354, + "theoretical_loss": 4.431913831105213, + "tokens_seen": 174245888 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004783751253761284, + "loss": 3.6739, + "theoretical_loss": 4.431700835769261, + "tokens_seen": 174311424 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047836509528585756, + "loss": 3.5662, + "theoretical_loss": 4.43148794291108, + "tokens_seen": 174376960 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047835506519558675, + "loss": 3.4641, + "theoretical_loss": 4.431275152442877, + "tokens_seen": 174442496 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478345035105316, + "loss": 3.525, + "theoretical_loss": 4.431062464276969, + "tokens_seen": 174508032 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004783350050150451, + "loss": 3.5995, + "theoretical_loss": 4.43084987832578, + "tokens_seen": 174573568 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047832497492477434, + "loss": 3.7173, + "theoretical_loss": 4.430637394501845, + "tokens_seen": 174639104 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047831494483450347, + "loss": 3.6189, + "theoretical_loss": 4.430425012717803, + "tokens_seen": 174704640 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004783049147442327, + "loss": 3.6207, + "theoretical_loss": 4.430212732886403, + "tokens_seen": 174770176 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004782948846539619, + "loss": 3.587, + "theoretical_loss": 4.430000554920499, + "tokens_seen": 174835712 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047828485456369107, + "loss": 3.6188, + "theoretical_loss": 4.4297884787330535, + "tokens_seen": 174901248 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047827482447342025, + "loss": 3.5284, + "theoretical_loss": 4.429576504237135, + "tokens_seen": 174966784 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004782647943831495, + "loss": 3.6435, + "theoretical_loss": 4.429364631345921, + "tokens_seen": 175032320 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004782547642928786, + "loss": 3.5804, + "theoretical_loss": 4.429152859972692, + "tokens_seen": 175097856 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047824473420260785, + "loss": 3.5734, + "theoretical_loss": 4.42894119003084, + "tokens_seen": 175163392 + }, + { + "epoch": 2.0, + "learning_rate": 0.000478234704112337, + "loss": 3.6917, + "theoretical_loss": 4.428729621433858, + "tokens_seen": 175228928 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 457957, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5127925872802734, + "objective/train/theoretical_loss": 4.428518154095348, + "objective/train/tokens_used": 195754464, + "theoretical_loss": 4.428518154095348, + "tokens_seen": 175294464 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004782246740220662, + "loss": 3.6008, + "theoretical_loss": 4.428518154095348, + "tokens_seen": 175294464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047821464393179544, + "loss": 3.6774, + "theoretical_loss": 4.428306787929017, + "tokens_seen": 175360000 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047820461384152457, + "loss": 3.5193, + "theoretical_loss": 4.428095522848679, + "tokens_seen": 175425536 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004781945837512538, + "loss": 3.4399, + "theoretical_loss": 4.427884358768251, + "tokens_seen": 175491072 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047818455366098293, + "loss": 3.5005, + "theoretical_loss": 4.427673295601759, + "tokens_seen": 175556608 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047817452357071217, + "loss": 3.5741, + "theoretical_loss": 4.427462333263332, + "tokens_seen": 175622144 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047816449348044135, + "loss": 3.6113, + "theoretical_loss": 4.427251471667202, + "tokens_seen": 175687680 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047815446339017053, + "loss": 3.5429, + "theoretical_loss": 4.427040710727711, + "tokens_seen": 175753216 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004781444332998997, + "loss": 3.6412, + "theoretical_loss": 4.426830050359302, + "tokens_seen": 175818752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047813440320962895, + "loss": 3.6116, + "theoretical_loss": 4.426619490476524, + "tokens_seen": 175884288 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004781243731193581, + "loss": 3.5965, + "theoretical_loss": 4.426409030994029, + "tokens_seen": 175949824 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004781143430290873, + "loss": 3.5645, + "theoretical_loss": 4.426198671826576, + "tokens_seen": 176015360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047810431293881644, + "loss": 3.5557, + "theoretical_loss": 4.425988412889025, + "tokens_seen": 176080896 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047809428284854567, + "loss": 3.6326, + "theoretical_loss": 4.425778254096341, + "tokens_seen": 176146432 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047808425275827485, + "loss": 3.6524, + "theoretical_loss": 4.425568195363594, + "tokens_seen": 176211968 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047807422266800403, + "loss": 3.6334, + "theoretical_loss": 4.425358236605955, + "tokens_seen": 176277504 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780641925777332, + "loss": 3.5218, + "theoretical_loss": 4.425148377738703, + "tokens_seen": 176343040 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780541624874624, + "loss": 3.6006, + "theoretical_loss": 4.4249386186772135, + "tokens_seen": 176408576 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780441323971916, + "loss": 3.523, + "theoretical_loss": 4.4247289593369725, + "tokens_seen": 176474112 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780341023069208, + "loss": 3.5788, + "theoretical_loss": 4.424519399633562, + "tokens_seen": 176539648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047802407221664994, + "loss": 3.632, + "theoretical_loss": 4.424309939482672, + "tokens_seen": 176605184 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780140421263792, + "loss": 3.6347, + "theoretical_loss": 4.424100578800093, + "tokens_seen": 176670720 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004780040120361083, + "loss": 3.5626, + "theoretical_loss": 4.423891317501718, + "tokens_seen": 176736256 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047799398194583754, + "loss": 3.4092, + "theoretical_loss": 4.423682155503541, + "tokens_seen": 176801792 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779839518555667, + "loss": 3.6029, + "theoretical_loss": 4.423473092721661, + "tokens_seen": 176867328 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 460775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.163076639175415, + "objective/train/theoretical_loss": 4.423264129072278, + "objective/train/tokens_used": 197392864, + "theoretical_loss": 4.423264129072278, + "tokens_seen": 176932864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779739217652959, + "loss": 3.37, + "theoretical_loss": 4.423264129072278, + "tokens_seen": 176932864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779638916750251, + "loss": 3.5481, + "theoretical_loss": 4.4230552644716905, + "tokens_seen": 176998400 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779538615847543, + "loss": 3.6455, + "theoretical_loss": 4.422846498836302, + "tokens_seen": 177063936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047794383149448344, + "loss": 3.6583, + "theoretical_loss": 4.42263783208262, + "tokens_seen": 177129472 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779338014042127, + "loss": 3.6714, + "theoretical_loss": 4.422429264127246, + "tokens_seen": 177195008 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779237713139418, + "loss": 3.6047, + "theoretical_loss": 4.422220794886888, + "tokens_seen": 177260544 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047791374122367104, + "loss": 3.5472, + "theoretical_loss": 4.422012424278355, + "tokens_seen": 177326080 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004779037111334002, + "loss": 3.4656, + "theoretical_loss": 4.421804152218554, + "tokens_seen": 177391616 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778936810431294, + "loss": 3.5652, + "theoretical_loss": 4.421595978624495, + "tokens_seen": 177457152 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778836509528586, + "loss": 3.6251, + "theoretical_loss": 4.4213879034132875, + "tokens_seen": 177522688 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047787362086258776, + "loss": 3.5653, + "theoretical_loss": 4.421179926502141, + "tokens_seen": 177588224 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047786359077231695, + "loss": 3.6025, + "theoretical_loss": 4.420972047808367, + "tokens_seen": 177653760 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778535606820462, + "loss": 3.5416, + "theoretical_loss": 4.420764267249375, + "tokens_seen": 177719296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778435305917753, + "loss": 3.6489, + "theoretical_loss": 4.420556584742673, + "tokens_seen": 177784832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047783350050150454, + "loss": 3.478, + "theoretical_loss": 4.420349000205874, + "tokens_seen": 177850368 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047782347041123367, + "loss": 3.5431, + "theoretical_loss": 4.420141513556687, + "tokens_seen": 177915904 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778134403209629, + "loss": 3.5822, + "theoretical_loss": 4.41993412471292, + "tokens_seen": 177981440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004778034102306921, + "loss": 3.5225, + "theoretical_loss": 4.41972683359248, + "tokens_seen": 178046976 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047779338014042127, + "loss": 3.536, + "theoretical_loss": 4.419519640113375, + "tokens_seen": 178112512 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047778335005015045, + "loss": 3.5597, + "theoretical_loss": 4.419312544193712, + "tokens_seen": 178178048 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004777733199598797, + "loss": 3.5714, + "theoretical_loss": 4.4191055457516955, + "tokens_seen": 178243584 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004777632898696088, + "loss": 3.4338, + "theoretical_loss": 4.418898644705628, + "tokens_seen": 178309120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00047775325977933805, + "loss": 3.5667, + "theoretical_loss": 4.418691840973912, + "tokens_seen": 178374656 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004777432296890672, + "loss": 3.5599, + "theoretical_loss": 4.418485134475047, + "tokens_seen": 178440192 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004777331995987964, + "loss": 3.6155, + "theoretical_loss": 4.418278525127633, + "tokens_seen": 178505728 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 465584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.479161262512207, + "objective/train/theoretical_loss": 4.418072012850365, + "objective/train/tokens_used": 199031264, + "theoretical_loss": 4.418072012850365, + "tokens_seen": 178571264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004777231695085256, + "loss": 3.5651, + "theoretical_loss": 4.418072012850365, + "tokens_seen": 178571264 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047771313941825477, + "loss": 3.6205, + "theoretical_loss": 4.417865597562036, + "tokens_seen": 178636800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047770310932798395, + "loss": 3.4811, + "theoretical_loss": 4.417659279181542, + "tokens_seen": 178702336 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047769307923771313, + "loss": 3.4966, + "theoretical_loss": 4.417453057627869, + "tokens_seen": 178767872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004776830491474423, + "loss": 3.5267, + "theoretical_loss": 4.417246932820104, + "tokens_seen": 178833408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047767301905717155, + "loss": 3.6723, + "theoretical_loss": 4.4170409046774335, + "tokens_seen": 178898944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004776629889669007, + "loss": 3.6552, + "theoretical_loss": 4.416834973119137, + "tokens_seen": 178964480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004776529588766299, + "loss": 3.505, + "theoretical_loss": 4.416629138064591, + "tokens_seen": 179030016 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047764292878635904, + "loss": 3.5941, + "theoretical_loss": 4.416423399433274, + "tokens_seen": 179095552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004776328986960883, + "loss": 3.4672, + "theoretical_loss": 4.4162177571447545, + "tokens_seen": 179161088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047762286860581746, + "loss": 3.6449, + "theoretical_loss": 4.4160122111187015, + "tokens_seen": 179226624 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047761283851554664, + "loss": 3.4784, + "theoretical_loss": 4.4158067612748795, + "tokens_seen": 179292160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004776028084252758, + "loss": 3.6779, + "theoretical_loss": 4.415601407533149, + "tokens_seen": 179357696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047759277833500505, + "loss": 3.443, + "theoretical_loss": 4.415396149813467, + "tokens_seen": 179423232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775827482447342, + "loss": 3.5475, + "theoretical_loss": 4.415190988035885, + "tokens_seen": 179488768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775727181544634, + "loss": 3.4474, + "theoretical_loss": 4.41498592212055, + "tokens_seen": 179554304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047756268806419254, + "loss": 3.5958, + "theoretical_loss": 4.414780951987709, + "tokens_seen": 179619840 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775526579739218, + "loss": 3.5748, + "theoretical_loss": 4.414576077557697, + "tokens_seen": 179685376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047754262788365096, + "loss": 3.5453, + "theoretical_loss": 4.414371298750951, + "tokens_seen": 179750912 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047753259779338014, + "loss": 3.5971, + "theoretical_loss": 4.414166615488001, + "tokens_seen": 179816448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775225677031093, + "loss": 3.5169, + "theoretical_loss": 4.41396202768947, + "tokens_seen": 179881984 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775125376128385, + "loss": 3.6132, + "theoretical_loss": 4.413757535276077, + "tokens_seen": 179947520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004775025075225677, + "loss": 3.7105, + "theoretical_loss": 4.413553138168638, + "tokens_seen": 180013056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774924774322969, + "loss": 3.6092, + "theoretical_loss": 4.413348836288058, + "tokens_seen": 180078592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047748244734202605, + "loss": 3.5486, + "theoretical_loss": 4.413144629555345, + "tokens_seen": 180144128 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 469314, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5523335933685303, + "objective/train/theoretical_loss": 4.412940517891592, + "objective/train/tokens_used": 200669664, + "theoretical_loss": 4.412940517891592, + "tokens_seen": 180209664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774724172517553, + "loss": 3.6258, + "theoretical_loss": 4.412940517891592, + "tokens_seen": 180209664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774623871614845, + "loss": 3.5852, + "theoretical_loss": 4.412736501217992, + "tokens_seen": 180275200 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047745235707121364, + "loss": 3.4049, + "theoretical_loss": 4.41253257945583, + "tokens_seen": 180340736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774423269809429, + "loss": 3.6243, + "theoretical_loss": 4.412328752526484, + "tokens_seen": 180406272 + }, + { + "epoch": 2.01, + "learning_rate": 0.000477432296890672, + "loss": 3.5559, + "theoretical_loss": 4.412125020351428, + "tokens_seen": 180471808 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047742226680040124, + "loss": 3.5374, + "theoretical_loss": 4.411921382852228, + "tokens_seen": 180537344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774122367101304, + "loss": 3.5985, + "theoretical_loss": 4.411717839950542, + "tokens_seen": 180602880 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004774022066198596, + "loss": 3.5828, + "theoretical_loss": 4.411514391568125, + "tokens_seen": 180668416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004773921765295888, + "loss": 3.4637, + "theoretical_loss": 4.411311037626822, + "tokens_seen": 180733952 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047738214643931797, + "loss": 3.4052, + "theoretical_loss": 4.411107778048571, + "tokens_seen": 180799488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047737211634904715, + "loss": 3.5656, + "theoretical_loss": 4.410904612755405, + "tokens_seen": 180865024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004773620862587764, + "loss": 3.526, + "theoretical_loss": 4.410701541669447, + "tokens_seen": 180930560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004773520561685055, + "loss": 3.5731, + "theoretical_loss": 4.410498564712913, + "tokens_seen": 180996096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047734202607823474, + "loss": 3.5841, + "theoretical_loss": 4.410295681808115, + "tokens_seen": 181061632 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047733199598796387, + "loss": 3.4679, + "theoretical_loss": 4.410092892877451, + "tokens_seen": 181127168 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004773219658976931, + "loss": 3.4683, + "theoretical_loss": 4.409890197843417, + "tokens_seen": 181192704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004773119358074223, + "loss": 3.5185, + "theoretical_loss": 4.409687596628596, + "tokens_seen": 181258240 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047730190571715147, + "loss": 3.5363, + "theoretical_loss": 4.409485089155666, + "tokens_seen": 181323776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047729187562688065, + "loss": 3.5015, + "theoretical_loss": 4.409282675347397, + "tokens_seen": 181389312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004772818455366099, + "loss": 3.6104, + "theoretical_loss": 4.4090803551266475, + "tokens_seen": 181454848 + }, + { + "epoch": 2.01, + "learning_rate": 0.000477271815446339, + "loss": 3.5437, + "theoretical_loss": 4.40887812841637, + "tokens_seen": 181520384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047726178535606825, + "loss": 3.6207, + "theoretical_loss": 4.408675995139606, + "tokens_seen": 181585920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004772517552657974, + "loss": 3.5969, + "theoretical_loss": 4.40847395521949, + "tokens_seen": 181651456 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004772417251755266, + "loss": 3.6604, + "theoretical_loss": 4.408272008579249, + "tokens_seen": 181716992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004772316950852558, + "loss": 3.5759, + "theoretical_loss": 4.408070155142195, + "tokens_seen": 181782528 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 474224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3052873611450195, + "objective/train/theoretical_loss": 4.407868394831737, + "objective/train/tokens_used": 202308064, + "theoretical_loss": 4.407868394831737, + "tokens_seen": 181848064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047722166499498497, + "loss": 3.4964, + "theoretical_loss": 4.407868394831737, + "tokens_seen": 181848064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047721163490471415, + "loss": 3.3808, + "theoretical_loss": 4.407666727571369, + "tokens_seen": 181913600 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047720160481444333, + "loss": 3.5385, + "theoretical_loss": 4.407465153284682, + "tokens_seen": 181979136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004771915747241725, + "loss": 3.6339, + "theoretical_loss": 4.40726367189535, + "tokens_seen": 182044672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047718154463390175, + "loss": 3.7586, + "theoretical_loss": 4.407062283327142, + "tokens_seen": 182110208 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004771715145436309, + "loss": 3.6541, + "theoretical_loss": 4.406860987503916, + "tokens_seen": 182175744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004771614844533601, + "loss": 3.5891, + "theoretical_loss": 4.4066597843496185, + "tokens_seen": 182241280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047715145436308924, + "loss": 3.5611, + "theoretical_loss": 4.406458673788286, + "tokens_seen": 182306816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004771414242728185, + "loss": 3.5061, + "theoretical_loss": 4.406257655744046, + "tokens_seen": 182372352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047713139418254766, + "loss": 3.5567, + "theoretical_loss": 4.406056730141113, + "tokens_seen": 182437888 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047712136409227684, + "loss": 3.6076, + "theoretical_loss": 4.405855896903795, + "tokens_seen": 182503424 + }, + { + "epoch": 2.01, + "learning_rate": 0.000477111334002006, + "loss": 3.6347, + "theoretical_loss": 4.405655155956484, + "tokens_seen": 182568960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047710130391173525, + "loss": 3.6174, + "theoretical_loss": 4.405454507223664, + "tokens_seen": 182634496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770912738214644, + "loss": 3.5966, + "theoretical_loss": 4.405253950629906, + "tokens_seen": 182700032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770812437311936, + "loss": 3.6313, + "theoretical_loss": 4.405053486099873, + "tokens_seen": 182765568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047707121364092274, + "loss": 3.5797, + "theoretical_loss": 4.404853113558314, + "tokens_seen": 182831104 + }, + { + "epoch": 2.01, + "learning_rate": 0.000477061183550652, + "loss": 3.512, + "theoretical_loss": 4.404652832930066, + "tokens_seen": 182896640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047705115346038116, + "loss": 3.5198, + "theoretical_loss": 4.404452644140055, + "tokens_seen": 182962176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047704112337011034, + "loss": 3.5957, + "theoretical_loss": 4.404252547113297, + "tokens_seen": 183027712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770310932798395, + "loss": 3.5973, + "theoretical_loss": 4.404052541774894, + "tokens_seen": 183093248 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770210631895687, + "loss": 3.5137, + "theoretical_loss": 4.4038526280500365, + "tokens_seen": 183158784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770110330992979, + "loss": 3.5251, + "theoretical_loss": 4.403652805864002, + "tokens_seen": 183224320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004770010030090271, + "loss": 3.5324, + "theoretical_loss": 4.403453075142157, + "tokens_seen": 183289856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047699097291875625, + "loss": 3.6324, + "theoretical_loss": 4.403253435809955, + "tokens_seen": 183355392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004769809428284855, + "loss": 3.6267, + "theoretical_loss": 4.403053887792936, + "tokens_seen": 183420928 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 477206, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4084765911102295, + "objective/train/theoretical_loss": 4.402854431016729, + "objective/train/tokens_used": 203946464, + "theoretical_loss": 4.402854431016729, + "tokens_seen": 183486464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047697091273821466, + "loss": 3.5757, + "theoretical_loss": 4.402854431016729, + "tokens_seen": 183486464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047696088264794384, + "loss": 3.6208, + "theoretical_loss": 4.402655065407048, + "tokens_seen": 183552000 + }, + { + "epoch": 2.01, + "learning_rate": 0.000476950852557673, + "loss": 3.5556, + "theoretical_loss": 4.402455790889695, + "tokens_seen": 183617536 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004769408224674022, + "loss": 3.4296, + "theoretical_loss": 4.4022566073905605, + "tokens_seen": 183683072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004769307923771314, + "loss": 3.571, + "theoretical_loss": 4.402057514835618, + "tokens_seen": 183748608 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004769207622868606, + "loss": 3.3675, + "theoretical_loss": 4.4018585131509305, + "tokens_seen": 183814144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047691073219658975, + "loss": 3.608, + "theoretical_loss": 4.401659602262647, + "tokens_seen": 183879680 + }, + { + "epoch": 2.01, + "learning_rate": 0.000476900702106319, + "loss": 3.5714, + "theoretical_loss": 4.401460782097004, + "tokens_seen": 183945216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004768906720160481, + "loss": 3.5718, + "theoretical_loss": 4.40126205258032, + "tokens_seen": 184010752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047688064192577735, + "loss": 3.4188, + "theoretical_loss": 4.401063413639003, + "tokens_seen": 184076288 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047687061183550653, + "loss": 3.6145, + "theoretical_loss": 4.400864865199546, + "tokens_seen": 184141824 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004768605817452357, + "loss": 3.5692, + "theoretical_loss": 4.4006664071885275, + "tokens_seen": 184207360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004768505516549649, + "loss": 3.4575, + "theoretical_loss": 4.400468039532614, + "tokens_seen": 184272896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047684052156469407, + "loss": 3.5572, + "theoretical_loss": 4.400269762158553, + "tokens_seen": 184338432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047683049147442325, + "loss": 3.5934, + "theoretical_loss": 4.4000715749931825, + "tokens_seen": 184403968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004768204613841525, + "loss": 3.5529, + "theoretical_loss": 4.399873477963421, + "tokens_seen": 184469504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004768104312938816, + "loss": 3.5065, + "theoretical_loss": 4.399675470996275, + "tokens_seen": 184535040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047680040120361085, + "loss": 3.5619, + "theoretical_loss": 4.399477554018837, + "tokens_seen": 184600576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047679037111334003, + "loss": 3.5382, + "theoretical_loss": 4.399279726958281, + "tokens_seen": 184666112 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004767803410230692, + "loss": 3.5648, + "theoretical_loss": 4.399081989741868, + "tokens_seen": 184731648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004767703109327984, + "loss": 3.5214, + "theoretical_loss": 4.398884342296943, + "tokens_seen": 184797184 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004767602808425276, + "loss": 3.4607, + "theoretical_loss": 4.398686784550936, + "tokens_seen": 184862720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047675025075225676, + "loss": 3.4903, + "theoretical_loss": 4.398489316431361, + "tokens_seen": 184928256 + }, + { + "epoch": 2.01, + "learning_rate": 0.000476740220661986, + "loss": 3.6422, + "theoretical_loss": 4.398291937865816, + "tokens_seen": 184993792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004767301905717151, + "loss": 3.6089, + "theoretical_loss": 4.398094648781982, + "tokens_seen": 185059328 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 482116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4747917652130127, + "objective/train/theoretical_loss": 4.397897449107628, + "objective/train/tokens_used": 205584864, + "theoretical_loss": 4.397897449107628, + "tokens_seen": 185124864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047672016048144435, + "loss": 3.4744, + "theoretical_loss": 4.397897449107628, + "tokens_seen": 185124864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047671013039117353, + "loss": 3.4674, + "theoretical_loss": 4.397700338770603, + "tokens_seen": 185190400 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004767001003009027, + "loss": 3.4592, + "theoretical_loss": 4.39750331769884, + "tokens_seen": 185255936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047669007021063195, + "loss": 3.5466, + "theoretical_loss": 4.397306385820358, + "tokens_seen": 185321472 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004766800401203611, + "loss": 3.5487, + "theoretical_loss": 4.397109543063258, + "tokens_seen": 185387008 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004766700100300903, + "loss": 3.5, + "theoretical_loss": 4.3969127893557225, + "tokens_seen": 185452544 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047665997993981944, + "loss": 3.5458, + "theoretical_loss": 4.39671612462602, + "tokens_seen": 185518080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004766499498495487, + "loss": 3.6392, + "theoretical_loss": 4.396519548802503, + "tokens_seen": 185583616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047663991975927786, + "loss": 3.5676, + "theoretical_loss": 4.396323061813602, + "tokens_seen": 185649152 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047662988966900704, + "loss": 3.5825, + "theoretical_loss": 4.396126663587835, + "tokens_seen": 185714688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004766198595787362, + "loss": 3.5843, + "theoretical_loss": 4.395930354053802, + "tokens_seen": 185780224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047660982948846545, + "loss": 3.6489, + "theoretical_loss": 4.395734133140184, + "tokens_seen": 185845760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765997993981946, + "loss": 3.6019, + "theoretical_loss": 4.395538000775744, + "tokens_seen": 185911296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765897693079238, + "loss": 3.5181, + "theoretical_loss": 4.3953419568893315, + "tokens_seen": 185976832 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047657973921765294, + "loss": 3.5915, + "theoretical_loss": 4.3951460014098735, + "tokens_seen": 186042368 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765697091273822, + "loss": 3.601, + "theoretical_loss": 4.39495013426638, + "tokens_seen": 186107904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047655967903711136, + "loss": 3.5721, + "theoretical_loss": 4.394754355387946, + "tokens_seen": 186173440 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047654964894684054, + "loss": 3.5465, + "theoretical_loss": 4.3945586647037445, + "tokens_seen": 186238976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765396188565697, + "loss": 3.5639, + "theoretical_loss": 4.394363062143033, + "tokens_seen": 186304512 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765295887662989, + "loss": 3.466, + "theoretical_loss": 4.39416754763515, + "tokens_seen": 186370048 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765195586760281, + "loss": 3.6056, + "theoretical_loss": 4.393972121109514, + "tokens_seen": 186435584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004765095285857573, + "loss": 3.5473, + "theoretical_loss": 4.393776782495626, + "tokens_seen": 186501120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047649949849548645, + "loss": 3.4869, + "theoretical_loss": 4.39358153172307, + "tokens_seen": 186566656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764894684052157, + "loss": 3.4495, + "theoretical_loss": 4.393386368721507, + "tokens_seen": 186632192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047647943831494486, + "loss": 3.6192, + "theoretical_loss": 4.393191293420682, + "tokens_seen": 186697728 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 485014, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.613365888595581, + "objective/train/theoretical_loss": 4.392996305750421, + "objective/train/tokens_used": 207223264, + "theoretical_loss": 4.392996305750421, + "tokens_seen": 186763264 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047646940822467404, + "loss": 3.5835, + "theoretical_loss": 4.392996305750421, + "tokens_seen": 186763264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764593781344032, + "loss": 3.4762, + "theoretical_loss": 4.392801405640628, + "tokens_seen": 186828800 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764493480441324, + "loss": 3.583, + "theoretical_loss": 4.392606593021291, + "tokens_seen": 186894336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764393179538616, + "loss": 3.5041, + "theoretical_loss": 4.392411867822478, + "tokens_seen": 186959872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764292878635908, + "loss": 3.5238, + "theoretical_loss": 4.392217229974334, + "tokens_seen": 187025408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047641925777331995, + "loss": 3.6095, + "theoretical_loss": 4.392022679407089, + "tokens_seen": 187090944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004764092276830492, + "loss": 3.5204, + "theoretical_loss": 4.391828216051049, + "tokens_seen": 187156480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004763991975927783, + "loss": 3.4982, + "theoretical_loss": 4.391633839836603, + "tokens_seen": 187222016 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047638916750250755, + "loss": 3.4697, + "theoretical_loss": 4.391439550694218, + "tokens_seen": 187287552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047637913741223673, + "loss": 3.5052, + "theoretical_loss": 4.391245348554443, + "tokens_seen": 187353088 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004763691073219659, + "loss": 3.5901, + "theoretical_loss": 4.391051233347904, + "tokens_seen": 187418624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004763590772316951, + "loss": 3.588, + "theoretical_loss": 4.390857205005307, + "tokens_seen": 187484160 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047634904714142427, + "loss": 3.5706, + "theoretical_loss": 4.39066326345744, + "tokens_seen": 187549696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047633901705115345, + "loss": 3.5538, + "theoretical_loss": 4.390469408635168, + "tokens_seen": 187615232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004763289869608827, + "loss": 3.4749, + "theoretical_loss": 4.390275640469435, + "tokens_seen": 187680768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004763189568706118, + "loss": 3.5386, + "theoretical_loss": 4.390081958891265, + "tokens_seen": 187746304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047630892678034105, + "loss": 3.5279, + "theoretical_loss": 4.389888363831762, + "tokens_seen": 187811840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047629889669007023, + "loss": 3.5085, + "theoretical_loss": 4.389694855222106, + "tokens_seen": 187877376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762888665997994, + "loss": 3.5306, + "theoretical_loss": 4.389501432993558, + "tokens_seen": 187942912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762788365095286, + "loss": 3.5872, + "theoretical_loss": 4.389308097077457, + "tokens_seen": 188008448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762688064192578, + "loss": 3.598, + "theoretical_loss": 4.389114847405221, + "tokens_seen": 188073984 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047625877632898696, + "loss": 3.5507, + "theoretical_loss": 4.388921683908343, + "tokens_seen": 188139520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762487462387162, + "loss": 3.537, + "theoretical_loss": 4.3887286065184, + "tokens_seen": 188205056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762387161484453, + "loss": 3.5351, + "theoretical_loss": 4.388535615167044, + "tokens_seen": 188270592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047622868605817455, + "loss": 3.5493, + "theoretical_loss": 4.388342709786004, + "tokens_seen": 188336128 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 488811, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.492114543914795, + "objective/train/theoretical_loss": 4.388149890307088, + "objective/train/tokens_used": 208861664, + "theoretical_loss": 4.388149890307088, + "tokens_seen": 188401664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762186559679037, + "loss": 3.4991, + "theoretical_loss": 4.388149890307088, + "tokens_seen": 188401664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004762086258776329, + "loss": 3.4662, + "theoretical_loss": 4.387957156662182, + "tokens_seen": 188467200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761985957873621, + "loss": 3.6804, + "theoretical_loss": 4.38776450878325, + "tokens_seen": 188532736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761885656970913, + "loss": 3.5524, + "theoretical_loss": 4.387571946602333, + "tokens_seen": 188598272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047617853560682046, + "loss": 3.5073, + "theoretical_loss": 4.38737947005155, + "tokens_seen": 188663808 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047616850551654964, + "loss": 3.5624, + "theoretical_loss": 4.387187079063096, + "tokens_seen": 188729344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761584754262788, + "loss": 3.5436, + "theoretical_loss": 4.386994773569244, + "tokens_seen": 188794880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047614844533600806, + "loss": 3.6221, + "theoretical_loss": 4.386802553502344, + "tokens_seen": 188860416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761384152457372, + "loss": 3.568, + "theoretical_loss": 4.386610418794824, + "tokens_seen": 188925952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761283851554664, + "loss": 3.6086, + "theoretical_loss": 4.386418369379188, + "tokens_seen": 188991488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761183550651956, + "loss": 3.5068, + "theoretical_loss": 4.386226405188015, + "tokens_seen": 189057024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004761083249749248, + "loss": 3.5862, + "theoretical_loss": 4.386034526153965, + "tokens_seen": 189122560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047609829488465396, + "loss": 3.5284, + "theoretical_loss": 4.385842732209771, + "tokens_seen": 189188096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047608826479438314, + "loss": 3.5745, + "theoretical_loss": 4.385651023288243, + "tokens_seen": 189253632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004760782347041123, + "loss": 3.4265, + "theoretical_loss": 4.385459399322267, + "tokens_seen": 189319168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047606820461384156, + "loss": 3.5244, + "theoretical_loss": 4.385267860244807, + "tokens_seen": 189384704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004760581745235707, + "loss": 3.5031, + "theoretical_loss": 4.385076405988901, + "tokens_seen": 189450240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004760481444332999, + "loss": 3.6076, + "theoretical_loss": 4.384885036487664, + "tokens_seen": 189515776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047603811434302905, + "loss": 3.4015, + "theoretical_loss": 4.384693751674287, + "tokens_seen": 189581312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004760280842527583, + "loss": 3.6573, + "theoretical_loss": 4.384502551482036, + "tokens_seen": 189646848 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047601805416248746, + "loss": 3.5614, + "theoretical_loss": 4.384311435844254, + "tokens_seen": 189712384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047600802407221665, + "loss": 3.5403, + "theoretical_loss": 4.384120404694358, + "tokens_seen": 189777920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759979939819458, + "loss": 3.6301, + "theoretical_loss": 4.383929457965841, + "tokens_seen": 189843456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047598796389167506, + "loss": 3.4861, + "theoretical_loss": 4.383738595592271, + "tokens_seen": 189908992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759779338014042, + "loss": 3.5132, + "theoretical_loss": 4.383547817507291, + "tokens_seen": 189974528 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 493606, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4268393516540527, + "objective/train/theoretical_loss": 4.383357123644621, + "objective/train/tokens_used": 210500064, + "theoretical_loss": 4.383357123644621, + "tokens_seen": 190040064 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759679037111334, + "loss": 3.5326, + "theoretical_loss": 4.383357123644621, + "tokens_seen": 190040064 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759578736208626, + "loss": 3.6468, + "theoretical_loss": 4.383166513938053, + "tokens_seen": 190105600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759478435305918, + "loss": 3.5442, + "theoretical_loss": 4.382975988321457, + "tokens_seen": 190171136 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475937813440321, + "loss": 3.5474, + "theoretical_loss": 4.3827855467287735, + "tokens_seen": 190236672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047592778335005015, + "loss": 3.5693, + "theoretical_loss": 4.382595189094022, + "tokens_seen": 190302208 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759177532597794, + "loss": 3.5902, + "theoretical_loss": 4.382404915351294, + "tokens_seen": 190367744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004759077231695085, + "loss": 3.5827, + "theoretical_loss": 4.382214725434757, + "tokens_seen": 190433280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047589769307923775, + "loss": 3.511, + "theoretical_loss": 4.382024619278651, + "tokens_seen": 190498816 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047588766298896693, + "loss": 3.5899, + "theoretical_loss": 4.38183459681729, + "tokens_seen": 190564352 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004758776328986961, + "loss": 3.4236, + "theoretical_loss": 4.381644657985065, + "tokens_seen": 190629888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004758676028084253, + "loss": 3.591, + "theoretical_loss": 4.381454802716439, + "tokens_seen": 190695424 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047585757271815447, + "loss": 3.3518, + "theoretical_loss": 4.381265030945949, + "tokens_seen": 190760960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047584754262788365, + "loss": 3.5549, + "theoretical_loss": 4.381075342608204, + "tokens_seen": 190826496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004758375125376129, + "loss": 3.4846, + "theoretical_loss": 4.3808857376378905, + "tokens_seen": 190892032 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475827482447342, + "loss": 3.5085, + "theoretical_loss": 4.380696215969765, + "tokens_seen": 190957568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047581745235707125, + "loss": 3.6623, + "theoretical_loss": 4.380506777538659, + "tokens_seen": 191023104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047580742226680043, + "loss": 3.474, + "theoretical_loss": 4.380317422279477, + "tokens_seen": 191088640 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757973921765296, + "loss": 3.5508, + "theoretical_loss": 4.380128150127199, + "tokens_seen": 191154176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757873620862588, + "loss": 3.5287, + "theoretical_loss": 4.379938961016872, + "tokens_seen": 191219712 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475777331995988, + "loss": 3.4079, + "theoretical_loss": 4.379749854883624, + "tokens_seen": 191285248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047576730190571716, + "loss": 3.3501, + "theoretical_loss": 4.379560831662648, + "tokens_seen": 191350784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757572718154464, + "loss": 3.6007, + "theoretical_loss": 4.379371891289217, + "tokens_seen": 191416320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757472417251755, + "loss": 3.5266, + "theoretical_loss": 4.379183033698672, + "tokens_seen": 191481856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047573721163490475, + "loss": 3.4822, + "theoretical_loss": 4.378994258826427, + "tokens_seen": 191547392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757271815446339, + "loss": 3.592, + "theoretical_loss": 4.378805566607969, + "tokens_seen": 191612928 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 496574, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.585700750350952, + "objective/train/theoretical_loss": 4.37861695697886, + "objective/train/tokens_used": 212138464, + "theoretical_loss": 4.37861695697886, + "tokens_seen": 191678464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757171514543631, + "loss": 3.587, + "theoretical_loss": 4.37861695697886, + "tokens_seen": 191678464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004757071213640923, + "loss": 3.3445, + "theoretical_loss": 4.378428429874731, + "tokens_seen": 191744000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004756970912738215, + "loss": 3.4925, + "theoretical_loss": 4.378239985231286, + "tokens_seen": 191809536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047568706118355066, + "loss": 3.4713, + "theoretical_loss": 4.378051622984299, + "tokens_seen": 191875072 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047567703109327984, + "loss": 3.5155, + "theoretical_loss": 4.377863343069621, + "tokens_seen": 191940608 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475667001003009, + "loss": 3.4518, + "theoretical_loss": 4.377675145423171, + "tokens_seen": 192006144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047565697091273826, + "loss": 3.545, + "theoretical_loss": 4.377487029980941, + "tokens_seen": 192071680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004756469408224674, + "loss": 3.5388, + "theoretical_loss": 4.3772989966789915, + "tokens_seen": 192137216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004756369107321966, + "loss": 3.4881, + "theoretical_loss": 4.377111045453461, + "tokens_seen": 192202752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004756268806419258, + "loss": 3.5359, + "theoretical_loss": 4.376923176240553, + "tokens_seen": 192268288 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475616850551655, + "loss": 3.5641, + "theoretical_loss": 4.376735388976547, + "tokens_seen": 192333824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047560682046138416, + "loss": 3.6022, + "theoretical_loss": 4.37654768359779, + "tokens_seen": 192399360 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047559679037111334, + "loss": 3.558, + "theoretical_loss": 4.376360060040702, + "tokens_seen": 192464896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004755867602808425, + "loss": 3.61, + "theoretical_loss": 4.376172518241775, + "tokens_seen": 192530432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047557673019057176, + "loss": 3.5381, + "theoretical_loss": 4.375985058137569, + "tokens_seen": 192595968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004755667001003009, + "loss": 3.5972, + "theoretical_loss": 4.375797679664718, + "tokens_seen": 192661504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004755566700100301, + "loss": 3.5826, + "theoretical_loss": 4.375610382759923, + "tokens_seen": 192727040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047554663991975925, + "loss": 3.5533, + "theoretical_loss": 4.37542316735996, + "tokens_seen": 192792576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004755366098294885, + "loss": 3.5548, + "theoretical_loss": 4.375236033401673, + "tokens_seen": 192858112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047552657973921766, + "loss": 3.5238, + "theoretical_loss": 4.375048980821974, + "tokens_seen": 192923648 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047551654964894685, + "loss": 3.5404, + "theoretical_loss": 4.374862009557851, + "tokens_seen": 192989184 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047550651955867603, + "loss": 3.6674, + "theoretical_loss": 4.374675119546357, + "tokens_seen": 193054720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047549648946840526, + "loss": 3.5788, + "theoretical_loss": 4.374488310724617, + "tokens_seen": 193120256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004754864593781344, + "loss": 3.606, + "theoretical_loss": 4.374301583029828, + "tokens_seen": 193185792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004754764292878636, + "loss": 3.5633, + "theoretical_loss": 4.374114936399253, + "tokens_seen": 193251328 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 499644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6705703735351562, + "objective/train/theoretical_loss": 4.373928370770227, + "objective/train/tokens_used": 213776864, + "theoretical_loss": 4.373928370770227, + "tokens_seen": 193316864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047546639919759275, + "loss": 3.5738, + "theoretical_loss": 4.373928370770227, + "tokens_seen": 193316864 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475456369107322, + "loss": 3.5628, + "theoretical_loss": 4.373741886080156, + "tokens_seen": 193382400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047544633901705117, + "loss": 3.6092, + "theoretical_loss": 4.373555482266511, + "tokens_seen": 193447936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047543630892678035, + "loss": 3.5094, + "theoretical_loss": 4.373369159266838, + "tokens_seen": 193513472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047542627883650953, + "loss": 3.6108, + "theoretical_loss": 4.373182917018748, + "tokens_seen": 193579008 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004754162487462387, + "loss": 3.4594, + "theoretical_loss": 4.3729967554599245, + "tokens_seen": 193644544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004754062186559679, + "loss": 3.6022, + "theoretical_loss": 4.372810674528117, + "tokens_seen": 193710080 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047539618856569713, + "loss": 3.6082, + "theoretical_loss": 4.372624674161147, + "tokens_seen": 193775616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047538615847542625, + "loss": 3.5734, + "theoretical_loss": 4.372438754296904, + "tokens_seen": 193841152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004753761283851555, + "loss": 3.5464, + "theoretical_loss": 4.372252914873345, + "tokens_seen": 193906688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004753660982948846, + "loss": 3.5216, + "theoretical_loss": 4.372067155828496, + "tokens_seen": 193972224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047535606820461385, + "loss": 3.6916, + "theoretical_loss": 4.371881477100455, + "tokens_seen": 194037760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047534603811434303, + "loss": 3.4982, + "theoretical_loss": 4.371695878627385, + "tokens_seen": 194103296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004753360080240722, + "loss": 3.5814, + "theoretical_loss": 4.371510360347517, + "tokens_seen": 194168832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004753259779338014, + "loss": 3.6354, + "theoretical_loss": 4.371324922199154, + "tokens_seen": 194234368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047531594784353063, + "loss": 3.6146, + "theoretical_loss": 4.371139564120663, + "tokens_seen": 194299904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047530591775325976, + "loss": 3.5113, + "theoretical_loss": 4.370954286050483, + "tokens_seen": 194365440 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475295887662989, + "loss": 3.6263, + "theoretical_loss": 4.370769087927119, + "tokens_seen": 194430976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752858575727181, + "loss": 3.415, + "theoretical_loss": 4.370583969689145, + "tokens_seen": 194496512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047527582748244736, + "loss": 3.5835, + "theoretical_loss": 4.370398931275201, + "tokens_seen": 194562048 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047526579739217654, + "loss": 3.5582, + "theoretical_loss": 4.370213972623996, + "tokens_seen": 194627584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752557673019057, + "loss": 3.5806, + "theoretical_loss": 4.370029093674307, + "tokens_seen": 194693120 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752457372116349, + "loss": 3.5134, + "theoretical_loss": 4.369844294364979, + "tokens_seen": 194758656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752357071213641, + "loss": 3.5662, + "theoretical_loss": 4.369659574634923, + "tokens_seen": 194824192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047522567703109326, + "loss": 3.6488, + "theoretical_loss": 4.369474934423119, + "tokens_seen": 194889728 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 503564, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.529350996017456, + "objective/train/theoretical_loss": 4.369290373668611, + "objective/train/tokens_used": 215415264, + "theoretical_loss": 4.369290373668611, + "tokens_seen": 194955264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752156469408225, + "loss": 3.495, + "theoretical_loss": 4.369290373668611, + "tokens_seen": 194955264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004752056168505517, + "loss": 3.607, + "theoretical_loss": 4.3691058923105155, + "tokens_seen": 195020800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047519558676028086, + "loss": 3.4587, + "theoretical_loss": 4.368921490288012, + "tokens_seen": 195086336 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047518555667001004, + "loss": 3.4191, + "theoretical_loss": 4.368737167540348, + "tokens_seen": 195151872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004751755265797392, + "loss": 3.4852, + "theoretical_loss": 4.368552924006838, + "tokens_seen": 195217408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047516549648946846, + "loss": 3.6482, + "theoretical_loss": 4.368368759626865, + "tokens_seen": 195282944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004751554663991976, + "loss": 3.5468, + "theoretical_loss": 4.368184674339875, + "tokens_seen": 195348480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004751454363089268, + "loss": 3.5029, + "theoretical_loss": 4.368000668085384, + "tokens_seen": 195414016 + }, + { + "epoch": 2.01, + "learning_rate": 0.000475135406218656, + "loss": 3.5649, + "theoretical_loss": 4.367816740802972, + "tokens_seen": 195479552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004751253761283852, + "loss": 3.4809, + "theoretical_loss": 4.3676328924322885, + "tokens_seen": 195545088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047511534603811436, + "loss": 3.5001, + "theoretical_loss": 4.3674491229130465, + "tokens_seen": 195610624 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047510531594784354, + "loss": 3.5336, + "theoretical_loss": 4.367265432185027, + "tokens_seen": 195676160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004750952858575727, + "loss": 3.5321, + "theoretical_loss": 4.367081820188075, + "tokens_seen": 195741696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047508525576730196, + "loss": 3.5267, + "theoretical_loss": 4.366898286862104, + "tokens_seen": 195807232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004750752256770311, + "loss": 3.3981, + "theoretical_loss": 4.3667148321470926, + "tokens_seen": 195872768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004750651955867603, + "loss": 3.55, + "theoretical_loss": 4.366531455983084, + "tokens_seen": 195938304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047505516549648945, + "loss": 3.5039, + "theoretical_loss": 4.366348158310189, + "tokens_seen": 196003840 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004750451354062187, + "loss": 3.585, + "theoretical_loss": 4.366164939068583, + "tokens_seen": 196069376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047503510531594787, + "loss": 3.5381, + "theoretical_loss": 4.365981798198508, + "tokens_seen": 196134912 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047502507522567705, + "loss": 3.6526, + "theoretical_loss": 4.365798735640269, + "tokens_seen": 196200448 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047501504513540623, + "loss": 3.5673, + "theoretical_loss": 4.36561575133424, + "tokens_seen": 196265984 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047500501504513546, + "loss": 3.6258, + "theoretical_loss": 4.365432845220857, + "tokens_seen": 196331520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004749949849548646, + "loss": 3.5227, + "theoretical_loss": 4.365250017240625, + "tokens_seen": 196397056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004749849548645938, + "loss": 3.625, + "theoretical_loss": 4.3650672673341075, + "tokens_seen": 196462592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047497492477432295, + "loss": 3.4968, + "theoretical_loss": 4.364884595441941, + "tokens_seen": 196528128 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 508047, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.517518997192383, + "objective/train/theoretical_loss": 4.3647020015048215, + "objective/train/tokens_used": 217053664, + "theoretical_loss": 4.3647020015048215, + "tokens_seen": 196593664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004749648946840522, + "loss": 3.5201, + "theoretical_loss": 4.3647020015048215, + "tokens_seen": 196593664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047495486459378137, + "loss": 3.4958, + "theoretical_loss": 4.3645194854635125, + "tokens_seen": 196659200 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047494483450351055, + "loss": 3.627, + "theoretical_loss": 4.364337047258839, + "tokens_seen": 196724736 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047493480441323973, + "loss": 3.672, + "theoretical_loss": 4.3641546868316965, + "tokens_seen": 196790272 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004749247743229689, + "loss": 3.6357, + "theoretical_loss": 4.363972404123038, + "tokens_seen": 196855808 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004749147442326981, + "loss": 3.5196, + "theoretical_loss": 4.363790199073886, + "tokens_seen": 196921344 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047490471414242733, + "loss": 3.6511, + "theoretical_loss": 4.363608071625325, + "tokens_seen": 196986880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047489468405215646, + "loss": 3.5831, + "theoretical_loss": 4.363426021718505, + "tokens_seen": 197052416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004748846539618857, + "loss": 3.5921, + "theoretical_loss": 4.36324404929464, + "tokens_seen": 197117952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004748746238716148, + "loss": 3.6157, + "theoretical_loss": 4.363062154295007, + "tokens_seen": 197183488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047486459378134405, + "loss": 3.5177, + "theoretical_loss": 4.3628803366609485, + "tokens_seen": 197249024 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047485456369107323, + "loss": 3.5556, + "theoretical_loss": 4.36269859633387, + "tokens_seen": 197314560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004748445336008024, + "loss": 3.5134, + "theoretical_loss": 4.362516933255241, + "tokens_seen": 197380096 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004748345035105316, + "loss": 3.5533, + "theoretical_loss": 4.362335347366595, + "tokens_seen": 197445632 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047482447342026083, + "loss": 3.5238, + "theoretical_loss": 4.362153838609528, + "tokens_seen": 197511168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047481444332998996, + "loss": 3.5406, + "theoretical_loss": 4.361972406925701, + "tokens_seen": 197576704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004748044132397192, + "loss": 3.6006, + "theoretical_loss": 4.361791052256838, + "tokens_seen": 197642240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747943831494483, + "loss": 3.5347, + "theoretical_loss": 4.361609774544727, + "tokens_seen": 197707776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047478435305917756, + "loss": 3.592, + "theoretical_loss": 4.361428573731216, + "tokens_seen": 197773312 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047477432296890674, + "loss": 3.5404, + "theoretical_loss": 4.361247449758221, + "tokens_seen": 197838848 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747642928786359, + "loss": 3.4818, + "theoretical_loss": 4.361066402567719, + "tokens_seen": 197904384 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747542627883651, + "loss": 3.4339, + "theoretical_loss": 4.36088543210175, + "tokens_seen": 197969920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747442326980943, + "loss": 3.5413, + "theoretical_loss": 4.360704538302414, + "tokens_seen": 198035456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047473420260782346, + "loss": 3.526, + "theoretical_loss": 4.360523721111879, + "tokens_seen": 198100992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747241725175527, + "loss": 3.5904, + "theoretical_loss": 4.360342980472374, + "tokens_seen": 198166528 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 511455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.64715576171875, + "objective/train/theoretical_loss": 4.360162316326187, + "objective/train/tokens_used": 218692064, + "theoretical_loss": 4.360162316326187, + "tokens_seen": 198232064 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004747141424272818, + "loss": 3.4809, + "theoretical_loss": 4.360162316326187, + "tokens_seen": 198232064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047470411233701106, + "loss": 3.4271, + "theoretical_loss": 4.359981728615676, + "tokens_seen": 198297600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746940822467402, + "loss": 3.6857, + "theoretical_loss": 4.359801217283252, + "tokens_seen": 198363136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746840521564694, + "loss": 3.6266, + "theoretical_loss": 4.3596207822713975, + "tokens_seen": 198428672 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746740220661986, + "loss": 3.5168, + "theoretical_loss": 4.35944042352265, + "tokens_seen": 198494208 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746639919759278, + "loss": 3.5221, + "theoretical_loss": 4.3592601409796154, + "tokens_seen": 198559744 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047465396188565696, + "loss": 3.6598, + "theoretical_loss": 4.3590799345849565, + "tokens_seen": 198625280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746439317953862, + "loss": 3.4688, + "theoretical_loss": 4.358899804281402, + "tokens_seen": 198690816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746339017051153, + "loss": 3.532, + "theoretical_loss": 4.35871975001174, + "tokens_seen": 198756352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047462387161484456, + "loss": 3.6016, + "theoretical_loss": 4.358539771718821, + "tokens_seen": 198821888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746138415245737, + "loss": 3.6397, + "theoretical_loss": 4.358359869345559, + "tokens_seen": 198887424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004746038114343029, + "loss": 3.6323, + "theoretical_loss": 4.3581800428349275, + "tokens_seen": 198952960 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004745937813440321, + "loss": 3.6666, + "theoretical_loss": 4.3580002921299625, + "tokens_seen": 199018496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004745837512537613, + "loss": 3.5238, + "theoretical_loss": 4.357820617173761, + "tokens_seen": 199084032 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047457372116349047, + "loss": 3.5451, + "theoretical_loss": 4.357641017909484, + "tokens_seen": 199149568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047456369107321965, + "loss": 3.5532, + "theoretical_loss": 4.357461494280349, + "tokens_seen": 199215104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047455366098294883, + "loss": 3.4765, + "theoretical_loss": 4.357282046229639, + "tokens_seen": 199280640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047454363089267807, + "loss": 3.5618, + "theoretical_loss": 4.357102673700696, + "tokens_seen": 199346176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004745336008024072, + "loss": 3.6004, + "theoretical_loss": 4.356923376636926, + "tokens_seen": 199411712 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047452357071213643, + "loss": 3.5968, + "theoretical_loss": 4.35674415498179, + "tokens_seen": 199477248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047451354062186555, + "loss": 3.5489, + "theoretical_loss": 4.356565008678817, + "tokens_seen": 199542784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004745035105315948, + "loss": 3.5412, + "theoretical_loss": 4.356385937671591, + "tokens_seen": 199608320 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047449348044132397, + "loss": 3.5911, + "theoretical_loss": 4.356206941903761, + "tokens_seen": 199673856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047448345035105315, + "loss": 3.5755, + "theoretical_loss": 4.356028021319034, + "tokens_seen": 199739392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004744734202607824, + "loss": 3.5969, + "theoretical_loss": 4.355849175861178, + "tokens_seen": 199804928 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 516165, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.62249755859375, + "objective/train/theoretical_loss": 4.355670405474023, + "objective/train/tokens_used": 220330464, + "theoretical_loss": 4.355670405474023, + "tokens_seen": 199870464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047446339017051157, + "loss": 3.5849, + "theoretical_loss": 4.355670405474023, + "tokens_seen": 199870464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047445336008024075, + "loss": 3.5631, + "theoretical_loss": 4.355491710101457, + "tokens_seen": 199936000 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047444332998996993, + "loss": 3.3571, + "theoretical_loss": 4.3553130896874315, + "tokens_seen": 200001536 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004744332998996991, + "loss": 3.6033, + "theoretical_loss": 4.3551345441759555, + "tokens_seen": 200067072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004744232698094283, + "loss": 3.5046, + "theoretical_loss": 4.354956073511099, + "tokens_seen": 200132608 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047441323971915753, + "loss": 3.6375, + "theoretical_loss": 4.35477767763699, + "tokens_seen": 200198144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047440320962888666, + "loss": 3.5742, + "theoretical_loss": 4.354599356497823, + "tokens_seen": 200263680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004743931795386159, + "loss": 3.6395, + "theoretical_loss": 4.354421110037844, + "tokens_seen": 200329216 + }, + { + "epoch": 2.01, + "learning_rate": 0.000474383149448345, + "loss": 3.5385, + "theoretical_loss": 4.354242938201365, + "tokens_seen": 200394752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047437311935807425, + "loss": 3.5607, + "theoretical_loss": 4.354064840932754, + "tokens_seen": 200460288 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047436308926780343, + "loss": 3.4314, + "theoretical_loss": 4.3538868181764405, + "tokens_seen": 200525824 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004743530591775326, + "loss": 3.5576, + "theoretical_loss": 4.353708869876914, + "tokens_seen": 200591360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004743430290872618, + "loss": 3.5616, + "theoretical_loss": 4.353530995978722, + "tokens_seen": 200656896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047433299899699103, + "loss": 3.4852, + "theoretical_loss": 4.353353196426472, + "tokens_seen": 200722432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047432296890672016, + "loss": 3.5202, + "theoretical_loss": 4.353175471164831, + "tokens_seen": 200787968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004743129388164494, + "loss": 3.5644, + "theoretical_loss": 4.352997820138524, + "tokens_seen": 200853504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004743029087261785, + "loss": 3.5993, + "theoretical_loss": 4.3528202432923395, + "tokens_seen": 200919040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047429287863590776, + "loss": 3.6171, + "theoretical_loss": 4.3526427405711186, + "tokens_seen": 200984576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047428284854563694, + "loss": 3.5986, + "theoretical_loss": 4.352465311919765, + "tokens_seen": 201050112 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004742728184553661, + "loss": 3.5601, + "theoretical_loss": 4.352287957283242, + "tokens_seen": 201115648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004742627883650953, + "loss": 3.6006, + "theoretical_loss": 4.352110676606569, + "tokens_seen": 201181184 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004742527582748245, + "loss": 3.5632, + "theoretical_loss": 4.351933469834827, + "tokens_seen": 201246720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047424272818455366, + "loss": 3.6715, + "theoretical_loss": 4.351756336913154, + "tokens_seen": 201312256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004742326980942829, + "loss": 3.5202, + "theoretical_loss": 4.3515792777867475, + "tokens_seen": 201377792 + }, + { + "epoch": 2.01, + "learning_rate": 0.000474222668004012, + "loss": 3.5321, + "theoretical_loss": 4.351402292400861, + "tokens_seen": 201443328 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 519105, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4599125385284424, + "objective/train/theoretical_loss": 4.351225380700811, + "objective/train/tokens_used": 221968864, + "theoretical_loss": 4.351225380700811, + "tokens_seen": 201508864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047421263791374126, + "loss": 3.5788, + "theoretical_loss": 4.351225380700811, + "tokens_seen": 201508864 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004742026078234704, + "loss": 3.49, + "theoretical_loss": 4.351048542631967, + "tokens_seen": 201574400 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741925777331996, + "loss": 3.4445, + "theoretical_loss": 4.350871778139761, + "tokens_seen": 201639936 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741825476429288, + "loss": 3.5298, + "theoretical_loss": 4.350695087169681, + "tokens_seen": 201705472 + }, + { + "epoch": 2.01, + "learning_rate": 0.000474172517552658, + "loss": 3.43, + "theoretical_loss": 4.350518469667275, + "tokens_seen": 201771008 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047416248746238716, + "loss": 3.4616, + "theoretical_loss": 4.350341925578145, + "tokens_seen": 201836544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741524573721164, + "loss": 3.5717, + "theoretical_loss": 4.350165454847955, + "tokens_seen": 201902080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741424272818455, + "loss": 3.6817, + "theoretical_loss": 4.349989057422426, + "tokens_seen": 201967616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047413239719157476, + "loss": 3.6421, + "theoretical_loss": 4.349812733247335, + "tokens_seen": 202033152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741223671013039, + "loss": 3.5152, + "theoretical_loss": 4.349636482268518, + "tokens_seen": 202098688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741123370110331, + "loss": 3.5247, + "theoretical_loss": 4.349460304431869, + "tokens_seen": 202164224 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004741023069207623, + "loss": 3.4047, + "theoretical_loss": 4.349284199683338, + "tokens_seen": 202229760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004740922768304915, + "loss": 3.3939, + "theoretical_loss": 4.349108167968934, + "tokens_seen": 202295296 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047408224674022067, + "loss": 3.6333, + "theoretical_loss": 4.348932209234723, + "tokens_seen": 202360832 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047407221664994985, + "loss": 3.4077, + "theoretical_loss": 4.348756323426828, + "tokens_seen": 202426368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047406218655967903, + "loss": 3.5865, + "theoretical_loss": 4.348580510491429, + "tokens_seen": 202491904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047405215646940827, + "loss": 3.5951, + "theoretical_loss": 4.348404770374763, + "tokens_seen": 202557440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004740421263791374, + "loss": 3.447, + "theoretical_loss": 4.348229103023124, + "tokens_seen": 202622976 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047403209628886663, + "loss": 3.6328, + "theoretical_loss": 4.348053508382864, + "tokens_seen": 202688512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047402206619859575, + "loss": 3.5084, + "theoretical_loss": 4.347877986400391, + "tokens_seen": 202754048 + }, + { + "epoch": 2.01, + "learning_rate": 0.000474012036108325, + "loss": 3.526, + "theoretical_loss": 4.347702537022171, + "tokens_seen": 202819584 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047400200601805417, + "loss": 3.4155, + "theoretical_loss": 4.3475271601947245, + "tokens_seen": 202885120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047399197592778335, + "loss": 3.5777, + "theoretical_loss": 4.3473518558646305, + "tokens_seen": 202950656 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047398194583751253, + "loss": 3.5917, + "theoretical_loss": 4.347176623978523, + "tokens_seen": 203016192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047397191574724177, + "loss": 3.3783, + "theoretical_loss": 4.347001464483096, + "tokens_seen": 203081728 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 522738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6642072200775146, + "objective/train/theoretical_loss": 4.346826377325094, + "objective/train/tokens_used": 223607264, + "theoretical_loss": 4.346826377325094, + "tokens_seen": 203147264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004739618856569709, + "loss": 3.4369, + "theoretical_loss": 4.346826377325094, + "tokens_seen": 203147264 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047395185556670013, + "loss": 3.5607, + "theoretical_loss": 4.346651362451324, + "tokens_seen": 203212800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047394182547642926, + "loss": 3.4604, + "theoretical_loss": 4.346476419808645, + "tokens_seen": 203278336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004739317953861585, + "loss": 3.5111, + "theoretical_loss": 4.346301549343973, + "tokens_seen": 203343872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004739217652958877, + "loss": 3.5535, + "theoretical_loss": 4.346126751004283, + "tokens_seen": 203409408 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047391173520561686, + "loss": 3.5831, + "theoretical_loss": 4.345952024736603, + "tokens_seen": 203474944 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047390170511534604, + "loss": 3.4355, + "theoretical_loss": 4.345777370488015, + "tokens_seen": 203540480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004738916750250752, + "loss": 3.5024, + "theoretical_loss": 4.345602788205664, + "tokens_seen": 203606016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004738816449348044, + "loss": 3.5619, + "theoretical_loss": 4.345428277836744, + "tokens_seen": 203671552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047387161484453363, + "loss": 3.5421, + "theoretical_loss": 4.345253839328507, + "tokens_seen": 203737088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047386158475426276, + "loss": 3.4201, + "theoretical_loss": 4.345079472628261, + "tokens_seen": 203802624 + }, + { + "epoch": 2.01, + "learning_rate": 0.000473851554663992, + "loss": 3.5769, + "theoretical_loss": 4.344905177683369, + "tokens_seen": 203868160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004738415245737211, + "loss": 3.494, + "theoretical_loss": 4.344730954441252, + "tokens_seen": 203933696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047383149448345036, + "loss": 3.5223, + "theoretical_loss": 4.3445568028493815, + "tokens_seen": 203999232 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047382146439317954, + "loss": 3.5065, + "theoretical_loss": 4.344382722855288, + "tokens_seen": 204064768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004738114343029087, + "loss": 3.493, + "theoretical_loss": 4.344208714406557, + "tokens_seen": 204130304 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004738014042126379, + "loss": 3.5223, + "theoretical_loss": 4.344034777450828, + "tokens_seen": 204195840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047379137412236714, + "loss": 3.5254, + "theoretical_loss": 4.343860911935796, + "tokens_seen": 204261376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047378134403209626, + "loss": 3.5647, + "theoretical_loss": 4.3436871178092105, + "tokens_seen": 204326912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004737713139418255, + "loss": 3.5348, + "theoretical_loss": 4.343513395018878, + "tokens_seen": 204392448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004737612838515546, + "loss": 3.3911, + "theoretical_loss": 4.343339743512657, + "tokens_seen": 204457984 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047375125376128386, + "loss": 3.5446, + "theoretical_loss": 4.343166163238464, + "tokens_seen": 204523520 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047374122367101304, + "loss": 3.5518, + "theoretical_loss": 4.342992654144267, + "tokens_seen": 204589056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004737311935807422, + "loss": 3.5104, + "theoretical_loss": 4.342819216178091, + "tokens_seen": 204654592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047372116349047146, + "loss": 3.4975, + "theoretical_loss": 4.342645849288015, + "tokens_seen": 204720128 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 525781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5652801990509033, + "objective/train/theoretical_loss": 4.342472553422172, + "objective/train/tokens_used": 225245664, + "theoretical_loss": 4.342472553422172, + "tokens_seen": 204785664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004737111334002006, + "loss": 3.5657, + "theoretical_loss": 4.342472553422172, + "tokens_seen": 204785664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004737011033099298, + "loss": 3.6219, + "theoretical_loss": 4.342299328528749, + "tokens_seen": 204851200 + }, + { + "epoch": 2.01, + "learning_rate": 0.000473691073219659, + "loss": 3.5461, + "theoretical_loss": 4.342126174555989, + "tokens_seen": 204916736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736810431293882, + "loss": 3.577, + "theoretical_loss": 4.341953091452187, + "tokens_seen": 204982272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047367101303911736, + "loss": 3.6389, + "theoretical_loss": 4.341780079165696, + "tokens_seen": 205047808 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736609829488466, + "loss": 3.5338, + "theoretical_loss": 4.341607137644918, + "tokens_seen": 205113344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736509528585757, + "loss": 3.4623, + "theoretical_loss": 4.341434266838314, + "tokens_seen": 205178880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047364092276830496, + "loss": 3.5569, + "theoretical_loss": 4.3412614666943945, + "tokens_seen": 205244416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736308926780341, + "loss": 3.4291, + "theoretical_loss": 4.341088737161728, + "tokens_seen": 205309952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736208625877633, + "loss": 3.5917, + "theoretical_loss": 4.340916078188934, + "tokens_seen": 205375488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736108324974925, + "loss": 3.4533, + "theoretical_loss": 4.340743489724687, + "tokens_seen": 205441024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004736008024072217, + "loss": 3.5951, + "theoretical_loss": 4.3405709717177166, + "tokens_seen": 205506560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047359077231695087, + "loss": 3.5929, + "theoretical_loss": 4.340398524116801, + "tokens_seen": 205572096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047358074222668005, + "loss": 3.4004, + "theoretical_loss": 4.340226146870778, + "tokens_seen": 205637632 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047357071213640923, + "loss": 3.6161, + "theoretical_loss": 4.340053839928535, + "tokens_seen": 205703168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047356068204613847, + "loss": 3.4254, + "theoretical_loss": 4.3398816032390135, + "tokens_seen": 205768704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004735506519558676, + "loss": 3.5485, + "theoretical_loss": 4.33970943675121, + "tokens_seen": 205834240 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047354062186559683, + "loss": 3.5822, + "theoretical_loss": 4.339537340414173, + "tokens_seen": 205899776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047353059177532595, + "loss": 3.4578, + "theoretical_loss": 4.339365314177004, + "tokens_seen": 205965312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004735205616850552, + "loss": 3.5145, + "theoretical_loss": 4.339193357988858, + "tokens_seen": 206030848 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047351053159478437, + "loss": 3.5751, + "theoretical_loss": 4.339021471798942, + "tokens_seen": 206096384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047350050150451355, + "loss": 3.5358, + "theoretical_loss": 4.338849655556517, + "tokens_seen": 206161920 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047349047141424273, + "loss": 3.6066, + "theoretical_loss": 4.338677909210899, + "tokens_seen": 206227456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047348044132397197, + "loss": 3.4367, + "theoretical_loss": 4.338506232711451, + "tokens_seen": 206292992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004734704112337011, + "loss": 3.4188, + "theoretical_loss": 4.338334626007595, + "tokens_seen": 206358528 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 530732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3274929523468018, + "objective/train/theoretical_loss": 4.338163089048802, + "objective/train/tokens_used": 226884064, + "theoretical_loss": 4.338163089048802, + "tokens_seen": 206424064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047346038114343033, + "loss": 3.4322, + "theoretical_loss": 4.338163089048802, + "tokens_seen": 206424064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047345035105315946, + "loss": 3.5568, + "theoretical_loss": 4.337991621784597, + "tokens_seen": 206489600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004734403209628887, + "loss": 3.4543, + "theoretical_loss": 4.337820224164557, + "tokens_seen": 206555136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004734302908726179, + "loss": 3.4834, + "theoretical_loss": 4.337648896138311, + "tokens_seen": 206620672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047342026078234706, + "loss": 3.4646, + "theoretical_loss": 4.3374776376555415, + "tokens_seen": 206686208 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047341023069207624, + "loss": 3.5433, + "theoretical_loss": 4.3373064486659825, + "tokens_seen": 206751744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004734002006018054, + "loss": 3.6442, + "theoretical_loss": 4.3371353291194215, + "tokens_seen": 206817280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004733901705115346, + "loss": 3.5188, + "theoretical_loss": 4.336964278965697, + "tokens_seen": 206882816 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047338014042126383, + "loss": 3.5405, + "theoretical_loss": 4.336793298154699, + "tokens_seen": 206948352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047337011033099296, + "loss": 3.5722, + "theoretical_loss": 4.336622386636371, + "tokens_seen": 207013888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004733600802407222, + "loss": 3.5075, + "theoretical_loss": 4.336451544360708, + "tokens_seen": 207079424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004733500501504513, + "loss": 3.4249, + "theoretical_loss": 4.336280771277756, + "tokens_seen": 207144960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047334002006018056, + "loss": 3.5341, + "theoretical_loss": 4.336110067337614, + "tokens_seen": 207210496 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047332998996990974, + "loss": 3.5561, + "theoretical_loss": 4.335939432490433, + "tokens_seen": 207276032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004733199598796389, + "loss": 3.5164, + "theoretical_loss": 4.335768866686415, + "tokens_seen": 207341568 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004733099297893681, + "loss": 3.5516, + "theoretical_loss": 4.335598369875812, + "tokens_seen": 207407104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047329989969909734, + "loss": 3.5482, + "theoretical_loss": 4.335427942008931, + "tokens_seen": 207472640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047328986960882646, + "loss": 3.5736, + "theoretical_loss": 4.335257583036127, + "tokens_seen": 207538176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004732798395185557, + "loss": 3.5971, + "theoretical_loss": 4.335087292907811, + "tokens_seen": 207603712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004732698094282848, + "loss": 3.5366, + "theoretical_loss": 4.334917071574439, + "tokens_seen": 207669248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047325977933801406, + "loss": 3.4868, + "theoretical_loss": 4.334746918986523, + "tokens_seen": 207734784 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047324974924774324, + "loss": 3.4383, + "theoretical_loss": 4.334576835094626, + "tokens_seen": 207800320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004732397191574724, + "loss": 3.4931, + "theoretical_loss": 4.3344068198493595, + "tokens_seen": 207865856 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004732296890672016, + "loss": 3.4858, + "theoretical_loss": 4.334236873201388, + "tokens_seen": 207931392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004732196589769308, + "loss": 3.547, + "theoretical_loss": 4.334066995101427, + "tokens_seen": 207996928 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 533676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.408191204071045, + "objective/train/theoretical_loss": 4.333897185500243, + "objective/train/tokens_used": 228522464, + "theoretical_loss": 4.333897185500243, + "tokens_seen": 208062464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047320962888665997, + "loss": 3.5086, + "theoretical_loss": 4.333897185500243, + "tokens_seen": 208062464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004731995987963892, + "loss": 3.4499, + "theoretical_loss": 4.333727444348652, + "tokens_seen": 208128000 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047318956870611833, + "loss": 3.5037, + "theoretical_loss": 4.333557771597521, + "tokens_seen": 208193536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047317953861584757, + "loss": 3.4235, + "theoretical_loss": 4.333388167197769, + "tokens_seen": 208259072 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047316950852557675, + "loss": 3.5958, + "theoretical_loss": 4.333218631100365, + "tokens_seen": 208324608 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047315947843530593, + "loss": 3.5683, + "theoretical_loss": 4.333049163256329, + "tokens_seen": 208390144 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004731494483450351, + "loss": 3.6294, + "theoretical_loss": 4.332879763616731, + "tokens_seen": 208455680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004731394182547643, + "loss": 3.5576, + "theoretical_loss": 4.332710432132691, + "tokens_seen": 208521216 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047312938816449347, + "loss": 3.3825, + "theoretical_loss": 4.33254116875538, + "tokens_seen": 208586752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004731193580742227, + "loss": 3.5248, + "theoretical_loss": 4.332371973436021, + "tokens_seen": 208652288 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047310932798395183, + "loss": 3.4738, + "theoretical_loss": 4.332202846125883, + "tokens_seen": 208717824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047309929789368107, + "loss": 3.572, + "theoretical_loss": 4.332033786776288, + "tokens_seen": 208783360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004730892678034102, + "loss": 3.4842, + "theoretical_loss": 4.33186479533861, + "tokens_seen": 208848896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047307923771313943, + "loss": 3.4974, + "theoretical_loss": 4.331695871764268, + "tokens_seen": 208914432 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004730692076228686, + "loss": 3.5421, + "theoretical_loss": 4.331527016004735, + "tokens_seen": 208979968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004730591775325978, + "loss": 3.5115, + "theoretical_loss": 4.331358228011533, + "tokens_seen": 209045504 + }, + { + "epoch": 2.01, + "learning_rate": 0.000473049147442327, + "loss": 3.4748, + "theoretical_loss": 4.331189507736233, + "tokens_seen": 209111040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047303911735205615, + "loss": 3.5951, + "theoretical_loss": 4.331020855130457, + "tokens_seen": 209176576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047302908726178534, + "loss": 3.4746, + "theoretical_loss": 4.330852270145875, + "tokens_seen": 209242112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047301905717151457, + "loss": 3.5223, + "theoretical_loss": 4.330683752734208, + "tokens_seen": 209307648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004730090270812437, + "loss": 3.5456, + "theoretical_loss": 4.3305153028472265, + "tokens_seen": 209373184 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047299899699097293, + "loss": 3.4369, + "theoretical_loss": 4.33034692043675, + "tokens_seen": 209438720 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004729889669007021, + "loss": 3.5197, + "theoretical_loss": 4.330178605454648, + "tokens_seen": 209504256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004729789368104313, + "loss": 3.4279, + "theoretical_loss": 4.330010357852839, + "tokens_seen": 209569792 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047296890672016053, + "loss": 3.5293, + "theoretical_loss": 4.3298421775832905, + "tokens_seen": 209635328 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 537370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3988702297210693, + "objective/train/theoretical_loss": 4.32967406459802, + "objective/train/tokens_used": 230160864, + "theoretical_loss": 4.32967406459802, + "tokens_seen": 209700864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047295887662988966, + "loss": 3.4078, + "theoretical_loss": 4.32967406459802, + "tokens_seen": 209700864 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004729488465396189, + "loss": 3.4953, + "theoretical_loss": 4.329506018849093, + "tokens_seen": 209766400 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004729388164493481, + "loss": 3.5096, + "theoretical_loss": 4.329338040288626, + "tokens_seen": 209831936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047292878635907726, + "loss": 3.4425, + "theoretical_loss": 4.329170128868782, + "tokens_seen": 209897472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047291875626880644, + "loss": 3.5937, + "theoretical_loss": 4.329002284541775, + "tokens_seen": 209963008 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004729087261785356, + "loss": 3.4947, + "theoretical_loss": 4.3288345072598675, + "tokens_seen": 210028544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004728986960882648, + "loss": 3.4862, + "theoretical_loss": 4.328666796975371, + "tokens_seen": 210094080 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047288866599799403, + "loss": 3.603, + "theoretical_loss": 4.328499153640645, + "tokens_seen": 210159616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047287863590772316, + "loss": 3.5939, + "theoretical_loss": 4.328331577208099, + "tokens_seen": 210225152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004728686058174524, + "loss": 3.4641, + "theoretical_loss": 4.328164067630188, + "tokens_seen": 210290688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004728585757271815, + "loss": 3.501, + "theoretical_loss": 4.32799662485942, + "tokens_seen": 210356224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047284854563691076, + "loss": 3.4972, + "theoretical_loss": 4.327829248848349, + "tokens_seen": 210421760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047283851554663994, + "loss": 3.5557, + "theoretical_loss": 4.327661939549579, + "tokens_seen": 210487296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004728284854563691, + "loss": 3.6029, + "theoretical_loss": 4.327494696915758, + "tokens_seen": 210552832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004728184553660983, + "loss": 3.5317, + "theoretical_loss": 4.327327520899588, + "tokens_seen": 210618368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047280842527582754, + "loss": 3.4473, + "theoretical_loss": 4.327160411453817, + "tokens_seen": 210683904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047279839518555666, + "loss": 3.4225, + "theoretical_loss": 4.326993368531241, + "tokens_seen": 210749440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004727883650952859, + "loss": 3.471, + "theoretical_loss": 4.326826392084703, + "tokens_seen": 210814976 + }, + { + "epoch": 2.01, + "learning_rate": 0.000472778335005015, + "loss": 3.5957, + "theoretical_loss": 4.326659482067096, + "tokens_seen": 210880512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047276830491474426, + "loss": 3.5059, + "theoretical_loss": 4.326492638431361, + "tokens_seen": 210946048 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047275827482447344, + "loss": 3.5943, + "theoretical_loss": 4.326325861130485, + "tokens_seen": 211011584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004727482447342026, + "loss": 3.5783, + "theoretical_loss": 4.326159150117505, + "tokens_seen": 211077120 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004727382146439318, + "loss": 3.5972, + "theoretical_loss": 4.325992505345504, + "tokens_seen": 211142656 + }, + { + "epoch": 2.01, + "learning_rate": 0.000472728184553661, + "loss": 3.5506, + "theoretical_loss": 4.325825926767616, + "tokens_seen": 211208192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047271815446339017, + "loss": 3.5501, + "theoretical_loss": 4.325659414337017, + "tokens_seen": 211273728 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 542358, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5822219848632812, + "objective/train/theoretical_loss": 4.3254929680069365, + "objective/train/tokens_used": 231799264, + "theoretical_loss": 4.3254929680069365, + "tokens_seen": 211339264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0004727081243731194, + "loss": 3.4963, + "theoretical_loss": 4.3254929680069365, + "tokens_seen": 211339264 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047269809428284853, + "loss": 3.6078, + "theoretical_loss": 4.325326587730648, + "tokens_seen": 211404800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00047268806419257777, + "loss": 3.5847, + "theoretical_loss": 4.325160273461473, + "tokens_seen": 211470336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047267803410230695, + "loss": 3.5477, + "theoretical_loss": 4.324994025152783, + "tokens_seen": 211535872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047266800401203613, + "loss": 3.5611, + "theoretical_loss": 4.324827842757994, + "tokens_seen": 211601408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004726579739217653, + "loss": 3.4698, + "theoretical_loss": 4.32466172623057, + "tokens_seen": 211666944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004726479438314945, + "loss": 3.5386, + "theoretical_loss": 4.324495675524021, + "tokens_seen": 211732480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047263791374122367, + "loss": 3.4764, + "theoretical_loss": 4.324329690591909, + "tokens_seen": 211798016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004726278836509529, + "loss": 3.5165, + "theoretical_loss": 4.324163771387837, + "tokens_seen": 211863552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047261785356068203, + "loss": 3.4291, + "theoretical_loss": 4.323997917865459, + "tokens_seen": 211929088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047260782347041127, + "loss": 3.5651, + "theoretical_loss": 4.323832129978474, + "tokens_seen": 211994624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004725977933801404, + "loss": 3.4549, + "theoretical_loss": 4.323666407680628, + "tokens_seen": 212060160 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047258776328986963, + "loss": 3.4293, + "theoretical_loss": 4.323500750925718, + "tokens_seen": 212125696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004725777331995988, + "loss": 3.5204, + "theoretical_loss": 4.3233351596675815, + "tokens_seen": 212191232 + }, + { + "epoch": 2.02, + "learning_rate": 0.000472567703109328, + "loss": 3.5473, + "theoretical_loss": 4.323169633860107, + "tokens_seen": 212256768 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004725576730190572, + "loss": 3.4994, + "theoretical_loss": 4.323004173457226, + "tokens_seen": 212322304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047254764292878636, + "loss": 3.543, + "theoretical_loss": 4.322838778412923, + "tokens_seen": 212387840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047253761283851554, + "loss": 3.4792, + "theoretical_loss": 4.322673448681223, + "tokens_seen": 212453376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047252758274824477, + "loss": 3.5077, + "theoretical_loss": 4.322508184216199, + "tokens_seen": 212518912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004725175526579739, + "loss": 3.4982, + "theoretical_loss": 4.322342984971972, + "tokens_seen": 212584448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047250752256770313, + "loss": 3.5648, + "theoretical_loss": 4.322177850902708, + "tokens_seen": 212649984 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004724974924774323, + "loss": 3.5266, + "theoretical_loss": 4.32201278196262, + "tokens_seen": 212715520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004724874623871615, + "loss": 3.5893, + "theoretical_loss": 4.321847778105967, + "tokens_seen": 212781056 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004724774322968907, + "loss": 3.5389, + "theoretical_loss": 4.321682839287054, + "tokens_seen": 212846592 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047246740220661986, + "loss": 3.608, + "theoretical_loss": 4.321517965460232, + "tokens_seen": 212912128 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 545114, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6596953868865967, + "objective/train/theoretical_loss": 4.3213531565798995, + "objective/train/tokens_used": 233437664, + "theoretical_loss": 4.3213531565798995, + "tokens_seen": 212977664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047245737211634904, + "loss": 3.5446, + "theoretical_loss": 4.3213531565798995, + "tokens_seen": 212977664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004724473420260783, + "loss": 3.5543, + "theoretical_loss": 4.321188412600499, + "tokens_seen": 213043200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004724373119358074, + "loss": 3.5091, + "theoretical_loss": 4.321023733476521, + "tokens_seen": 213108736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047242728184553664, + "loss": 3.5667, + "theoretical_loss": 4.320859119162499, + "tokens_seen": 213174272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047241725175526576, + "loss": 3.5197, + "theoretical_loss": 4.320694569613015, + "tokens_seen": 213239808 + }, + { + "epoch": 2.02, + "learning_rate": 0.000472407221664995, + "loss": 3.485, + "theoretical_loss": 4.320530084782696, + "tokens_seen": 213305344 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004723971915747242, + "loss": 3.4852, + "theoretical_loss": 4.320365664626216, + "tokens_seen": 213370880 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047238716148445336, + "loss": 3.5734, + "theoretical_loss": 4.320201309098291, + "tokens_seen": 213436416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047237713139418254, + "loss": 3.5038, + "theoretical_loss": 4.320037018153686, + "tokens_seen": 213501952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004723671013039117, + "loss": 3.5054, + "theoretical_loss": 4.31987279174721, + "tokens_seen": 213567488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004723570712136409, + "loss": 3.5454, + "theoretical_loss": 4.319708629833719, + "tokens_seen": 213633024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047234704112337014, + "loss": 3.5085, + "theoretical_loss": 4.319544532368112, + "tokens_seen": 213698560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047233701103309927, + "loss": 3.643, + "theoretical_loss": 4.319380499305335, + "tokens_seen": 213764096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004723269809428285, + "loss": 3.5277, + "theoretical_loss": 4.319216530600379, + "tokens_seen": 213829632 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004723169508525577, + "loss": 3.401, + "theoretical_loss": 4.31905262620828, + "tokens_seen": 213895168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047230692076228686, + "loss": 3.5099, + "theoretical_loss": 4.31888878608412, + "tokens_seen": 213960704 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047229689067201605, + "loss": 3.4677, + "theoretical_loss": 4.318725010183025, + "tokens_seen": 214026240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004722868605817452, + "loss": 3.5114, + "theoretical_loss": 4.318561298460167, + "tokens_seen": 214091776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004722768304914744, + "loss": 3.5128, + "theoretical_loss": 4.318397650870762, + "tokens_seen": 214157312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047226680040120364, + "loss": 3.5029, + "theoretical_loss": 4.318234067370072, + "tokens_seen": 214222848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047225677031093277, + "loss": 3.4886, + "theoretical_loss": 4.318070547913402, + "tokens_seen": 214288384 + }, + { + "epoch": 2.02, + "learning_rate": 0.000472246740220662, + "loss": 3.5092, + "theoretical_loss": 4.317907092456105, + "tokens_seen": 214353920 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047223671013039113, + "loss": 3.5363, + "theoretical_loss": 4.317743700953577, + "tokens_seen": 214419456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047222668004012037, + "loss": 3.57, + "theoretical_loss": 4.317580373361257, + "tokens_seen": 214484992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004722166499498496, + "loss": 3.3968, + "theoretical_loss": 4.317417109634632, + "tokens_seen": 214550528 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 550135, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5746114253997803, + "objective/train/theoretical_loss": 4.3172539097292315, + "objective/train/tokens_used": 235076064, + "theoretical_loss": 4.3172539097292315, + "tokens_seen": 214616064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047220661985957873, + "loss": 3.4466, + "theoretical_loss": 4.3172539097292315, + "tokens_seen": 214616064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047219658976930797, + "loss": 3.6227, + "theoretical_loss": 4.317090773600628, + "tokens_seen": 214681600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047218655967903715, + "loss": 3.4589, + "theoretical_loss": 4.316927701204444, + "tokens_seen": 214747136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047217652958876633, + "loss": 3.4819, + "theoretical_loss": 4.316764692496339, + "tokens_seen": 214812672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004721664994984955, + "loss": 3.5422, + "theoretical_loss": 4.316601747432024, + "tokens_seen": 214878208 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004721564694082247, + "loss": 3.4997, + "theoretical_loss": 4.31643886596725, + "tokens_seen": 214943744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047214643931795387, + "loss": 3.4477, + "theoretical_loss": 4.316276048057812, + "tokens_seen": 215009280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004721364092276831, + "loss": 3.5056, + "theoretical_loss": 4.316113293659551, + "tokens_seen": 215074816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047212637913741223, + "loss": 3.4899, + "theoretical_loss": 4.3159506027283525, + "tokens_seen": 215140352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047211634904714147, + "loss": 3.5342, + "theoretical_loss": 4.315787975220144, + "tokens_seen": 215205888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004721063189568706, + "loss": 3.4396, + "theoretical_loss": 4.315625411090899, + "tokens_seen": 215271424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047209628886659983, + "loss": 3.5406, + "theoretical_loss": 4.315462910296633, + "tokens_seen": 215336960 + }, + { + "epoch": 2.02, + "learning_rate": 0.000472086258776329, + "loss": 3.502, + "theoretical_loss": 4.315300472793409, + "tokens_seen": 215402496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004720762286860582, + "loss": 3.4525, + "theoretical_loss": 4.315138098537327, + "tokens_seen": 215468032 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004720661985957874, + "loss": 3.4729, + "theoretical_loss": 4.3149757874845385, + "tokens_seen": 215533568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047205616850551656, + "loss": 3.4921, + "theoretical_loss": 4.314813539591235, + "tokens_seen": 215599104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047204613841524574, + "loss": 3.4757, + "theoretical_loss": 4.314651354813651, + "tokens_seen": 215664640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047203610832497497, + "loss": 3.5845, + "theoretical_loss": 4.314489233108066, + "tokens_seen": 215730176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004720260782347041, + "loss": 3.4918, + "theoretical_loss": 4.314327174430803, + "tokens_seen": 215795712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047201604814443333, + "loss": 3.4526, + "theoretical_loss": 4.314165178738227, + "tokens_seen": 215861248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004720060180541625, + "loss": 3.4875, + "theoretical_loss": 4.31400324598675, + "tokens_seen": 215926784 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719959879638917, + "loss": 3.5337, + "theoretical_loss": 4.313841376132823, + "tokens_seen": 215992320 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719859578736209, + "loss": 3.5402, + "theoretical_loss": 4.313679569132942, + "tokens_seen": 216057856 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047197592778335006, + "loss": 3.526, + "theoretical_loss": 4.313517824943648, + "tokens_seen": 216123392 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047196589769307924, + "loss": 3.5455, + "theoretical_loss": 4.313356143521522, + "tokens_seen": 216188928 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 553058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.71616792678833, + "objective/train/theoretical_loss": 4.313194524823193, + "objective/train/tokens_used": 236714464, + "theoretical_loss": 4.313194524823193, + "tokens_seen": 216254464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719558676028085, + "loss": 3.5154, + "theoretical_loss": 4.313194524823193, + "tokens_seen": 216254464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719458375125376, + "loss": 3.5368, + "theoretical_loss": 4.313032968805328, + "tokens_seen": 216320000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047193580742226684, + "loss": 3.5379, + "theoretical_loss": 4.3128714754246396, + "tokens_seen": 216385536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047192577733199596, + "loss": 3.513, + "theoretical_loss": 4.312710044637882, + "tokens_seen": 216451072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719157472417252, + "loss": 3.5465, + "theoretical_loss": 4.312548676401855, + "tokens_seen": 216516608 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004719057171514544, + "loss": 3.576, + "theoretical_loss": 4.312387370673398, + "tokens_seen": 216582144 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047189568706118356, + "loss": 3.5269, + "theoretical_loss": 4.312226127409396, + "tokens_seen": 216647680 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047188565697091274, + "loss": 3.4898, + "theoretical_loss": 4.312064946566775, + "tokens_seen": 216713216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004718756268806419, + "loss": 3.4091, + "theoretical_loss": 4.311903828102503, + "tokens_seen": 216778752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004718655967903711, + "loss": 3.4946, + "theoretical_loss": 4.311742771973594, + "tokens_seen": 216844288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047185556670010034, + "loss": 3.4048, + "theoretical_loss": 4.311581778137102, + "tokens_seen": 216909824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047184553660982947, + "loss": 3.4713, + "theoretical_loss": 4.311420846550123, + "tokens_seen": 216975360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004718355065195587, + "loss": 3.5293, + "theoretical_loss": 4.311259977169798, + "tokens_seen": 217040896 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004718254764292879, + "loss": 3.5843, + "theoretical_loss": 4.311099169953309, + "tokens_seen": 217106432 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047181544633901706, + "loss": 3.495, + "theoretical_loss": 4.310938424857879, + "tokens_seen": 217171968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047180541624874625, + "loss": 3.4924, + "theoretical_loss": 4.310777741840775, + "tokens_seen": 217237504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004717953861584754, + "loss": 3.5409, + "theoretical_loss": 4.310617120859308, + "tokens_seen": 217303040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004717853560682046, + "loss": 3.5249, + "theoretical_loss": 4.310456561870827, + "tokens_seen": 217368576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047177532597793384, + "loss": 3.5145, + "theoretical_loss": 4.310296064832726, + "tokens_seen": 217434112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047176529588766297, + "loss": 3.4386, + "theoretical_loss": 4.310135629702441, + "tokens_seen": 217499648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004717552657973922, + "loss": 3.5258, + "theoretical_loss": 4.30997525643745, + "tokens_seen": 217565184 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047174523570712133, + "loss": 3.5504, + "theoretical_loss": 4.309814944995271, + "tokens_seen": 217630720 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047173520561685057, + "loss": 3.5799, + "theoretical_loss": 4.3096546953334665, + "tokens_seen": 217696256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047172517552657975, + "loss": 3.5283, + "theoretical_loss": 4.3094945074096405, + "tokens_seen": 217761792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047171514543630893, + "loss": 3.4125, + "theoretical_loss": 4.309334381181437, + "tokens_seen": 217827328 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 556918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4384846687316895, + "objective/train/theoretical_loss": 4.309174316606544, + "objective/train/tokens_used": 238352864, + "theoretical_loss": 4.309174316606544, + "tokens_seen": 217892864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004717051153460381, + "loss": 3.4979, + "theoretical_loss": 4.309174316606544, + "tokens_seen": 217892864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047169508525576735, + "loss": 3.597, + "theoretical_loss": 4.30901431364269, + "tokens_seen": 217958400 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004716850551654965, + "loss": 3.4769, + "theoretical_loss": 4.308854372247646, + "tokens_seen": 218023936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004716750250752257, + "loss": 3.5185, + "theoretical_loss": 4.308694492379223, + "tokens_seen": 218089472 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047166499498495484, + "loss": 3.4959, + "theoretical_loss": 4.3085346739952755, + "tokens_seen": 218155008 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047165496489468407, + "loss": 3.4516, + "theoretical_loss": 4.3083749170536985, + "tokens_seen": 218220544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047164493480441325, + "loss": 3.4684, + "theoretical_loss": 4.3082152215124285, + "tokens_seen": 218286080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047163490471414243, + "loss": 3.6809, + "theoretical_loss": 4.308055587329444, + "tokens_seen": 218351616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004716248746238716, + "loss": 3.4248, + "theoretical_loss": 4.307896014462764, + "tokens_seen": 218417152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004716148445336008, + "loss": 3.3885, + "theoretical_loss": 4.3077365028704495, + "tokens_seen": 218482688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047160481444333, + "loss": 3.6242, + "theoretical_loss": 4.307577052510602, + "tokens_seen": 218548224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715947843530592, + "loss": 3.4485, + "theoretical_loss": 4.307417663341366, + "tokens_seen": 218613760 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047158475426278834, + "loss": 3.4714, + "theoretical_loss": 4.307258335320925, + "tokens_seen": 218679296 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715747241725176, + "loss": 3.5173, + "theoretical_loss": 4.307099068407504, + "tokens_seen": 218744832 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715646940822467, + "loss": 3.5628, + "theoretical_loss": 4.30693986255937, + "tokens_seen": 218810368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047155466399197594, + "loss": 3.5403, + "theoretical_loss": 4.3067807177348305, + "tokens_seen": 218875904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715446339017051, + "loss": 3.5236, + "theoretical_loss": 4.3066216338922345, + "tokens_seen": 218941440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715346038114343, + "loss": 3.5609, + "theoretical_loss": 4.3064626109899695, + "tokens_seen": 219006976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715245737211635, + "loss": 3.5777, + "theoretical_loss": 4.306303648986468, + "tokens_seen": 219072512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004715145436308927, + "loss": 3.4777, + "theoretical_loss": 4.306144747840199, + "tokens_seen": 219138048 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047150451354062184, + "loss": 3.475, + "theoretical_loss": 4.305985907509675, + "tokens_seen": 219203584 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714944834503511, + "loss": 3.5617, + "theoretical_loss": 4.305827127953449, + "tokens_seen": 219269120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714844533600802, + "loss": 3.5481, + "theoretical_loss": 4.305668409130113, + "tokens_seen": 219334656 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047147442326980944, + "loss": 3.5231, + "theoretical_loss": 4.305509750998301, + "tokens_seen": 219400192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714643931795387, + "loss": 3.4572, + "theoretical_loss": 4.3053511535166855, + "tokens_seen": 219465728 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 561573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3734734058380127, + "objective/train/theoretical_loss": 4.305192616643984, + "objective/train/tokens_used": 239991264, + "theoretical_loss": 4.305192616643984, + "tokens_seen": 219531264 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714543630892678, + "loss": 3.4805, + "theoretical_loss": 4.305192616643984, + "tokens_seen": 219531264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047144433299899704, + "loss": 3.5554, + "theoretical_loss": 4.3050341403389485, + "tokens_seen": 219596800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047143430290872616, + "loss": 3.4474, + "theoretical_loss": 4.304875724560375, + "tokens_seen": 219662336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714242728184554, + "loss": 3.5675, + "theoretical_loss": 4.3047173692671015, + "tokens_seen": 219727872 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004714142427281846, + "loss": 3.3757, + "theoretical_loss": 4.304559074418002, + "tokens_seen": 219793408 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047140421263791376, + "loss": 3.5046, + "theoretical_loss": 4.304400839971992, + "tokens_seen": 219858944 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047139418254764294, + "loss": 3.3892, + "theoretical_loss": 4.304242665888028, + "tokens_seen": 219924480 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004713841524573721, + "loss": 3.5336, + "theoretical_loss": 4.304084552125107, + "tokens_seen": 219990016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004713741223671013, + "loss": 3.4788, + "theoretical_loss": 4.303926498642266, + "tokens_seen": 220055552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047136409227683054, + "loss": 3.4571, + "theoretical_loss": 4.303768505398581, + "tokens_seen": 220121088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047135406218655967, + "loss": 3.5411, + "theoretical_loss": 4.303610572353167, + "tokens_seen": 220186624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004713440320962889, + "loss": 3.5203, + "theoretical_loss": 4.303452699465181, + "tokens_seen": 220252160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004713340020060181, + "loss": 3.5259, + "theoretical_loss": 4.30329488669382, + "tokens_seen": 220317696 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047132397191574726, + "loss": 3.4983, + "theoretical_loss": 4.3031371339983195, + "tokens_seen": 220383232 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047131394182547645, + "loss": 3.5327, + "theoretical_loss": 4.302979441337956, + "tokens_seen": 220448768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047130391173520563, + "loss": 3.536, + "theoretical_loss": 4.302821808672043, + "tokens_seen": 220514304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004712938816449348, + "loss": 3.4482, + "theoretical_loss": 4.302664235959937, + "tokens_seen": 220579840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047128385155466404, + "loss": 3.509, + "theoretical_loss": 4.3025067231610326, + "tokens_seen": 220645376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047127382146439317, + "loss": 3.5041, + "theoretical_loss": 4.302349270234763, + "tokens_seen": 220710912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004712637913741224, + "loss": 3.5342, + "theoretical_loss": 4.3021918771406025, + "tokens_seen": 220776448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047125376128385153, + "loss": 3.4421, + "theoretical_loss": 4.302034543838065, + "tokens_seen": 220841984 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047124373119358077, + "loss": 3.333, + "theoretical_loss": 4.3018772702867025, + "tokens_seen": 220907520 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047123370110330995, + "loss": 3.512, + "theoretical_loss": 4.301720056446106, + "tokens_seen": 220973056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047122367101303913, + "loss": 3.3241, + "theoretical_loss": 4.301562902275908, + "tokens_seen": 221038592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004712136409227683, + "loss": 3.4896, + "theoretical_loss": 4.301405807735779, + "tokens_seen": 221104128 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 564339, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.649526357650757, + "objective/train/theoretical_loss": 4.301248772785428, + "objective/train/tokens_used": 241629664, + "theoretical_loss": 4.301248772785428, + "tokens_seen": 221169664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047120361083249755, + "loss": 3.6293, + "theoretical_loss": 4.301248772785428, + "tokens_seen": 221169664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004711935807422267, + "loss": 3.5068, + "theoretical_loss": 4.301091797384603, + "tokens_seen": 221235200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004711835506519559, + "loss": 3.5103, + "theoretical_loss": 4.300934881493094, + "tokens_seen": 221300736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047117352056168504, + "loss": 3.4145, + "theoretical_loss": 4.300778025070727, + "tokens_seen": 221366272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047116349047141427, + "loss": 3.4655, + "theoretical_loss": 4.300621228077367, + "tokens_seen": 221431808 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047115346038114345, + "loss": 3.481, + "theoretical_loss": 4.300464490472921, + "tokens_seen": 221497344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047114343029087263, + "loss": 3.5694, + "theoretical_loss": 4.30030781221733, + "tokens_seen": 221562880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004711334002006018, + "loss": 3.5297, + "theoretical_loss": 4.30015119327058, + "tokens_seen": 221628416 + }, + { + "epoch": 2.02, + "learning_rate": 0.000471123370110331, + "loss": 3.4081, + "theoretical_loss": 4.299994633592689, + "tokens_seen": 221693952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004711133400200602, + "loss": 3.4691, + "theoretical_loss": 4.29983813314372, + "tokens_seen": 221759488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004711033099297894, + "loss": 3.5062, + "theoretical_loss": 4.299681691883771, + "tokens_seen": 221825024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047109327983951854, + "loss": 3.4014, + "theoretical_loss": 4.29952530977298, + "tokens_seen": 221890560 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710832497492478, + "loss": 3.4811, + "theoretical_loss": 4.299368986771523, + "tokens_seen": 221956096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710732196589769, + "loss": 3.3963, + "theoretical_loss": 4.299212722839614, + "tokens_seen": 222021632 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047106318956870614, + "loss": 3.5809, + "theoretical_loss": 4.299056517937506, + "tokens_seen": 222087168 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710531594784353, + "loss": 3.4679, + "theoretical_loss": 4.298900372025494, + "tokens_seen": 222152704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710431293881645, + "loss": 3.3793, + "theoretical_loss": 4.298744285063904, + "tokens_seen": 222218240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710330992978937, + "loss": 3.4076, + "theoretical_loss": 4.298588257013107, + "tokens_seen": 222283776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710230692076229, + "loss": 3.3633, + "theoretical_loss": 4.29843228783351, + "tokens_seen": 222349312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047101303911735204, + "loss": 3.4524, + "theoretical_loss": 4.298276377485556, + "tokens_seen": 222414848 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004710030090270813, + "loss": 3.448, + "theoretical_loss": 4.298120525929731, + "tokens_seen": 222480384 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004709929789368104, + "loss": 3.4639, + "theoretical_loss": 4.2979647331265545, + "tokens_seen": 222545920 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047098294884653964, + "loss": 3.4657, + "theoretical_loss": 4.297808999036587, + "tokens_seen": 222611456 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004709729187562688, + "loss": 3.4771, + "theoretical_loss": 4.297653323620426, + "tokens_seen": 222676992 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470962888665998, + "loss": 3.5086, + "theoretical_loss": 4.2974977068387075, + "tokens_seen": 222742528 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4262208938598633, + "objective/train/theoretical_loss": 4.297342148652105, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.297342148652105, + "tokens_seen": 222808064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004709528585757272, + "loss": 3.4734, + "theoretical_loss": 4.297342148652105, + "tokens_seen": 222808064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047094282848545636, + "loss": 3.5238, + "theoretical_loss": 4.297186649021331, + "tokens_seen": 222873600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047093279839518555, + "loss": 3.4887, + "theoretical_loss": 4.297031207907134, + "tokens_seen": 222939136 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004709227683049148, + "loss": 3.6223, + "theoretical_loss": 4.296875825270302, + "tokens_seen": 223004672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004709127382146439, + "loss": 3.5369, + "theoretical_loss": 4.296720501071659, + "tokens_seen": 223070208 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047090270812437314, + "loss": 3.499, + "theoretical_loss": 4.29656523527207, + "tokens_seen": 223135744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047089267803410227, + "loss": 3.5285, + "theoretical_loss": 4.296410027832434, + "tokens_seen": 223201280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004708826479438315, + "loss": 3.5638, + "theoretical_loss": 4.29625487871369, + "tokens_seen": 223266816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004708726178535607, + "loss": 3.4945, + "theoretical_loss": 4.2960997878768135, + "tokens_seen": 223332352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047086258776328987, + "loss": 3.5083, + "theoretical_loss": 4.295944755282818, + "tokens_seen": 223397888 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047085255767301905, + "loss": 3.5533, + "theoretical_loss": 4.295789780892754, + "tokens_seen": 223463424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004708425275827483, + "loss": 3.5261, + "theoretical_loss": 4.295634864667711, + "tokens_seen": 223528960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004708324974924774, + "loss": 3.4927, + "theoretical_loss": 4.295480006568814, + "tokens_seen": 223594496 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047082246740220665, + "loss": 3.4799, + "theoretical_loss": 4.295325206557227, + "tokens_seen": 223660032 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004708124373119358, + "loss": 3.5125, + "theoretical_loss": 4.29517046459415, + "tokens_seen": 223725568 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470802407221665, + "loss": 3.3489, + "theoretical_loss": 4.295015780640821, + "tokens_seen": 223791104 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004707923771313942, + "loss": 3.4377, + "theoretical_loss": 4.294861154658513, + "tokens_seen": 223856640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047078234704112337, + "loss": 3.4895, + "theoretical_loss": 4.294706586608541, + "tokens_seen": 223922176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047077231695085255, + "loss": 3.5373, + "theoretical_loss": 4.294552076452252, + "tokens_seen": 223987712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047076228686058173, + "loss": 3.5546, + "theoretical_loss": 4.294397624151035, + "tokens_seen": 224053248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004707522567703109, + "loss": 3.5722, + "theoretical_loss": 4.294243229666311, + "tokens_seen": 224118784 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047074222668004015, + "loss": 3.4146, + "theoretical_loss": 4.29408889295954, + "tokens_seen": 224184320 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004707321965897693, + "loss": 3.4941, + "theoretical_loss": 4.2939346139922225, + "tokens_seen": 224249856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004707221664994985, + "loss": 3.3877, + "theoretical_loss": 4.293780392725891, + "tokens_seen": 224315392 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047071213640922775, + "loss": 3.606, + "theoretical_loss": 4.293626229122116, + "tokens_seen": 224380928 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6407711505889893, + "objective/train/theoretical_loss": 4.293472123142506, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.293472123142506, + "tokens_seen": 224446464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004707021063189569, + "loss": 3.5011, + "theoretical_loss": 4.293472123142506, + "tokens_seen": 224446464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004706920762286861, + "loss": 3.5283, + "theoretical_loss": 4.293318074748706, + "tokens_seen": 224512000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047068204613841524, + "loss": 3.4724, + "theoretical_loss": 4.293164083902397, + "tokens_seen": 224577536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047067201604814447, + "loss": 3.5297, + "theoretical_loss": 4.293010150565297, + "tokens_seen": 224643072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047066198595787365, + "loss": 3.4834, + "theoretical_loss": 4.292856274699161, + "tokens_seen": 224708608 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047065195586760283, + "loss": 3.4768, + "theoretical_loss": 4.29270245626578, + "tokens_seen": 224774144 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470641925777332, + "loss": 3.4684, + "theoretical_loss": 4.292548695226982, + "tokens_seen": 224839680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004706318956870612, + "loss": 3.4689, + "theoretical_loss": 4.292394991544631, + "tokens_seen": 224905216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004706218655967904, + "loss": 3.4973, + "theoretical_loss": 4.292241345180629, + "tokens_seen": 224970752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004706118355065196, + "loss": 3.4436, + "theoretical_loss": 4.292087756096911, + "tokens_seen": 225036288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047060180541624874, + "loss": 3.3471, + "theoretical_loss": 4.291934224255452, + "tokens_seen": 225101824 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470591775325978, + "loss": 3.4582, + "theoretical_loss": 4.291780749618262, + "tokens_seen": 225167360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705817452357071, + "loss": 3.5647, + "theoretical_loss": 4.291627332147385, + "tokens_seen": 225232896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047057171514543634, + "loss": 3.4789, + "theoretical_loss": 4.291473971804907, + "tokens_seen": 225298432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705616850551655, + "loss": 3.3833, + "theoretical_loss": 4.291320668552943, + "tokens_seen": 225363968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705516549648947, + "loss": 3.4301, + "theoretical_loss": 4.291167422353648, + "tokens_seen": 225429504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705416248746239, + "loss": 3.5329, + "theoretical_loss": 4.291014233169214, + "tokens_seen": 225495040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705315947843531, + "loss": 3.4672, + "theoretical_loss": 4.290861100961867, + "tokens_seen": 225560576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047052156469408224, + "loss": 3.4672, + "theoretical_loss": 4.2907080256938706, + "tokens_seen": 225626112 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705115346038115, + "loss": 3.5427, + "theoretical_loss": 4.290555007327521, + "tokens_seen": 225691648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004705015045135406, + "loss": 3.6193, + "theoretical_loss": 4.290402045825156, + "tokens_seen": 225757184 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047049147442326984, + "loss": 3.5078, + "theoretical_loss": 4.290249141149143, + "tokens_seen": 225822720 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470481444332999, + "loss": 3.4615, + "theoretical_loss": 4.290096293261891, + "tokens_seen": 225888256 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004704714142427282, + "loss": 3.3696, + "theoretical_loss": 4.28994350212584, + "tokens_seen": 225953792 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004704613841524574, + "loss": 3.4101, + "theoretical_loss": 4.289790767703467, + "tokens_seen": 226019328 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5094780921936035, + "objective/train/theoretical_loss": 4.289638089957288, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.289638089957288, + "tokens_seen": 226084864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047045135406218656, + "loss": 3.4668, + "theoretical_loss": 4.289638089957288, + "tokens_seen": 226084864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047044132397191575, + "loss": 3.5102, + "theoretical_loss": 4.28948546884985, + "tokens_seen": 226150400 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470431293881645, + "loss": 3.5214, + "theoretical_loss": 4.289332904343738, + "tokens_seen": 226215936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004704212637913741, + "loss": 3.4844, + "theoretical_loss": 4.289180396401572, + "tokens_seen": 226281472 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047041123370110334, + "loss": 3.4733, + "theoretical_loss": 4.2890279449860085, + "tokens_seen": 226347008 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047040120361083247, + "loss": 3.5547, + "theoretical_loss": 4.288875550059737, + "tokens_seen": 226412544 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703911735205617, + "loss": 3.5465, + "theoretical_loss": 4.288723211585486, + "tokens_seen": 226478080 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703811434302909, + "loss": 3.4884, + "theoretical_loss": 4.288570929526016, + "tokens_seen": 226543616 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047037111334002007, + "loss": 3.4375, + "theoretical_loss": 4.288418703844125, + "tokens_seen": 226609152 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047036108324974925, + "loss": 3.5254, + "theoretical_loss": 4.288266534502645, + "tokens_seen": 226674688 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703510531594785, + "loss": 3.4548, + "theoretical_loss": 4.288114421464444, + "tokens_seen": 226740224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703410230692076, + "loss": 3.4267, + "theoretical_loss": 4.287962364692424, + "tokens_seen": 226805760 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047033099297893685, + "loss": 3.3983, + "theoretical_loss": 4.287810364149525, + "tokens_seen": 226871296 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470320962888666, + "loss": 3.401, + "theoretical_loss": 4.287658419798718, + "tokens_seen": 226936832 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703109327983952, + "loss": 3.5318, + "theoretical_loss": 4.287506531603013, + "tokens_seen": 227002368 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004703009027081244, + "loss": 3.5887, + "theoretical_loss": 4.287354699525452, + "tokens_seen": 227067904 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047029087261785357, + "loss": 3.6061, + "theoretical_loss": 4.287202923529115, + "tokens_seen": 227133440 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047028084252758275, + "loss": 3.5292, + "theoretical_loss": 4.287051203577113, + "tokens_seen": 227198976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047027081243731193, + "loss": 3.5184, + "theoretical_loss": 4.286899539632596, + "tokens_seen": 227264512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004702607823470411, + "loss": 3.387, + "theoretical_loss": 4.286747931658745, + "tokens_seen": 227330048 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047025075225677035, + "loss": 3.4091, + "theoretical_loss": 4.28659637961878, + "tokens_seen": 227395584 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004702407221664995, + "loss": 3.4782, + "theoretical_loss": 4.286444883475951, + "tokens_seen": 227461120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004702306920762287, + "loss": 3.4827, + "theoretical_loss": 4.286293443193548, + "tokens_seen": 227526656 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047022066198595784, + "loss": 3.3926, + "theoretical_loss": 4.2861420587348915, + "tokens_seen": 227592192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004702106318956871, + "loss": 3.5525, + "theoretical_loss": 4.285990730063338, + "tokens_seen": 227657728 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.667811155319214, + "objective/train/theoretical_loss": 4.285839457142278, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.285839457142278, + "tokens_seen": 227723264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047020060180541626, + "loss": 3.4378, + "theoretical_loss": 4.285839457142278, + "tokens_seen": 227723264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047019057171514544, + "loss": 3.4424, + "theoretical_loss": 4.285688239935138, + "tokens_seen": 227788800 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004701805416248746, + "loss": 3.5495, + "theoretical_loss": 4.285537078405378, + "tokens_seen": 227854336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047017051153460385, + "loss": 3.4353, + "theoretical_loss": 4.285385972516494, + "tokens_seen": 227919872 + }, + { + "epoch": 2.02, + "learning_rate": 0.000470160481444333, + "loss": 3.5859, + "theoretical_loss": 4.2852349222320125, + "tokens_seen": 227985408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004701504513540622, + "loss": 3.4626, + "theoretical_loss": 4.285083927515498, + "tokens_seen": 228050944 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047014042126379134, + "loss": 3.6006, + "theoretical_loss": 4.28493298833055, + "tokens_seen": 228116480 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004701303911735206, + "loss": 3.4955, + "theoretical_loss": 4.284782104640799, + "tokens_seen": 228182016 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047012036108324976, + "loss": 3.4942, + "theoretical_loss": 4.284631276409911, + "tokens_seen": 228247552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047011033099297894, + "loss": 3.456, + "theoretical_loss": 4.284480503601587, + "tokens_seen": 228313088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004701003009027081, + "loss": 3.5162, + "theoretical_loss": 4.284329786179563, + "tokens_seen": 228378624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700902708124373, + "loss": 3.4798, + "theoretical_loss": 4.284179124107606, + "tokens_seen": 228444160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700802407221665, + "loss": 3.451, + "theoretical_loss": 4.284028517349519, + "tokens_seen": 228509696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700702106318957, + "loss": 3.4841, + "theoretical_loss": 4.283877965869141, + "tokens_seen": 228575232 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047006018054162484, + "loss": 3.342, + "theoretical_loss": 4.283727469630341, + "tokens_seen": 228640768 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700501504513541, + "loss": 3.472, + "theoretical_loss": 4.2835770285970245, + "tokens_seen": 228706304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047004012036108326, + "loss": 3.4488, + "theoretical_loss": 4.283426642733131, + "tokens_seen": 228771840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047003009027081244, + "loss": 3.4493, + "theoretical_loss": 4.2832763120026325, + "tokens_seen": 228837376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700200601805416, + "loss": 3.421, + "theoretical_loss": 4.283126036369536, + "tokens_seen": 228902912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004700100300902708, + "loss": 3.4266, + "theoretical_loss": 4.282975815797882, + "tokens_seen": 228968448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00047, + "loss": 3.6056, + "theoretical_loss": 4.282825650251745, + "tokens_seen": 229033984 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004699899699097292, + "loss": 3.4103, + "theoretical_loss": 4.282675539695231, + "tokens_seen": 229099520 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046997993981945835, + "loss": 3.4648, + "theoretical_loss": 4.282525484092483, + "tokens_seen": 229165056 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004699699097291876, + "loss": 3.4618, + "theoretical_loss": 4.282375483407677, + "tokens_seen": 229230592 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046995987963891676, + "loss": 3.5067, + "theoretical_loss": 4.28222553760502, + "tokens_seen": 229296128 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3004913330078125, + "objective/train/theoretical_loss": 4.282075646648755, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.282075646648755, + "tokens_seen": 229361664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046994984954864595, + "loss": 3.4202, + "theoretical_loss": 4.282075646648755, + "tokens_seen": 229361664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004699398194583752, + "loss": 3.3929, + "theoretical_loss": 4.281925810503157, + "tokens_seen": 229427200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004699297893681043, + "loss": 3.4206, + "theoretical_loss": 4.281776029132537, + "tokens_seen": 229492736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046991975927783354, + "loss": 3.4838, + "theoretical_loss": 4.281626302501236, + "tokens_seen": 229558272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046990972918756267, + "loss": 3.4581, + "theoretical_loss": 4.281476630573632, + "tokens_seen": 229623808 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698996990972919, + "loss": 3.4672, + "theoretical_loss": 4.281327013314131, + "tokens_seen": 229689344 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698896690070211, + "loss": 3.495, + "theoretical_loss": 4.28117745068718, + "tokens_seen": 229754880 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046987963891675027, + "loss": 3.2944, + "theoretical_loss": 4.281027942657252, + "tokens_seen": 229820416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046986960882647945, + "loss": 3.4439, + "theoretical_loss": 4.280878489188858, + "tokens_seen": 229885952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698595787362087, + "loss": 3.5105, + "theoretical_loss": 4.280729090246538, + "tokens_seen": 229951488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698495486459378, + "loss": 3.5542, + "theoretical_loss": 4.28057974579487, + "tokens_seen": 230017024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046983951855566705, + "loss": 3.3953, + "theoretical_loss": 4.28043045579846, + "tokens_seen": 230082560 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698294884653962, + "loss": 3.5254, + "theoretical_loss": 4.280281220221953, + "tokens_seen": 230148096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698194583751254, + "loss": 3.4414, + "theoretical_loss": 4.280132039030021, + "tokens_seen": 230213632 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004698094282848546, + "loss": 3.4009, + "theoretical_loss": 4.279982912187372, + "tokens_seen": 230279168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046979939819458377, + "loss": 3.4663, + "theoretical_loss": 4.279833839658748, + "tokens_seen": 230344704 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046978936810431295, + "loss": 3.5093, + "theoretical_loss": 4.279684821408922, + "tokens_seen": 230410240 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046977933801404213, + "loss": 3.5052, + "theoretical_loss": 4.279535857402699, + "tokens_seen": 230475776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004697693079237713, + "loss": 3.5071, + "theoretical_loss": 4.27938694760492, + "tokens_seen": 230541312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046975927783350055, + "loss": 3.3882, + "theoretical_loss": 4.2792380919804565, + "tokens_seen": 230606848 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004697492477432297, + "loss": 3.412, + "theoretical_loss": 4.279089290494212, + "tokens_seen": 230672384 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004697392176529589, + "loss": 3.4704, + "theoretical_loss": 4.2789405431111245, + "tokens_seen": 230737920 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046972918756268804, + "loss": 3.538, + "theoretical_loss": 4.278791849796165, + "tokens_seen": 230803456 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004697191574724173, + "loss": 3.5494, + "theoretical_loss": 4.2786432105143355, + "tokens_seen": 230868992 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046970912738214646, + "loss": 3.4956, + "theoretical_loss": 4.278494625230671, + "tokens_seen": 230934528 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5833725929260254, + "objective/train/theoretical_loss": 4.27834609391024, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.27834609391024, + "tokens_seen": 231000064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046969909729187564, + "loss": 3.5909, + "theoretical_loss": 4.27834609391024, + "tokens_seen": 231000064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004696890672016048, + "loss": 3.5367, + "theoretical_loss": 4.278197616518142, + "tokens_seen": 231065600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046967903711133405, + "loss": 3.5433, + "theoretical_loss": 4.27804919301951, + "tokens_seen": 231131136 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004696690070210632, + "loss": 3.5276, + "theoretical_loss": 4.277900823379509, + "tokens_seen": 231196672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004696589769307924, + "loss": 3.5167, + "theoretical_loss": 4.277752507563337, + "tokens_seen": 231262208 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046964894684052154, + "loss": 3.4563, + "theoretical_loss": 4.2776042455362235, + "tokens_seen": 231327744 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004696389167502508, + "loss": 3.4646, + "theoretical_loss": 4.277456037263431, + "tokens_seen": 231393280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046962888665997996, + "loss": 3.3889, + "theoretical_loss": 4.277307882710255, + "tokens_seen": 231458816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046961885656970914, + "loss": 3.3983, + "theoretical_loss": 4.27715978184202, + "tokens_seen": 231524352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004696088264794383, + "loss": 3.3848, + "theoretical_loss": 4.277011734624085, + "tokens_seen": 231589888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695987963891675, + "loss": 3.494, + "theoretical_loss": 4.2768637410218435, + "tokens_seen": 231655424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695887662988967, + "loss": 3.4501, + "theoretical_loss": 4.276715801000716, + "tokens_seen": 231720960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695787362086259, + "loss": 3.4997, + "theoretical_loss": 4.27656791452616, + "tokens_seen": 231786496 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046956870611835505, + "loss": 3.5721, + "theoretical_loss": 4.276420081563661, + "tokens_seen": 231852032 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695586760280843, + "loss": 3.5595, + "theoretical_loss": 4.2762723020787385, + "tokens_seen": 231917568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046954864593781346, + "loss": 3.4204, + "theoretical_loss": 4.276124576036944, + "tokens_seen": 231983104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046953861584754264, + "loss": 3.375, + "theoretical_loss": 4.27597690340386, + "tokens_seen": 232048640 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695285857572718, + "loss": 3.5189, + "theoretical_loss": 4.275829284145104, + "tokens_seen": 232114176 + }, + { + "epoch": 2.02, + "learning_rate": 0.000469518555667001, + "loss": 3.4095, + "theoretical_loss": 4.275681718226319, + "tokens_seen": 232179712 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004695085255767302, + "loss": 3.4642, + "theoretical_loss": 4.2755342056131855, + "tokens_seen": 232245248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694984954864594, + "loss": 3.441, + "theoretical_loss": 4.275386746271415, + "tokens_seen": 232310784 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046948846539618855, + "loss": 3.3844, + "theoretical_loss": 4.275239340166747, + "tokens_seen": 232376320 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694784353059178, + "loss": 3.5081, + "theoretical_loss": 4.275091987264958, + "tokens_seen": 232441856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694684052156469, + "loss": 3.4688, + "theoretical_loss": 4.2749446875318515, + "tokens_seen": 232507392 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046945837512537615, + "loss": 3.4289, + "theoretical_loss": 4.274797440933265, + "tokens_seen": 232572928 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5150699615478516, + "objective/train/theoretical_loss": 4.274650247435068, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.274650247435068, + "tokens_seen": 232638464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694483450351053, + "loss": 3.469, + "theoretical_loss": 4.274650247435068, + "tokens_seen": 232638464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694383149448345, + "loss": 3.4642, + "theoretical_loss": 4.274503107003159, + "tokens_seen": 232704000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004694282848545637, + "loss": 3.5994, + "theoretical_loss": 4.274356019603472, + "tokens_seen": 232769536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046941825476429287, + "loss": 3.4261, + "theoretical_loss": 4.274208985201967, + "tokens_seen": 232835072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046940822467402205, + "loss": 3.4685, + "theoretical_loss": 4.274062003764641, + "tokens_seen": 232900608 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693981945837513, + "loss": 3.4264, + "theoretical_loss": 4.273915075257518, + "tokens_seen": 232966144 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693881644934804, + "loss": 3.4324, + "theoretical_loss": 4.273768199646657, + "tokens_seen": 233031680 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046937813440320965, + "loss": 3.4811, + "theoretical_loss": 4.273621376898146, + "tokens_seen": 233097216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046936810431293883, + "loss": 3.4523, + "theoretical_loss": 4.2734746069781036, + "tokens_seen": 233162752 + }, + { + "epoch": 2.02, + "learning_rate": 0.000469358074222668, + "loss": 3.4757, + "theoretical_loss": 4.273327889852682, + "tokens_seen": 233228288 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693480441323972, + "loss": 3.5692, + "theoretical_loss": 4.273181225488064, + "tokens_seen": 233293824 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693380140421264, + "loss": 3.5445, + "theoretical_loss": 4.273034613850461, + "tokens_seen": 233359360 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046932798395185555, + "loss": 3.5012, + "theoretical_loss": 4.27288805490612, + "tokens_seen": 233424896 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693179538615848, + "loss": 3.4574, + "theoretical_loss": 4.272741548621314, + "tokens_seen": 233490432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004693079237713139, + "loss": 3.486, + "theoretical_loss": 4.272595094962352, + "tokens_seen": 233555968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046929789368104315, + "loss": 3.543, + "theoretical_loss": 4.272448693895569, + "tokens_seen": 233621504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004692878635907723, + "loss": 3.491, + "theoretical_loss": 4.272302345387335, + "tokens_seen": 233687040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004692778335005015, + "loss": 3.5102, + "theoretical_loss": 4.2721560494040505, + "tokens_seen": 233752576 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004692678034102307, + "loss": 3.394, + "theoretical_loss": 4.272009805912145, + "tokens_seen": 233818112 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004692577733199599, + "loss": 3.4718, + "theoretical_loss": 4.271863614878079, + "tokens_seen": 233883648 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046924774322968906, + "loss": 3.4245, + "theoretical_loss": 4.271717476268345, + "tokens_seen": 233949184 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046923771313941824, + "loss": 3.5273, + "theoretical_loss": 4.271571390049467, + "tokens_seen": 234014720 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004692276830491474, + "loss": 3.5172, + "theoretical_loss": 4.271425356187998, + "tokens_seen": 234080256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046921765295887666, + "loss": 3.414, + "theoretical_loss": 4.271279374650521, + "tokens_seen": 234145792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046920762286860584, + "loss": 3.4805, + "theoretical_loss": 4.2711334454036525, + "tokens_seen": 234211328 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3115851879119873, + "objective/train/theoretical_loss": 4.270987568414038, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.270987568414038, + "tokens_seen": 234276864 + }, + { + "epoch": 2.02, + "learning_rate": 0.000469197592778335, + "loss": 3.4741, + "theoretical_loss": 4.270987568414038, + "tokens_seen": 234276864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046918756268806425, + "loss": 3.4641, + "theoretical_loss": 4.270841743648353, + "tokens_seen": 234342400 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004691775325977934, + "loss": 3.4636, + "theoretical_loss": 4.270695971073305, + "tokens_seen": 234407936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004691675025075226, + "loss": 3.4732, + "theoretical_loss": 4.2705502506556305, + "tokens_seen": 234473472 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046915747241725174, + "loss": 3.5255, + "theoretical_loss": 4.270404582362098, + "tokens_seen": 234539008 + }, + { + "epoch": 2.02, + "learning_rate": 0.000469147442326981, + "loss": 3.5126, + "theoretical_loss": 4.270258966159506, + "tokens_seen": 234604544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046913741223671016, + "loss": 3.4864, + "theoretical_loss": 4.270113402014682, + "tokens_seen": 234670080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046912738214643934, + "loss": 3.4404, + "theoretical_loss": 4.269967889894486, + "tokens_seen": 234735616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004691173520561685, + "loss": 3.5357, + "theoretical_loss": 4.269822429765807, + "tokens_seen": 234801152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004691073219658977, + "loss": 3.44, + "theoretical_loss": 4.269677021595566, + "tokens_seen": 234866688 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690972918756269, + "loss": 3.5593, + "theoretical_loss": 4.26953166535071, + "tokens_seen": 234932224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690872617853561, + "loss": 3.5327, + "theoretical_loss": 4.269386360998222, + "tokens_seen": 234997760 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046907723169508525, + "loss": 3.4481, + "theoretical_loss": 4.269241108505112, + "tokens_seen": 235063296 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690672016048145, + "loss": 3.3127, + "theoretical_loss": 4.269095907838419, + "tokens_seen": 235128832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046905717151454366, + "loss": 3.4706, + "theoretical_loss": 4.2689507589652145, + "tokens_seen": 235194368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046904714142427284, + "loss": 3.4553, + "theoretical_loss": 4.2688056618526, + "tokens_seen": 235259904 + }, + { + "epoch": 2.02, + "learning_rate": 0.000469037111334002, + "loss": 3.5823, + "theoretical_loss": 4.268660616467706, + "tokens_seen": 235325440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690270812437312, + "loss": 3.5889, + "theoretical_loss": 4.268515622777692, + "tokens_seen": 235390976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690170511534604, + "loss": 3.4879, + "theoretical_loss": 4.268370680749751, + "tokens_seen": 235456512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004690070210631896, + "loss": 3.3844, + "theoretical_loss": 4.268225790351103, + "tokens_seen": 235522048 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046899699097291875, + "loss": 3.4982, + "theoretical_loss": 4.268080951548998, + "tokens_seen": 235587584 + }, + { + "epoch": 2.02, + "learning_rate": 0.000468986960882648, + "loss": 3.4759, + "theoretical_loss": 4.267936164310717, + "tokens_seen": 235653120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004689769307923771, + "loss": 3.4707, + "theoretical_loss": 4.26779142860357, + "tokens_seen": 235718656 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046896690070210635, + "loss": 3.4282, + "theoretical_loss": 4.267646744394899, + "tokens_seen": 235784192 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046895687061183553, + "loss": 3.5829, + "theoretical_loss": 4.267502111652071, + "tokens_seen": 235849728 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5653693675994873, + "objective/train/theoretical_loss": 4.267357530342489, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.267357530342489, + "tokens_seen": 235915264 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004689468405215647, + "loss": 3.4388, + "theoretical_loss": 4.267357530342489, + "tokens_seen": 235915264 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004689368104312939, + "loss": 3.4281, + "theoretical_loss": 4.267213000433579, + "tokens_seen": 235980800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046892678034102307, + "loss": 3.4959, + "theoretical_loss": 4.267068521892803, + "tokens_seen": 236046336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046891675025075225, + "loss": 3.5007, + "theoretical_loss": 4.266924094687649, + "tokens_seen": 236111872 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004689067201604815, + "loss": 3.414, + "theoretical_loss": 4.266779718785634, + "tokens_seen": 236177408 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004688966900702106, + "loss": 3.5623, + "theoretical_loss": 4.266635394154307, + "tokens_seen": 236242944 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046888665997993985, + "loss": 3.3703, + "theoretical_loss": 4.266491120761246, + "tokens_seen": 236308480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046887662988966903, + "loss": 3.5236, + "theoretical_loss": 4.2663468985740565, + "tokens_seen": 236374016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004688665997993982, + "loss": 3.3497, + "theoretical_loss": 4.266202727560374, + "tokens_seen": 236439552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004688565697091274, + "loss": 3.5417, + "theoretical_loss": 4.266058607687867, + "tokens_seen": 236505088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004688465396188566, + "loss": 3.4712, + "theoretical_loss": 4.26591453892423, + "tokens_seen": 236570624 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046883650952858575, + "loss": 3.5026, + "theoretical_loss": 4.265770521237185, + "tokens_seen": 236636160 + }, + { + "epoch": 2.02, + "learning_rate": 0.000468826479438315, + "loss": 3.4472, + "theoretical_loss": 4.265626554594489, + "tokens_seen": 236701696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004688164493480441, + "loss": 3.4746, + "theoretical_loss": 4.265482638963922, + "tokens_seen": 236767232 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046880641925777335, + "loss": 3.3811, + "theoretical_loss": 4.2653387743132996, + "tokens_seen": 236832768 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687963891675025, + "loss": 3.4135, + "theoretical_loss": 4.265194960610461, + "tokens_seen": 236898304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687863590772317, + "loss": 3.4291, + "theoretical_loss": 4.2650511978232775, + "tokens_seen": 236963840 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687763289869609, + "loss": 3.3768, + "theoretical_loss": 4.2649074859196485, + "tokens_seen": 237029376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687662988966901, + "loss": 3.4189, + "theoretical_loss": 4.264763824867504, + "tokens_seen": 237094912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046875626880641926, + "loss": 3.4079, + "theoretical_loss": 4.264620214634801, + "tokens_seen": 237160448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046874623871614844, + "loss": 3.4803, + "theoretical_loss": 4.264476655189528, + "tokens_seen": 237225984 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687362086258776, + "loss": 3.4199, + "theoretical_loss": 4.264333146499699, + "tokens_seen": 237291520 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046872617853560686, + "loss": 3.4734, + "theoretical_loss": 4.264189688533361, + "tokens_seen": 237357056 + }, + { + "epoch": 2.02, + "learning_rate": 0.000468716148445336, + "loss": 3.4398, + "theoretical_loss": 4.264046281258587, + "tokens_seen": 237422592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004687061183550652, + "loss": 3.4782, + "theoretical_loss": 4.263902924643479, + "tokens_seen": 237488128 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.439204216003418, + "objective/train/theoretical_loss": 4.26375961865617, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.26375961865617, + "tokens_seen": 237553664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686960882647944, + "loss": 3.4987, + "theoretical_loss": 4.26375961865617, + "tokens_seen": 237553664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686860581745236, + "loss": 3.4144, + "theoretical_loss": 4.26361636326482, + "tokens_seen": 237619200 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046867602808425276, + "loss": 3.5284, + "theoretical_loss": 4.26347315843762, + "tokens_seen": 237684736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046866599799398194, + "loss": 3.606, + "theoretical_loss": 4.263330004142785, + "tokens_seen": 237750272 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686559679037111, + "loss": 3.5015, + "theoretical_loss": 4.263186900348564, + "tokens_seen": 237815808 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046864593781344036, + "loss": 3.4355, + "theoretical_loss": 4.263043847023232, + "tokens_seen": 237881344 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686359077231695, + "loss": 3.3787, + "theoretical_loss": 4.2629008441350935, + "tokens_seen": 237946880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686258776328987, + "loss": 3.4793, + "theoretical_loss": 4.262757891652481, + "tokens_seen": 238012416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046861584754262785, + "loss": 3.5023, + "theoretical_loss": 4.262614989543756, + "tokens_seen": 238077952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004686058174523571, + "loss": 3.4444, + "theoretical_loss": 4.262472137777309, + "tokens_seen": 238143488 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046859578736208626, + "loss": 3.3768, + "theoretical_loss": 4.2623293363215575, + "tokens_seen": 238209024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046858575727181545, + "loss": 3.5466, + "theoretical_loss": 4.26218658514495, + "tokens_seen": 238274560 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004685757271815446, + "loss": 3.4439, + "theoretical_loss": 4.26204388421596, + "tokens_seen": 238340096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004685656970912738, + "loss": 3.5216, + "theoretical_loss": 4.261901233503093, + "tokens_seen": 238405632 + }, + { + "epoch": 2.02, + "learning_rate": 0.000468555667001003, + "loss": 3.4962, + "theoretical_loss": 4.261758632974881, + "tokens_seen": 238471168 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004685456369107322, + "loss": 3.4472, + "theoretical_loss": 4.261616082599884, + "tokens_seen": 238536704 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046853560682046135, + "loss": 3.4676, + "theoretical_loss": 4.261473582346692, + "tokens_seen": 238602240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004685255767301906, + "loss": 3.552, + "theoretical_loss": 4.261331132183921, + "tokens_seen": 238667776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046851554663991977, + "loss": 3.4921, + "theoretical_loss": 4.2611887320802175, + "tokens_seen": 238733312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046850551654964895, + "loss": 3.4938, + "theoretical_loss": 4.261046382004255, + "tokens_seen": 238798848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046849548645937813, + "loss": 3.4676, + "theoretical_loss": 4.2609040819247355, + "tokens_seen": 238864384 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684854563691073, + "loss": 3.4017, + "theoretical_loss": 4.260761831810389, + "tokens_seen": 238929920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684754262788365, + "loss": 3.5401, + "theoretical_loss": 4.260619631629974, + "tokens_seen": 238995456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046846539618856573, + "loss": 3.4934, + "theoretical_loss": 4.260477481352276, + "tokens_seen": 239060992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684553660982949, + "loss": 3.4291, + "theoretical_loss": 4.2603353809461115, + "tokens_seen": 239126528 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4320733547210693, + "objective/train/theoretical_loss": 4.26019333038032, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.26019333038032, + "tokens_seen": 239192064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684453360080241, + "loss": 3.4611, + "theoretical_loss": 4.26019333038032, + "tokens_seen": 239192064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046843530591775327, + "loss": 3.375, + "theoretical_loss": 4.260051329623774, + "tokens_seen": 239257600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046842527582748245, + "loss": 3.4951, + "theoretical_loss": 4.2599093786453714, + "tokens_seen": 239323136 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684152457372117, + "loss": 3.3314, + "theoretical_loss": 4.259767477414038, + "tokens_seen": 239388672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004684052156469408, + "loss": 3.4878, + "theoretical_loss": 4.259625625898729, + "tokens_seen": 239454208 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046839518555667005, + "loss": 3.3614, + "theoretical_loss": 4.259483824068425, + "tokens_seen": 239519744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046838515546639923, + "loss": 3.5823, + "theoretical_loss": 4.259342071892138, + "tokens_seen": 239585280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683751253761284, + "loss": 3.5727, + "theoretical_loss": 4.259200369338904, + "tokens_seen": 239650816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683650952858576, + "loss": 3.5667, + "theoretical_loss": 4.259058716377789, + "tokens_seen": 239716352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683550651955868, + "loss": 3.4661, + "theoretical_loss": 4.258917112977886, + "tokens_seen": 239781888 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046834503510531595, + "loss": 3.4185, + "theoretical_loss": 4.258775559108317, + "tokens_seen": 239847424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683350050150452, + "loss": 3.4376, + "theoretical_loss": 4.258634054738229, + "tokens_seen": 239912960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683249749247743, + "loss": 3.4657, + "theoretical_loss": 4.258492599836799, + "tokens_seen": 239978496 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046831494483450355, + "loss": 3.345, + "theoretical_loss": 4.2583511943732315, + "tokens_seen": 240044032 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004683049147442327, + "loss": 3.4154, + "theoretical_loss": 4.258209838316756, + "tokens_seen": 240109568 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682948846539619, + "loss": 3.4702, + "theoretical_loss": 4.258068531636634, + "tokens_seen": 240175104 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682848545636911, + "loss": 3.5274, + "theoretical_loss": 4.25792727430215, + "tokens_seen": 240240640 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682748244734203, + "loss": 3.5393, + "theoretical_loss": 4.257786066282619, + "tokens_seen": 240306176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046826479438314946, + "loss": 3.4486, + "theoretical_loss": 4.257644907547381, + "tokens_seen": 240371712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046825476429287864, + "loss": 3.5143, + "theoretical_loss": 4.257503798065807, + "tokens_seen": 240437248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682447342026078, + "loss": 3.3546, + "theoretical_loss": 4.2573627378072905, + "tokens_seen": 240502784 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046823470411233706, + "loss": 3.5681, + "theoretical_loss": 4.257221726741257, + "tokens_seen": 240568320 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682246740220662, + "loss": 3.4727, + "theoretical_loss": 4.257080764837157, + "tokens_seen": 240633856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682146439317954, + "loss": 3.3799, + "theoretical_loss": 4.256939852064468, + "tokens_seen": 240699392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004682046138415246, + "loss": 3.4338, + "theoretical_loss": 4.256798988392696, + "tokens_seen": 240764928 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.747337818145752, + "objective/train/theoretical_loss": 4.256658173791373, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.256658173791373, + "tokens_seen": 240830464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004681945837512538, + "loss": 3.4531, + "theoretical_loss": 4.256658173791373, + "tokens_seen": 240830464 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046818455366098296, + "loss": 3.5203, + "theoretical_loss": 4.256517408230059, + "tokens_seen": 240896000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046817452357071214, + "loss": 3.4055, + "theoretical_loss": 4.256376691678342, + "tokens_seen": 240961536 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004681644934804413, + "loss": 3.378, + "theoretical_loss": 4.256236024105834, + "tokens_seen": 241027072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046815446339017056, + "loss": 3.4125, + "theoretical_loss": 4.256095405482178, + "tokens_seen": 241092608 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004681444332998997, + "loss": 3.2709, + "theoretical_loss": 4.255954835777041, + "tokens_seen": 241158144 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004681344032096289, + "loss": 3.4265, + "theoretical_loss": 4.255814314960118, + "tokens_seen": 241223680 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046812437311935805, + "loss": 3.4345, + "theoretical_loss": 4.255673843001134, + "tokens_seen": 241289216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004681143430290873, + "loss": 3.5304, + "theoretical_loss": 4.255533419869835, + "tokens_seen": 241354752 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046810431293881646, + "loss": 3.5578, + "theoretical_loss": 4.255393045535998, + "tokens_seen": 241420288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046809428284854565, + "loss": 3.4293, + "theoretical_loss": 4.255252719969427, + "tokens_seen": 241485824 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004680842527582748, + "loss": 3.4474, + "theoretical_loss": 4.255112443139952, + "tokens_seen": 241551360 + }, + { + "epoch": 2.02, + "learning_rate": 0.000468074222668004, + "loss": 3.5217, + "theoretical_loss": 4.254972215017427, + "tokens_seen": 241616896 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004680641925777332, + "loss": 3.4646, + "theoretical_loss": 4.25483203557174, + "tokens_seen": 241682432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004680541624874624, + "loss": 3.5005, + "theoretical_loss": 4.254691904772798, + "tokens_seen": 241747968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046804413239719155, + "loss": 3.5584, + "theoretical_loss": 4.25455182259054, + "tokens_seen": 241813504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004680341023069208, + "loss": 3.4313, + "theoretical_loss": 4.254411788994929, + "tokens_seen": 241879040 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046802407221664997, + "loss": 3.4063, + "theoretical_loss": 4.254271803955955, + "tokens_seen": 241944576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046801404212637915, + "loss": 3.5062, + "theoretical_loss": 4.254131867443637, + "tokens_seen": 242010112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046800401203610833, + "loss": 3.4869, + "theoretical_loss": 4.253991979428017, + "tokens_seen": 242075648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004679939819458375, + "loss": 3.4524, + "theoretical_loss": 4.253852139879166, + "tokens_seen": 242141184 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004679839518555667, + "loss": 3.4626, + "theoretical_loss": 4.2537123487671815, + "tokens_seen": 242206720 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046797392176529593, + "loss": 3.4744, + "theoretical_loss": 4.253572606062186, + "tokens_seen": 242272256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046796389167502505, + "loss": 3.4427, + "theoretical_loss": 4.253432911734331, + "tokens_seen": 242337792 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004679538615847543, + "loss": 3.5673, + "theoretical_loss": 4.253293265753792, + "tokens_seen": 242403328 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 567264, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6890623569488525, + "objective/train/theoretical_loss": 4.253153668090771, + "objective/train/tokens_used": 242965984, + "theoretical_loss": 4.253153668090771, + "tokens_seen": 242468864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004679438314944834, + "loss": 3.481, + "theoretical_loss": 4.253153668090771, + "tokens_seen": 242468864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046793380140421265, + "loss": 3.501, + "theoretical_loss": 4.2530141187155, + "tokens_seen": 242534400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046792377131394183, + "loss": 3.5207, + "theoretical_loss": 4.252874617598232, + "tokens_seen": 242599936 + }, + { + "epoch": 2.02, + "learning_rate": 0.000467913741223671, + "loss": 3.5433, + "theoretical_loss": 4.252735164709252, + "tokens_seen": 242665472 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004679037111334002, + "loss": 3.5594, + "theoretical_loss": 4.252595760018865, + "tokens_seen": 242731008 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046789368104312943, + "loss": 3.4624, + "theoretical_loss": 4.2524564034974075, + "tokens_seen": 242796544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00046788365095285856, + "loss": 3.5935, + "theoretical_loss": 4.252317095115241, + "tokens_seen": 242862080 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004678736208625878, + "loss": 3.4748, + "theoretical_loss": 4.252177834842751, + "tokens_seen": 242927616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0004678635907723169, + "loss": 3.3296, + "theoretical_loss": 4.252053846643013, + "tokens_seen": 242985984 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046785356068204616, + "loss": 3.3455, + "theoretical_loss": 4.251914677247056, + "tokens_seen": 243051520 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046784353059177534, + "loss": 3.4769, + "theoretical_loss": 4.251775555875326, + "tokens_seen": 243117056 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004678335005015045, + "loss": 3.3548, + "theoretical_loss": 4.251636482498309, + "tokens_seen": 243182592 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004678234704112337, + "loss": 3.4627, + "theoretical_loss": 4.251497457086521, + "tokens_seen": 243248128 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004678134403209629, + "loss": 3.3824, + "theoretical_loss": 4.251358479610504, + "tokens_seen": 243313664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046780341023069206, + "loss": 3.4554, + "theoretical_loss": 4.251219550040823, + "tokens_seen": 243379200 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677933801404213, + "loss": 3.3578, + "theoretical_loss": 4.251080668348074, + "tokens_seen": 243444736 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677833500501504, + "loss": 3.409, + "theoretical_loss": 4.250941834502873, + "tokens_seen": 243510272 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046777331995987966, + "loss": 3.4296, + "theoretical_loss": 4.250803048475867, + "tokens_seen": 243575808 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677632898696088, + "loss": 3.1251, + "theoretical_loss": 4.250664310237727, + "tokens_seen": 243641344 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467753259779338, + "loss": 3.3785, + "theoretical_loss": 4.250525619759148, + "tokens_seen": 243706880 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677432296890672, + "loss": 3.2767, + "theoretical_loss": 4.250386977010853, + "tokens_seen": 243772416 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677331995987964, + "loss": 3.3807, + "theoretical_loss": 4.250248381963592, + "tokens_seen": 243837952 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046772316950852556, + "loss": 3.3321, + "theoretical_loss": 4.250109834588138, + "tokens_seen": 243903488 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004677131394182548, + "loss": 3.2971, + "theoretical_loss": 4.249971334855291, + "tokens_seen": 243969024 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467703109327984, + "loss": 3.3752, + "theoretical_loss": 4.249832882735878, + "tokens_seen": 244034560 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 617708, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5776028633117676, + "objective/train/theoretical_loss": 4.249694478200748, + "objective/train/tokens_used": 264560096, + "theoretical_loss": 4.249694478200748, + "tokens_seen": 244100096 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046769307923771316, + "loss": 3.4955, + "theoretical_loss": 4.249694478200748, + "tokens_seen": 244100096 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046768304914744234, + "loss": 3.42, + "theoretical_loss": 4.249556121220779, + "tokens_seen": 244165632 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004676730190571715, + "loss": 3.3031, + "theoretical_loss": 4.249417811766874, + "tokens_seen": 244231168 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046766298896690076, + "loss": 3.4557, + "theoretical_loss": 4.249279549809962, + "tokens_seen": 244296704 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004676529588766299, + "loss": 3.3932, + "theoretical_loss": 4.249141335320995, + "tokens_seen": 244362240 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004676429287863591, + "loss": 3.3086, + "theoretical_loss": 4.249003168270954, + "tokens_seen": 244427776 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046763289869608825, + "loss": 3.4533, + "theoretical_loss": 4.248865048630842, + "tokens_seen": 244493312 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004676228686058175, + "loss": 3.4026, + "theoretical_loss": 4.248726976371692, + "tokens_seen": 244558848 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046761283851554666, + "loss": 3.512, + "theoretical_loss": 4.248588951464558, + "tokens_seen": 244624384 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046760280842527585, + "loss": 3.3988, + "theoretical_loss": 4.248450973880521, + "tokens_seen": 244689920 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467592778335005, + "loss": 3.4345, + "theoretical_loss": 4.24831304359069, + "tokens_seen": 244755456 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004675827482447342, + "loss": 3.3965, + "theoretical_loss": 4.248175160566195, + "tokens_seen": 244820992 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004675727181544634, + "loss": 3.278, + "theoretical_loss": 4.248037324778194, + "tokens_seen": 244886528 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004675626880641926, + "loss": 3.3659, + "theoretical_loss": 4.247899536197869, + "tokens_seen": 244952064 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046755265797392175, + "loss": 3.4802, + "theoretical_loss": 4.247761794796428, + "tokens_seen": 245017600 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467542627883651, + "loss": 3.323, + "theoretical_loss": 4.247624100545106, + "tokens_seen": 245083136 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046753259779338017, + "loss": 3.4578, + "theoretical_loss": 4.24748645341516, + "tokens_seen": 245148672 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046752256770310935, + "loss": 3.4801, + "theoretical_loss": 4.247348853377874, + "tokens_seen": 245214208 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046751253761283853, + "loss": 3.4449, + "theoretical_loss": 4.247211300404556, + "tokens_seen": 245279744 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004675025075225677, + "loss": 3.4369, + "theoretical_loss": 4.247073794466543, + "tokens_seen": 245345280 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004674924774322969, + "loss": 3.2605, + "theoretical_loss": 4.24693633553519, + "tokens_seen": 245410816 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046748244734202613, + "loss": 3.4328, + "theoretical_loss": 4.246798923581884, + "tokens_seen": 245476352 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046747241725175525, + "loss": 3.3628, + "theoretical_loss": 4.246661558578032, + "tokens_seen": 245541888 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004674623871614845, + "loss": 3.4923, + "theoretical_loss": 4.246524240495071, + "tokens_seen": 245607424 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004674523570712136, + "loss": 3.3141, + "theoretical_loss": 4.246386969304458, + "tokens_seen": 245672960 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 622673, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5313405990600586, + "objective/train/theoretical_loss": 4.246249744977678, + "objective/train/tokens_used": 266198496, + "theoretical_loss": 4.246249744977678, + "tokens_seen": 245738496 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046744232698094285, + "loss": 3.4107, + "theoretical_loss": 4.246249744977678, + "tokens_seen": 245738496 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046743229689067203, + "loss": 3.3752, + "theoretical_loss": 4.246112567486241, + "tokens_seen": 245804032 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004674222668004012, + "loss": 3.554, + "theoretical_loss": 4.24597543680168, + "tokens_seen": 245869568 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004674122367101304, + "loss": 3.4436, + "theoretical_loss": 4.245838352895554, + "tokens_seen": 245935104 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046740220661985963, + "loss": 3.2945, + "theoretical_loss": 4.245701315739447, + "tokens_seen": 246000640 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046739217652958876, + "loss": 3.3638, + "theoretical_loss": 4.245564325304968, + "tokens_seen": 246066176 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467382146439318, + "loss": 3.2771, + "theoretical_loss": 4.245427381563749, + "tokens_seen": 246131712 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004673721163490471, + "loss": 3.2234, + "theoretical_loss": 4.24529048448745, + "tokens_seen": 246197248 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046736208625877636, + "loss": 3.3897, + "theoretical_loss": 4.245153634047753, + "tokens_seen": 246262784 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046735205616850554, + "loss": 3.415, + "theoretical_loss": 4.245016830216366, + "tokens_seen": 246328320 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004673420260782347, + "loss": 3.4516, + "theoretical_loss": 4.244880072965023, + "tokens_seen": 246393856 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004673319959879639, + "loss": 3.4033, + "theoretical_loss": 4.244743362265477, + "tokens_seen": 246459392 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004673219658976931, + "loss": 3.421, + "theoretical_loss": 4.2446066980895125, + "tokens_seen": 246524928 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046731193580742226, + "loss": 3.4119, + "theoretical_loss": 4.244470080408934, + "tokens_seen": 246590464 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004673019057171515, + "loss": 3.3777, + "theoretical_loss": 4.244333509195575, + "tokens_seen": 246656000 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004672918756268806, + "loss": 3.3541, + "theoretical_loss": 4.244196984421289, + "tokens_seen": 246721536 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046728184553660986, + "loss": 3.2542, + "theoretical_loss": 4.244060506057956, + "tokens_seen": 246787072 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467271815446339, + "loss": 3.475, + "theoretical_loss": 4.24392407407748, + "tokens_seen": 246852608 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004672617853560682, + "loss": 3.3489, + "theoretical_loss": 4.24378768845179, + "tokens_seen": 246918144 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004672517552657974, + "loss": 3.4009, + "theoretical_loss": 4.24365134915284, + "tokens_seen": 246983680 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004672417251755266, + "loss": 3.328, + "theoretical_loss": 4.2435150561526065, + "tokens_seen": 247049216 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046723169508525576, + "loss": 3.3618, + "theoretical_loss": 4.243378809423093, + "tokens_seen": 247114752 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467221664994985, + "loss": 3.4207, + "theoretical_loss": 4.243242608936325, + "tokens_seen": 247180288 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004672116349047141, + "loss": 3.2045, + "theoretical_loss": 4.243106454664352, + "tokens_seen": 247245824 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046720160481444336, + "loss": 3.3636, + "theoretical_loss": 4.242970346579252, + "tokens_seen": 247311360 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 625791, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.321624279022217, + "objective/train/theoretical_loss": 4.242834284653122, + "objective/train/tokens_used": 267836896, + "theoretical_loss": 4.242834284653122, + "tokens_seen": 247376896 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004671915747241725, + "loss": 3.2954, + "theoretical_loss": 4.242834284653122, + "tokens_seen": 247376896 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004671815446339017, + "loss": 3.3967, + "theoretical_loss": 4.242698268858086, + "tokens_seen": 247442432 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004671715145436309, + "loss": 3.3997, + "theoretical_loss": 4.242562299166291, + "tokens_seen": 247507968 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004671614844533601, + "loss": 3.3829, + "theoretical_loss": 4.24242637554991, + "tokens_seen": 247573504 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046715145436308927, + "loss": 3.3789, + "theoretical_loss": 4.2422904979811396, + "tokens_seen": 247639040 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046714142427281845, + "loss": 3.3531, + "theoretical_loss": 4.2421546664321985, + "tokens_seen": 247704576 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046713139418254763, + "loss": 3.3342, + "theoretical_loss": 4.242018880875332, + "tokens_seen": 247770112 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046712136409227686, + "loss": 3.434, + "theoretical_loss": 4.241883141282807, + "tokens_seen": 247835648 + }, + { + "epoch": 3.0, + "learning_rate": 0.000467111334002006, + "loss": 3.3904, + "theoretical_loss": 4.241747447626918, + "tokens_seen": 247901184 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004671013039117352, + "loss": 3.2635, + "theoretical_loss": 4.24161179987998, + "tokens_seen": 247966720 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046709127382146435, + "loss": 3.3626, + "theoretical_loss": 4.241476198014334, + "tokens_seen": 248032256 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004670812437311936, + "loss": 3.3987, + "theoretical_loss": 4.241340642002345, + "tokens_seen": 248097792 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046707121364092277, + "loss": 3.3292, + "theoretical_loss": 4.2412051318163995, + "tokens_seen": 248163328 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046706118355065195, + "loss": 3.4402, + "theoretical_loss": 4.241069667428912, + "tokens_seen": 248228864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046705115346038113, + "loss": 3.4031, + "theoretical_loss": 4.240934248812316, + "tokens_seen": 248294400 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046704112337011037, + "loss": 3.2248, + "theoretical_loss": 4.240798875939074, + "tokens_seen": 248359936 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004670310932798395, + "loss": 3.5611, + "theoretical_loss": 4.240663548781669, + "tokens_seen": 248425472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046702106318956873, + "loss": 3.4016, + "theoretical_loss": 4.240528267312609, + "tokens_seen": 248491008 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046701103309929786, + "loss": 3.1504, + "theoretical_loss": 4.240393031504424, + "tokens_seen": 248556544 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004670010030090271, + "loss": 3.4436, + "theoretical_loss": 4.2402578413296705, + "tokens_seen": 248622080 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004669909729187563, + "loss": 3.2758, + "theoretical_loss": 4.240122696760927, + "tokens_seen": 248687616 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046698094282848545, + "loss": 3.4012, + "theoretical_loss": 4.239987597770796, + "tokens_seen": 248753152 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046697091273821464, + "loss": 3.3318, + "theoretical_loss": 4.239852544331904, + "tokens_seen": 248818688 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004669608826479438, + "loss": 3.4029, + "theoretical_loss": 4.2397175364169, + "tokens_seen": 248884224 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046695085255767305, + "loss": 3.427, + "theoretical_loss": 4.239582573998459, + "tokens_seen": 248949760 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 629674, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3873817920684814, + "objective/train/theoretical_loss": 4.239447657049277, + "objective/train/tokens_used": 269475296, + "theoretical_loss": 4.239447657049277, + "tokens_seen": 249015296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046694082246740223, + "loss": 3.3962, + "theoretical_loss": 4.239447657049277, + "tokens_seen": 249015296 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004669307923771314, + "loss": 3.2594, + "theoretical_loss": 4.239312785542076, + "tokens_seen": 249080832 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004669207622868606, + "loss": 3.4452, + "theoretical_loss": 4.239177959449599, + "tokens_seen": 249146368 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046691073219658983, + "loss": 3.4319, + "theoretical_loss": 4.2390431787446134, + "tokens_seen": 249211904 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046690070210631896, + "loss": 3.4141, + "theoretical_loss": 4.238908443399912, + "tokens_seen": 249277440 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668906720160482, + "loss": 3.4438, + "theoretical_loss": 4.238773753388307, + "tokens_seen": 249342976 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668806419257773, + "loss": 3.4251, + "theoretical_loss": 4.23863910868264, + "tokens_seen": 249408512 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046687061183550656, + "loss": 3.4414, + "theoretical_loss": 4.238504509255769, + "tokens_seen": 249474048 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046686058174523574, + "loss": 3.3442, + "theoretical_loss": 4.238369955080581, + "tokens_seen": 249539584 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668505516549649, + "loss": 3.3745, + "theoretical_loss": 4.238235446129984, + "tokens_seen": 249605120 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668405215646941, + "loss": 3.3523, + "theoretical_loss": 4.23810098237691, + "tokens_seen": 249670656 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668304914744233, + "loss": 3.426, + "theoretical_loss": 4.237966563794312, + "tokens_seen": 249736192 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046682046138415246, + "loss": 3.4638, + "theoretical_loss": 4.237832190355169, + "tokens_seen": 249801728 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668104312938817, + "loss": 3.4005, + "theoretical_loss": 4.237697862032483, + "tokens_seen": 249867264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004668004012036108, + "loss": 3.4751, + "theoretical_loss": 4.237563578799279, + "tokens_seen": 249932800 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046679037111334006, + "loss": 3.4012, + "theoretical_loss": 4.237429340628605, + "tokens_seen": 249998336 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667803410230692, + "loss": 3.4029, + "theoretical_loss": 4.23729514749353, + "tokens_seen": 250063872 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667703109327984, + "loss": 3.3168, + "theoretical_loss": 4.237160999367148, + "tokens_seen": 250129408 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667602808425276, + "loss": 3.4434, + "theoretical_loss": 4.23702689622258, + "tokens_seen": 250194944 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667502507522568, + "loss": 3.3044, + "theoretical_loss": 4.236892838032962, + "tokens_seen": 250260480 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046674022066198596, + "loss": 3.4625, + "theoretical_loss": 4.23675882477146, + "tokens_seen": 250326016 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667301905717152, + "loss": 3.4397, + "theoretical_loss": 4.23662485641126, + "tokens_seen": 250391552 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667201604814443, + "loss": 3.4859, + "theoretical_loss": 4.236490932925571, + "tokens_seen": 250457088 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046671013039117356, + "loss": 3.3105, + "theoretical_loss": 4.236357054287627, + "tokens_seen": 250522624 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004667001003009027, + "loss": 3.3793, + "theoretical_loss": 4.236223220470681, + "tokens_seen": 250588160 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 634247, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.476658821105957, + "objective/train/theoretical_loss": 4.236089431448011, + "objective/train/tokens_used": 271113696, + "theoretical_loss": 4.236089431448011, + "tokens_seen": 250653696 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004666900702106319, + "loss": 3.3582, + "theoretical_loss": 4.236089431448011, + "tokens_seen": 250653696 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004666800401203611, + "loss": 3.381, + "theoretical_loss": 4.235955687192922, + "tokens_seen": 250719232 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004666700100300903, + "loss": 3.3743, + "theoretical_loss": 4.235821987678735, + "tokens_seen": 250784768 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046665997993981947, + "loss": 3.3719, + "theoretical_loss": 4.235688332878798, + "tokens_seen": 250850304 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046664994984954865, + "loss": 3.2238, + "theoretical_loss": 4.23555472276648, + "tokens_seen": 250915840 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046663991975927783, + "loss": 3.3988, + "theoretical_loss": 4.235421157315176, + "tokens_seen": 250981376 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046662988966900706, + "loss": 3.3772, + "theoretical_loss": 4.235287636498299, + "tokens_seen": 251046912 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004666198595787362, + "loss": 3.4718, + "theoretical_loss": 4.235154160289288, + "tokens_seen": 251112448 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046660982948846543, + "loss": 3.46, + "theoretical_loss": 4.235020728661604, + "tokens_seen": 251177984 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046659979939819455, + "loss": 3.4704, + "theoretical_loss": 4.234887341588731, + "tokens_seen": 251243520 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004665897693079238, + "loss": 3.5539, + "theoretical_loss": 4.234753999044175, + "tokens_seen": 251309056 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046657973921765297, + "loss": 3.4658, + "theoretical_loss": 4.234620701001465, + "tokens_seen": 251374592 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046656970912738215, + "loss": 3.4751, + "theoretical_loss": 4.234487447434153, + "tokens_seen": 251440128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046655967903711133, + "loss": 3.4929, + "theoretical_loss": 4.234354238315813, + "tokens_seen": 251505664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046654964894684057, + "loss": 3.344, + "theoretical_loss": 4.234221073620041, + "tokens_seen": 251571200 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004665396188565697, + "loss": 3.3864, + "theoretical_loss": 4.234087953320458, + "tokens_seen": 251636736 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046652958876629893, + "loss": 3.3849, + "theoretical_loss": 4.233954877390705, + "tokens_seen": 251702272 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046651955867602806, + "loss": 3.3495, + "theoretical_loss": 4.233821845804446, + "tokens_seen": 251767808 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004665095285857573, + "loss": 3.3466, + "theoretical_loss": 4.233688858535368, + "tokens_seen": 251833344 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004664994984954865, + "loss": 3.4657, + "theoretical_loss": 4.233555915557181, + "tokens_seen": 251898880 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046648946840521565, + "loss": 3.4057, + "theoretical_loss": 4.233423016843616, + "tokens_seen": 251964416 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046647943831494484, + "loss": 3.3842, + "theoretical_loss": 4.233290162368428, + "tokens_seen": 252029952 + }, + { + "epoch": 3.0, + "learning_rate": 0.000466469408224674, + "loss": 3.3689, + "theoretical_loss": 4.233157352105393, + "tokens_seen": 252095488 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004664593781344032, + "loss": 3.3533, + "theoretical_loss": 4.233024586028311, + "tokens_seen": 252161024 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046644934804413243, + "loss": 3.3963, + "theoretical_loss": 4.232891864111002, + "tokens_seen": 252226560 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 637432, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3189682960510254, + "objective/train/theoretical_loss": 4.232759186327309, + "objective/train/tokens_used": 272752096, + "theoretical_loss": 4.232759186327309, + "tokens_seen": 252292096 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046643931795386156, + "loss": 3.2588, + "theoretical_loss": 4.232759186327309, + "tokens_seen": 252292096 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004664292878635908, + "loss": 3.4053, + "theoretical_loss": 4.232626552651099, + "tokens_seen": 252357632 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004664192577733199, + "loss": 3.371, + "theoretical_loss": 4.23249396305626, + "tokens_seen": 252423168 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046640922768304916, + "loss": 3.4771, + "theoretical_loss": 4.232361417516703, + "tokens_seen": 252488704 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046639919759277834, + "loss": 3.3868, + "theoretical_loss": 4.232228916006359, + "tokens_seen": 252554240 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004663891675025075, + "loss": 3.3168, + "theoretical_loss": 4.232096458499184, + "tokens_seen": 252619776 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004663791374122367, + "loss": 3.3231, + "theoretical_loss": 4.231964044969153, + "tokens_seen": 252685312 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046636910732196594, + "loss": 3.5008, + "theoretical_loss": 4.231831675390268, + "tokens_seen": 252750848 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046635907723169506, + "loss": 3.3414, + "theoretical_loss": 4.231699349736548, + "tokens_seen": 252816384 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004663490471414243, + "loss": 3.3492, + "theoretical_loss": 4.231567067982036, + "tokens_seen": 252881920 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004663390170511534, + "loss": 3.3025, + "theoretical_loss": 4.231434830100799, + "tokens_seen": 252947456 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046632898696088266, + "loss": 3.4124, + "theoretical_loss": 4.231302636066923, + "tokens_seen": 253012992 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046631895687061184, + "loss": 3.4322, + "theoretical_loss": 4.231170485854518, + "tokens_seen": 253078528 + }, + { + "epoch": 3.0, + "learning_rate": 0.000466308926780341, + "loss": 3.4329, + "theoretical_loss": 4.231038379437714, + "tokens_seen": 253144064 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662988966900702, + "loss": 3.3851, + "theoretical_loss": 4.230906316790666, + "tokens_seen": 253209600 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662888665997994, + "loss": 3.3117, + "theoretical_loss": 4.230774297887548, + "tokens_seen": 253275136 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046627883650952857, + "loss": 3.3254, + "theoretical_loss": 4.230642322702558, + "tokens_seen": 253340672 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662688064192578, + "loss": 3.3743, + "theoretical_loss": 4.230510391209913, + "tokens_seen": 253406208 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046625877632898693, + "loss": 3.4403, + "theoretical_loss": 4.230378503383857, + "tokens_seen": 253471744 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046624874623871616, + "loss": 3.4631, + "theoretical_loss": 4.230246659198651, + "tokens_seen": 253537280 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046623871614844535, + "loss": 3.3498, + "theoretical_loss": 4.23011485862858, + "tokens_seen": 253602816 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662286860581745, + "loss": 3.4715, + "theoretical_loss": 4.229983101647949, + "tokens_seen": 253668352 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662186559679037, + "loss": 3.3781, + "theoretical_loss": 4.229851388231088, + "tokens_seen": 253733888 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004662086258776329, + "loss": 3.4429, + "theoretical_loss": 4.229719718352346, + "tokens_seen": 253799424 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004661985957873621, + "loss": 3.4437, + "theoretical_loss": 4.229588091986093, + "tokens_seen": 253864960 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 642123, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.346550226211548, + "objective/train/theoretical_loss": 4.2294565091067255, + "objective/train/tokens_used": 274390496, + "theoretical_loss": 4.2294565091067255, + "tokens_seen": 253930496 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004661885656970913, + "loss": 3.4123, + "theoretical_loss": 4.2294565091067255, + "tokens_seen": 253930496 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004661785356068205, + "loss": 3.4815, + "theoretical_loss": 4.229324969688657, + "tokens_seen": 253996032 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046616850551654967, + "loss": 3.3957, + "theoretical_loss": 4.229193473706323, + "tokens_seen": 254061568 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046615847542627885, + "loss": 3.4834, + "theoretical_loss": 4.229062021134182, + "tokens_seen": 254127104 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046614844533600803, + "loss": 3.4137, + "theoretical_loss": 4.228930611946715, + "tokens_seen": 254192640 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046613841524573727, + "loss": 3.4436, + "theoretical_loss": 4.228799246118422, + "tokens_seen": 254258176 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004661283851554664, + "loss": 3.4781, + "theoretical_loss": 4.228667923623828, + "tokens_seen": 254323712 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046611835506519563, + "loss": 3.4553, + "theoretical_loss": 4.228536644437476, + "tokens_seen": 254389248 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046610832497492475, + "loss": 3.4574, + "theoretical_loss": 4.2284054085339315, + "tokens_seen": 254454784 + }, + { + "epoch": 3.0, + "learning_rate": 0.000466098294884654, + "loss": 3.3848, + "theoretical_loss": 4.228274215887783, + "tokens_seen": 254520320 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046608826479438317, + "loss": 3.3241, + "theoretical_loss": 4.228143066473638, + "tokens_seen": 254585856 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046607823470411235, + "loss": 3.4069, + "theoretical_loss": 4.228011960266129, + "tokens_seen": 254651392 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046606820461384153, + "loss": 3.3983, + "theoretical_loss": 4.227880897239906, + "tokens_seen": 254716928 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046605817452357077, + "loss": 3.4137, + "theoretical_loss": 4.227749877369644, + "tokens_seen": 254782464 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004660481444332999, + "loss": 3.3647, + "theoretical_loss": 4.227618900630036, + "tokens_seen": 254848000 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046603811434302913, + "loss": 3.3206, + "theoretical_loss": 4.227487966995797, + "tokens_seen": 254913536 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046602808425275826, + "loss": 3.4647, + "theoretical_loss": 4.227357076441667, + "tokens_seen": 254979072 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004660180541624875, + "loss": 3.5316, + "theoretical_loss": 4.227226228942401, + "tokens_seen": 255044608 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004660080240722167, + "loss": 3.3789, + "theoretical_loss": 4.2270954244727825, + "tokens_seen": 255110144 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046599799398194586, + "loss": 3.2425, + "theoretical_loss": 4.226964663007609, + "tokens_seen": 255175680 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046598796389167504, + "loss": 3.1657, + "theoretical_loss": 4.226833944521705, + "tokens_seen": 255241216 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004659779338014042, + "loss": 3.3573, + "theoretical_loss": 4.2267032689899136, + "tokens_seen": 255306752 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004659679037111334, + "loss": 3.3217, + "theoretical_loss": 4.226572636387099, + "tokens_seen": 255372288 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046595787362086263, + "loss": 3.4465, + "theoretical_loss": 4.226442046688146, + "tokens_seen": 255437824 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046594784353059176, + "loss": 3.4378, + "theoretical_loss": 4.226311499867963, + "tokens_seen": 255503360 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 645079, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.348341703414917, + "objective/train/theoretical_loss": 4.226180995901478, + "objective/train/tokens_used": 276028896, + "theoretical_loss": 4.226180995901478, + "tokens_seen": 255568896 + }, + { + "epoch": 3.0, + "learning_rate": 0.000465937813440321, + "loss": 3.3293, + "theoretical_loss": 4.226180995901478, + "tokens_seen": 255568896 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004659277833500501, + "loss": 3.3739, + "theoretical_loss": 4.2260505347636395, + "tokens_seen": 255634432 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046591775325977936, + "loss": 3.4505, + "theoretical_loss": 4.225920116429417, + "tokens_seen": 255699968 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046590772316950854, + "loss": 3.3236, + "theoretical_loss": 4.225789740873802, + "tokens_seen": 255765504 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658976930792377, + "loss": 3.3581, + "theoretical_loss": 4.225659408071808, + "tokens_seen": 255831040 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658876629889669, + "loss": 3.4039, + "theoretical_loss": 4.225529117998467, + "tokens_seen": 255896576 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046587763289869614, + "loss": 3.4795, + "theoretical_loss": 4.225398870628833, + "tokens_seen": 255962112 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046586760280842526, + "loss": 3.3516, + "theoretical_loss": 4.225268665937982, + "tokens_seen": 256027648 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658575727181545, + "loss": 3.4754, + "theoretical_loss": 4.225138503901008, + "tokens_seen": 256093184 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658475426278836, + "loss": 3.4911, + "theoretical_loss": 4.225008384493031, + "tokens_seen": 256158720 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046583751253761286, + "loss": 3.2802, + "theoretical_loss": 4.2248783076891865, + "tokens_seen": 256224256 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046582748244734204, + "loss": 3.41, + "theoretical_loss": 4.224748273464634, + "tokens_seen": 256289792 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658174523570712, + "loss": 3.3752, + "theoretical_loss": 4.224618281794553, + "tokens_seen": 256355328 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004658074222668004, + "loss": 3.3747, + "theoretical_loss": 4.2244883326541425, + "tokens_seen": 256420864 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004657973921765296, + "loss": 3.5078, + "theoretical_loss": 4.224358426018625, + "tokens_seen": 256486400 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046578736208625877, + "loss": 3.3532, + "theoretical_loss": 4.224228561863243, + "tokens_seen": 256551936 + }, + { + "epoch": 3.0, + "learning_rate": 0.000465777331995988, + "loss": 3.4016, + "theoretical_loss": 4.2240987401632575, + "tokens_seen": 256617472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046576730190571713, + "loss": 3.3422, + "theoretical_loss": 4.223968960893952, + "tokens_seen": 256683008 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046575727181544636, + "loss": 3.4192, + "theoretical_loss": 4.223839224030631, + "tokens_seen": 256748544 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046574724172517555, + "loss": 3.4726, + "theoretical_loss": 4.22370952954862, + "tokens_seen": 256814080 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004657372116349047, + "loss": 3.3545, + "theoretical_loss": 4.223579877423262, + "tokens_seen": 256879616 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004657271815446339, + "loss": 3.497, + "theoretical_loss": 4.223450267629925, + "tokens_seen": 256945152 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004657171514543631, + "loss": 3.4486, + "theoretical_loss": 4.223320700143995, + "tokens_seen": 257010688 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046570712136409227, + "loss": 3.1252, + "theoretical_loss": 4.2231911749408795, + "tokens_seen": 257076224 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004656970912738215, + "loss": 3.3811, + "theoretical_loss": 4.223061691996005, + "tokens_seen": 257141760 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 648782, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3789634704589844, + "objective/train/theoretical_loss": 4.222932251284821, + "objective/train/tokens_used": 277667296, + "theoretical_loss": 4.222932251284821, + "tokens_seen": 257207296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046568706118355063, + "loss": 3.458, + "theoretical_loss": 4.222932251284821, + "tokens_seen": 257207296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046567703109327987, + "loss": 3.3099, + "theoretical_loss": 4.222802852782794, + "tokens_seen": 257272832 + }, + { + "epoch": 3.0, + "learning_rate": 0.000465667001003009, + "loss": 3.3821, + "theoretical_loss": 4.222673496465417, + "tokens_seen": 257338368 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046565697091273823, + "loss": 3.3208, + "theoretical_loss": 4.222544182308196, + "tokens_seen": 257403904 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004656469408224674, + "loss": 3.3079, + "theoretical_loss": 4.2224149102866635, + "tokens_seen": 257469440 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004656369107321966, + "loss": 3.4442, + "theoretical_loss": 4.22228568037637, + "tokens_seen": 257534976 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004656268806419258, + "loss": 3.3449, + "theoretical_loss": 4.222156492552885, + "tokens_seen": 257600512 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046561685055165495, + "loss": 3.3237, + "theoretical_loss": 4.222027346791801, + "tokens_seen": 257666048 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046560682046138414, + "loss": 3.3449, + "theoretical_loss": 4.221898243068729, + "tokens_seen": 257731584 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046559679037111337, + "loss": 3.4006, + "theoretical_loss": 4.221769181359303, + "tokens_seen": 257797120 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004655867602808425, + "loss": 3.4254, + "theoretical_loss": 4.221640161639172, + "tokens_seen": 257862656 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046557673019057173, + "loss": 3.4284, + "theoretical_loss": 4.221511183884012, + "tokens_seen": 257928192 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004655667001003009, + "loss": 3.2966, + "theoretical_loss": 4.221382248069514, + "tokens_seen": 257993728 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004655566700100301, + "loss": 3.351, + "theoretical_loss": 4.221253354171392, + "tokens_seen": 258059264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004655466399197593, + "loss": 3.4197, + "theoretical_loss": 4.221124502165379, + "tokens_seen": 258124800 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046553660982948846, + "loss": 3.3733, + "theoretical_loss": 4.220995692027229, + "tokens_seen": 258190336 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046552657973921764, + "loss": 3.3537, + "theoretical_loss": 4.220866923732715, + "tokens_seen": 258255872 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004655165496489469, + "loss": 3.3075, + "theoretical_loss": 4.220738197257632, + "tokens_seen": 258321408 + }, + { + "epoch": 3.0, + "learning_rate": 0.000465506519558676, + "loss": 3.3802, + "theoretical_loss": 4.220609512577794, + "tokens_seen": 258386944 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046549648946840524, + "loss": 3.4045, + "theoretical_loss": 4.220480869669035, + "tokens_seen": 258452480 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046548645937813436, + "loss": 3.5202, + "theoretical_loss": 4.220352268507209, + "tokens_seen": 258518016 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004654764292878636, + "loss": 3.421, + "theoretical_loss": 4.220223709068191, + "tokens_seen": 258583552 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004654663991975928, + "loss": 3.4554, + "theoretical_loss": 4.220095191327874, + "tokens_seen": 258649088 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046545636910732196, + "loss": 3.4138, + "theoretical_loss": 4.219966715262174, + "tokens_seen": 258714624 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004654463390170512, + "loss": 3.3009, + "theoretical_loss": 4.219838280847026, + "tokens_seen": 258780160 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 653642, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1768276691436768, + "objective/train/theoretical_loss": 4.219709888058383, + "objective/train/tokens_used": 279305696, + "theoretical_loss": 4.219709888058383, + "tokens_seen": 258845696 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004654363089267803, + "loss": 3.3205, + "theoretical_loss": 4.219709888058383, + "tokens_seen": 258845696 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046542627883650956, + "loss": 3.4772, + "theoretical_loss": 4.21958153687222, + "tokens_seen": 258911232 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046541624874623874, + "loss": 3.4511, + "theoretical_loss": 4.219453227264532, + "tokens_seen": 258976768 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004654062186559679, + "loss": 3.4917, + "theoretical_loss": 4.219324959211332, + "tokens_seen": 259042304 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004653961885656971, + "loss": 3.4374, + "theoretical_loss": 4.219196732688656, + "tokens_seen": 259107840 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046538615847542634, + "loss": 3.478, + "theoretical_loss": 4.219068547672556, + "tokens_seen": 259173376 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046537612838515546, + "loss": 3.5206, + "theoretical_loss": 4.218940404139107, + "tokens_seen": 259238912 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004653660982948847, + "loss": 3.4472, + "theoretical_loss": 4.2188123020644035, + "tokens_seen": 259304448 + }, + { + "epoch": 3.0, + "learning_rate": 0.0004653560682046138, + "loss": 3.401, + "theoretical_loss": 4.218684241424557, + "tokens_seen": 259369984 + }, + { + "epoch": 3.0, + "learning_rate": 0.00046534603811434306, + "loss": 3.3878, + "theoretical_loss": 4.218556222195703, + "tokens_seen": 259435520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046533600802407224, + "loss": 3.4614, + "theoretical_loss": 4.218428244353994, + "tokens_seen": 259501056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004653259779338014, + "loss": 3.3575, + "theoretical_loss": 4.218300307875603, + "tokens_seen": 259566592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004653159478435306, + "loss": 3.4058, + "theoretical_loss": 4.218172412736723, + "tokens_seen": 259632128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004653059177532598, + "loss": 3.5254, + "theoretical_loss": 4.218044558913565, + "tokens_seen": 259697664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046529588766298897, + "loss": 3.2466, + "theoretical_loss": 4.217916746382363, + "tokens_seen": 259763200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004652858575727182, + "loss": 3.3662, + "theoretical_loss": 4.217788975119366, + "tokens_seen": 259828736 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046527582748244733, + "loss": 3.4491, + "theoretical_loss": 4.217661245100848, + "tokens_seen": 259894272 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046526579739217656, + "loss": 3.449, + "theoretical_loss": 4.217533556303098, + "tokens_seen": 259959808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046525576730190575, + "loss": 3.441, + "theoretical_loss": 4.217405908702428, + "tokens_seen": 260025344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004652457372116349, + "loss": 3.3518, + "theoretical_loss": 4.217278302275167, + "tokens_seen": 260090880 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004652357071213641, + "loss": 3.5187, + "theoretical_loss": 4.217150736997667, + "tokens_seen": 260156416 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004652256770310933, + "loss": 3.2874, + "theoretical_loss": 4.217023212846294, + "tokens_seen": 260221952 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046521564694082247, + "loss": 3.4771, + "theoretical_loss": 4.2168957297974385, + "tokens_seen": 260287488 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004652056168505517, + "loss": 3.4028, + "theoretical_loss": 4.216768287827509, + "tokens_seen": 260353024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046519558676028083, + "loss": 3.391, + "theoretical_loss": 4.216640886912932, + "tokens_seen": 260418560 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 656561, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6094577312469482, + "objective/train/theoretical_loss": 4.216513527030156, + "objective/train/tokens_used": 280944096, + "theoretical_loss": 4.216513527030156, + "tokens_seen": 260484096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046518555667001007, + "loss": 3.4231, + "theoretical_loss": 4.216513527030156, + "tokens_seen": 260484096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004651755265797392, + "loss": 3.442, + "theoretical_loss": 4.216386208155648, + "tokens_seen": 260549632 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046516549648946843, + "loss": 3.4605, + "theoretical_loss": 4.216258930265893, + "tokens_seen": 260615168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004651554663991976, + "loss": 3.3183, + "theoretical_loss": 4.216131693337395, + "tokens_seen": 260680704 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004651454363089268, + "loss": 3.4199, + "theoretical_loss": 4.216004497346682, + "tokens_seen": 260746240 + }, + { + "epoch": 3.01, + "learning_rate": 0.000465135406218656, + "loss": 3.3744, + "theoretical_loss": 4.215877342270296, + "tokens_seen": 260811776 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046512537612838515, + "loss": 3.3971, + "theoretical_loss": 4.215750228084802, + "tokens_seen": 260877312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046511534603811434, + "loss": 3.3152, + "theoretical_loss": 4.215623154766781, + "tokens_seen": 260942848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046510531594784357, + "loss": 3.3525, + "theoretical_loss": 4.215496122292835, + "tokens_seen": 261008384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650952858575727, + "loss": 3.2163, + "theoretical_loss": 4.215369130639587, + "tokens_seen": 261073920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046508525576730193, + "loss": 3.37, + "theoretical_loss": 4.215242179783678, + "tokens_seen": 261139456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650752256770311, + "loss": 3.3979, + "theoretical_loss": 4.215115269701765, + "tokens_seen": 261204992 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650651955867603, + "loss": 3.376, + "theoretical_loss": 4.21498840037053, + "tokens_seen": 261270528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650551654964895, + "loss": 3.4052, + "theoretical_loss": 4.214861571766669, + "tokens_seen": 261336064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046504513540621866, + "loss": 3.4374, + "theoretical_loss": 4.214734783866899, + "tokens_seen": 261401600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046503510531594784, + "loss": 3.4098, + "theoretical_loss": 4.2146080366479595, + "tokens_seen": 261467136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650250752256771, + "loss": 3.4825, + "theoretical_loss": 4.2144813300866035, + "tokens_seen": 261532672 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004650150451354062, + "loss": 3.4162, + "theoretical_loss": 4.2143546641596075, + "tokens_seen": 261598208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046500501504513544, + "loss": 3.3753, + "theoretical_loss": 4.214228038843764, + "tokens_seen": 261663744 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046499498495486456, + "loss": 3.4224, + "theoretical_loss": 4.214101454115886, + "tokens_seen": 261729280 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004649849548645938, + "loss": 3.329, + "theoretical_loss": 4.213974909952805, + "tokens_seen": 261794816 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464974924774323, + "loss": 3.3338, + "theoretical_loss": 4.2138484063313735, + "tokens_seen": 261860352 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046496489468405216, + "loss": 3.2873, + "theoretical_loss": 4.21372194322846, + "tokens_seen": 261925888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046495486459378134, + "loss": 3.4923, + "theoretical_loss": 4.213595520620955, + "tokens_seen": 261991424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004649448345035105, + "loss": 3.3379, + "theoretical_loss": 4.213469138485765, + "tokens_seen": 262056960 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 661392, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.550494432449341, + "objective/train/theoretical_loss": 4.213342796799816, + "objective/train/tokens_used": 282582496, + "theoretical_loss": 4.213342796799816, + "tokens_seen": 262122496 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004649348044132397, + "loss": 3.3802, + "theoretical_loss": 4.213342796799816, + "tokens_seen": 262122496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046492477432296894, + "loss": 3.3717, + "theoretical_loss": 4.213216495540056, + "tokens_seen": 262188032 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046491474423269807, + "loss": 3.3835, + "theoretical_loss": 4.213090234683449, + "tokens_seen": 262253568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004649047141424273, + "loss": 3.2907, + "theoretical_loss": 4.212964014206976, + "tokens_seen": 262319104 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004648946840521565, + "loss": 3.3499, + "theoretical_loss": 4.212837834087644, + "tokens_seen": 262384640 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046488465396188566, + "loss": 3.2264, + "theoretical_loss": 4.212711694302469, + "tokens_seen": 262450176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046487462387161485, + "loss": 3.3777, + "theoretical_loss": 4.212585594828495, + "tokens_seen": 262515712 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464864593781344, + "loss": 3.3031, + "theoretical_loss": 4.212459535642779, + "tokens_seen": 262581248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004648545636910732, + "loss": 3.4965, + "theoretical_loss": 4.212333516722399, + "tokens_seen": 262646784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046484453360080244, + "loss": 3.3272, + "theoretical_loss": 4.212207538044452, + "tokens_seen": 262712320 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046483450351053157, + "loss": 3.3113, + "theoretical_loss": 4.212081599586051, + "tokens_seen": 262777856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004648244734202608, + "loss": 3.4357, + "theoretical_loss": 4.211955701324333, + "tokens_seen": 262843392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046481444332998993, + "loss": 3.3326, + "theoretical_loss": 4.211829843236448, + "tokens_seen": 262908928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046480441323971917, + "loss": 3.3284, + "theoretical_loss": 4.211704025299568, + "tokens_seen": 262974464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046479438314944835, + "loss": 3.427, + "theoretical_loss": 4.211578247490882, + "tokens_seen": 263040000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046478435305917753, + "loss": 3.3472, + "theoretical_loss": 4.2114525097876, + "tokens_seen": 263105536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004647743229689067, + "loss": 3.4523, + "theoretical_loss": 4.2113268121669485, + "tokens_seen": 263171072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046476429287863595, + "loss": 3.4151, + "theoretical_loss": 4.211201154606172, + "tokens_seen": 263236608 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004647542627883651, + "loss": 3.3648, + "theoretical_loss": 4.211075537082538, + "tokens_seen": 263302144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004647442326980943, + "loss": 3.4123, + "theoretical_loss": 4.210949959573325, + "tokens_seen": 263367680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046473420260782344, + "loss": 3.4546, + "theoretical_loss": 4.210824422055837, + "tokens_seen": 263433216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046472417251755267, + "loss": 3.3841, + "theoretical_loss": 4.210698924507394, + "tokens_seen": 263498752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004647141424272819, + "loss": 3.4206, + "theoretical_loss": 4.210573466905332, + "tokens_seen": 263564288 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046470411233701103, + "loss": 3.4705, + "theoretical_loss": 4.210448049227011, + "tokens_seen": 263629824 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046469408224674027, + "loss": 3.4193, + "theoretical_loss": 4.210322671449806, + "tokens_seen": 263695360 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 664381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4015915393829346, + "objective/train/theoretical_loss": 4.210197333551108, + "objective/train/tokens_used": 284220896, + "theoretical_loss": 4.210197333551108, + "tokens_seen": 263760896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004646840521564694, + "loss": 3.4292, + "theoretical_loss": 4.210197333551108, + "tokens_seen": 263760896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046467402206619863, + "loss": 3.4281, + "theoretical_loss": 4.210072035508332, + "tokens_seen": 263826432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004646639919759278, + "loss": 3.3333, + "theoretical_loss": 4.2099467772989065, + "tokens_seen": 263891968 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464653961885657, + "loss": 3.3972, + "theoretical_loss": 4.209821558900282, + "tokens_seen": 263957504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004646439317953862, + "loss": 3.4562, + "theoretical_loss": 4.209696380289925, + "tokens_seen": 264023040 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046463390170511535, + "loss": 3.3578, + "theoretical_loss": 4.209571241445322, + "tokens_seen": 264088576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046462387161484454, + "loss": 3.3594, + "theoretical_loss": 4.2094461423439755, + "tokens_seen": 264154112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046461384152457377, + "loss": 3.4309, + "theoretical_loss": 4.20932108296341, + "tokens_seen": 264219648 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004646038114343029, + "loss": 3.4409, + "theoretical_loss": 4.209196063281164, + "tokens_seen": 264285184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046459378134403213, + "loss": 3.3235, + "theoretical_loss": 4.209071083274797, + "tokens_seen": 264350720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004645837512537613, + "loss": 3.4002, + "theoretical_loss": 4.208946142921887, + "tokens_seen": 264416256 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004645737211634905, + "loss": 3.436, + "theoretical_loss": 4.2088212422000275, + "tokens_seen": 264481792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004645636910732197, + "loss": 3.3535, + "theoretical_loss": 4.208696381086835, + "tokens_seen": 264547328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046455366098294886, + "loss": 3.4504, + "theoretical_loss": 4.208571559559937, + "tokens_seen": 264612864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046454363089267804, + "loss": 3.4515, + "theoretical_loss": 4.208446777596987, + "tokens_seen": 264678400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004645336008024073, + "loss": 3.4721, + "theoretical_loss": 4.208322035175652, + "tokens_seen": 264743936 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004645235707121364, + "loss": 3.4366, + "theoretical_loss": 4.208197332273617, + "tokens_seen": 264809472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046451354062186564, + "loss": 3.39, + "theoretical_loss": 4.208072668868588, + "tokens_seen": 264875008 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046450351053159476, + "loss": 3.4919, + "theoretical_loss": 4.207948044938286, + "tokens_seen": 264940544 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464493480441324, + "loss": 3.323, + "theoretical_loss": 4.207823460460451, + "tokens_seen": 265006080 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004644834503510532, + "loss": 3.4856, + "theoretical_loss": 4.207698915412844, + "tokens_seen": 265071616 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046447342026078236, + "loss": 3.4023, + "theoretical_loss": 4.2075744097732395, + "tokens_seen": 265137152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046446339017051154, + "loss": 3.3296, + "theoretical_loss": 4.207449943519432, + "tokens_seen": 265202688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004644533600802407, + "loss": 3.3874, + "theoretical_loss": 4.207325516629235, + "tokens_seen": 265268224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004644433299899699, + "loss": 3.2964, + "theoretical_loss": 4.207201129080478, + "tokens_seen": 265333760 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 668227, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3647165298461914, + "objective/train/theoretical_loss": 4.207076780851011, + "objective/train/tokens_used": 285859296, + "theoretical_loss": 4.207076780851011, + "tokens_seen": 265399296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046443329989969914, + "loss": 3.4103, + "theoretical_loss": 4.207076780851011, + "tokens_seen": 265399296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046442326980942827, + "loss": 3.4182, + "theoretical_loss": 4.206952471918699, + "tokens_seen": 265464832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004644132397191575, + "loss": 3.4631, + "theoretical_loss": 4.206828202261426, + "tokens_seen": 265530368 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004644032096288867, + "loss": 3.4277, + "theoretical_loss": 4.206703971857095, + "tokens_seen": 265595904 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046439317953861586, + "loss": 3.3753, + "theoretical_loss": 4.206579780683627, + "tokens_seen": 265661440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046438314944834505, + "loss": 3.3836, + "theoretical_loss": 4.206455628718958, + "tokens_seen": 265726976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004643731193580742, + "loss": 3.3762, + "theoretical_loss": 4.206331515941045, + "tokens_seen": 265792512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004643630892678034, + "loss": 3.2912, + "theoretical_loss": 4.2062074423278615, + "tokens_seen": 265858048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046435305917753264, + "loss": 3.3818, + "theoretical_loss": 4.206083407857399, + "tokens_seen": 265923584 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046434302908726177, + "loss": 3.3536, + "theoretical_loss": 4.205959412507666, + "tokens_seen": 265989120 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464332998996991, + "loss": 3.3396, + "theoretical_loss": 4.205835456256691, + "tokens_seen": 266054656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046432296890672013, + "loss": 3.2584, + "theoretical_loss": 4.205711539082517, + "tokens_seen": 266120192 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046431293881644937, + "loss": 3.3777, + "theoretical_loss": 4.205587660963207, + "tokens_seen": 266185728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046430290872617855, + "loss": 3.3898, + "theoretical_loss": 4.205463821876843, + "tokens_seen": 266251264 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046429287863590773, + "loss": 3.4271, + "theoretical_loss": 4.205340021801521, + "tokens_seen": 266316800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004642828485456369, + "loss": 3.399, + "theoretical_loss": 4.205216260715357, + "tokens_seen": 266382336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046427281845536615, + "loss": 3.411, + "theoretical_loss": 4.2050925385964835, + "tokens_seen": 266447872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004642627883650953, + "loss": 3.3635, + "theoretical_loss": 4.204968855423052, + "tokens_seen": 266513408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004642527582748245, + "loss": 3.3501, + "theoretical_loss": 4.2048452111732315, + "tokens_seen": 266578944 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046424272818455364, + "loss": 3.4098, + "theoretical_loss": 4.204721605825207, + "tokens_seen": 266644480 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046423269809428287, + "loss": 3.3396, + "theoretical_loss": 4.204598039357183, + "tokens_seen": 266710016 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046422266800401205, + "loss": 3.3498, + "theoretical_loss": 4.204474511747382, + "tokens_seen": 266775552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046421263791374123, + "loss": 3.3655, + "theoretical_loss": 4.2043510229740395, + "tokens_seen": 266841088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004642026078234704, + "loss": 3.426, + "theoretical_loss": 4.204227573015414, + "tokens_seen": 266906624 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641925777331996, + "loss": 3.3086, + "theoretical_loss": 4.204104161849779, + "tokens_seen": 266972160 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 672980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.329946279525757, + "objective/train/theoretical_loss": 4.203980789455425, + "objective/train/tokens_used": 287497696, + "theoretical_loss": 4.203980789455425, + "tokens_seen": 267037696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641825476429288, + "loss": 3.3812, + "theoretical_loss": 4.203980789455425, + "tokens_seen": 267037696 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464172517552658, + "loss": 3.3324, + "theoretical_loss": 4.203857455810662, + "tokens_seen": 267103232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046416248746238714, + "loss": 3.4843, + "theoretical_loss": 4.203734160893816, + "tokens_seen": 267168768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641524573721164, + "loss": 3.4237, + "theoretical_loss": 4.20361090468323, + "tokens_seen": 267234304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641424272818455, + "loss": 3.3996, + "theoretical_loss": 4.203487687157265, + "tokens_seen": 267299840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046413239719157474, + "loss": 3.4251, + "theoretical_loss": 4.203364508294301, + "tokens_seen": 267365376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641223671013039, + "loss": 3.3018, + "theoretical_loss": 4.203241368072732, + "tokens_seen": 267430912 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641123370110331, + "loss": 3.2996, + "theoretical_loss": 4.203118266470972, + "tokens_seen": 267496448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004641023069207623, + "loss": 3.4139, + "theoretical_loss": 4.202995203467451, + "tokens_seen": 267561984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004640922768304915, + "loss": 3.4632, + "theoretical_loss": 4.202872179040618, + "tokens_seen": 267627520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046408224674022064, + "loss": 3.4364, + "theoretical_loss": 4.202749193168938, + "tokens_seen": 267693056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004640722166499499, + "loss": 3.4329, + "theoretical_loss": 4.202626245830893, + "tokens_seen": 267758592 + }, + { + "epoch": 3.01, + "learning_rate": 0.000464062186559679, + "loss": 3.3622, + "theoretical_loss": 4.202503337004982, + "tokens_seen": 267824128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046405215646940824, + "loss": 3.383, + "theoretical_loss": 4.202380466669723, + "tokens_seen": 267889664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004640421263791374, + "loss": 3.3963, + "theoretical_loss": 4.202257634803651, + "tokens_seen": 267955200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004640320962888666, + "loss": 3.4551, + "theoretical_loss": 4.202134841385316, + "tokens_seen": 268020736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004640220661985958, + "loss": 3.3539, + "theoretical_loss": 4.202012086393287, + "tokens_seen": 268086272 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046401203610832496, + "loss": 3.5227, + "theoretical_loss": 4.201889369806151, + "tokens_seen": 268151808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046400200601805414, + "loss": 3.3732, + "theoretical_loss": 4.20176669160251, + "tokens_seen": 268217344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639919759277834, + "loss": 3.3953, + "theoretical_loss": 4.201644051760983, + "tokens_seen": 268282880 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639819458375125, + "loss": 3.4103, + "theoretical_loss": 4.201521450260209, + "tokens_seen": 268348416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046397191574724174, + "loss": 3.3416, + "theoretical_loss": 4.201398887078843, + "tokens_seen": 268413952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639618856569709, + "loss": 3.4257, + "theoretical_loss": 4.2012763621955544, + "tokens_seen": 268479488 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639518555667001, + "loss": 3.4142, + "theoretical_loss": 4.201153875589033, + "tokens_seen": 268545024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046394182547642934, + "loss": 3.4054, + "theoretical_loss": 4.201031427237984, + "tokens_seen": 268610560 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 676060, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3552777767181396, + "objective/train/theoretical_loss": 4.2009090171211305, + "objective/train/tokens_used": 289136096, + "theoretical_loss": 4.2009090171211305, + "tokens_seen": 268676096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046393179538615847, + "loss": 3.3167, + "theoretical_loss": 4.2009090171211305, + "tokens_seen": 268676096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639217652958877, + "loss": 3.4372, + "theoretical_loss": 4.200786645217212, + "tokens_seen": 268741632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004639117352056169, + "loss": 3.3825, + "theoretical_loss": 4.200664311504985, + "tokens_seen": 268807168 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046390170511534606, + "loss": 3.307, + "theoretical_loss": 4.200542015963223, + "tokens_seen": 268872704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046389167502507525, + "loss": 3.4198, + "theoretical_loss": 4.2004197585707175, + "tokens_seen": 268938240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004638816449348044, + "loss": 3.4339, + "theoretical_loss": 4.200297539306275, + "tokens_seen": 269003776 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004638716148445336, + "loss": 3.3019, + "theoretical_loss": 4.200175358148721, + "tokens_seen": 269069312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046386158475426284, + "loss": 3.271, + "theoretical_loss": 4.200053215076895, + "tokens_seen": 269134848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046385155466399197, + "loss": 3.3723, + "theoretical_loss": 4.199931110069659, + "tokens_seen": 269200384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004638415245737212, + "loss": 3.4563, + "theoretical_loss": 4.199809043105884, + "tokens_seen": 269265920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046383149448345033, + "loss": 3.4305, + "theoretical_loss": 4.199687014164465, + "tokens_seen": 269331456 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046382146439317957, + "loss": 3.3174, + "theoretical_loss": 4.199565023224309, + "tokens_seen": 269396992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046381143430290875, + "loss": 3.4399, + "theoretical_loss": 4.199443070264344, + "tokens_seen": 269462528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046380140421263793, + "loss": 3.365, + "theoretical_loss": 4.19932115526351, + "tokens_seen": 269528064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004637913741223671, + "loss": 3.4065, + "theoretical_loss": 4.199199278200768, + "tokens_seen": 269593600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046378134403209635, + "loss": 3.4128, + "theoretical_loss": 4.199077439055094, + "tokens_seen": 269659136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004637713139418255, + "loss": 3.4325, + "theoretical_loss": 4.198955637805479, + "tokens_seen": 269724672 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004637612838515547, + "loss": 3.3599, + "theoretical_loss": 4.198833874430936, + "tokens_seen": 269790208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046375125376128384, + "loss": 3.3522, + "theoretical_loss": 4.198712148910488, + "tokens_seen": 269855744 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046374122367101307, + "loss": 3.4158, + "theoretical_loss": 4.19859046122318, + "tokens_seen": 269921280 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046373119358074225, + "loss": 3.3592, + "theoretical_loss": 4.1984688113480715, + "tokens_seen": 269986816 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046372116349047143, + "loss": 3.4255, + "theoretical_loss": 4.198347199264238, + "tokens_seen": 270052352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004637111334002006, + "loss": 3.4467, + "theoretical_loss": 4.198225624950773, + "tokens_seen": 270117888 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004637011033099298, + "loss": 3.4413, + "theoretical_loss": 4.198104088386787, + "tokens_seen": 270183424 + }, + { + "epoch": 3.01, + "learning_rate": 0.000463691073219659, + "loss": 3.2933, + "theoretical_loss": 4.197982589551405, + "tokens_seen": 270248960 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 680862, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9977571964263916, + "objective/train/theoretical_loss": 4.197861128423771, + "objective/train/tokens_used": 290774496, + "theoretical_loss": 4.197861128423771, + "tokens_seen": 270314496 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636810431293882, + "loss": 3.2736, + "theoretical_loss": 4.197861128423771, + "tokens_seen": 270314496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046367101303911734, + "loss": 3.348, + "theoretical_loss": 4.197739704983043, + "tokens_seen": 270380032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636609829488466, + "loss": 3.349, + "theoretical_loss": 4.197618319208399, + "tokens_seen": 270445568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636509528585757, + "loss": 3.3513, + "theoretical_loss": 4.197496971079029, + "tokens_seen": 270511104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046364092276830494, + "loss": 3.4003, + "theoretical_loss": 4.197375660574145, + "tokens_seen": 270576640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636308926780341, + "loss": 3.2995, + "theoretical_loss": 4.19725438767297, + "tokens_seen": 270642176 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636208625877633, + "loss": 3.4457, + "theoretical_loss": 4.197133152354748, + "tokens_seen": 270707712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636108324974925, + "loss": 3.402, + "theoretical_loss": 4.197011954598738, + "tokens_seen": 270773248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004636008024072217, + "loss": 3.4164, + "theoretical_loss": 4.196890794384213, + "tokens_seen": 270838784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046359077231695084, + "loss": 3.4, + "theoretical_loss": 4.1967696716904666, + "tokens_seen": 270904320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004635807422266801, + "loss": 3.4499, + "theoretical_loss": 4.1966485864968055, + "tokens_seen": 270969856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004635707121364092, + "loss": 3.3508, + "theoretical_loss": 4.196527538782554, + "tokens_seen": 271035392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046356068204613844, + "loss": 3.3753, + "theoretical_loss": 4.196406528527054, + "tokens_seen": 271100928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004635506519558676, + "loss": 3.473, + "theoretical_loss": 4.196285555709661, + "tokens_seen": 271166464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004635406218655968, + "loss": 3.4181, + "theoretical_loss": 4.19616462030975, + "tokens_seen": 271232000 + }, + { + "epoch": 3.01, + "learning_rate": 0.000463530591775326, + "loss": 3.4042, + "theoretical_loss": 4.19604372230671, + "tokens_seen": 271297536 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046352056168505516, + "loss": 3.2683, + "theoretical_loss": 4.195922861679947, + "tokens_seen": 271363072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046351053159478434, + "loss": 3.5133, + "theoretical_loss": 4.195802038408884, + "tokens_seen": 271428608 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004635005015045136, + "loss": 3.3885, + "theoretical_loss": 4.19568125247296, + "tokens_seen": 271494144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004634904714142427, + "loss": 3.4177, + "theoretical_loss": 4.195560503851629, + "tokens_seen": 271559680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046348044132397194, + "loss": 3.4121, + "theoretical_loss": 4.195439792524365, + "tokens_seen": 271625216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046347041123370107, + "loss": 3.3606, + "theoretical_loss": 4.1953191184706515, + "tokens_seen": 271690752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004634603811434303, + "loss": 3.3997, + "theoretical_loss": 4.195198481669995, + "tokens_seen": 271756288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004634503510531595, + "loss": 3.3857, + "theoretical_loss": 4.195077882101915, + "tokens_seen": 271821824 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046344032096288867, + "loss": 3.4494, + "theoretical_loss": 4.1949573197459475, + "tokens_seen": 271887360 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 683835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.57450532913208, + "objective/train/theoretical_loss": 4.1948367945816445, + "objective/train/tokens_used": 292412896, + "theoretical_loss": 4.1948367945816445, + "tokens_seen": 271952896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046343029087261785, + "loss": 3.4521, + "theoretical_loss": 4.1948367945816445, + "tokens_seen": 271952896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004634202607823471, + "loss": 3.3578, + "theoretical_loss": 4.194716306588575, + "tokens_seen": 272018432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004634102306920762, + "loss": 3.2741, + "theoretical_loss": 4.194595855746324, + "tokens_seen": 272083968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046340020060180545, + "loss": 3.3542, + "theoretical_loss": 4.194475442034491, + "tokens_seen": 272149504 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046339017051153457, + "loss": 3.3717, + "theoretical_loss": 4.1943550654326955, + "tokens_seen": 272215040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004633801404212638, + "loss": 3.3426, + "theoretical_loss": 4.194234725920568, + "tokens_seen": 272280576 + }, + { + "epoch": 3.01, + "learning_rate": 0.000463370110330993, + "loss": 3.4286, + "theoretical_loss": 4.194114423477758, + "tokens_seen": 272346112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046336008024072217, + "loss": 3.3023, + "theoretical_loss": 4.193994158083931, + "tokens_seen": 272411648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046335005015045135, + "loss": 3.4973, + "theoretical_loss": 4.193873929718769, + "tokens_seen": 272477184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046334002006018053, + "loss": 3.3608, + "theoretical_loss": 4.193753738361967, + "tokens_seen": 272542720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004633299899699097, + "loss": 3.5121, + "theoretical_loss": 4.193633583993241, + "tokens_seen": 272608256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046331995987963895, + "loss": 3.381, + "theoretical_loss": 4.193513466592318, + "tokens_seen": 272673792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004633099297893681, + "loss": 3.3017, + "theoretical_loss": 4.193393386138944, + "tokens_seen": 272739328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004632998996990973, + "loss": 3.4342, + "theoretical_loss": 4.193273342612881, + "tokens_seen": 272804864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046328986960882644, + "loss": 3.4019, + "theoretical_loss": 4.1931533359939035, + "tokens_seen": 272870400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004632798395185557, + "loss": 3.4335, + "theoretical_loss": 4.193033366261808, + "tokens_seen": 272935936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046326980942828485, + "loss": 3.3777, + "theoretical_loss": 4.1929134333964, + "tokens_seen": 273001472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046325977933801404, + "loss": 3.4087, + "theoretical_loss": 4.192793537377508, + "tokens_seen": 273067008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004632497492477432, + "loss": 3.3746, + "theoretical_loss": 4.19267367818497, + "tokens_seen": 273132544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046323971915747245, + "loss": 3.305, + "theoretical_loss": 4.192553855798643, + "tokens_seen": 273198080 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004632296890672016, + "loss": 3.4063, + "theoretical_loss": 4.1924340701983995, + "tokens_seen": 273263616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004632196589769308, + "loss": 3.3984, + "theoretical_loss": 4.192314321364129, + "tokens_seen": 273329152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046320962888666, + "loss": 3.3772, + "theoretical_loss": 4.192194609275733, + "tokens_seen": 273394688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631995987963892, + "loss": 3.3399, + "theoretical_loss": 4.192074933913134, + "tokens_seen": 273460224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631895687061184, + "loss": 3.3995, + "theoretical_loss": 4.191955295256267, + "tokens_seen": 273525760 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 687504, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.303570032119751, + "objective/train/theoretical_loss": 4.191835693285082, + "objective/train/tokens_used": 294051296, + "theoretical_loss": 4.191835693285082, + "tokens_seen": 273591296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046317953861584754, + "loss": 3.3196, + "theoretical_loss": 4.191835693285082, + "tokens_seen": 273591296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631695085255768, + "loss": 3.4797, + "theoretical_loss": 4.1917161279795465, + "tokens_seen": 273656832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631594784353059, + "loss": 3.5129, + "theoretical_loss": 4.191596599319645, + "tokens_seen": 273722368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046314944834503514, + "loss": 3.4964, + "theoretical_loss": 4.191477107285375, + "tokens_seen": 273787904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631394182547643, + "loss": 3.3794, + "theoretical_loss": 4.191357651856752, + "tokens_seen": 273853440 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631293881644935, + "loss": 3.2252, + "theoretical_loss": 4.191238233013803, + "tokens_seen": 273918976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631193580742227, + "loss": 3.3023, + "theoretical_loss": 4.191118850736577, + "tokens_seen": 273984512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004631093279839519, + "loss": 3.4911, + "theoretical_loss": 4.1909995050051325, + "tokens_seen": 274050048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046309929789368104, + "loss": 3.372, + "theoretical_loss": 4.190880195799549, + "tokens_seen": 274115584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004630892678034103, + "loss": 3.4378, + "theoretical_loss": 4.190760923099917, + "tokens_seen": 274181120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004630792377131394, + "loss": 3.3986, + "theoretical_loss": 4.190641686886346, + "tokens_seen": 274246656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046306920762286864, + "loss": 3.4045, + "theoretical_loss": 4.19052248713896, + "tokens_seen": 274312192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004630591775325978, + "loss": 3.4442, + "theoretical_loss": 4.190403323837898, + "tokens_seen": 274377728 + }, + { + "epoch": 3.01, + "learning_rate": 0.000463049147442327, + "loss": 3.3904, + "theoretical_loss": 4.190284196963313, + "tokens_seen": 274443264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004630391173520562, + "loss": 3.2976, + "theoretical_loss": 4.190165106495378, + "tokens_seen": 274508800 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046302908726178536, + "loss": 3.3259, + "theoretical_loss": 4.190046052414279, + "tokens_seen": 274574336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046301905717151455, + "loss": 3.3849, + "theoretical_loss": 4.189927034700215, + "tokens_seen": 274639872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004630090270812438, + "loss": 3.4397, + "theoretical_loss": 4.1898080533334054, + "tokens_seen": 274705408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004629989969909729, + "loss": 3.225, + "theoretical_loss": 4.189689108294082, + "tokens_seen": 274770944 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046298896690070214, + "loss": 3.3894, + "theoretical_loss": 4.189570199562492, + "tokens_seen": 274836480 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046297893681043127, + "loss": 3.387, + "theoretical_loss": 4.189451327118899, + "tokens_seen": 274902016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004629689067201605, + "loss": 3.313, + "theoretical_loss": 4.189332490943582, + "tokens_seen": 274967552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004629588766298897, + "loss": 3.287, + "theoretical_loss": 4.189213691016835, + "tokens_seen": 275033088 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046294884653961887, + "loss": 3.3659, + "theoretical_loss": 4.189094927318968, + "tokens_seen": 275098624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046293881644934805, + "loss": 3.3163, + "theoretical_loss": 4.188976199830306, + "tokens_seen": 275164160 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 692652, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.117398738861084, + "objective/train/theoretical_loss": 4.188857508531189, + "objective/train/tokens_used": 295689696, + "theoretical_loss": 4.188857508531189, + "tokens_seen": 275229696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004629287863590773, + "loss": 3.3616, + "theoretical_loss": 4.188857508531189, + "tokens_seen": 275229696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004629187562688064, + "loss": 3.3581, + "theoretical_loss": 4.188738853401973, + "tokens_seen": 275295232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046290872617853565, + "loss": 3.4643, + "theoretical_loss": 4.188620234423029, + "tokens_seen": 275360768 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046289869608826477, + "loss": 3.3939, + "theoretical_loss": 4.188501651574743, + "tokens_seen": 275426304 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462888665997994, + "loss": 3.3426, + "theoretical_loss": 4.188383104837516, + "tokens_seen": 275491840 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004628786359077232, + "loss": 3.4265, + "theoretical_loss": 4.188264594191768, + "tokens_seen": 275557376 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046286860581745237, + "loss": 3.4033, + "theoretical_loss": 4.188146119617928, + "tokens_seen": 275622912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046285857572718155, + "loss": 3.386, + "theoretical_loss": 4.188027681096444, + "tokens_seen": 275688448 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046284854563691073, + "loss": 3.4147, + "theoretical_loss": 4.18790927860778, + "tokens_seen": 275753984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004628385155466399, + "loss": 3.3285, + "theoretical_loss": 4.187790912132414, + "tokens_seen": 275819520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046282848545636915, + "loss": 3.3315, + "theoretical_loss": 4.187672581650837, + "tokens_seen": 275885056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004628184553660983, + "loss": 3.3746, + "theoretical_loss": 4.187554287143559, + "tokens_seen": 275950592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004628084252758275, + "loss": 3.3893, + "theoretical_loss": 4.1874360285911045, + "tokens_seen": 276016128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046279839518555664, + "loss": 3.3526, + "theoretical_loss": 4.18731780597401, + "tokens_seen": 276081664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004627883650952859, + "loss": 3.408, + "theoretical_loss": 4.187199619272832, + "tokens_seen": 276147200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046277833500501505, + "loss": 3.3676, + "theoretical_loss": 4.187081468468136, + "tokens_seen": 276212736 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046276830491474424, + "loss": 3.3552, + "theoretical_loss": 4.186963353540509, + "tokens_seen": 276278272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004627582748244734, + "loss": 3.198, + "theoretical_loss": 4.186845274470549, + "tokens_seen": 276343808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046274824473420265, + "loss": 3.3533, + "theoretical_loss": 4.186727231238872, + "tokens_seen": 276409344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004627382146439318, + "loss": 3.3701, + "theoretical_loss": 4.186609223826105, + "tokens_seen": 276474880 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462728184553661, + "loss": 3.3664, + "theoretical_loss": 4.1864912522128925, + "tokens_seen": 276540416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046271815446339014, + "loss": 3.4682, + "theoretical_loss": 4.186373316379896, + "tokens_seen": 276605952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004627081243731194, + "loss": 3.3776, + "theoretical_loss": 4.186255416307789, + "tokens_seen": 276671488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046269809428284856, + "loss": 3.3974, + "theoretical_loss": 4.186137551977261, + "tokens_seen": 276737024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046268806419257774, + "loss": 3.3266, + "theoretical_loss": 4.186019723369016, + "tokens_seen": 276802560 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 695337, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2643723487854004, + "objective/train/theoretical_loss": 4.185901930463775, + "objective/train/tokens_used": 297328096, + "theoretical_loss": 4.185901930463775, + "tokens_seen": 276868096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004626780341023069, + "loss": 3.3214, + "theoretical_loss": 4.185901930463775, + "tokens_seen": 276868096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004626680040120361, + "loss": 3.3822, + "theoretical_loss": 4.185784173242271, + "tokens_seen": 276933632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004626579739217653, + "loss": 3.4402, + "theoretical_loss": 4.185666451685254, + "tokens_seen": 276999168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004626479438314945, + "loss": 3.3671, + "theoretical_loss": 4.185548765773488, + "tokens_seen": 277064704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046263791374122364, + "loss": 3.4917, + "theoretical_loss": 4.185431115487753, + "tokens_seen": 277130240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004626278836509529, + "loss": 3.2716, + "theoretical_loss": 4.185313500808842, + "tokens_seen": 277195776 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462617853560682, + "loss": 3.3507, + "theoretical_loss": 4.1851959217175665, + "tokens_seen": 277261312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046260782347041124, + "loss": 3.4305, + "theoretical_loss": 4.185078378194747, + "tokens_seen": 277326848 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625977933801404, + "loss": 3.4656, + "theoretical_loss": 4.184960870221225, + "tokens_seen": 277392384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625877632898696, + "loss": 3.3496, + "theoretical_loss": 4.184843397777853, + "tokens_seen": 277457920 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625777331995988, + "loss": 3.4266, + "theoretical_loss": 4.1847259608455, + "tokens_seen": 277523456 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462567703109328, + "loss": 3.4373, + "theoretical_loss": 4.18460855940505, + "tokens_seen": 277588992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046255767301905715, + "loss": 3.344, + "theoretical_loss": 4.184491193437399, + "tokens_seen": 277654528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625476429287864, + "loss": 3.4012, + "theoretical_loss": 4.1843738629234615, + "tokens_seen": 277720064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625376128385155, + "loss": 3.3798, + "theoretical_loss": 4.184256567844166, + "tokens_seen": 277785600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046252758274824475, + "loss": 3.441, + "theoretical_loss": 4.184139308180452, + "tokens_seen": 277851136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625175526579739, + "loss": 3.321, + "theoretical_loss": 4.184022083913279, + "tokens_seen": 277916672 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004625075225677031, + "loss": 3.3215, + "theoretical_loss": 4.183904895023618, + "tokens_seen": 277982208 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004624974924774323, + "loss": 3.4134, + "theoretical_loss": 4.183787741492457, + "tokens_seen": 278047744 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046248746238716147, + "loss": 3.4962, + "theoretical_loss": 4.183670623300795, + "tokens_seen": 278113280 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046247743229689065, + "loss": 3.471, + "theoretical_loss": 4.18355354042965, + "tokens_seen": 278178816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004624674022066199, + "loss": 3.3646, + "theoretical_loss": 4.183436492860052, + "tokens_seen": 278244352 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046245737211634907, + "loss": 3.3308, + "theoretical_loss": 4.183319480573045, + "tokens_seen": 278309888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046244734202607825, + "loss": 3.4522, + "theoretical_loss": 4.183202503549692, + "tokens_seen": 278375424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004624373119358075, + "loss": 3.3521, + "theoretical_loss": 4.1830855617710645, + "tokens_seen": 278440960 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 698603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2970354557037354, + "objective/train/theoretical_loss": 4.182968655218254, + "objective/train/tokens_used": 298966496, + "theoretical_loss": 4.182968655218254, + "tokens_seen": 278506496 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004624272818455366, + "loss": 3.3687, + "theoretical_loss": 4.182968655218254, + "tokens_seen": 278506496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046241725175526585, + "loss": 3.2631, + "theoretical_loss": 4.182851783872362, + "tokens_seen": 278572032 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462407221664995, + "loss": 3.3008, + "theoretical_loss": 4.182734947714508, + "tokens_seen": 278637568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004623971915747242, + "loss": 3.456, + "theoretical_loss": 4.182618146725825, + "tokens_seen": 278703104 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004623871614844534, + "loss": 3.4675, + "theoretical_loss": 4.182501380887461, + "tokens_seen": 278768640 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046237713139418257, + "loss": 3.3068, + "theoretical_loss": 4.182384650180577, + "tokens_seen": 278834176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046236710130391175, + "loss": 3.3648, + "theoretical_loss": 4.18226795458635, + "tokens_seen": 278899712 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046235707121364093, + "loss": 3.3963, + "theoretical_loss": 4.182151294085971, + "tokens_seen": 278965248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004623470411233701, + "loss": 3.4232, + "theoretical_loss": 4.182034668660645, + "tokens_seen": 279030784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046233701103309935, + "loss": 3.2932, + "theoretical_loss": 4.181918078291593, + "tokens_seen": 279096320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004623269809428285, + "loss": 3.2852, + "theoretical_loss": 4.181801522960049, + "tokens_seen": 279161856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004623169508525577, + "loss": 3.4534, + "theoretical_loss": 4.181685002647262, + "tokens_seen": 279227392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046230692076228684, + "loss": 3.4165, + "theoretical_loss": 4.181568517334494, + "tokens_seen": 279292928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004622968906720161, + "loss": 3.3712, + "theoretical_loss": 4.181452067003025, + "tokens_seen": 279358464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046228686058174525, + "loss": 3.3194, + "theoretical_loss": 4.181335651634146, + "tokens_seen": 279424000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046227683049147444, + "loss": 3.3525, + "theoretical_loss": 4.181219271209162, + "tokens_seen": 279489536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004622668004012036, + "loss": 3.2827, + "theoretical_loss": 4.181102925709397, + "tokens_seen": 279555072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046225677031093285, + "loss": 3.4366, + "theoretical_loss": 4.180986615116185, + "tokens_seen": 279620608 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462246740220662, + "loss": 3.4087, + "theoretical_loss": 4.180870339410875, + "tokens_seen": 279686144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004622367101303912, + "loss": 3.4317, + "theoretical_loss": 4.180754098574832, + "tokens_seen": 279751680 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046222668004012034, + "loss": 3.3375, + "theoretical_loss": 4.1806378925894325, + "tokens_seen": 279817216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004622166499498496, + "loss": 3.3483, + "theoretical_loss": 4.1805217214360715, + "tokens_seen": 279882752 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046220661985957876, + "loss": 3.4034, + "theoretical_loss": 4.180405585096154, + "tokens_seen": 279948288 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046219658976930794, + "loss": 3.4875, + "theoretical_loss": 4.180289483551102, + "tokens_seen": 280013824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621865596790371, + "loss": 3.3726, + "theoretical_loss": 4.180173416782351, + "tokens_seen": 280079360 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 702390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6370158195495605, + "objective/train/theoretical_loss": 4.18005738477135, + "objective/train/tokens_used": 300604896, + "theoretical_loss": 4.18005738477135, + "tokens_seen": 280144896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621765295887663, + "loss": 3.4775, + "theoretical_loss": 4.18005738477135, + "tokens_seen": 280144896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621664994984955, + "loss": 3.2318, + "theoretical_loss": 4.179941387499563, + "tokens_seen": 280210432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621564694082247, + "loss": 3.244, + "theoretical_loss": 4.1798254249484685, + "tokens_seen": 280275968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046214643931795384, + "loss": 3.3193, + "theoretical_loss": 4.179709497099559, + "tokens_seen": 280341504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621364092276831, + "loss": 3.3982, + "theoretical_loss": 4.17959360393434, + "tokens_seen": 280407040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621263791374122, + "loss": 3.4021, + "theoretical_loss": 4.179477745434333, + "tokens_seen": 280472576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046211634904714144, + "loss": 3.538, + "theoretical_loss": 4.179361921581073, + "tokens_seen": 280538112 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004621063189568706, + "loss": 3.3163, + "theoretical_loss": 4.179246132356109, + "tokens_seen": 280603648 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620962888665998, + "loss": 3.3942, + "theoretical_loss": 4.179130377741004, + "tokens_seen": 280669184 + }, + { + "epoch": 3.01, + "learning_rate": 0.000462086258776329, + "loss": 3.4309, + "theoretical_loss": 4.1790146577173335, + "tokens_seen": 280734720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620762286860582, + "loss": 3.3042, + "theoretical_loss": 4.1788989722666905, + "tokens_seen": 280800256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046206619859578735, + "loss": 3.3487, + "theoretical_loss": 4.178783321370681, + "tokens_seen": 280865792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620561685055166, + "loss": 3.4001, + "theoretical_loss": 4.178667705010923, + "tokens_seen": 280931328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620461384152457, + "loss": 3.4619, + "theoretical_loss": 4.178552123169052, + "tokens_seen": 280996864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046203610832497495, + "loss": 3.4041, + "theoretical_loss": 4.178436575826714, + "tokens_seen": 281062400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620260782347041, + "loss": 3.4398, + "theoretical_loss": 4.17832106296557, + "tokens_seen": 281127936 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620160481444333, + "loss": 3.5361, + "theoretical_loss": 4.178205584567298, + "tokens_seen": 281193472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004620060180541625, + "loss": 3.3819, + "theoretical_loss": 4.178090140613585, + "tokens_seen": 281259008 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046199598796389167, + "loss": 3.4299, + "theoretical_loss": 4.177974731086136, + "tokens_seen": 281324544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046198595787362085, + "loss": 3.3696, + "theoretical_loss": 4.17785935596667, + "tokens_seen": 281390080 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004619759277833501, + "loss": 3.1975, + "theoretical_loss": 4.177744015236916, + "tokens_seen": 281455616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004619658976930792, + "loss": 3.3496, + "theoretical_loss": 4.177628708878622, + "tokens_seen": 281521152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046195586760280845, + "loss": 3.4332, + "theoretical_loss": 4.177513436873546, + "tokens_seen": 281586688 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046194583751253763, + "loss": 3.3884, + "theoretical_loss": 4.177398199203461, + "tokens_seen": 281652224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004619358074222668, + "loss": 3.354, + "theoretical_loss": 4.177282995850156, + "tokens_seen": 281717760 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 706893, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5023720264434814, + "objective/train/theoretical_loss": 4.17716782679543, + "objective/train/tokens_used": 302243296, + "theoretical_loss": 4.17716782679543, + "tokens_seen": 281783296 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461925777331996, + "loss": 3.4066, + "theoretical_loss": 4.17716782679543, + "tokens_seen": 281783296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004619157472417252, + "loss": 3.4239, + "theoretical_loss": 4.177052692021101, + "tokens_seen": 281848832 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046190571715145435, + "loss": 3.3167, + "theoretical_loss": 4.176937591508994, + "tokens_seen": 281914368 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618956870611836, + "loss": 3.4208, + "theoretical_loss": 4.176822525240956, + "tokens_seen": 281979904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618856569709127, + "loss": 3.338, + "theoretical_loss": 4.176707493198841, + "tokens_seen": 282045440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046187562688064195, + "loss": 3.4151, + "theoretical_loss": 4.17659249536452, + "tokens_seen": 282110976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618655967903711, + "loss": 3.3439, + "theoretical_loss": 4.176477531719879, + "tokens_seen": 282176512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618555667001003, + "loss": 3.295, + "theoretical_loss": 4.176362602246813, + "tokens_seen": 282242048 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618455366098295, + "loss": 3.2927, + "theoretical_loss": 4.176247706927237, + "tokens_seen": 282307584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618355065195587, + "loss": 3.4233, + "theoretical_loss": 4.176132845743074, + "tokens_seen": 282373120 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046182547642928786, + "loss": 3.4423, + "theoretical_loss": 4.176018018676265, + "tokens_seen": 282438656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046181544633901704, + "loss": 3.4645, + "theoretical_loss": 4.175903225708762, + "tokens_seen": 282504192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004618054162487462, + "loss": 3.4065, + "theoretical_loss": 4.175788466822533, + "tokens_seen": 282569728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046179538615847545, + "loss": 3.3295, + "theoretical_loss": 4.175673741999558, + "tokens_seen": 282635264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004617853560682046, + "loss": 3.4097, + "theoretical_loss": 4.175559051221832, + "tokens_seen": 282700800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004617753259779338, + "loss": 3.4406, + "theoretical_loss": 4.175444394471363, + "tokens_seen": 282766336 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461765295887663, + "loss": 3.356, + "theoretical_loss": 4.175329771730171, + "tokens_seen": 282831872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004617552657973922, + "loss": 3.4643, + "theoretical_loss": 4.175215182980292, + "tokens_seen": 282897408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046174523570712136, + "loss": 3.4319, + "theoretical_loss": 4.175100628203777, + "tokens_seen": 282962944 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046173520561685054, + "loss": 3.4481, + "theoretical_loss": 4.1749861073826855, + "tokens_seen": 283028480 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004617251755265797, + "loss": 3.4374, + "theoretical_loss": 4.174871620499095, + "tokens_seen": 283094016 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046171514543630896, + "loss": 3.2416, + "theoretical_loss": 4.174757167535097, + "tokens_seen": 283159552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046170511534603814, + "loss": 3.3021, + "theoretical_loss": 4.1746427484727935, + "tokens_seen": 283225088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616950852557673, + "loss": 3.3462, + "theoretical_loss": 4.1745283632943, + "tokens_seen": 283290624 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616850551654965, + "loss": 3.4472, + "theoretical_loss": 4.17441401198175, + "tokens_seen": 283356160 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 710028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4669623374938965, + "objective/train/theoretical_loss": 4.174299694517286, + "objective/train/tokens_used": 303881696, + "theoretical_loss": 4.174299694517286, + "tokens_seen": 283421696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616750250752257, + "loss": 3.3623, + "theoretical_loss": 4.174299694517286, + "tokens_seen": 283421696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616649949849549, + "loss": 3.5019, + "theoretical_loss": 4.174185410883066, + "tokens_seen": 283487232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046165496489468404, + "loss": 3.4116, + "theoretical_loss": 4.17407116106126, + "tokens_seen": 283552768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616449348044133, + "loss": 3.4596, + "theoretical_loss": 4.173956945034054, + "tokens_seen": 283618304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616349047141424, + "loss": 3.314, + "theoretical_loss": 4.1738427627836465, + "tokens_seen": 283683840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046162487462387164, + "loss": 3.3755, + "theoretical_loss": 4.173728614292249, + "tokens_seen": 283749376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004616148445336008, + "loss": 3.3551, + "theoretical_loss": 4.173614499542086, + "tokens_seen": 283814912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046160481444333, + "loss": 3.442, + "theoretical_loss": 4.173500418515396, + "tokens_seen": 283880448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615947843530592, + "loss": 3.3052, + "theoretical_loss": 4.173386371194432, + "tokens_seen": 283945984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615847542627884, + "loss": 3.3455, + "theoretical_loss": 4.17327235756146, + "tokens_seen": 284011520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046157472417251755, + "loss": 3.3373, + "theoretical_loss": 4.173158377598757, + "tokens_seen": 284077056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615646940822468, + "loss": 3.3595, + "theoretical_loss": 4.173044431288616, + "tokens_seen": 284142592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615546639919759, + "loss": 3.327, + "theoretical_loss": 4.172930518613344, + "tokens_seen": 284208128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046154463390170515, + "loss": 3.2878, + "theoretical_loss": 4.1728166395552595, + "tokens_seen": 284273664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615346038114343, + "loss": 3.4025, + "theoretical_loss": 4.172702794096694, + "tokens_seen": 284339200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615245737211635, + "loss": 3.4835, + "theoretical_loss": 4.172588982219994, + "tokens_seen": 284404736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004615145436308927, + "loss": 3.3875, + "theoretical_loss": 4.17247520390752, + "tokens_seen": 284470272 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046150451354062187, + "loss": 3.3403, + "theoretical_loss": 4.172361459141642, + "tokens_seen": 284535808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046149448345035105, + "loss": 3.3474, + "theoretical_loss": 4.172247747904747, + "tokens_seen": 284601344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004614844533600803, + "loss": 3.3795, + "theoretical_loss": 4.172134070179235, + "tokens_seen": 284666880 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004614744232698094, + "loss": 3.3429, + "theoretical_loss": 4.172020425947517, + "tokens_seen": 284732416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046146439317953865, + "loss": 3.3296, + "theoretical_loss": 4.17190681519202, + "tokens_seen": 284797952 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046145436308926783, + "loss": 3.2946, + "theoretical_loss": 4.171793237895181, + "tokens_seen": 284863488 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461444332998997, + "loss": 3.464, + "theoretical_loss": 4.1716796940394545, + "tokens_seen": 284929024 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004614343029087262, + "loss": 3.4303, + "theoretical_loss": 4.171566183607305, + "tokens_seen": 284994560 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 714869, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1715900897979736, + "objective/train/theoretical_loss": 4.17145270658121, + "objective/train/tokens_used": 305520096, + "theoretical_loss": 4.17145270658121, + "tokens_seen": 285060096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004614242728184554, + "loss": 3.3158, + "theoretical_loss": 4.17145270658121, + "tokens_seen": 285060096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046141424272818455, + "loss": 3.3921, + "theoretical_loss": 4.171339262943663, + "tokens_seen": 285125632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004614042126379138, + "loss": 3.3537, + "theoretical_loss": 4.1712258526771695, + "tokens_seen": 285191168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613941825476429, + "loss": 3.3677, + "theoretical_loss": 4.171112475764246, + "tokens_seen": 285256704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046138415245737215, + "loss": 3.4244, + "theoretical_loss": 4.170999132187424, + "tokens_seen": 285322240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613741223671013, + "loss": 3.3352, + "theoretical_loss": 4.17088582192925, + "tokens_seen": 285387776 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613640922768305, + "loss": 3.4179, + "theoretical_loss": 4.170772544972279, + "tokens_seen": 285453312 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613540621865597, + "loss": 3.3737, + "theoretical_loss": 4.170659301299085, + "tokens_seen": 285518848 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613440320962889, + "loss": 3.3917, + "theoretical_loss": 4.17054609089225, + "tokens_seen": 285584384 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046133400200601806, + "loss": 3.4428, + "theoretical_loss": 4.170432913734371, + "tokens_seen": 285649920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046132397191574724, + "loss": 3.3946, + "theoretical_loss": 4.170319769808058, + "tokens_seen": 285715456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004613139418254764, + "loss": 3.4102, + "theoretical_loss": 4.170206659095936, + "tokens_seen": 285780992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046130391173520566, + "loss": 3.4647, + "theoretical_loss": 4.170093581580641, + "tokens_seen": 285846528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612938816449348, + "loss": 3.4005, + "theoretical_loss": 4.16998053724482, + "tokens_seen": 285912064 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461283851554664, + "loss": 3.1956, + "theoretical_loss": 4.169867526071138, + "tokens_seen": 285977600 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612738214643932, + "loss": 3.432, + "theoretical_loss": 4.1697545480422695, + "tokens_seen": 286043136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612637913741224, + "loss": 3.2251, + "theoretical_loss": 4.169641603140903, + "tokens_seen": 286108672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046125376128385156, + "loss": 3.4034, + "theoretical_loss": 4.16952869134974, + "tokens_seen": 286174208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046124373119358074, + "loss": 3.3274, + "theoretical_loss": 4.169415812651494, + "tokens_seen": 286239744 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612337011033099, + "loss": 3.3991, + "theoretical_loss": 4.169302967028894, + "tokens_seen": 286305280 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046122367101303916, + "loss": 3.3915, + "theoretical_loss": 4.1691901544646806, + "tokens_seen": 286370816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612136409227683, + "loss": 3.319, + "theoretical_loss": 4.169077374941606, + "tokens_seen": 286436352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004612036108324975, + "loss": 3.272, + "theoretical_loss": 4.168964628442437, + "tokens_seen": 286501888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046119358074222665, + "loss": 3.3392, + "theoretical_loss": 4.168851914949952, + "tokens_seen": 286567424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004611835506519559, + "loss": 3.3495, + "theoretical_loss": 4.168739234446946, + "tokens_seen": 286632960 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 717872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4431254863739014, + "objective/train/theoretical_loss": 4.16862658691622, + "objective/train/tokens_used": 307158496, + "theoretical_loss": 4.16862658691622, + "tokens_seen": 286698496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046117352056168506, + "loss": 3.3076, + "theoretical_loss": 4.16862658691622, + "tokens_seen": 286698496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046116349047141425, + "loss": 3.4048, + "theoretical_loss": 4.1685139723405955, + "tokens_seen": 286764032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004611534603811434, + "loss": 3.3884, + "theoretical_loss": 4.1684013907029005, + "tokens_seen": 286829568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004611434302908726, + "loss": 3.3915, + "theoretical_loss": 4.168288841985981, + "tokens_seen": 286895104 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004611334002006018, + "loss": 3.2847, + "theoretical_loss": 4.168176326172692, + "tokens_seen": 286960640 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461123370110331, + "loss": 3.4204, + "theoretical_loss": 4.168063843245904, + "tokens_seen": 287026176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046111334002006015, + "loss": 3.4008, + "theoretical_loss": 4.167951393188498, + "tokens_seen": 287091712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004611033099297894, + "loss": 3.411, + "theoretical_loss": 4.16783897598337, + "tokens_seen": 287157248 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046109327983951857, + "loss": 3.3578, + "theoretical_loss": 4.1677265916134285, + "tokens_seen": 287222784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046108324974924775, + "loss": 3.4846, + "theoretical_loss": 4.167614240061592, + "tokens_seen": 287288320 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046107321965897693, + "loss": 3.3406, + "theoretical_loss": 4.167501921310795, + "tokens_seen": 287353856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004610631895687061, + "loss": 3.502, + "theoretical_loss": 4.1673896353439845, + "tokens_seen": 287419392 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004610531594784353, + "loss": 3.4055, + "theoretical_loss": 4.1672773821441185, + "tokens_seen": 287484928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004610431293881645, + "loss": 3.3871, + "theoretical_loss": 4.167165161694169, + "tokens_seen": 287550464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046103309929789365, + "loss": 3.4343, + "theoretical_loss": 4.167052973977119, + "tokens_seen": 287616000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004610230692076229, + "loss": 3.2835, + "theoretical_loss": 4.166940818975968, + "tokens_seen": 287681536 + }, + { + "epoch": 3.01, + "learning_rate": 0.000461013039117352, + "loss": 3.4809, + "theoretical_loss": 4.166828696673725, + "tokens_seen": 287747072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046100300902708125, + "loss": 3.3331, + "theoretical_loss": 4.166716607053411, + "tokens_seen": 287812608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046099297893681043, + "loss": 3.3455, + "theoretical_loss": 4.1666045500980635, + "tokens_seen": 287878144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609829488465396, + "loss": 3.3276, + "theoretical_loss": 4.166492525790728, + "tokens_seen": 287943680 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609729187562688, + "loss": 3.3489, + "theoretical_loss": 4.166380534114467, + "tokens_seen": 288009216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046096288866599803, + "loss": 3.4536, + "theoretical_loss": 4.166268575052352, + "tokens_seen": 288074752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609528585757272, + "loss": 3.3982, + "theoretical_loss": 4.166156648587471, + "tokens_seen": 288140288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609428284854564, + "loss": 3.4112, + "theoretical_loss": 4.166044754702919, + "tokens_seen": 288205824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609327983951856, + "loss": 3.3367, + "theoretical_loss": 4.16593289338181, + "tokens_seen": 288271360 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 721764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.353747606277466, + "objective/train/theoretical_loss": 4.165821064607266, + "objective/train/tokens_used": 308796896, + "theoretical_loss": 4.165821064607266, + "tokens_seen": 288336896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046092276830491475, + "loss": 3.3469, + "theoretical_loss": 4.165821064607266, + "tokens_seen": 288336896 + }, + { + "epoch": 3.01, + "learning_rate": 0.000460912738214644, + "loss": 3.3564, + "theoretical_loss": 4.165709268362424, + "tokens_seen": 288402432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004609027081243731, + "loss": 3.4568, + "theoretical_loss": 4.1655975046304325, + "tokens_seen": 288467968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046089267803410235, + "loss": 3.3961, + "theoretical_loss": 4.165485773394453, + "tokens_seen": 288533504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004608826479438315, + "loss": 3.4585, + "theoretical_loss": 4.165374074637658, + "tokens_seen": 288599040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004608726178535607, + "loss": 3.4334, + "theoretical_loss": 4.165262408343235, + "tokens_seen": 288664576 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004608625877632899, + "loss": 3.3561, + "theoretical_loss": 4.165150774494383, + "tokens_seen": 288730112 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004608525576730191, + "loss": 3.3492, + "theoretical_loss": 4.165039173074313, + "tokens_seen": 288795648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046084252758274826, + "loss": 3.3179, + "theoretical_loss": 4.164927604066249, + "tokens_seen": 288861184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046083249749247744, + "loss": 3.4697, + "theoretical_loss": 4.164816067453428, + "tokens_seen": 288926720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004608224674022066, + "loss": 3.3677, + "theoretical_loss": 4.164704563219097, + "tokens_seen": 288992256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046081243731193586, + "loss": 3.2421, + "theoretical_loss": 4.164593091346518, + "tokens_seen": 289057792 + }, + { + "epoch": 3.01, + "learning_rate": 0.000460802407221665, + "loss": 3.3825, + "theoretical_loss": 4.164481651818966, + "tokens_seen": 289123328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607923771313942, + "loss": 3.4709, + "theoretical_loss": 4.164370244619727, + "tokens_seen": 289188864 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607823470411234, + "loss": 3.3175, + "theoretical_loss": 4.164258869732098, + "tokens_seen": 289254400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607723169508526, + "loss": 3.4654, + "theoretical_loss": 4.164147527139391, + "tokens_seen": 289319936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046076228686058176, + "loss": 3.4231, + "theoretical_loss": 4.164036216824929, + "tokens_seen": 289385472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046075225677031094, + "loss": 3.4195, + "theoretical_loss": 4.163924938772048, + "tokens_seen": 289451008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607422266800401, + "loss": 3.3472, + "theoretical_loss": 4.163813692964097, + "tokens_seen": 289516544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046073219658976936, + "loss": 3.2932, + "theoretical_loss": 4.163702479384435, + "tokens_seen": 289582080 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607221664994985, + "loss": 3.3832, + "theoretical_loss": 4.163591298016436, + "tokens_seen": 289647616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004607121364092277, + "loss": 3.4775, + "theoretical_loss": 4.163480148843485, + "tokens_seen": 289713152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046070210631895685, + "loss": 3.4394, + "theoretical_loss": 4.163369031848978, + "tokens_seen": 289778688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004606920762286861, + "loss": 3.4114, + "theoretical_loss": 4.163257947016327, + "tokens_seen": 289844224 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046068204613841526, + "loss": 3.4802, + "theoretical_loss": 4.163146894328953, + "tokens_seen": 289909760 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 726458, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.428697109222412, + "objective/train/theoretical_loss": 4.163035873770292, + "objective/train/tokens_used": 310435296, + "theoretical_loss": 4.163035873770292, + "tokens_seen": 289975296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046067201604814445, + "loss": 3.3145, + "theoretical_loss": 4.163035873770292, + "tokens_seen": 289975296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004606619859578736, + "loss": 3.342, + "theoretical_loss": 4.162924885323789, + "tokens_seen": 290040832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004606519558676028, + "loss": 3.3588, + "theoretical_loss": 4.162813928972902, + "tokens_seen": 290106368 + }, + { + "epoch": 3.01, + "learning_rate": 0.000460641925777332, + "loss": 3.4012, + "theoretical_loss": 4.162703004701105, + "tokens_seen": 290171904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004606318956870612, + "loss": 3.4174, + "theoretical_loss": 4.16259211249188, + "tokens_seen": 290237440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046062186559679035, + "loss": 3.2819, + "theoretical_loss": 4.162481252328722, + "tokens_seen": 290302976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004606118355065196, + "loss": 3.4229, + "theoretical_loss": 4.1623704241951405, + "tokens_seen": 290368512 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046060180541624877, + "loss": 3.3934, + "theoretical_loss": 4.162259628074654, + "tokens_seen": 290434048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046059177532597795, + "loss": 3.4413, + "theoretical_loss": 4.162148863950797, + "tokens_seen": 290499584 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046058174523570713, + "loss": 3.4269, + "theoretical_loss": 4.162038131807113, + "tokens_seen": 290565120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004605717151454363, + "loss": 3.3542, + "theoretical_loss": 4.161927431627157, + "tokens_seen": 290630656 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004605616850551655, + "loss": 3.3607, + "theoretical_loss": 4.1618167633945, + "tokens_seen": 290696192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004605516549648947, + "loss": 3.4056, + "theoretical_loss": 4.161706127092722, + "tokens_seen": 290761728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046054162487462385, + "loss": 3.2878, + "theoretical_loss": 4.161595522705416, + "tokens_seen": 290827264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004605315947843531, + "loss": 3.3566, + "theoretical_loss": 4.161484950216188, + "tokens_seen": 290892800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004605215646940822, + "loss": 3.4357, + "theoretical_loss": 4.161374409608656, + "tokens_seen": 290958336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046051153460381145, + "loss": 3.248, + "theoretical_loss": 4.161263900866447, + "tokens_seen": 291023872 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046050150451354063, + "loss": 3.382, + "theoretical_loss": 4.161153423973205, + "tokens_seen": 291089408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004604914744232698, + "loss": 3.4262, + "theoretical_loss": 4.161042978912581, + "tokens_seen": 291154944 + }, + { + "epoch": 3.01, + "learning_rate": 0.000460481444332999, + "loss": 3.3635, + "theoretical_loss": 4.1609325656682445, + "tokens_seen": 291220480 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046047141424272823, + "loss": 3.4584, + "theoretical_loss": 4.16082218422387, + "tokens_seen": 291286016 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046046138415245736, + "loss": 3.2909, + "theoretical_loss": 4.160711834563148, + "tokens_seen": 291351552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004604513540621866, + "loss": 3.3776, + "theoretical_loss": 4.160601516669782, + "tokens_seen": 291417088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004604413239719157, + "loss": 3.3557, + "theoretical_loss": 4.160491230527484, + "tokens_seen": 291482624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046043129388164495, + "loss": 3.4535, + "theoretical_loss": 4.16038097611998, + "tokens_seen": 291548160 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 729494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5712475776672363, + "objective/train/theoretical_loss": 4.16027075343101, + "objective/train/tokens_used": 312073696, + "theoretical_loss": 4.16027075343101, + "tokens_seen": 291613696 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046042126379137414, + "loss": 3.4095, + "theoretical_loss": 4.16027075343101, + "tokens_seen": 291613696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004604112337011033, + "loss": 3.3175, + "theoretical_loss": 4.160160562444321, + "tokens_seen": 291679232 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004604012036108325, + "loss": 3.4952, + "theoretical_loss": 4.160050403143675, + "tokens_seen": 291744768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603911735205617, + "loss": 3.2868, + "theoretical_loss": 4.159940275512848, + "tokens_seen": 291810304 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046038114343029086, + "loss": 3.4458, + "theoretical_loss": 4.159830179535625, + "tokens_seen": 291875840 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603711133400201, + "loss": 3.3675, + "theoretical_loss": 4.159720115195802, + "tokens_seen": 291941376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603610832497492, + "loss": 3.3404, + "theoretical_loss": 4.15961008247719, + "tokens_seen": 292006912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046035105315947846, + "loss": 3.3419, + "theoretical_loss": 4.1595000813636105, + "tokens_seen": 292072448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603410230692076, + "loss": 3.3324, + "theoretical_loss": 4.159390111838897, + "tokens_seen": 292137984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603309929789368, + "loss": 3.4107, + "theoretical_loss": 4.159280173886895, + "tokens_seen": 292203520 + }, + { + "epoch": 3.01, + "learning_rate": 0.000460320962888666, + "loss": 3.4254, + "theoretical_loss": 4.15917026749146, + "tokens_seen": 292269056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004603109327983952, + "loss": 3.2928, + "theoretical_loss": 4.159060392636463, + "tokens_seen": 292334592 + }, + { + "epoch": 3.01, + "learning_rate": 0.00046030090270812436, + "loss": 3.2477, + "theoretical_loss": 4.158950549305784, + "tokens_seen": 292400128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0004602908726178536, + "loss": 3.4016, + "theoretical_loss": 4.158840737483317, + "tokens_seen": 292465664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602808425275827, + "loss": 3.3299, + "theoretical_loss": 4.158730957152964, + "tokens_seen": 292531200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046027081243731196, + "loss": 3.3354, + "theoretical_loss": 4.158621208298644, + "tokens_seen": 292596736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602607823470411, + "loss": 3.3827, + "theoretical_loss": 4.158511490904285, + "tokens_seen": 292662272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602507522567703, + "loss": 3.3937, + "theoretical_loss": 4.158401804953826, + "tokens_seen": 292727808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602407221664995, + "loss": 3.3925, + "theoretical_loss": 4.158292150431219, + "tokens_seen": 292793344 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602306920762287, + "loss": 3.3606, + "theoretical_loss": 4.158182527320427, + "tokens_seen": 292858880 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046022066198595787, + "loss": 3.3285, + "theoretical_loss": 4.158072935605428, + "tokens_seen": 292924416 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046021063189568705, + "loss": 3.4145, + "theoretical_loss": 4.1579633752702065, + "tokens_seen": 292989952 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004602006018054163, + "loss": 3.4836, + "theoretical_loss": 4.157853846298762, + "tokens_seen": 293055488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046019057171514546, + "loss": 3.4557, + "theoretical_loss": 4.157744348675106, + "tokens_seen": 293121024 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046018054162487465, + "loss": 3.313, + "theoretical_loss": 4.15763488238326, + "tokens_seen": 293186560 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 734202, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3168368339538574, + "objective/train/theoretical_loss": 4.1575254474072585, + "objective/train/tokens_used": 313712096, + "theoretical_loss": 4.1575254474072585, + "tokens_seen": 293252096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004601705115346038, + "loss": 3.2961, + "theoretical_loss": 4.1575254474072585, + "tokens_seen": 293252096 + }, + { + "epoch": 3.02, + "learning_rate": 0.000460160481444333, + "loss": 3.2921, + "theoretical_loss": 4.157416043731147, + "tokens_seen": 293317632 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004601504513540622, + "loss": 3.3633, + "theoretical_loss": 4.157306671338984, + "tokens_seen": 293383168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004601404212637914, + "loss": 3.4005, + "theoretical_loss": 4.157197330214837, + "tokens_seen": 293448704 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046013039117352055, + "loss": 3.3763, + "theoretical_loss": 4.157088020342788, + "tokens_seen": 293514240 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004601203610832498, + "loss": 3.438, + "theoretical_loss": 4.156978741706928, + "tokens_seen": 293579776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046011033099297897, + "loss": 3.3548, + "theoretical_loss": 4.156869494291364, + "tokens_seen": 293645312 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046010030090270815, + "loss": 3.451, + "theoretical_loss": 4.156760278080209, + "tokens_seen": 293710848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046009027081243733, + "loss": 3.3676, + "theoretical_loss": 4.156651093057591, + "tokens_seen": 293776384 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004600802407221665, + "loss": 3.3348, + "theoretical_loss": 4.15654193920765, + "tokens_seen": 293841920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004600702106318957, + "loss": 3.4598, + "theoretical_loss": 4.156432816514536, + "tokens_seen": 293907456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046006018054162493, + "loss": 3.2717, + "theoretical_loss": 4.1563237249624105, + "tokens_seen": 293972992 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046005015045135405, + "loss": 3.3542, + "theoretical_loss": 4.156214664535447, + "tokens_seen": 294038528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004600401203610833, + "loss": 3.3615, + "theoretical_loss": 4.156105635217833, + "tokens_seen": 294104064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004600300902708124, + "loss": 3.369, + "theoretical_loss": 4.155996636993764, + "tokens_seen": 294169600 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046002006018054165, + "loss": 3.3195, + "theoretical_loss": 4.1558876698474485, + "tokens_seen": 294235136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046001003009027083, + "loss": 3.3821, + "theoretical_loss": 4.155778733763107, + "tokens_seen": 294300672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00046, + "loss": 3.3277, + "theoretical_loss": 4.155669828724969, + "tokens_seen": 294366208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004599899699097292, + "loss": 3.3783, + "theoretical_loss": 4.15556095471728, + "tokens_seen": 294431744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045997993981945843, + "loss": 3.2749, + "theoretical_loss": 4.155452111724292, + "tokens_seen": 294497280 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045996990972918756, + "loss": 3.3454, + "theoretical_loss": 4.155343299730274, + "tokens_seen": 294562816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004599598796389168, + "loss": 3.2791, + "theoretical_loss": 4.155234518719501, + "tokens_seen": 294628352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004599498495486459, + "loss": 3.3759, + "theoretical_loss": 4.155125768676264, + "tokens_seen": 294693888 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045993981945837515, + "loss": 3.4042, + "theoretical_loss": 4.1550170495848615, + "tokens_seen": 294759424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045992978936810434, + "loss": 3.368, + "theoretical_loss": 4.154908361429606, + "tokens_seen": 294824960 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 737236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.435652017593384, + "objective/train/theoretical_loss": 4.15479970419482, + "objective/train/tokens_used": 315350496, + "theoretical_loss": 4.15479970419482, + "tokens_seen": 294890496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004599197592778335, + "loss": 3.443, + "theoretical_loss": 4.15479970419482, + "tokens_seen": 294890496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004599097291875627, + "loss": 3.3521, + "theoretical_loss": 4.154691077864841, + "tokens_seen": 294956032 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598996990972919, + "loss": 3.3641, + "theoretical_loss": 4.154582482424012, + "tokens_seen": 295021568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045988966900702106, + "loss": 3.3494, + "theoretical_loss": 4.154473917856692, + "tokens_seen": 295087104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598796389167503, + "loss": 3.4441, + "theoretical_loss": 4.15436538414725, + "tokens_seen": 295152640 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598696088264794, + "loss": 3.4495, + "theoretical_loss": 4.154256881280066, + "tokens_seen": 295218176 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045985957873620866, + "loss": 3.4535, + "theoretical_loss": 4.154148409239532, + "tokens_seen": 295283712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598495486459378, + "loss": 3.3242, + "theoretical_loss": 4.15403996801005, + "tokens_seen": 295349248 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459839518555667, + "loss": 3.3247, + "theoretical_loss": 4.153931557576035, + "tokens_seen": 295414784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598294884653962, + "loss": 3.4472, + "theoretical_loss": 4.153823177921914, + "tokens_seen": 295480320 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004598194583751254, + "loss": 3.43, + "theoretical_loss": 4.153714829032122, + "tokens_seen": 295545856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045980942828485456, + "loss": 3.4742, + "theoretical_loss": 4.153606510891109, + "tokens_seen": 295611392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597993981945838, + "loss": 3.4022, + "theoretical_loss": 4.153498223483333, + "tokens_seen": 295676928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597893681043129, + "loss": 3.3693, + "theoretical_loss": 4.1533899667932666, + "tokens_seen": 295742464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045977933801404216, + "loss": 3.3811, + "theoretical_loss": 4.153281740805391, + "tokens_seen": 295808000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597693079237713, + "loss": 3.4142, + "theoretical_loss": 4.153173545504201, + "tokens_seen": 295873536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597592778335005, + "loss": 3.4184, + "theoretical_loss": 4.1530653808741995, + "tokens_seen": 295939072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597492477432297, + "loss": 3.3912, + "theoretical_loss": 4.152957246899904, + "tokens_seen": 296004608 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004597392176529589, + "loss": 3.4407, + "theoretical_loss": 4.152849143565842, + "tokens_seen": 296070144 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045972918756268807, + "loss": 3.3778, + "theoretical_loss": 4.152741070856551, + "tokens_seen": 296135680 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045971915747241725, + "loss": 3.4119, + "theoretical_loss": 4.152633028756581, + "tokens_seen": 296201216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045970912738214643, + "loss": 3.3407, + "theoretical_loss": 4.152525017250493, + "tokens_seen": 296266752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045969909729187566, + "loss": 3.2681, + "theoretical_loss": 4.152417036322859, + "tokens_seen": 296332288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004596890672016048, + "loss": 3.3725, + "theoretical_loss": 4.152309085958263, + "tokens_seen": 296397824 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459679037111334, + "loss": 3.2405, + "theoretical_loss": 4.1522011661413005, + "tokens_seen": 296463360 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 741211, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4141645431518555, + "objective/train/theoretical_loss": 4.152093276856575, + "objective/train/tokens_used": 316988896, + "theoretical_loss": 4.152093276856575, + "tokens_seen": 296528896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045966900702106315, + "loss": 3.18, + "theoretical_loss": 4.152093276856575, + "tokens_seen": 296528896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004596589769307924, + "loss": 3.2859, + "theoretical_loss": 4.151985418088705, + "tokens_seen": 296594432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045964894684052157, + "loss": 3.3392, + "theoretical_loss": 4.151877589822316, + "tokens_seen": 296659968 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045963891675025075, + "loss": 3.3605, + "theoretical_loss": 4.15176979204205, + "tokens_seen": 296725504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045962888665997993, + "loss": 3.4052, + "theoretical_loss": 4.151662024732557, + "tokens_seen": 296791040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045961885656970917, + "loss": 3.4338, + "theoretical_loss": 4.151554287878497, + "tokens_seen": 296856576 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004596088264794383, + "loss": 3.3618, + "theoretical_loss": 4.151446581464542, + "tokens_seen": 296922112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045959879638916753, + "loss": 3.3533, + "theoretical_loss": 4.151338905475378, + "tokens_seen": 296987648 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045958876629889666, + "loss": 3.2821, + "theoretical_loss": 4.151231259895699, + "tokens_seen": 297053184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004595787362086259, + "loss": 3.3401, + "theoretical_loss": 4.151123644710209, + "tokens_seen": 297118720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004595687061183551, + "loss": 3.3306, + "theoretical_loss": 4.151016059903625, + "tokens_seen": 297184256 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045955867602808425, + "loss": 3.4693, + "theoretical_loss": 4.150908505460677, + "tokens_seen": 297249792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045954864593781344, + "loss": 3.275, + "theoretical_loss": 4.150800981366103, + "tokens_seen": 297315328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004595386158475426, + "loss": 3.3286, + "theoretical_loss": 4.150693487604652, + "tokens_seen": 297380864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004595285857572718, + "loss": 3.3319, + "theoretical_loss": 4.150586024161086, + "tokens_seen": 297446400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045951855566700103, + "loss": 3.337, + "theoretical_loss": 4.150478591020176, + "tokens_seen": 297511936 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045950852557673016, + "loss": 3.2501, + "theoretical_loss": 4.150371188166705, + "tokens_seen": 297577472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594984954864594, + "loss": 3.3879, + "theoretical_loss": 4.150263815585468, + "tokens_seen": 297643008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594884653961885, + "loss": 3.3856, + "theoretical_loss": 4.15015647326127, + "tokens_seen": 297708544 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045947843530591776, + "loss": 3.2989, + "theoretical_loss": 4.150049161178926, + "tokens_seen": 297774080 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045946840521564694, + "loss": 3.4191, + "theoretical_loss": 4.149941879323263, + "tokens_seen": 297839616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594583751253761, + "loss": 3.3296, + "theoretical_loss": 4.14983462767912, + "tokens_seen": 297905152 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045944834503510536, + "loss": 3.3632, + "theoretical_loss": 4.149727406231343, + "tokens_seen": 297970688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045943831494483454, + "loss": 3.2685, + "theoretical_loss": 4.149620214964795, + "tokens_seen": 298036224 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594282848545637, + "loss": 3.3679, + "theoretical_loss": 4.149513053864344, + "tokens_seen": 298101760 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 744009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.381294012069702, + "objective/train/theoretical_loss": 4.149405922914873, + "objective/train/tokens_used": 318627296, + "theoretical_loss": 4.149405922914873, + "tokens_seen": 298167296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594182547642929, + "loss": 3.3454, + "theoretical_loss": 4.149405922914873, + "tokens_seen": 298167296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004594082246740221, + "loss": 3.3432, + "theoretical_loss": 4.149298822101274, + "tokens_seen": 298232832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045939819458375126, + "loss": 3.2993, + "theoretical_loss": 4.14919175140845, + "tokens_seen": 298298368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004593881644934805, + "loss": 3.3331, + "theoretical_loss": 4.149084710821317, + "tokens_seen": 298363904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004593781344032096, + "loss": 3.3897, + "theoretical_loss": 4.148977700324797, + "tokens_seen": 298429440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045936810431293886, + "loss": 3.3544, + "theoretical_loss": 4.148870719903828, + "tokens_seen": 298494976 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459358074222668, + "loss": 3.2863, + "theoretical_loss": 4.148763769543355, + "tokens_seen": 298560512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004593480441323972, + "loss": 3.3953, + "theoretical_loss": 4.148656849228338, + "tokens_seen": 298626048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004593380140421264, + "loss": 3.3691, + "theoretical_loss": 4.148549958943744, + "tokens_seen": 298691584 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004593279839518556, + "loss": 3.3921, + "theoretical_loss": 4.148443098674552, + "tokens_seen": 298757120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045931795386158476, + "loss": 3.3034, + "theoretical_loss": 4.148336268405753, + "tokens_seen": 298822656 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459307923771314, + "loss": 3.3948, + "theoretical_loss": 4.148229468122347, + "tokens_seen": 298888192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004592978936810431, + "loss": 3.4624, + "theoretical_loss": 4.148122697809345, + "tokens_seen": 298953728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045928786359077236, + "loss": 3.3567, + "theoretical_loss": 4.148015957451772, + "tokens_seen": 299019264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004592778335005015, + "loss": 3.417, + "theoretical_loss": 4.147909247034658, + "tokens_seen": 299084800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004592678034102307, + "loss": 3.4001, + "theoretical_loss": 4.147802566543049, + "tokens_seen": 299150336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004592577733199599, + "loss": 3.2872, + "theoretical_loss": 4.147695915961998, + "tokens_seen": 299215872 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004592477432296891, + "loss": 3.3536, + "theoretical_loss": 4.147589295276573, + "tokens_seen": 299281408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045923771313941827, + "loss": 3.2934, + "theoretical_loss": 4.147482704471848, + "tokens_seen": 299346944 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045922768304914745, + "loss": 3.3833, + "theoretical_loss": 4.1473761435329095, + "tokens_seen": 299412480 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045921765295887663, + "loss": 3.2122, + "theoretical_loss": 4.147269612444857, + "tokens_seen": 299478016 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045920762286860586, + "loss": 3.3666, + "theoretical_loss": 4.147163111192797, + "tokens_seen": 299543552 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459197592778335, + "loss": 3.3598, + "theoretical_loss": 4.147056639761849, + "tokens_seen": 299609088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004591875626880642, + "loss": 3.3981, + "theoretical_loss": 4.146950198137143, + "tokens_seen": 299674624 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045917753259779335, + "loss": 3.4309, + "theoretical_loss": 4.146843786303818, + "tokens_seen": 299740160 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 748805, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5238001346588135, + "objective/train/theoretical_loss": 4.146737404247027, + "objective/train/tokens_used": 320265696, + "theoretical_loss": 4.146737404247027, + "tokens_seen": 299805696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004591675025075226, + "loss": 3.4419, + "theoretical_loss": 4.146737404247027, + "tokens_seen": 299805696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045915747241725177, + "loss": 3.3529, + "theoretical_loss": 4.146631051951929, + "tokens_seen": 299871232 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045914744232698095, + "loss": 3.2939, + "theoretical_loss": 4.146524729403699, + "tokens_seen": 299936768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045913741223671013, + "loss": 3.2853, + "theoretical_loss": 4.146418436587517, + "tokens_seen": 300002304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045912738214643937, + "loss": 3.3618, + "theoretical_loss": 4.146312173488578, + "tokens_seen": 300067840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004591173520561685, + "loss": 3.3298, + "theoretical_loss": 4.146205940092087, + "tokens_seen": 300133376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045910732196589773, + "loss": 3.4012, + "theoretical_loss": 4.146099736383256, + "tokens_seen": 300198912 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045909729187562686, + "loss": 3.2705, + "theoretical_loss": 4.145993562347313, + "tokens_seen": 300264448 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004590872617853561, + "loss": 3.4372, + "theoretical_loss": 4.145887417969492, + "tokens_seen": 300329984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004590772316950853, + "loss": 3.2934, + "theoretical_loss": 4.14578130323504, + "tokens_seen": 300395520 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045906720160481445, + "loss": 3.1871, + "theoretical_loss": 4.145675218129213, + "tokens_seen": 300461056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045905717151454364, + "loss": 3.4155, + "theoretical_loss": 4.145569162637281, + "tokens_seen": 300526592 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004590471414242728, + "loss": 3.3106, + "theoretical_loss": 4.145463136744518, + "tokens_seen": 300592128 + }, + { + "epoch": 3.02, + "learning_rate": 0.000459037111334002, + "loss": 3.2972, + "theoretical_loss": 4.145357140436216, + "tokens_seen": 300657664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045902708124373123, + "loss": 3.3737, + "theoretical_loss": 4.145251173697673, + "tokens_seen": 300723200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045901705115346036, + "loss": 3.3533, + "theoretical_loss": 4.145145236514198, + "tokens_seen": 300788736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004590070210631896, + "loss": 3.3398, + "theoretical_loss": 4.145039328871111, + "tokens_seen": 300854272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004589969909729187, + "loss": 3.323, + "theoretical_loss": 4.144933450753744, + "tokens_seen": 300919808 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045898696088264796, + "loss": 3.3243, + "theoretical_loss": 4.144827602147435, + "tokens_seen": 300985344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045897693079237714, + "loss": 3.2535, + "theoretical_loss": 4.144721783037539, + "tokens_seen": 301050880 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004589669007021063, + "loss": 3.3681, + "theoretical_loss": 4.144615993409417, + "tokens_seen": 301116416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004589568706118355, + "loss": 3.3463, + "theoretical_loss": 4.14451023324844, + "tokens_seen": 301181952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045894684052156474, + "loss": 3.3289, + "theoretical_loss": 4.144404502539992, + "tokens_seen": 301247488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045893681043129386, + "loss": 3.3844, + "theoretical_loss": 4.144298801269464, + "tokens_seen": 301313024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004589267803410231, + "loss": 3.4952, + "theoretical_loss": 4.144193129422264, + "tokens_seen": 301378560 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 751621, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.337928056716919, + "objective/train/theoretical_loss": 4.144087486983802, + "objective/train/tokens_used": 321904096, + "theoretical_loss": 4.144087486983802, + "tokens_seen": 301444096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004589167502507522, + "loss": 3.3749, + "theoretical_loss": 4.144087486983802, + "tokens_seen": 301444096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045890672016048146, + "loss": 3.3472, + "theoretical_loss": 4.143981873939504, + "tokens_seen": 301509632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045889669007021064, + "loss": 3.4432, + "theoretical_loss": 4.143876290274806, + "tokens_seen": 301575168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004588866599799398, + "loss": 3.3312, + "theoretical_loss": 4.143770735975152, + "tokens_seen": 301640704 + }, + { + "epoch": 3.02, + "learning_rate": 0.000458876629889669, + "loss": 3.3696, + "theoretical_loss": 4.143665211025997, + "tokens_seen": 301706240 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004588665997993982, + "loss": 3.3794, + "theoretical_loss": 4.143559715412809, + "tokens_seen": 301771776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045885656970912737, + "loss": 3.3845, + "theoretical_loss": 4.143454249121063, + "tokens_seen": 301837312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004588465396188566, + "loss": 3.4021, + "theoretical_loss": 4.143348812136244, + "tokens_seen": 301902848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045883650952858573, + "loss": 3.3443, + "theoretical_loss": 4.143243404443853, + "tokens_seen": 301968384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045882647943831496, + "loss": 3.3701, + "theoretical_loss": 4.143138026029394, + "tokens_seen": 302033920 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045881644934804415, + "loss": 3.3444, + "theoretical_loss": 4.143032676878386, + "tokens_seen": 302099456 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004588064192577733, + "loss": 3.3098, + "theoretical_loss": 4.142927356976357, + "tokens_seen": 302164992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004587963891675025, + "loss": 3.3824, + "theoretical_loss": 4.142822066308845, + "tokens_seen": 302230528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004587863590772317, + "loss": 3.3844, + "theoretical_loss": 4.142716804861399, + "tokens_seen": 302296064 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045877632898696087, + "loss": 3.3784, + "theoretical_loss": 4.142611572619577, + "tokens_seen": 302361600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004587662988966901, + "loss": 3.3639, + "theoretical_loss": 4.142506369568949, + "tokens_seen": 302427136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045875626880641923, + "loss": 3.3414, + "theoretical_loss": 4.142401195695092, + "tokens_seen": 302492672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045874623871614847, + "loss": 3.4101, + "theoretical_loss": 4.142296050983599, + "tokens_seen": 302558208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004587362086258776, + "loss": 3.3497, + "theoretical_loss": 4.142190935420067, + "tokens_seen": 302623744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045872617853560683, + "loss": 3.4554, + "theoretical_loss": 4.142085848990108, + "tokens_seen": 302689280 + }, + { + "epoch": 3.02, + "learning_rate": 0.000458716148445336, + "loss": 3.2954, + "theoretical_loss": 4.141980791679341, + "tokens_seen": 302754816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004587061183550652, + "loss": 3.3815, + "theoretical_loss": 4.141875763473397, + "tokens_seen": 302820352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004586960882647944, + "loss": 3.4204, + "theoretical_loss": 4.141770764357916, + "tokens_seen": 302885888 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045868605817452355, + "loss": 3.4068, + "theoretical_loss": 4.141665794318549, + "tokens_seen": 302951424 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004586760280842528, + "loss": 3.3828, + "theoretical_loss": 4.141560853340959, + "tokens_seen": 303016960 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 755397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2629525661468506, + "objective/train/theoretical_loss": 4.141455941410815, + "objective/train/tokens_used": 323542496, + "theoretical_loss": 4.141455941410815, + "tokens_seen": 303082496 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045866599799398197, + "loss": 3.1715, + "theoretical_loss": 4.141455941410815, + "tokens_seen": 303082496 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045865596790371115, + "loss": 3.2613, + "theoretical_loss": 4.141351058513798, + "tokens_seen": 303148032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045864593781344033, + "loss": 3.288, + "theoretical_loss": 4.141246204635602, + "tokens_seen": 303213568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045863590772316957, + "loss": 3.3515, + "theoretical_loss": 4.141141379761925, + "tokens_seen": 303279104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004586258776328987, + "loss": 3.3191, + "theoretical_loss": 4.141036583878482, + "tokens_seen": 303344640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045861584754262793, + "loss": 3.3776, + "theoretical_loss": 4.140931816970994, + "tokens_seen": 303410176 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045860581745235706, + "loss": 3.4229, + "theoretical_loss": 4.140827079025193, + "tokens_seen": 303475712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004585957873620863, + "loss": 3.4553, + "theoretical_loss": 4.14072237002682, + "tokens_seen": 303541248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004585857572718155, + "loss": 3.3487, + "theoretical_loss": 4.14061768996163, + "tokens_seen": 303606784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045857572718154465, + "loss": 3.3642, + "theoretical_loss": 4.1405130388153815, + "tokens_seen": 303672320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045856569709127384, + "loss": 3.3268, + "theoretical_loss": 4.14040841657385, + "tokens_seen": 303737856 + }, + { + "epoch": 3.02, + "learning_rate": 0.000458555667001003, + "loss": 3.3279, + "theoretical_loss": 4.140303823222816, + "tokens_seen": 303803392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004585456369107322, + "loss": 3.4392, + "theoretical_loss": 4.1401992587480745, + "tokens_seen": 303868928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045853560682046143, + "loss": 3.4298, + "theoretical_loss": 4.140094723135425, + "tokens_seen": 303934464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045852557673019056, + "loss": 3.3589, + "theoretical_loss": 4.139990216370681, + "tokens_seen": 304000000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004585155466399198, + "loss": 3.4474, + "theoretical_loss": 4.139885738439667, + "tokens_seen": 304065536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004585055165496489, + "loss": 3.3639, + "theoretical_loss": 4.139781289328214, + "tokens_seen": 304131072 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045849548645937816, + "loss": 3.3502, + "theoretical_loss": 4.139676869022164, + "tokens_seen": 304196608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045848545636910734, + "loss": 3.4278, + "theoretical_loss": 4.139572477507372, + "tokens_seen": 304262144 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004584754262788365, + "loss": 3.5207, + "theoretical_loss": 4.139468114769699, + "tokens_seen": 304327680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004584653961885657, + "loss": 3.335, + "theoretical_loss": 4.139363780795017, + "tokens_seen": 304393216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045845536609829494, + "loss": 3.3648, + "theoretical_loss": 4.139259475569211, + "tokens_seen": 304458752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045844533600802406, + "loss": 3.2343, + "theoretical_loss": 4.139155199078171, + "tokens_seen": 304524288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004584353059177533, + "loss": 3.3856, + "theoretical_loss": 4.1390509513078015, + "tokens_seen": 304589824 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004584252758274824, + "loss": 3.3584, + "theoretical_loss": 4.138946732244014, + "tokens_seen": 304655360 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1678788661956787, + "objective/train/theoretical_loss": 4.13884254187273, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.13884254187273, + "tokens_seen": 304720896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045841524573721166, + "loss": 3.2824, + "theoretical_loss": 4.13884254187273, + "tokens_seen": 304720896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045840521564694084, + "loss": 3.4158, + "theoretical_loss": 4.138738380179884, + "tokens_seen": 304786432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045839518555667, + "loss": 3.3171, + "theoretical_loss": 4.138634247151417, + "tokens_seen": 304851968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004583851554663992, + "loss": 3.4214, + "theoretical_loss": 4.138530142773282, + "tokens_seen": 304917504 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004583751253761284, + "loss": 3.2771, + "theoretical_loss": 4.13842606703144, + "tokens_seen": 304983040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045836509528585757, + "loss": 3.3176, + "theoretical_loss": 4.138322019911864, + "tokens_seen": 305048576 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004583550651955868, + "loss": 3.4111, + "theoretical_loss": 4.138218001400535, + "tokens_seen": 305114112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045834503510531593, + "loss": 3.2666, + "theoretical_loss": 4.138114011483445, + "tokens_seen": 305179648 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045833500501504516, + "loss": 3.3685, + "theoretical_loss": 4.138010050146597, + "tokens_seen": 305245184 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045832497492477435, + "loss": 3.2955, + "theoretical_loss": 4.137906117376, + "tokens_seen": 305310720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004583149448345035, + "loss": 3.4155, + "theoretical_loss": 4.137802213157677, + "tokens_seen": 305376256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004583049147442327, + "loss": 3.3851, + "theoretical_loss": 4.137698337477659, + "tokens_seen": 305441792 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004582948846539619, + "loss": 3.4501, + "theoretical_loss": 4.137594490321986, + "tokens_seen": 305507328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045828485456369107, + "loss": 3.3713, + "theoretical_loss": 4.137490671676709, + "tokens_seen": 305572864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004582748244734203, + "loss": 3.4412, + "theoretical_loss": 4.13738688152789, + "tokens_seen": 305638400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045826479438314943, + "loss": 3.4317, + "theoretical_loss": 4.137283119861598, + "tokens_seen": 305703936 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045825476429287867, + "loss": 3.3094, + "theoretical_loss": 4.137179386663914, + "tokens_seen": 305769472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004582447342026078, + "loss": 3.4108, + "theoretical_loss": 4.137075681920928, + "tokens_seen": 305835008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045823470411233703, + "loss": 3.3659, + "theoretical_loss": 4.136972005618739, + "tokens_seen": 305900544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004582246740220662, + "loss": 3.3005, + "theoretical_loss": 4.136868357743458, + "tokens_seen": 305966080 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004582146439317954, + "loss": 3.3826, + "theoretical_loss": 4.136764738281202, + "tokens_seen": 306031616 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045820461384152457, + "loss": 3.3629, + "theoretical_loss": 4.136661147218102, + "tokens_seen": 306097152 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045819458375125375, + "loss": 3.3065, + "theoretical_loss": 4.136557584540297, + "tokens_seen": 306162688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045818455366098294, + "loss": 3.358, + "theoretical_loss": 4.136454050233933, + "tokens_seen": 306228224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045817452357071217, + "loss": 3.2498, + "theoretical_loss": 4.136350544285171, + "tokens_seen": 306293760 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.412407398223877, + "objective/train/theoretical_loss": 4.136247066680177, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.136247066680177, + "tokens_seen": 306359296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004581644934804413, + "loss": 3.388, + "theoretical_loss": 4.136247066680177, + "tokens_seen": 306359296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045815446339017053, + "loss": 3.392, + "theoretical_loss": 4.13614361740513, + "tokens_seen": 306424832 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004581444332998997, + "loss": 3.3976, + "theoretical_loss": 4.136040196446217, + "tokens_seen": 306490368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004581344032096289, + "loss": 3.3626, + "theoretical_loss": 4.135936803789635, + "tokens_seen": 306555904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004581243731193581, + "loss": 3.4091, + "theoretical_loss": 4.13583343942159, + "tokens_seen": 306621440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045811434302908726, + "loss": 3.2545, + "theoretical_loss": 4.135730103328299, + "tokens_seen": 306686976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045810431293881644, + "loss": 3.2406, + "theoretical_loss": 4.135626795495988, + "tokens_seen": 306752512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580942828485457, + "loss": 3.3161, + "theoretical_loss": 4.135523515910893, + "tokens_seen": 306818048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580842527582748, + "loss": 3.4415, + "theoretical_loss": 4.135420264559259, + "tokens_seen": 306883584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045807422266800404, + "loss": 3.3173, + "theoretical_loss": 4.13531704142734, + "tokens_seen": 306949120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045806419257773316, + "loss": 3.3255, + "theoretical_loss": 4.1352138465014034, + "tokens_seen": 307014656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580541624874624, + "loss": 3.3912, + "theoretical_loss": 4.13511067976772, + "tokens_seen": 307080192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580441323971916, + "loss": 3.3367, + "theoretical_loss": 4.135007541212576, + "tokens_seen": 307145728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045803410230692076, + "loss": 3.2095, + "theoretical_loss": 4.134904430822264, + "tokens_seen": 307211264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045802407221664994, + "loss": 3.0827, + "theoretical_loss": 4.134801348583086, + "tokens_seen": 307276800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580140421263791, + "loss": 3.2701, + "theoretical_loss": 4.134698294481357, + "tokens_seen": 307342336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004580040120361083, + "loss": 3.3371, + "theoretical_loss": 4.134595268503396, + "tokens_seen": 307407872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045799398194583754, + "loss": 3.4181, + "theoretical_loss": 4.134492270635538, + "tokens_seen": 307473408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045798395185556667, + "loss": 3.3775, + "theoretical_loss": 4.134389300864122, + "tokens_seen": 307538944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004579739217652959, + "loss": 3.3009, + "theoretical_loss": 4.1342863591754995, + "tokens_seen": 307604480 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004579638916750251, + "loss": 3.3925, + "theoretical_loss": 4.134183445556031, + "tokens_seen": 307670016 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045795386158475426, + "loss": 3.3653, + "theoretical_loss": 4.1340805599920865, + "tokens_seen": 307735552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004579438314944835, + "loss": 3.3427, + "theoretical_loss": 4.133977702470045, + "tokens_seen": 307801088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004579338014042126, + "loss": 3.2448, + "theoretical_loss": 4.133874872976294, + "tokens_seen": 307866624 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045792377131394186, + "loss": 3.325, + "theoretical_loss": 4.133772071497235, + "tokens_seen": 307932160 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.47104811668396, + "objective/train/theoretical_loss": 4.133669298019274, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.133669298019274, + "tokens_seen": 307997696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045791374122367104, + "loss": 3.2704, + "theoretical_loss": 4.133669298019274, + "tokens_seen": 307997696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004579037111334002, + "loss": 3.4602, + "theoretical_loss": 4.13356655252883, + "tokens_seen": 308063232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004578936810431294, + "loss": 3.3614, + "theoretical_loss": 4.133463835012327, + "tokens_seen": 308128768 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004578836509528586, + "loss": 3.3557, + "theoretical_loss": 4.133361145456203, + "tokens_seen": 308194304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045787362086258777, + "loss": 3.3316, + "theoretical_loss": 4.133258483846904, + "tokens_seen": 308259840 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457863590772317, + "loss": 3.3309, + "theoretical_loss": 4.133155850170886, + "tokens_seen": 308325376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045785356068204613, + "loss": 3.3573, + "theoretical_loss": 4.1330532444146115, + "tokens_seen": 308390912 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045784353059177536, + "loss": 3.3634, + "theoretical_loss": 4.132950666564557, + "tokens_seen": 308456448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045783350050150455, + "loss": 3.4029, + "theoretical_loss": 4.132848116607203, + "tokens_seen": 308521984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004578234704112337, + "loss": 3.4237, + "theoretical_loss": 4.132745594529045, + "tokens_seen": 308587520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004578134403209629, + "loss": 3.3872, + "theoretical_loss": 4.132643100316585, + "tokens_seen": 308653056 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004578034102306921, + "loss": 3.3192, + "theoretical_loss": 4.132540633956334, + "tokens_seen": 308718592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045779338014042127, + "loss": 3.3006, + "theoretical_loss": 4.132438195434815, + "tokens_seen": 308784128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004577833500501505, + "loss": 3.39, + "theoretical_loss": 4.132335784738555, + "tokens_seen": 308849664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045777331995987963, + "loss": 3.3416, + "theoretical_loss": 4.132233401854098, + "tokens_seen": 308915200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045776328986960887, + "loss": 3.3765, + "theoretical_loss": 4.13213104676799, + "tokens_seen": 308980736 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457753259779338, + "loss": 3.4426, + "theoretical_loss": 4.132028719466791, + "tokens_seen": 309046272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045774322968906723, + "loss": 3.3839, + "theoretical_loss": 4.131926419937069, + "tokens_seen": 309111808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004577331995987964, + "loss": 3.3309, + "theoretical_loss": 4.131824148165402, + "tokens_seen": 309177344 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004577231695085256, + "loss": 3.3499, + "theoretical_loss": 4.131721904138375, + "tokens_seen": 309242880 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004577131394182548, + "loss": 3.3951, + "theoretical_loss": 4.131619687842584, + "tokens_seen": 309308416 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045770310932798395, + "loss": 3.3863, + "theoretical_loss": 4.131517499264637, + "tokens_seen": 309373952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045769307923771314, + "loss": 3.3697, + "theoretical_loss": 4.131415338391146, + "tokens_seen": 309439488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045768304914744237, + "loss": 3.2944, + "theoretical_loss": 4.131313205208736, + "tokens_seen": 309505024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004576730190571715, + "loss": 3.2817, + "theoretical_loss": 4.131211099704038, + "tokens_seen": 309570560 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.295409679412842, + "objective/train/theoretical_loss": 4.131109021863699, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.131109021863699, + "tokens_seen": 309636096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045766298896690073, + "loss": 3.2591, + "theoretical_loss": 4.131109021863699, + "tokens_seen": 309636096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004576529588766299, + "loss": 3.3528, + "theoretical_loss": 4.131006971674366, + "tokens_seen": 309701632 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004576429287863591, + "loss": 3.2869, + "theoretical_loss": 4.130904949122703, + "tokens_seen": 309767168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004576328986960883, + "loss": 3.2036, + "theoretical_loss": 4.130802954195379, + "tokens_seen": 309832704 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045762286860581746, + "loss": 3.2742, + "theoretical_loss": 4.130700986879074, + "tokens_seen": 309898240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045761283851554664, + "loss": 3.467, + "theoretical_loss": 4.130599047160475, + "tokens_seen": 309963776 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004576028084252759, + "loss": 3.3107, + "theoretical_loss": 4.130497135026283, + "tokens_seen": 310029312 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457592778335005, + "loss": 3.3182, + "theoretical_loss": 4.130395250463203, + "tokens_seen": 310094848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045758274824473424, + "loss": 3.4367, + "theoretical_loss": 4.130293393457952, + "tokens_seen": 310160384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045757271815446336, + "loss": 3.2606, + "theoretical_loss": 4.130191563997256, + "tokens_seen": 310225920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004575626880641926, + "loss": 3.5343, + "theoretical_loss": 4.130089762067849, + "tokens_seen": 310291456 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004575526579739218, + "loss": 3.3743, + "theoretical_loss": 4.129987987656476, + "tokens_seen": 310356992 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045754262788365096, + "loss": 3.3118, + "theoretical_loss": 4.129886240749888, + "tokens_seen": 310422528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045753259779338014, + "loss": 3.3919, + "theoretical_loss": 4.129784521334851, + "tokens_seen": 310488064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004575225677031093, + "loss": 3.2649, + "theoretical_loss": 4.129682829398134, + "tokens_seen": 310553600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004575125376128385, + "loss": 3.3826, + "theoretical_loss": 4.129581164926518, + "tokens_seen": 310619136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045750250752256774, + "loss": 3.278, + "theoretical_loss": 4.129479527906792, + "tokens_seen": 310684672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045749247743229687, + "loss": 3.2714, + "theoretical_loss": 4.129377918325757, + "tokens_seen": 310750208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004574824473420261, + "loss": 3.3861, + "theoretical_loss": 4.129276336170219, + "tokens_seen": 310815744 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004574724172517553, + "loss": 3.3373, + "theoretical_loss": 4.129174781426997, + "tokens_seen": 310881280 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045746238716148446, + "loss": 3.4, + "theoretical_loss": 4.129073254082916, + "tokens_seen": 310946816 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045745235707121364, + "loss": 3.3545, + "theoretical_loss": 4.128971754124811, + "tokens_seen": 311012352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004574423269809428, + "loss": 3.3302, + "theoretical_loss": 4.128870281539529, + "tokens_seen": 311077888 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457432296890672, + "loss": 3.2306, + "theoretical_loss": 4.12876883631392, + "tokens_seen": 311143424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045742226680040124, + "loss": 3.2486, + "theoretical_loss": 4.12866741843485, + "tokens_seen": 311208960 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1544976234436035, + "objective/train/theoretical_loss": 4.12856602788919, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.12856602788919, + "tokens_seen": 311274496 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045741223671013037, + "loss": 3.3642, + "theoretical_loss": 4.12856602788919, + "tokens_seen": 311274496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004574022066198596, + "loss": 3.2603, + "theoretical_loss": 4.128464664663819, + "tokens_seen": 311340032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045739217652958873, + "loss": 3.3362, + "theoretical_loss": 4.128363328745629, + "tokens_seen": 311405568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045738214643931797, + "loss": 3.3349, + "theoretical_loss": 4.128262020121518, + "tokens_seen": 311471104 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045737211634904715, + "loss": 3.1808, + "theoretical_loss": 4.128160738778394, + "tokens_seen": 311536640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045736208625877633, + "loss": 3.3371, + "theoretical_loss": 4.128059484703174, + "tokens_seen": 311602176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004573520561685055, + "loss": 3.2603, + "theoretical_loss": 4.127958257882783, + "tokens_seen": 311667712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045734202607823475, + "loss": 3.3413, + "theoretical_loss": 4.1278570583041585, + "tokens_seen": 311733248 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045733199598796387, + "loss": 3.4352, + "theoretical_loss": 4.127755885954243, + "tokens_seen": 311798784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004573219658976931, + "loss": 3.3737, + "theoretical_loss": 4.127654740819989, + "tokens_seen": 311864320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045731193580742223, + "loss": 3.2605, + "theoretical_loss": 4.12755362288836, + "tokens_seen": 311929856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045730190571715147, + "loss": 3.3538, + "theoretical_loss": 4.127452532146326, + "tokens_seen": 311995392 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045729187562688065, + "loss": 3.3362, + "theoretical_loss": 4.127351468580867, + "tokens_seen": 312060928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045728184553660983, + "loss": 3.376, + "theoretical_loss": 4.127250432178973, + "tokens_seen": 312126464 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457271815446339, + "loss": 3.3513, + "theoretical_loss": 4.12714942292764, + "tokens_seen": 312192000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004572617853560682, + "loss": 3.4166, + "theoretical_loss": 4.127048440813876, + "tokens_seen": 312257536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004572517552657974, + "loss": 3.388, + "theoretical_loss": 4.126947485824698, + "tokens_seen": 312323072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004572417251755266, + "loss": 3.3376, + "theoretical_loss": 4.126846557947129, + "tokens_seen": 312388608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045723169508525574, + "loss": 3.3338, + "theoretical_loss": 4.126745657168204, + "tokens_seen": 312454144 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457221664994985, + "loss": 3.3635, + "theoretical_loss": 4.126644783474964, + "tokens_seen": 312519680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004572116349047141, + "loss": 3.4336, + "theoretical_loss": 4.126543936854462, + "tokens_seen": 312585216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045720160481444334, + "loss": 3.2245, + "theoretical_loss": 4.126443117293758, + "tokens_seen": 312650752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045719157472417257, + "loss": 3.4048, + "theoretical_loss": 4.1263423247799205, + "tokens_seen": 312716288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571815446339017, + "loss": 3.3794, + "theoretical_loss": 4.126241559300029, + "tokens_seen": 312781824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045717151454363093, + "loss": 3.3307, + "theoretical_loss": 4.12614082084117, + "tokens_seen": 312847360 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.241759777069092, + "objective/train/theoretical_loss": 4.126040109390439, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.126040109390439, + "tokens_seen": 312912896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571614844533601, + "loss": 3.2865, + "theoretical_loss": 4.126040109390439, + "tokens_seen": 312912896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571514543630893, + "loss": 3.4084, + "theoretical_loss": 4.12593942493494, + "tokens_seen": 312978432 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571414242728185, + "loss": 3.3166, + "theoretical_loss": 4.12583876746179, + "tokens_seen": 313043968 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045713139418254766, + "loss": 3.3605, + "theoretical_loss": 4.125738136958108, + "tokens_seen": 313109504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045712136409227684, + "loss": 3.3416, + "theoretical_loss": 4.125637533411028, + "tokens_seen": 313175040 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571113340020061, + "loss": 3.3375, + "theoretical_loss": 4.125536956807688, + "tokens_seen": 313240576 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004571013039117352, + "loss": 3.3345, + "theoretical_loss": 4.125436407135238, + "tokens_seen": 313306112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045709127382146444, + "loss": 3.3248, + "theoretical_loss": 4.125335884380836, + "tokens_seen": 313371648 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045708124373119356, + "loss": 3.4014, + "theoretical_loss": 4.125235388531649, + "tokens_seen": 313437184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004570712136409228, + "loss": 3.2745, + "theoretical_loss": 4.125134919574851, + "tokens_seen": 313502720 + }, + { + "epoch": 3.02, + "learning_rate": 0.000457061183550652, + "loss": 3.271, + "theoretical_loss": 4.125034477497627, + "tokens_seen": 313568256 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045705115346038116, + "loss": 3.3104, + "theoretical_loss": 4.124934062287171, + "tokens_seen": 313633792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045704112337011034, + "loss": 3.4652, + "theoretical_loss": 4.124833673930683, + "tokens_seen": 313699328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004570310932798395, + "loss": 3.3168, + "theoretical_loss": 4.124733312415374, + "tokens_seen": 313764864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004570210631895687, + "loss": 3.3656, + "theoretical_loss": 4.1246329777284645, + "tokens_seen": 313830400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045701103309929794, + "loss": 3.3714, + "theoretical_loss": 4.124532669857182, + "tokens_seen": 313895936 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045700100300902707, + "loss": 3.3498, + "theoretical_loss": 4.1244323887887635, + "tokens_seen": 313961472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004569909729187563, + "loss": 3.2655, + "theoretical_loss": 4.124332134510453, + "tokens_seen": 314027008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004569809428284855, + "loss": 3.415, + "theoretical_loss": 4.124231907009507, + "tokens_seen": 314092544 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045697091273821466, + "loss": 3.3767, + "theoretical_loss": 4.124131706273187, + "tokens_seen": 314158080 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045696088264794384, + "loss": 3.3721, + "theoretical_loss": 4.124031532288765, + "tokens_seen": 314223616 + }, + { + "epoch": 3.02, + "learning_rate": 0.000456950852557673, + "loss": 3.2172, + "theoretical_loss": 4.123931385043522, + "tokens_seen": 314289152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004569408224674022, + "loss": 3.2521, + "theoretical_loss": 4.123831264524747, + "tokens_seen": 314354688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045693079237713144, + "loss": 3.2867, + "theoretical_loss": 4.123731170719737, + "tokens_seen": 314420224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045692076228686057, + "loss": 3.3194, + "theoretical_loss": 4.123631103615799, + "tokens_seen": 314485760 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.434213161468506, + "objective/train/theoretical_loss": 4.123531063200248, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.123531063200248, + "tokens_seen": 314551296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004569107321965898, + "loss": 3.4667, + "theoretical_loss": 4.123531063200248, + "tokens_seen": 314551296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045690070210631893, + "loss": 3.3621, + "theoretical_loss": 4.123431049460409, + "tokens_seen": 314616832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045689067201604817, + "loss": 3.3834, + "theoretical_loss": 4.123331062383614, + "tokens_seen": 314682368 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045688064192577735, + "loss": 3.4044, + "theoretical_loss": 4.123231101957203, + "tokens_seen": 314747904 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045687061183550653, + "loss": 3.2801, + "theoretical_loss": 4.123131168168526, + "tokens_seen": 314813440 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004568605817452357, + "loss": 3.3495, + "theoretical_loss": 4.123031261004943, + "tokens_seen": 314878976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045685055165496495, + "loss": 3.3687, + "theoretical_loss": 4.122931380453819, + "tokens_seen": 314944512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045684052156469407, + "loss": 3.2835, + "theoretical_loss": 4.122831526502532, + "tokens_seen": 315010048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004568304914744233, + "loss": 3.3782, + "theoretical_loss": 4.1227316991384635, + "tokens_seen": 315075584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045682046138415243, + "loss": 3.3787, + "theoretical_loss": 4.122631898349009, + "tokens_seen": 315141120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045681043129388167, + "loss": 3.358, + "theoretical_loss": 4.122532124121568, + "tokens_seen": 315206656 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045680040120361085, + "loss": 3.3444, + "theoretical_loss": 4.122432376443552, + "tokens_seen": 315272192 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045679037111334003, + "loss": 3.324, + "theoretical_loss": 4.12233265530238, + "tokens_seen": 315337728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567803410230692, + "loss": 3.4069, + "theoretical_loss": 4.122232960685476, + "tokens_seen": 315403264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567703109327984, + "loss": 3.2971, + "theoretical_loss": 4.122133292580281, + "tokens_seen": 315468800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567602808425276, + "loss": 3.4008, + "theoretical_loss": 4.122033650974235, + "tokens_seen": 315534336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567502507522568, + "loss": 3.4044, + "theoretical_loss": 4.1219340358547925, + "tokens_seen": 315599872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045674022066198594, + "loss": 3.26, + "theoretical_loss": 4.121834447209416, + "tokens_seen": 315665408 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567301905717152, + "loss": 3.4434, + "theoretical_loss": 4.121734885025573, + "tokens_seen": 315730944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567201604814443, + "loss": 3.3618, + "theoretical_loss": 4.121635349290745, + "tokens_seen": 315796480 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045671013039117354, + "loss": 3.4387, + "theoretical_loss": 4.121535839992417, + "tokens_seen": 315862016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004567001003009027, + "loss": 3.3078, + "theoretical_loss": 4.121436357118085, + "tokens_seen": 315927552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566900702106319, + "loss": 3.4526, + "theoretical_loss": 4.121336900655254, + "tokens_seen": 315993088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566800401203611, + "loss": 3.3368, + "theoretical_loss": 4.121237470591435, + "tokens_seen": 316058624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566700100300903, + "loss": 3.2617, + "theoretical_loss": 4.12113806691415, + "tokens_seen": 316124160 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2488391399383545, + "objective/train/theoretical_loss": 4.121038689610929, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.121038689610929, + "tokens_seen": 316189696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045665997993981944, + "loss": 3.2687, + "theoretical_loss": 4.121038689610929, + "tokens_seen": 316189696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566499498495487, + "loss": 3.4185, + "theoretical_loss": 4.120939338669309, + "tokens_seen": 316255232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566399197592778, + "loss": 3.382, + "theoretical_loss": 4.120840014076838, + "tokens_seen": 316320768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045662988966900704, + "loss": 3.4152, + "theoretical_loss": 4.120740715821069, + "tokens_seen": 316386304 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566198595787362, + "loss": 3.3786, + "theoretical_loss": 4.120641443889566, + "tokens_seen": 316451840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004566098294884654, + "loss": 3.3474, + "theoretical_loss": 4.120542198269902, + "tokens_seen": 316517376 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004565997993981946, + "loss": 3.3274, + "theoretical_loss": 4.120442978949657, + "tokens_seen": 316582912 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045658976930792376, + "loss": 3.525, + "theoretical_loss": 4.120343785916419, + "tokens_seen": 316648448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045657973921765294, + "loss": 3.424, + "theoretical_loss": 4.120244619157786, + "tokens_seen": 316713984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004565697091273822, + "loss": 3.3346, + "theoretical_loss": 4.120145478661362, + "tokens_seen": 316779520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004565596790371113, + "loss": 3.264, + "theoretical_loss": 4.120046364414763, + "tokens_seen": 316845056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045654964894684054, + "loss": 3.4168, + "theoretical_loss": 4.119947276405609, + "tokens_seen": 316910592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045653961885656967, + "loss": 3.3276, + "theoretical_loss": 4.119848214621534, + "tokens_seen": 316976128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004565295887662989, + "loss": 3.4607, + "theoretical_loss": 4.119749179050174, + "tokens_seen": 317041664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004565195586760281, + "loss": 3.419, + "theoretical_loss": 4.119650169679179, + "tokens_seen": 317107200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045650952858575727, + "loss": 3.312, + "theoretical_loss": 4.119551186496203, + "tokens_seen": 317172736 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045649949849548645, + "loss": 3.1434, + "theoretical_loss": 4.119452229488912, + "tokens_seen": 317238272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004564894684052157, + "loss": 3.3367, + "theoretical_loss": 4.1193532986449775, + "tokens_seen": 317303808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004564794383149448, + "loss": 3.4012, + "theoretical_loss": 4.1192543939520805, + "tokens_seen": 317369344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045646940822467405, + "loss": 3.3725, + "theoretical_loss": 4.119155515397911, + "tokens_seen": 317434880 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045645937813440317, + "loss": 3.337, + "theoretical_loss": 4.119056662970166, + "tokens_seen": 317500416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004564493480441324, + "loss": 3.3622, + "theoretical_loss": 4.118957836656553, + "tokens_seen": 317565952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045643931795386164, + "loss": 3.4061, + "theoretical_loss": 4.118859036444784, + "tokens_seen": 317631488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045642928786359077, + "loss": 3.2901, + "theoretical_loss": 4.118760262322583, + "tokens_seen": 317697024 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045641925777332, + "loss": 3.3909, + "theoretical_loss": 4.118661514277681, + "tokens_seen": 317762560 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4380221366882324, + "objective/train/theoretical_loss": 4.1185627922978165, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.1185627922978165, + "tokens_seen": 317828096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045640922768304913, + "loss": 3.3856, + "theoretical_loss": 4.1185627922978165, + "tokens_seen": 317828096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045639919759277837, + "loss": 3.3048, + "theoretical_loss": 4.118464096370738, + "tokens_seen": 317893632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045638916750250755, + "loss": 3.357, + "theoretical_loss": 4.118365426484201, + "tokens_seen": 317959168 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045637913741223673, + "loss": 3.3702, + "theoretical_loss": 4.118266782625969, + "tokens_seen": 318024704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004563691073219659, + "loss": 3.2639, + "theoretical_loss": 4.118168164783814, + "tokens_seen": 318090240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045635907723169515, + "loss": 3.3794, + "theoretical_loss": 4.118069572945519, + "tokens_seen": 318155776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045634904714142427, + "loss": 3.3825, + "theoretical_loss": 4.117971007098871, + "tokens_seen": 318221312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004563390170511535, + "loss": 3.3655, + "theoretical_loss": 4.117872467231667, + "tokens_seen": 318286848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045632898696088263, + "loss": 3.3208, + "theoretical_loss": 4.117773953331712, + "tokens_seen": 318352384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045631895687061187, + "loss": 3.3228, + "theoretical_loss": 4.117675465386821, + "tokens_seen": 318417920 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045630892678034105, + "loss": 3.4326, + "theoretical_loss": 4.117577003384816, + "tokens_seen": 318483456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045629889669007023, + "loss": 3.4369, + "theoretical_loss": 4.117478567313525, + "tokens_seen": 318548992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562888665997994, + "loss": 3.4043, + "theoretical_loss": 4.117380157160788, + "tokens_seen": 318614528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562788365095286, + "loss": 3.2675, + "theoretical_loss": 4.117281772914452, + "tokens_seen": 318680064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562688064192578, + "loss": 3.3596, + "theoretical_loss": 4.117183414562369, + "tokens_seen": 318745600 + }, + { + "epoch": 3.02, + "learning_rate": 0.000456258776328987, + "loss": 3.2678, + "theoretical_loss": 4.117085082092404, + "tokens_seen": 318811136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045624874623871614, + "loss": 3.4215, + "theoretical_loss": 4.116986775492428, + "tokens_seen": 318876672 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562387161484454, + "loss": 3.3686, + "theoretical_loss": 4.11688849475032, + "tokens_seen": 318942208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562286860581745, + "loss": 3.2992, + "theoretical_loss": 4.116790239853966, + "tokens_seen": 319007744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045621865596790374, + "loss": 3.2969, + "theoretical_loss": 4.1166920107912635, + "tokens_seen": 319073280 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004562086258776329, + "loss": 3.3181, + "theoretical_loss": 4.116593807550115, + "tokens_seen": 319138816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561985957873621, + "loss": 3.2632, + "theoretical_loss": 4.116495630118433, + "tokens_seen": 319204352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561885656970913, + "loss": 3.3366, + "theoretical_loss": 4.116397478484136, + "tokens_seen": 319269888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561785356068205, + "loss": 3.4519, + "theoretical_loss": 4.116299352635155, + "tokens_seen": 319335424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045616850551654964, + "loss": 3.4675, + "theoretical_loss": 4.1162012525594225, + "tokens_seen": 319400960 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2997679710388184, + "objective/train/theoretical_loss": 4.116103178244885, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.116103178244885, + "tokens_seen": 319466496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561584754262789, + "loss": 3.3432, + "theoretical_loss": 4.116103178244885, + "tokens_seen": 319466496 + }, + { + "epoch": 3.02, + "learning_rate": 0.000456148445336008, + "loss": 3.4092, + "theoretical_loss": 4.116005129679496, + "tokens_seen": 319532032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045613841524573724, + "loss": 3.309, + "theoretical_loss": 4.115907106851214, + "tokens_seen": 319597568 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561283851554664, + "loss": 3.2965, + "theoretical_loss": 4.1158091097480085, + "tokens_seen": 319663104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561183550651956, + "loss": 3.4222, + "theoretical_loss": 4.115711138357857, + "tokens_seen": 319728640 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004561083249749248, + "loss": 3.3415, + "theoretical_loss": 4.115613192668743, + "tokens_seen": 319794176 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045609829488465396, + "loss": 3.2343, + "theoretical_loss": 4.1155152726686595, + "tokens_seen": 319859712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045608826479438314, + "loss": 3.3253, + "theoretical_loss": 4.115417378345608, + "tokens_seen": 319925248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004560782347041124, + "loss": 3.445, + "theoretical_loss": 4.115319509687597, + "tokens_seen": 319990784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004560682046138415, + "loss": 3.3518, + "theoretical_loss": 4.115221666682645, + "tokens_seen": 320056320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045605817452357074, + "loss": 3.4073, + "theoretical_loss": 4.115123849318776, + "tokens_seen": 320121856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045604814443329987, + "loss": 3.3503, + "theoretical_loss": 4.115026057584023, + "tokens_seen": 320187392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004560381143430291, + "loss": 3.3411, + "theoretical_loss": 4.114928291466429, + "tokens_seen": 320252928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004560280842527583, + "loss": 3.4153, + "theoretical_loss": 4.11483055095404, + "tokens_seen": 320318464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045601805416248747, + "loss": 3.3818, + "theoretical_loss": 4.114732836034916, + "tokens_seen": 320384000 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045600802407221665, + "loss": 3.354, + "theoretical_loss": 4.114635146697121, + "tokens_seen": 320449536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004559979939819459, + "loss": 3.2954, + "theoretical_loss": 4.1145374829287285, + "tokens_seen": 320515072 + }, + { + "epoch": 3.02, + "learning_rate": 0.000455987963891675, + "loss": 3.3816, + "theoretical_loss": 4.11443984471782, + "tokens_seen": 320580608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045597793380140425, + "loss": 3.533, + "theoretical_loss": 4.1143422320524845, + "tokens_seen": 320646144 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045596790371113337, + "loss": 3.353, + "theoretical_loss": 4.114244644920819, + "tokens_seen": 320711680 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004559578736208626, + "loss": 3.2532, + "theoretical_loss": 4.11414708331093, + "tokens_seen": 320777216 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004559478435305918, + "loss": 3.3886, + "theoretical_loss": 4.114049547210929, + "tokens_seen": 320842752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045593781344032097, + "loss": 3.2745, + "theoretical_loss": 4.113952036608938, + "tokens_seen": 320908288 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045592778335005015, + "loss": 3.2049, + "theoretical_loss": 4.113854551493086, + "tokens_seen": 320973824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045591775325977933, + "loss": 3.3117, + "theoretical_loss": 4.113757091851511, + "tokens_seen": 321039360 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.349116325378418, + "objective/train/theoretical_loss": 4.113659657672356, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.113659657672356, + "tokens_seen": 321104896 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004559077231695085, + "loss": 3.3043, + "theoretical_loss": 4.113659657672356, + "tokens_seen": 321104896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045589769307923775, + "loss": 3.3935, + "theoretical_loss": 4.113562248943775, + "tokens_seen": 321170432 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004558876629889669, + "loss": 3.3858, + "theoretical_loss": 4.113464865653929, + "tokens_seen": 321235968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004558776328986961, + "loss": 3.3606, + "theoretical_loss": 4.113367507790987, + "tokens_seen": 321301504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045586760280842524, + "loss": 3.3949, + "theoretical_loss": 4.113270175343125, + "tokens_seen": 321367040 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004558575727181545, + "loss": 3.401, + "theoretical_loss": 4.113172868298529, + "tokens_seen": 321432576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045584754262788365, + "loss": 3.3874, + "theoretical_loss": 4.1130755866453885, + "tokens_seen": 321498112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045583751253761284, + "loss": 3.3678, + "theoretical_loss": 4.112978330371908, + "tokens_seen": 321563648 + }, + { + "epoch": 3.02, + "learning_rate": 0.000455827482447342, + "loss": 3.2563, + "theoretical_loss": 4.112881099466292, + "tokens_seen": 321629184 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045581745235707125, + "loss": 3.4085, + "theoretical_loss": 4.112783893916759, + "tokens_seen": 321694720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004558074222668004, + "loss": 3.3726, + "theoretical_loss": 4.112686713711533, + "tokens_seen": 321760256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004557973921765296, + "loss": 3.3302, + "theoretical_loss": 4.1125895588388435, + "tokens_seen": 321825792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045578736208625874, + "loss": 3.2531, + "theoretical_loss": 4.112492429286933, + "tokens_seen": 321891328 + }, + { + "epoch": 3.02, + "learning_rate": 0.000455777331995988, + "loss": 3.2909, + "theoretical_loss": 4.112395325044048, + "tokens_seen": 321956864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045576730190571716, + "loss": 3.3217, + "theoretical_loss": 4.112298246098445, + "tokens_seen": 322022400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045575727181544634, + "loss": 3.2251, + "theoretical_loss": 4.112201192438385, + "tokens_seen": 322087936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004557472417251755, + "loss": 3.1697, + "theoretical_loss": 4.112104164052141, + "tokens_seen": 322153472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004557372116349047, + "loss": 3.3694, + "theoretical_loss": 4.112007160927992, + "tokens_seen": 322219008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004557271815446339, + "loss": 3.298, + "theoretical_loss": 4.111910183054224, + "tokens_seen": 322284544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004557171514543631, + "loss": 3.3054, + "theoretical_loss": 4.1118132304191315, + "tokens_seen": 322350080 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045570712136409224, + "loss": 3.3882, + "theoretical_loss": 4.1117163030110175, + "tokens_seen": 322415616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556970912738215, + "loss": 3.4058, + "theoretical_loss": 4.111619400818192, + "tokens_seen": 322481152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556870611835507, + "loss": 3.3417, + "theoretical_loss": 4.111522523828973, + "tokens_seen": 322546688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045567703109327984, + "loss": 3.317, + "theoretical_loss": 4.111425672031686, + "tokens_seen": 322612224 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556670010030091, + "loss": 3.346, + "theoretical_loss": 4.111328845414665, + "tokens_seen": 322677760 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 756352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4127137660980225, + "objective/train/theoretical_loss": 4.11123204396625, + "objective/train/tokens_used": 323961312, + "theoretical_loss": 4.11123204396625, + "tokens_seen": 322743296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556569709127382, + "loss": 3.3442, + "theoretical_loss": 4.11123204396625, + "tokens_seen": 322743296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045564694082246744, + "loss": 3.3689, + "theoretical_loss": 4.111135267674792, + "tokens_seen": 322808832 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556369107321966, + "loss": 3.3846, + "theoretical_loss": 4.111038516528647, + "tokens_seen": 322874368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004556268806419258, + "loss": 3.3727, + "theoretical_loss": 4.110941790516179, + "tokens_seen": 322939904 + }, + { + "epoch": 3.02, + "learning_rate": 0.000455616850551655, + "loss": 3.1483, + "theoretical_loss": 4.110845089625761, + "tokens_seen": 323005440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045560682046138416, + "loss": 3.3729, + "theoretical_loss": 4.110748413845773, + "tokens_seen": 323070976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045559679037111334, + "loss": 3.4239, + "theoretical_loss": 4.110651763164603, + "tokens_seen": 323136512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004555867602808426, + "loss": 3.4348, + "theoretical_loss": 4.110555137570646, + "tokens_seen": 323202048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004555767301905717, + "loss": 3.2598, + "theoretical_loss": 4.110458537052306, + "tokens_seen": 323267584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045556670010030094, + "loss": 3.4094, + "theoretical_loss": 4.110361961597993, + "tokens_seen": 323333120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045555667001003007, + "loss": 3.2751, + "theoretical_loss": 4.110265411196126, + "tokens_seen": 323398656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004555466399197593, + "loss": 3.3937, + "theoretical_loss": 4.110168885835131, + "tokens_seen": 323464192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004555366098294885, + "loss": 3.2547, + "theoretical_loss": 4.110072385503443, + "tokens_seen": 323529728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045552657973921767, + "loss": 3.2854, + "theoretical_loss": 4.109975910189504, + "tokens_seen": 323595264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045551654964894685, + "loss": 3.2812, + "theoretical_loss": 4.109879459881761, + "tokens_seen": 323660800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004555065195586761, + "loss": 3.3136, + "theoretical_loss": 4.109783034568673, + "tokens_seen": 323726336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004554964894684052, + "loss": 3.4086, + "theoretical_loss": 4.109686634238706, + "tokens_seen": 323791872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045548645937813445, + "loss": 3.441, + "theoretical_loss": 4.1095902588803295, + "tokens_seen": 323857408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00045547642928786357, + "loss": 3.4201, + "theoretical_loss": 4.109493908482025, + "tokens_seen": 323922944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0004554663991975928, + "loss": 3.3263, + "theoretical_loss": 4.109408117413553, + "tokens_seen": 323981312 + }, + { + "epoch": 4.0, + "learning_rate": 0.000455456369107322, + "loss": 3.2421, + "theoretical_loss": 4.109311814173933, + "tokens_seen": 324046848 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045544633901705117, + "loss": 3.1741, + "theoretical_loss": 4.109215535861127, + "tokens_seen": 324112384 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045543630892678035, + "loss": 3.28, + "theoretical_loss": 4.109119282463646, + "tokens_seen": 324177920 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045542627883650953, + "loss": 3.2013, + "theoretical_loss": 4.109023053970008, + "tokens_seen": 324243456 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004554162487462387, + "loss": 3.2305, + "theoretical_loss": 4.108926850368736, + "tokens_seen": 324308992 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 805027, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.26824688911438, + "objective/train/theoretical_loss": 4.108830671648363, + "objective/train/tokens_used": 344834528, + "theoretical_loss": 4.108830671648363, + "tokens_seen": 324374528 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045540621865596795, + "loss": 3.2289, + "theoretical_loss": 4.108830671648363, + "tokens_seen": 324374528 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553961885656971, + "loss": 3.3296, + "theoretical_loss": 4.108734517797431, + "tokens_seen": 324440064 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553861584754263, + "loss": 3.2057, + "theoretical_loss": 4.1086383888044855, + "tokens_seen": 324505600 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045537612838515544, + "loss": 3.1829, + "theoretical_loss": 4.108542284658084, + "tokens_seen": 324571136 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553660982948847, + "loss": 3.267, + "theoretical_loss": 4.108446205346787, + "tokens_seen": 324636672 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045535606820461385, + "loss": 3.1336, + "theoretical_loss": 4.108350150859167, + "tokens_seen": 324702208 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045534603811434304, + "loss": 3.2729, + "theoretical_loss": 4.108254121183801, + "tokens_seen": 324767744 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553360080240722, + "loss": 3.2655, + "theoretical_loss": 4.1081581163092755, + "tokens_seen": 324833280 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045532597793380145, + "loss": 3.2706, + "theoretical_loss": 4.1080621362241825, + "tokens_seen": 324898816 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553159478435306, + "loss": 3.3267, + "theoretical_loss": 4.107966180917124, + "tokens_seen": 324964352 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004553059177532598, + "loss": 3.3241, + "theoretical_loss": 4.107870250376708, + "tokens_seen": 325029888 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045529588766298894, + "loss": 3.1469, + "theoretical_loss": 4.107774344591549, + "tokens_seen": 325095424 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552858575727182, + "loss": 3.2018, + "theoretical_loss": 4.107678463550272, + "tokens_seen": 325160960 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045527582748244736, + "loss": 3.337, + "theoretical_loss": 4.107582607241507, + "tokens_seen": 325226496 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045526579739217654, + "loss": 3.1527, + "theoretical_loss": 4.107486775653891, + "tokens_seen": 325292032 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552557673019057, + "loss": 3.3133, + "theoretical_loss": 4.107390968776073, + "tokens_seen": 325357568 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552457372116349, + "loss": 3.2998, + "theoretical_loss": 4.107295186596705, + "tokens_seen": 325423104 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552357071213641, + "loss": 3.3095, + "theoretical_loss": 4.107199429104447, + "tokens_seen": 325488640 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552256770310933, + "loss": 3.2394, + "theoretical_loss": 4.107103696287967, + "tokens_seen": 325554176 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045521564694082244, + "loss": 3.1686, + "theoretical_loss": 4.107007988135943, + "tokens_seen": 325619712 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004552056168505517, + "loss": 3.2421, + "theoretical_loss": 4.106912304637056, + "tokens_seen": 325685248 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551955867602808, + "loss": 3.3708, + "theoretical_loss": 4.1068166457799995, + "tokens_seen": 325750784 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045518555667001004, + "loss": 3.2549, + "theoretical_loss": 4.106721011553469, + "tokens_seen": 325816320 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551755265797392, + "loss": 3.3068, + "theoretical_loss": 4.106625401946172, + "tokens_seen": 325881856 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551654964894684, + "loss": 3.2767, + "theoretical_loss": 4.10652981694682, + "tokens_seen": 325947392 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 809942, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.446375846862793, + "objective/train/theoretical_loss": 4.106434256544134, + "objective/train/tokens_used": 346472928, + "theoretical_loss": 4.106434256544134, + "tokens_seen": 326012928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551554663991976, + "loss": 3.3296, + "theoretical_loss": 4.106434256544134, + "tokens_seen": 326012928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551454363089268, + "loss": 3.299, + "theoretical_loss": 4.106338720726844, + "tokens_seen": 326078464 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045513540621865595, + "loss": 3.3169, + "theoretical_loss": 4.106243209483683, + "tokens_seen": 326144000 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551253761283852, + "loss": 3.3561, + "theoretical_loss": 4.106147722803394, + "tokens_seen": 326209536 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004551153460381143, + "loss": 3.2784, + "theoretical_loss": 4.10605226067473, + "tokens_seen": 326275072 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045510531594784354, + "loss": 3.2258, + "theoretical_loss": 4.105956823086445, + "tokens_seen": 326340608 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550952858575727, + "loss": 3.2389, + "theoretical_loss": 4.105861410027307, + "tokens_seen": 326406144 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550852557673019, + "loss": 3.2061, + "theoretical_loss": 4.105766021486087, + "tokens_seen": 326471680 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550752256770311, + "loss": 3.1913, + "theoretical_loss": 4.105670657451565, + "tokens_seen": 326537216 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045506519558676027, + "loss": 3.1959, + "theoretical_loss": 4.105575317912529, + "tokens_seen": 326602752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045505516549648945, + "loss": 3.3004, + "theoretical_loss": 4.105480002857774, + "tokens_seen": 326668288 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550451354062187, + "loss": 3.259, + "theoretical_loss": 4.105384712276099, + "tokens_seen": 326733824 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550351053159478, + "loss": 3.2223, + "theoretical_loss": 4.105289446156318, + "tokens_seen": 326799360 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045502507522567705, + "loss": 3.261, + "theoretical_loss": 4.105194204487244, + "tokens_seen": 326864896 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045501504513540623, + "loss": 3.1937, + "theoretical_loss": 4.105098987257703, + "tokens_seen": 326930432 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004550050150451354, + "loss": 3.2355, + "theoretical_loss": 4.105003794456525, + "tokens_seen": 326995968 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004549949849548646, + "loss": 3.3566, + "theoretical_loss": 4.104908626072551, + "tokens_seen": 327061504 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045498495486459377, + "loss": 3.3203, + "theoretical_loss": 4.104813482094626, + "tokens_seen": 327127040 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045497492477432295, + "loss": 3.2448, + "theoretical_loss": 4.104718362511603, + "tokens_seen": 327192576 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004549648946840522, + "loss": 3.4097, + "theoretical_loss": 4.104623267312342, + "tokens_seen": 327258112 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045495486459378137, + "loss": 3.2888, + "theoretical_loss": 4.104528196485713, + "tokens_seen": 327323648 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045494483450351055, + "loss": 3.2556, + "theoretical_loss": 4.104433150020592, + "tokens_seen": 327389184 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045493480441323973, + "loss": 3.1995, + "theoretical_loss": 4.104338127905859, + "tokens_seen": 327454720 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004549247743229689, + "loss": 3.4071, + "theoretical_loss": 4.1042431301304045, + "tokens_seen": 327520256 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045491474423269815, + "loss": 3.1668, + "theoretical_loss": 4.104148156683127, + "tokens_seen": 327585792 + }, + { + "debugging/Self-BLEU-5": 0.7033283543779171, + "debugging/distinct-1-grams": 0.738822861586569, + "debugging/distinct-2-grams": 0.9369885984147496, + "debugging/entropy-1-grams": 6.656809708360688, + "debugging/entropy-2-grams": 8.059063205471514, + "debugging/length": 533.204081632653, + "debugging/num_segments": 49, + "epoch": 4.0, + "objective/train/docs_used": 813615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3166885375976562, + "objective/train/theoretical_loss": 4.104053207552932, + "objective/train/tokens_used": 348111328, + "theoretical_loss": 4.104053207552932, + "tokens_seen": 327651328 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004549047141424273, + "loss": 3.2396, + "theoretical_loss": 4.104053207552932, + "tokens_seen": 327651328 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004548946840521565, + "loss": 3.3488, + "theoretical_loss": 4.103958282728729, + "tokens_seen": 327716864 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045488465396188564, + "loss": 3.1992, + "theoretical_loss": 4.103863382199439, + "tokens_seen": 327782400 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004548746238716149, + "loss": 3.2192, + "theoretical_loss": 4.103768505953987, + "tokens_seen": 327847936 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045486459378134405, + "loss": 3.217, + "theoretical_loss": 4.103673653981307, + "tokens_seen": 327913472 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045485456369107324, + "loss": 3.2526, + "theoretical_loss": 4.103578826270341, + "tokens_seen": 327979008 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004548445336008024, + "loss": 3.3684, + "theoretical_loss": 4.1034840228100355, + "tokens_seen": 328044544 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045483450351053165, + "loss": 3.3617, + "theoretical_loss": 4.103389243589347, + "tokens_seen": 328110080 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004548244734202608, + "loss": 3.3339, + "theoretical_loss": 4.103294488597237, + "tokens_seen": 328175616 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045481444332999, + "loss": 3.3276, + "theoretical_loss": 4.1031997578226775, + "tokens_seen": 328241152 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045480441323971914, + "loss": 3.2438, + "theoretical_loss": 4.103105051254644, + "tokens_seen": 328306688 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547943831494484, + "loss": 3.3567, + "theoretical_loss": 4.103010368882122, + "tokens_seen": 328372224 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045478435305917756, + "loss": 3.2345, + "theoretical_loss": 4.1029157106941, + "tokens_seen": 328437760 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045477432296890674, + "loss": 3.1394, + "theoretical_loss": 4.1028210766795805, + "tokens_seen": 328503296 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547642928786359, + "loss": 3.2208, + "theoretical_loss": 4.102726466827567, + "tokens_seen": 328568832 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547542627883651, + "loss": 3.0969, + "theoretical_loss": 4.102631881127074, + "tokens_seen": 328634368 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547442326980943, + "loss": 3.2908, + "theoretical_loss": 4.102537319567121, + "tokens_seen": 328699904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547342026078235, + "loss": 3.307, + "theoretical_loss": 4.102442782136735, + "tokens_seen": 328765440 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045472417251755264, + "loss": 3.2889, + "theoretical_loss": 4.102348268824952, + "tokens_seen": 328830976 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004547141424272819, + "loss": 3.2388, + "theoretical_loss": 4.1022537796208125, + "tokens_seen": 328896512 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454704112337011, + "loss": 3.2518, + "theoretical_loss": 4.102159314513367, + "tokens_seen": 328962048 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045469408224674024, + "loss": 3.1814, + "theoretical_loss": 4.102064873491669, + "tokens_seen": 329027584 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546840521564694, + "loss": 3.3233, + "theoretical_loss": 4.101970456544785, + "tokens_seen": 329093120 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546740220661986, + "loss": 3.3328, + "theoretical_loss": 4.101876063661782, + "tokens_seen": 329158656 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546639919759278, + "loss": 3.3538, + "theoretical_loss": 4.101781694831741, + "tokens_seen": 329224192 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 816799, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.130906820297241, + "objective/train/theoretical_loss": 4.101687350043745, + "objective/train/tokens_used": 349749728, + "theoretical_loss": 4.101687350043745, + "tokens_seen": 329289728 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454653961885657, + "loss": 3.2295, + "theoretical_loss": 4.101687350043745, + "tokens_seen": 329289728 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045464393179538615, + "loss": 3.4439, + "theoretical_loss": 4.101593029286886, + "tokens_seen": 329355264 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546339017051154, + "loss": 3.3013, + "theoretical_loss": 4.1014987325502625, + "tokens_seen": 329420800 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546238716148445, + "loss": 3.3434, + "theoretical_loss": 4.101404459822981, + "tokens_seen": 329486336 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045461384152457374, + "loss": 3.2789, + "theoretical_loss": 4.101310211094154, + "tokens_seen": 329551872 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004546038114343029, + "loss": 3.3591, + "theoretical_loss": 4.101215986352903, + "tokens_seen": 329617408 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004545937813440321, + "loss": 3.3039, + "theoretical_loss": 4.101121785588354, + "tokens_seen": 329682944 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004545837512537613, + "loss": 3.1425, + "theoretical_loss": 4.101027608789643, + "tokens_seen": 329748480 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045457372116349047, + "loss": 3.312, + "theoretical_loss": 4.10093345594591, + "tokens_seen": 329814016 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045456369107321965, + "loss": 3.2713, + "theoretical_loss": 4.1008393270463035, + "tokens_seen": 329879552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004545536609829489, + "loss": 3.307, + "theoretical_loss": 4.10074522207998, + "tokens_seen": 329945088 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454543630892678, + "loss": 3.3045, + "theoretical_loss": 4.100651141036103, + "tokens_seen": 330010624 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045453360080240725, + "loss": 3.1746, + "theoretical_loss": 4.1005570839038405, + "tokens_seen": 330076160 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045452357071213643, + "loss": 3.188, + "theoretical_loss": 4.100463050672371, + "tokens_seen": 330141696 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004545135406218656, + "loss": 3.2481, + "theoretical_loss": 4.100369041330876, + "tokens_seen": 330207232 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004545035105315948, + "loss": 3.3313, + "theoretical_loss": 4.100275055868549, + "tokens_seen": 330272768 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045449348044132397, + "loss": 3.2949, + "theoretical_loss": 4.100181094274587, + "tokens_seen": 330338304 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045448345035105315, + "loss": 3.3359, + "theoretical_loss": 4.100087156538194, + "tokens_seen": 330403840 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544734202607824, + "loss": 3.0631, + "theoretical_loss": 4.0999932426485834, + "tokens_seen": 330469376 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544633901705115, + "loss": 3.2408, + "theoretical_loss": 4.099899352594974, + "tokens_seen": 330534912 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045445336008024075, + "loss": 3.2117, + "theoretical_loss": 4.099805486366591, + "tokens_seen": 330600448 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544433299899699, + "loss": 3.2972, + "theoretical_loss": 4.099711643952669, + "tokens_seen": 330665984 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544332998996991, + "loss": 3.3019, + "theoretical_loss": 4.099617825342446, + "tokens_seen": 330731520 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544232698094283, + "loss": 3.2039, + "theoretical_loss": 4.099524030525171, + "tokens_seen": 330797056 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004544132397191575, + "loss": 3.307, + "theoretical_loss": 4.099430259490096, + "tokens_seen": 330862592 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 821459, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.36027193069458, + "objective/train/theoretical_loss": 4.099336512226483, + "objective/train/tokens_used": 351388128, + "theoretical_loss": 4.099336512226483, + "tokens_seen": 330928128 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045440320962888666, + "loss": 3.1294, + "theoretical_loss": 4.099336512226483, + "tokens_seen": 330928128 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045439317953861584, + "loss": 3.383, + "theoretical_loss": 4.0992427887236005, + "tokens_seen": 330993664 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454383149448345, + "loss": 3.2048, + "theoretical_loss": 4.099149088970723, + "tokens_seen": 331059200 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045437311935807425, + "loss": 3.3453, + "theoretical_loss": 4.099055412957132, + "tokens_seen": 331124736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004543630892678034, + "loss": 3.2564, + "theoretical_loss": 4.098961760672117, + "tokens_seen": 331190272 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004543530591775326, + "loss": 3.2896, + "theoretical_loss": 4.098868132104974, + "tokens_seen": 331255808 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004543430290872618, + "loss": 3.3989, + "theoretical_loss": 4.098774527245004, + "tokens_seen": 331321344 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454332998996991, + "loss": 3.1343, + "theoretical_loss": 4.098680946081519, + "tokens_seen": 331386880 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045432296890672016, + "loss": 3.2578, + "theoretical_loss": 4.098587388603834, + "tokens_seen": 331452416 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045431293881644934, + "loss": 3.3131, + "theoretical_loss": 4.098493854801273, + "tokens_seen": 331517952 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004543029087261785, + "loss": 3.3571, + "theoretical_loss": 4.098400344663167, + "tokens_seen": 331583488 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045429287863590776, + "loss": 3.2889, + "theoretical_loss": 4.098306858178853, + "tokens_seen": 331649024 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004542828485456369, + "loss": 3.3737, + "theoretical_loss": 4.098213395337675, + "tokens_seen": 331714560 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004542728184553661, + "loss": 3.2616, + "theoretical_loss": 4.098119956128985, + "tokens_seen": 331780096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045426278836509525, + "loss": 3.2989, + "theoretical_loss": 4.098026540542141, + "tokens_seen": 331845632 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004542527582748245, + "loss": 3.3424, + "theoretical_loss": 4.097933148566506, + "tokens_seen": 331911168 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045424272818455366, + "loss": 3.2621, + "theoretical_loss": 4.097839780191455, + "tokens_seen": 331976704 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045423269809428284, + "loss": 3.2966, + "theoretical_loss": 4.097746435406364, + "tokens_seen": 332042240 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454222668004012, + "loss": 3.2025, + "theoretical_loss": 4.097653114200622, + "tokens_seen": 332107776 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004542126379137412, + "loss": 3.3491, + "theoretical_loss": 4.097559816563617, + "tokens_seen": 332173312 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045420260782347044, + "loss": 3.1086, + "theoretical_loss": 4.097466542484752, + "tokens_seen": 332238848 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541925777331996, + "loss": 3.3181, + "theoretical_loss": 4.097373291953431, + "tokens_seen": 332304384 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541825476429288, + "loss": 3.3897, + "theoretical_loss": 4.09728006495907, + "tokens_seen": 332369920 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454172517552658, + "loss": 3.2844, + "theoretical_loss": 4.097186861491085, + "tokens_seen": 332435456 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541624874623872, + "loss": 3.3244, + "theoretical_loss": 4.097093681538906, + "tokens_seen": 332500992 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 824560, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1732494831085205, + "objective/train/theoretical_loss": 4.097000525091966, + "objective/train/tokens_used": 353026528, + "theoretical_loss": 4.097000525091966, + "tokens_seen": 332566528 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045415245737211635, + "loss": 3.2129, + "theoretical_loss": 4.097000525091966, + "tokens_seen": 332566528 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541424272818456, + "loss": 3.3429, + "theoretical_loss": 4.096907392139704, + "tokens_seen": 332632064 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541323971915747, + "loss": 3.1753, + "theoretical_loss": 4.096814282671567, + "tokens_seen": 332697600 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045412236710130395, + "loss": 3.3445, + "theoretical_loss": 4.096721196677011, + "tokens_seen": 332763136 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541123370110331, + "loss": 3.2688, + "theoretical_loss": 4.096628134145497, + "tokens_seen": 332828672 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004541023069207623, + "loss": 3.3341, + "theoretical_loss": 4.096535095066491, + "tokens_seen": 332894208 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004540922768304915, + "loss": 3.2137, + "theoretical_loss": 4.096442079429468, + "tokens_seen": 332959744 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045408224674022067, + "loss": 3.3247, + "theoretical_loss": 4.09634908722391, + "tokens_seen": 333025280 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045407221664994985, + "loss": 3.3213, + "theoretical_loss": 4.096256118439304, + "tokens_seen": 333090816 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004540621865596791, + "loss": 3.2929, + "theoretical_loss": 4.096163173065145, + "tokens_seen": 333156352 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004540521564694082, + "loss": 3.1567, + "theoretical_loss": 4.096070251090937, + "tokens_seen": 333221888 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045404212637913745, + "loss": 3.3253, + "theoretical_loss": 4.095977352506185, + "tokens_seen": 333287424 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045403209628886663, + "loss": 3.1639, + "theoretical_loss": 4.095884477300405, + "tokens_seen": 333352960 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004540220661985958, + "loss": 3.2506, + "theoretical_loss": 4.095791625463121, + "tokens_seen": 333418496 + }, + { + "epoch": 4.0, + "learning_rate": 0.000454012036108325, + "loss": 3.2341, + "theoretical_loss": 4.095698796983859, + "tokens_seen": 333484032 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045400200601805417, + "loss": 3.3066, + "theoretical_loss": 4.095605991852157, + "tokens_seen": 333549568 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045399197592778335, + "loss": 3.291, + "theoretical_loss": 4.095513210057556, + "tokens_seen": 333615104 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539819458375126, + "loss": 3.1567, + "theoretical_loss": 4.095420451589604, + "tokens_seen": 333680640 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539719157472417, + "loss": 3.2744, + "theoretical_loss": 4.095327716437857, + "tokens_seen": 333746176 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045396188565697095, + "loss": 3.351, + "theoretical_loss": 4.095235004591878, + "tokens_seen": 333811712 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539518555667001, + "loss": 3.2999, + "theoretical_loss": 4.095142316041237, + "tokens_seen": 333877248 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539418254764293, + "loss": 3.1483, + "theoretical_loss": 4.095049650775508, + "tokens_seen": 333942784 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539317953861585, + "loss": 3.2271, + "theoretical_loss": 4.094957008784274, + "tokens_seen": 334008320 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004539217652958877, + "loss": 3.2797, + "theoretical_loss": 4.094864390057124, + "tokens_seen": 334073856 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045391173520561686, + "loss": 3.1847, + "theoretical_loss": 4.094771794583655, + "tokens_seen": 334139392 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 828411, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.463193893432617, + "objective/train/theoretical_loss": 4.094679222353469, + "objective/train/tokens_used": 354664928, + "theoretical_loss": 4.094679222353469, + "tokens_seen": 334204928 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045390170511534604, + "loss": 3.3378, + "theoretical_loss": 4.094679222353469, + "tokens_seen": 334204928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004538916750250752, + "loss": 3.2884, + "theoretical_loss": 4.094586673356175, + "tokens_seen": 334270464 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045388164493480445, + "loss": 3.1926, + "theoretical_loss": 4.09449414758139, + "tokens_seen": 334336000 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004538716148445336, + "loss": 3.2317, + "theoretical_loss": 4.094401645018735, + "tokens_seen": 334401536 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004538615847542628, + "loss": 3.3062, + "theoretical_loss": 4.09430916565784, + "tokens_seen": 334467072 + }, + { + "epoch": 4.0, + "learning_rate": 0.000453851554663992, + "loss": 3.2975, + "theoretical_loss": 4.094216709488342, + "tokens_seen": 334532608 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004538415245737212, + "loss": 3.2848, + "theoretical_loss": 4.094124276499883, + "tokens_seen": 334598144 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045383149448345036, + "loss": 3.3223, + "theoretical_loss": 4.094031866682112, + "tokens_seen": 334663680 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045382146439317954, + "loss": 3.2419, + "theoretical_loss": 4.093939480024685, + "tokens_seen": 334729216 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004538114343029087, + "loss": 3.3198, + "theoretical_loss": 4.0938471165172645, + "tokens_seen": 334794752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045380140421263796, + "loss": 3.245, + "theoretical_loss": 4.09375477614952, + "tokens_seen": 334860288 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537913741223671, + "loss": 3.3396, + "theoretical_loss": 4.093662458911128, + "tokens_seen": 334925824 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537813440320963, + "loss": 3.2462, + "theoretical_loss": 4.093570164791769, + "tokens_seen": 334991360 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045377131394182545, + "loss": 3.2604, + "theoretical_loss": 4.0934778937811345, + "tokens_seen": 335056896 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537612838515547, + "loss": 3.3441, + "theoretical_loss": 4.093385645868919, + "tokens_seen": 335122432 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045375125376128386, + "loss": 3.3086, + "theoretical_loss": 4.093293421044824, + "tokens_seen": 335187968 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045374122367101304, + "loss": 3.3909, + "theoretical_loss": 4.093201219298559, + "tokens_seen": 335253504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537311935807422, + "loss": 3.3017, + "theoretical_loss": 4.093109040619842, + "tokens_seen": 335319040 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537211634904714, + "loss": 3.1827, + "theoretical_loss": 4.0930168849983914, + "tokens_seen": 335384576 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537111334002006, + "loss": 3.2416, + "theoretical_loss": 4.092924752423937, + "tokens_seen": 335450112 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004537011033099298, + "loss": 3.1987, + "theoretical_loss": 4.092832642886216, + "tokens_seen": 335515648 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045369107321965895, + "loss": 3.2704, + "theoretical_loss": 4.092740556374968, + "tokens_seen": 335581184 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004536810431293882, + "loss": 3.2319, + "theoretical_loss": 4.092648492879943, + "tokens_seen": 335646720 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045367101303911737, + "loss": 3.2986, + "theoretical_loss": 4.092556452390895, + "tokens_seen": 335712256 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045366098294884655, + "loss": 3.3095, + "theoretical_loss": 4.092464434897585, + "tokens_seen": 335777792 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 833011, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2089200019836426, + "objective/train/theoretical_loss": 4.0923724403897825, + "objective/train/tokens_used": 356303328, + "theoretical_loss": 4.0923724403897825, + "tokens_seen": 335843328 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045365095285857573, + "loss": 3.2759, + "theoretical_loss": 4.0923724403897825, + "tokens_seen": 335843328 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004536409227683049, + "loss": 3.1707, + "theoretical_loss": 4.092280468857261, + "tokens_seen": 335908864 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004536308926780341, + "loss": 3.357, + "theoretical_loss": 4.092188520289803, + "tokens_seen": 335974400 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004536208625877633, + "loss": 3.3306, + "theoretical_loss": 4.092096594677196, + "tokens_seen": 336039936 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045361083249749245, + "loss": 3.3981, + "theoretical_loss": 4.092004692009233, + "tokens_seen": 336105472 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004536008024072217, + "loss": 3.28, + "theoretical_loss": 4.091912812275716, + "tokens_seen": 336171008 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004535907723169508, + "loss": 3.3463, + "theoretical_loss": 4.091820955466451, + "tokens_seen": 336236544 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045358074222668005, + "loss": 3.4156, + "theoretical_loss": 4.0917291215712535, + "tokens_seen": 336302080 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045357071213640923, + "loss": 3.3669, + "theoretical_loss": 4.0916373105799435, + "tokens_seen": 336367616 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004535606820461384, + "loss": 3.3369, + "theoretical_loss": 4.091545522482347, + "tokens_seen": 336433152 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004535506519558676, + "loss": 3.3484, + "theoretical_loss": 4.091453757268297, + "tokens_seen": 336498688 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045354062186559683, + "loss": 3.3386, + "theoretical_loss": 4.091362014927634, + "tokens_seen": 336564224 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045353059177532596, + "loss": 3.4319, + "theoretical_loss": 4.091270295450206, + "tokens_seen": 336629760 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004535205616850552, + "loss": 3.3753, + "theoretical_loss": 4.091178598825864, + "tokens_seen": 336695296 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004535105315947843, + "loss": 3.3666, + "theoretical_loss": 4.091086925044467, + "tokens_seen": 336760832 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045350050150451355, + "loss": 3.3938, + "theoretical_loss": 4.090995274095881, + "tokens_seen": 336826368 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045349047141424274, + "loss": 3.4103, + "theoretical_loss": 4.090903645969978, + "tokens_seen": 336891904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534804413239719, + "loss": 3.3619, + "theoretical_loss": 4.090812040656638, + "tokens_seen": 336957440 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534704112337011, + "loss": 3.3298, + "theoretical_loss": 4.090720458145745, + "tokens_seen": 337022976 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534603811434303, + "loss": 3.2521, + "theoretical_loss": 4.090628898427191, + "tokens_seen": 337088512 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534503510531595, + "loss": 3.3458, + "theoretical_loss": 4.090537361490874, + "tokens_seen": 337154048 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534403209628887, + "loss": 3.2184, + "theoretical_loss": 4.0904458473266985, + "tokens_seen": 337219584 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534302908726179, + "loss": 3.2619, + "theoretical_loss": 4.090354355924576, + "tokens_seen": 337285120 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045342026078234706, + "loss": 3.2383, + "theoretical_loss": 4.090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045341023069207624, + "loss": 3.3178, + "theoretical_loss": 4.090171441366162, + "tokens_seen": 337416192 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 836133, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.357266664505005, + "objective/train/theoretical_loss": 4.090080018189726, + "objective/train/tokens_used": 357941728, + "theoretical_loss": 4.090080018189726, + "tokens_seen": 337481728 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004534002006018054, + "loss": 3.2445, + "theoretical_loss": 4.090080018189726, + "tokens_seen": 337481728 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045339017051153465, + "loss": 3.334, + "theoretical_loss": 4.08998861773505, + "tokens_seen": 337547264 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004533801404212638, + "loss": 3.3064, + "theoretical_loss": 4.0898972399920765, + "tokens_seen": 337612800 + }, + { + "epoch": 4.0, + "learning_rate": 0.000453370110330993, + "loss": 3.4088, + "theoretical_loss": 4.089805884950757, + "tokens_seen": 337678336 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004533600802407222, + "loss": 3.3632, + "theoretical_loss": 4.089714552601045, + "tokens_seen": 337743872 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004533500501504514, + "loss": 3.3516, + "theoretical_loss": 4.0896232429329045, + "tokens_seen": 337809408 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045334002006018056, + "loss": 3.3392, + "theoretical_loss": 4.089531955936303, + "tokens_seen": 337874944 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045332998996990974, + "loss": 3.2858, + "theoretical_loss": 4.089440691601215, + "tokens_seen": 337940480 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004533199598796389, + "loss": 3.2165, + "theoretical_loss": 4.089349449917623, + "tokens_seen": 338006016 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045330992978936816, + "loss": 3.3435, + "theoretical_loss": 4.089258230875514, + "tokens_seen": 338071552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532998996990973, + "loss": 3.2892, + "theoretical_loss": 4.089167034464883, + "tokens_seen": 338137088 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532898696088265, + "loss": 3.3453, + "theoretical_loss": 4.089075860675729, + "tokens_seen": 338202624 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045327983951855565, + "loss": 3.3698, + "theoretical_loss": 4.08898470949806, + "tokens_seen": 338268160 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532698094282849, + "loss": 3.3851, + "theoretical_loss": 4.088893580921889, + "tokens_seen": 338333696 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045325977933801406, + "loss": 3.3425, + "theoretical_loss": 4.088802474937235, + "tokens_seen": 338399232 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045324974924774324, + "loss": 3.2356, + "theoretical_loss": 4.088711391534124, + "tokens_seen": 338464768 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532397191574724, + "loss": 3.1961, + "theoretical_loss": 4.088620330702588, + "tokens_seen": 338530304 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532296890672016, + "loss": 3.2443, + "theoretical_loss": 4.088529292432666, + "tokens_seen": 338595840 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004532196589769308, + "loss": 3.3496, + "theoretical_loss": 4.088438276714402, + "tokens_seen": 338661376 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045320962888666, + "loss": 3.2766, + "theoretical_loss": 4.088347283537848, + "tokens_seen": 338726912 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045319959879638915, + "loss": 3.2046, + "theoretical_loss": 4.088256312893061, + "tokens_seen": 338792448 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004531895687061184, + "loss": 3.2556, + "theoretical_loss": 4.088165364770104, + "tokens_seen": 338857984 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045317953861584757, + "loss": 3.2586, + "theoretical_loss": 4.088074439159049, + "tokens_seen": 338923520 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045316950852557675, + "loss": 3.3971, + "theoretical_loss": 4.087983536049971, + "tokens_seen": 338989056 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045315947843530593, + "loss": 3.3363, + "theoretical_loss": 4.087892655432952, + "tokens_seen": 339054592 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 840774, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.240633726119995, + "objective/train/theoretical_loss": 4.0878017972980825, + "objective/train/tokens_used": 359580128, + "theoretical_loss": 4.0878017972980825, + "tokens_seen": 339120128 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004531494483450351, + "loss": 3.2638, + "theoretical_loss": 4.0878017972980825, + "tokens_seen": 339120128 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004531394182547643, + "loss": 3.2089, + "theoretical_loss": 4.087710961635457, + "tokens_seen": 339185664 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004531293881644935, + "loss": 3.2319, + "theoretical_loss": 4.087620148435177, + "tokens_seen": 339251200 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045311935807422265, + "loss": 3.2501, + "theoretical_loss": 4.08752935768735, + "tokens_seen": 339316736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004531093279839519, + "loss": 3.3258, + "theoretical_loss": 4.08743858938209, + "tokens_seen": 339382272 + }, + { + "epoch": 4.0, + "learning_rate": 0.000453099297893681, + "loss": 3.2647, + "theoretical_loss": 4.0873478435095185, + "tokens_seen": 339447808 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045308926780341025, + "loss": 3.2685, + "theoretical_loss": 4.0872571200597605, + "tokens_seen": 339513344 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045307923771313943, + "loss": 3.2745, + "theoretical_loss": 4.0871664190229495, + "tokens_seen": 339578880 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004530692076228686, + "loss": 3.3192, + "theoretical_loss": 4.087075740389224, + "tokens_seen": 339644416 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004530591775325978, + "loss": 3.3256, + "theoretical_loss": 4.08698508414873, + "tokens_seen": 339709952 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045304914744232703, + "loss": 3.203, + "theoretical_loss": 4.086894450291618, + "tokens_seen": 339775488 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045303911735205616, + "loss": 3.36, + "theoretical_loss": 4.086803838808047, + "tokens_seen": 339841024 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004530290872617854, + "loss": 3.2496, + "theoretical_loss": 4.086713249688181, + "tokens_seen": 339906560 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004530190571715145, + "loss": 3.276, + "theoretical_loss": 4.086622682922188, + "tokens_seen": 339972096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045300902708124375, + "loss": 3.1935, + "theoretical_loss": 4.086532138500247, + "tokens_seen": 340037632 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045299899699097294, + "loss": 3.3799, + "theoretical_loss": 4.0864416164125394, + "tokens_seen": 340103168 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004529889669007021, + "loss": 3.3085, + "theoretical_loss": 4.086351116649254, + "tokens_seen": 340168704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004529789368104313, + "loss": 3.2183, + "theoretical_loss": 4.086260639200587, + "tokens_seen": 340234240 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004529689067201605, + "loss": 3.206, + "theoretical_loss": 4.086170184056737, + "tokens_seen": 340299776 + }, + { + "epoch": 4.0, + "learning_rate": 0.00045295887662988966, + "loss": 3.329, + "theoretical_loss": 4.086079751207914, + "tokens_seen": 340365312 + }, + { + "epoch": 4.0, + "learning_rate": 0.0004529488465396189, + "loss": 3.257, + "theoretical_loss": 4.08598934064433, + "tokens_seen": 340430848 + }, + { + "epoch": 4.01, + "learning_rate": 0.000452938816449348, + "loss": 3.1652, + "theoretical_loss": 4.085898952356206, + "tokens_seen": 340496384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045292878635907726, + "loss": 3.3319, + "theoretical_loss": 4.0858085863337665, + "tokens_seen": 340561920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004529187562688064, + "loss": 3.3341, + "theoretical_loss": 4.085718242567245, + "tokens_seen": 340627456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004529087261785356, + "loss": 3.265, + "theoretical_loss": 4.085627921046879, + "tokens_seen": 340692992 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 843760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2802767753601074, + "objective/train/theoretical_loss": 4.085537621762913, + "objective/train/tokens_used": 361218528, + "theoretical_loss": 4.085537621762913, + "tokens_seen": 340758528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528986960882648, + "loss": 3.2892, + "theoretical_loss": 4.085537621762913, + "tokens_seen": 340758528 + }, + { + "epoch": 4.01, + "learning_rate": 0.000452888665997994, + "loss": 3.2742, + "theoretical_loss": 4.085447344705598, + "tokens_seen": 340824064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045287863590772316, + "loss": 3.3471, + "theoretical_loss": 4.085357089865189, + "tokens_seen": 340889600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528686058174524, + "loss": 3.3541, + "theoretical_loss": 4.085266857231951, + "tokens_seen": 340955136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528585757271815, + "loss": 3.2487, + "theoretical_loss": 4.085176646796152, + "tokens_seen": 341020672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045284854563691076, + "loss": 3.1953, + "theoretical_loss": 4.085086458548068, + "tokens_seen": 341086208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528385155466399, + "loss": 3.3085, + "theoretical_loss": 4.084996292477979, + "tokens_seen": 341151744 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528284854563691, + "loss": 3.3633, + "theoretical_loss": 4.084906148576174, + "tokens_seen": 341217280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528184553660983, + "loss": 3.2703, + "theoretical_loss": 4.084816026832945, + "tokens_seen": 341282816 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004528084252758275, + "loss": 3.2789, + "theoretical_loss": 4.0847259272385905, + "tokens_seen": 341348352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045279839518555667, + "loss": 3.2419, + "theoretical_loss": 4.084635849783419, + "tokens_seen": 341413888 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045278836509528585, + "loss": 3.3148, + "theoretical_loss": 4.084545794457742, + "tokens_seen": 341479424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045277833500501503, + "loss": 3.2151, + "theoretical_loss": 4.084455761251876, + "tokens_seen": 341544960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045276830491474426, + "loss": 3.2825, + "theoretical_loss": 4.084365750156145, + "tokens_seen": 341610496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004527582748244734, + "loss": 3.3909, + "theoretical_loss": 4.08427576116088, + "tokens_seen": 341676032 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004527482447342026, + "loss": 3.1569, + "theoretical_loss": 4.084185794256417, + "tokens_seen": 341741568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045273821464393175, + "loss": 3.3453, + "theoretical_loss": 4.084095849433099, + "tokens_seen": 341807104 + }, + { + "epoch": 4.01, + "learning_rate": 0.000452728184553661, + "loss": 3.298, + "theoretical_loss": 4.0840059266812725, + "tokens_seen": 341872640 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045271815446339017, + "loss": 3.1241, + "theoretical_loss": 4.083916025991293, + "tokens_seen": 341938176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045270812437311935, + "loss": 3.298, + "theoretical_loss": 4.083826147353522, + "tokens_seen": 342003712 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526980942828486, + "loss": 3.243, + "theoretical_loss": 4.083736290758323, + "tokens_seen": 342069248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045268806419257777, + "loss": 3.214, + "theoretical_loss": 4.083646456196071, + "tokens_seen": 342134784 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045267803410230695, + "loss": 3.3564, + "theoretical_loss": 4.083556643657145, + "tokens_seen": 342200320 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045266800401203613, + "loss": 3.207, + "theoretical_loss": 4.083466853131928, + "tokens_seen": 342265856 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526579739217653, + "loss": 3.2761, + "theoretical_loss": 4.083377084610813, + "tokens_seen": 342331392 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 847490, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.997652292251587, + "objective/train/theoretical_loss": 4.083287338084194, + "objective/train/tokens_used": 362856928, + "theoretical_loss": 4.083287338084194, + "tokens_seen": 342396928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526479438314945, + "loss": 3.2484, + "theoretical_loss": 4.083287338084194, + "tokens_seen": 342396928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526379137412237, + "loss": 3.314, + "theoretical_loss": 4.083197613542475, + "tokens_seen": 342462464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045262788365095285, + "loss": 3.21, + "theoretical_loss": 4.083107910976065, + "tokens_seen": 342528000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526178535606821, + "loss": 3.2925, + "theoretical_loss": 4.083018230375379, + "tokens_seen": 342593536 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004526078234704112, + "loss": 3.2722, + "theoretical_loss": 4.082928571730838, + "tokens_seen": 342659072 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045259779338014045, + "loss": 3.3612, + "theoretical_loss": 4.082838935032867, + "tokens_seen": 342724608 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045258776328986963, + "loss": 3.3501, + "theoretical_loss": 4.0827493202719, + "tokens_seen": 342790144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004525777331995988, + "loss": 3.3146, + "theoretical_loss": 4.082659727438378, + "tokens_seen": 342855680 + }, + { + "epoch": 4.01, + "learning_rate": 0.000452567703109328, + "loss": 3.2245, + "theoretical_loss": 4.082570156522742, + "tokens_seen": 342921216 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045255767301905723, + "loss": 3.3849, + "theoretical_loss": 4.0824806075154445, + "tokens_seen": 342986752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045254764292878636, + "loss": 3.2081, + "theoretical_loss": 4.082391080406944, + "tokens_seen": 343052288 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004525376128385156, + "loss": 3.2053, + "theoretical_loss": 4.0823015751877, + "tokens_seen": 343117824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004525275827482447, + "loss": 3.3515, + "theoretical_loss": 4.082212091848184, + "tokens_seen": 343183360 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045251755265797395, + "loss": 3.2176, + "theoretical_loss": 4.08212263037887, + "tokens_seen": 343248896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045250752256770314, + "loss": 3.2681, + "theoretical_loss": 4.082033190770238, + "tokens_seen": 343314432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524974924774323, + "loss": 3.2916, + "theoretical_loss": 4.081943773012775, + "tokens_seen": 343379968 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524874623871615, + "loss": 3.3492, + "theoretical_loss": 4.081854377096974, + "tokens_seen": 343445504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524774322968907, + "loss": 3.218, + "theoretical_loss": 4.081765003013333, + "tokens_seen": 343511040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045246740220661986, + "loss": 3.1371, + "theoretical_loss": 4.081675650752357, + "tokens_seen": 343576576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524573721163491, + "loss": 3.2186, + "theoretical_loss": 4.081586320304555, + "tokens_seen": 343642112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524473420260782, + "loss": 3.3674, + "theoretical_loss": 4.081497011660446, + "tokens_seen": 343707648 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045243731193580746, + "loss": 3.1755, + "theoretical_loss": 4.08140772481055, + "tokens_seen": 343773184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524272818455366, + "loss": 3.3316, + "theoretical_loss": 4.081318459745397, + "tokens_seen": 343838720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004524172517552658, + "loss": 3.1884, + "theoretical_loss": 4.081229216455519, + "tokens_seen": 343904256 + }, + { + "epoch": 4.01, + "learning_rate": 0.000452407221664995, + "loss": 3.3053, + "theoretical_loss": 4.081139994931458, + "tokens_seen": 343969792 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 852400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4030613899230957, + "objective/train/theoretical_loss": 4.081050795163759, + "objective/train/tokens_used": 364495328, + "theoretical_loss": 4.081050795163759, + "tokens_seen": 344035328 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523971915747242, + "loss": 3.2319, + "theoretical_loss": 4.081050795163759, + "tokens_seen": 344035328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045238716148445336, + "loss": 3.2492, + "theoretical_loss": 4.080961617142975, + "tokens_seen": 344100864 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523771313941826, + "loss": 3.2627, + "theoretical_loss": 4.080872460859663, + "tokens_seen": 344166400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523671013039117, + "loss": 3.3062, + "theoretical_loss": 4.080783326304386, + "tokens_seen": 344231936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045235707121364096, + "loss": 3.2938, + "theoretical_loss": 4.080694213467714, + "tokens_seen": 344297472 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523470411233701, + "loss": 3.167, + "theoretical_loss": 4.080605122340223, + "tokens_seen": 344363008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523370110330993, + "loss": 3.3402, + "theoretical_loss": 4.080516052912493, + "tokens_seen": 344428544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523269809428285, + "loss": 3.3119, + "theoretical_loss": 4.080427005175114, + "tokens_seen": 344494080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004523169508525577, + "loss": 3.2768, + "theoretical_loss": 4.080337979118677, + "tokens_seen": 344559616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045230692076228687, + "loss": 3.3255, + "theoretical_loss": 4.080248974733781, + "tokens_seen": 344625152 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045229689067201605, + "loss": 3.2255, + "theoretical_loss": 4.080159992011032, + "tokens_seen": 344690688 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045228686058174523, + "loss": 3.4124, + "theoretical_loss": 4.080071030941038, + "tokens_seen": 344756224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045227683049147446, + "loss": 3.392, + "theoretical_loss": 4.0799820915144185, + "tokens_seen": 344821760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004522668004012036, + "loss": 3.305, + "theoretical_loss": 4.0798931737217945, + "tokens_seen": 344887296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004522567703109328, + "loss": 3.3367, + "theoretical_loss": 4.079804277553794, + "tokens_seen": 344952832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045224674022066195, + "loss": 3.3804, + "theoretical_loss": 4.079715403001053, + "tokens_seen": 345018368 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004522367101303912, + "loss": 3.2233, + "theoretical_loss": 4.0796265500542095, + "tokens_seen": 345083904 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045222668004012037, + "loss": 3.3, + "theoretical_loss": 4.079537718703909, + "tokens_seen": 345149440 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045221664994984955, + "loss": 3.1918, + "theoretical_loss": 4.079448908940805, + "tokens_seen": 345214976 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045220661985957873, + "loss": 3.2952, + "theoretical_loss": 4.0793601207555525, + "tokens_seen": 345280512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045219658976930797, + "loss": 3.3033, + "theoretical_loss": 4.079271354138816, + "tokens_seen": 345346048 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004521865596790371, + "loss": 3.255, + "theoretical_loss": 4.079182609081265, + "tokens_seen": 345411584 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045217652958876633, + "loss": 3.3155, + "theoretical_loss": 4.079093885573573, + "tokens_seen": 345477120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045216649949849546, + "loss": 3.1411, + "theoretical_loss": 4.079005183606422, + "tokens_seen": 345542656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004521564694082247, + "loss": 3.3257, + "theoretical_loss": 4.078916503170497, + "tokens_seen": 345608192 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 855382, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.417637348175049, + "objective/train/theoretical_loss": 4.078827844256492, + "objective/train/tokens_used": 366133728, + "theoretical_loss": 4.078827844256492, + "tokens_seen": 345673728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045214643931795387, + "loss": 3.2145, + "theoretical_loss": 4.078827844256492, + "tokens_seen": 345673728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045213640922768305, + "loss": 3.2823, + "theoretical_loss": 4.0787392068551025, + "tokens_seen": 345739264 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045212637913741223, + "loss": 3.3106, + "theoretical_loss": 4.078650590957034, + "tokens_seen": 345804800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004521163490471414, + "loss": 3.2921, + "theoretical_loss": 4.078561996552997, + "tokens_seen": 345870336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004521063189568706, + "loss": 3.1305, + "theoretical_loss": 4.078473423633704, + "tokens_seen": 345935872 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045209628886659983, + "loss": 3.2461, + "theoretical_loss": 4.078384872189878, + "tokens_seen": 346001408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045208625877632896, + "loss": 3.261, + "theoretical_loss": 4.078296342212246, + "tokens_seen": 346066944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004520762286860582, + "loss": 3.2483, + "theoretical_loss": 4.07820783369154, + "tokens_seen": 346132480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004520661985957873, + "loss": 3.2977, + "theoretical_loss": 4.078119346618498, + "tokens_seen": 346198016 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045205616850551656, + "loss": 3.1557, + "theoretical_loss": 4.0780308809838655, + "tokens_seen": 346263552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045204613841524574, + "loss": 3.3235, + "theoretical_loss": 4.077942436778391, + "tokens_seen": 346329088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004520361083249749, + "loss": 3.4421, + "theoretical_loss": 4.07785401399283, + "tokens_seen": 346394624 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004520260782347041, + "loss": 3.1675, + "theoretical_loss": 4.077765612617946, + "tokens_seen": 346460160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045201604814443334, + "loss": 3.3018, + "theoretical_loss": 4.077677232644503, + "tokens_seen": 346525696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045200601805416246, + "loss": 3.2782, + "theoretical_loss": 4.077588874063276, + "tokens_seen": 346591232 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519959879638917, + "loss": 3.342, + "theoretical_loss": 4.077500536865044, + "tokens_seen": 346656768 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519859578736208, + "loss": 3.3466, + "theoretical_loss": 4.077412221040588, + "tokens_seen": 346722304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045197592778335006, + "loss": 3.3348, + "theoretical_loss": 4.077323926580701, + "tokens_seen": 346787840 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045196589769307924, + "loss": 3.302, + "theoretical_loss": 4.077235653476178, + "tokens_seen": 346853376 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519558676028084, + "loss": 3.2819, + "theoretical_loss": 4.077147401717819, + "tokens_seen": 346918912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045194583751253766, + "loss": 3.3431, + "theoretical_loss": 4.077059171296433, + "tokens_seen": 346984448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519358074222668, + "loss": 3.2515, + "theoretical_loss": 4.076970962202832, + "tokens_seen": 347049984 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451925777331996, + "loss": 3.3471, + "theoretical_loss": 4.076882774427834, + "tokens_seen": 347115520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519157472417252, + "loss": 3.2838, + "theoretical_loss": 4.076794607962263, + "tokens_seen": 347181056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004519057171514544, + "loss": 3.2228, + "theoretical_loss": 4.076706462796951, + "tokens_seen": 347246592 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 860292, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.901607036590576, + "objective/train/theoretical_loss": 4.07661833892273, + "objective/train/tokens_used": 367772128, + "theoretical_loss": 4.07661833892273, + "tokens_seen": 347312128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045189568706118356, + "loss": 3.2443, + "theoretical_loss": 4.07661833892273, + "tokens_seen": 347312128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518856569709128, + "loss": 3.2667, + "theoretical_loss": 4.076530236330444, + "tokens_seen": 347377664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518756268806419, + "loss": 3.3225, + "theoretical_loss": 4.076442155010938, + "tokens_seen": 347443200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045186559679037116, + "loss": 3.3217, + "theoretical_loss": 4.076354094955066, + "tokens_seen": 347508736 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518555667001003, + "loss": 3.3526, + "theoretical_loss": 4.076266056153685, + "tokens_seen": 347574272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518455366098295, + "loss": 3.3055, + "theoretical_loss": 4.076178038597659, + "tokens_seen": 347639808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518355065195587, + "loss": 3.3321, + "theoretical_loss": 4.076090042277858, + "tokens_seen": 347705344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004518254764292879, + "loss": 3.3717, + "theoretical_loss": 4.076002067185158, + "tokens_seen": 347770880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045181544633901707, + "loss": 3.3579, + "theoretical_loss": 4.075914113310437, + "tokens_seen": 347836416 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045180541624874625, + "loss": 3.2743, + "theoretical_loss": 4.075826180644583, + "tokens_seen": 347901952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045179538615847543, + "loss": 3.2233, + "theoretical_loss": 4.075738269178488, + "tokens_seen": 347967488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045178535606820466, + "loss": 3.1725, + "theoretical_loss": 4.075650378903049, + "tokens_seen": 348033024 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004517753259779338, + "loss": 3.2816, + "theoretical_loss": 4.075562509809171, + "tokens_seen": 348098560 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451765295887663, + "loss": 3.1653, + "theoretical_loss": 4.07547466188776, + "tokens_seen": 348164096 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045175526579739215, + "loss": 3.3091, + "theoretical_loss": 4.075386835129734, + "tokens_seen": 348229632 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004517452357071214, + "loss": 3.3405, + "theoretical_loss": 4.075299029526009, + "tokens_seen": 348295168 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045173520561685057, + "loss": 3.3292, + "theoretical_loss": 4.075211245067515, + "tokens_seen": 348360704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045172517552657975, + "loss": 3.3188, + "theoretical_loss": 4.07512348174518, + "tokens_seen": 348426240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045171514543630893, + "loss": 3.2731, + "theoretical_loss": 4.075035739549943, + "tokens_seen": 348491776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045170511534603817, + "loss": 3.2138, + "theoretical_loss": 4.074948018472744, + "tokens_seen": 348557312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004516950852557673, + "loss": 3.2275, + "theoretical_loss": 4.074860318504534, + "tokens_seen": 348622848 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045168505516549653, + "loss": 3.2111, + "theoretical_loss": 4.074772639636264, + "tokens_seen": 348688384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045167502507522566, + "loss": 3.213, + "theoretical_loss": 4.074684981858895, + "tokens_seen": 348753920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004516649949849549, + "loss": 3.3533, + "theoretical_loss": 4.074597345163391, + "tokens_seen": 348819456 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045165496489468407, + "loss": 3.3545, + "theoretical_loss": 4.074509729540723, + "tokens_seen": 348884992 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 863190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3143651485443115, + "objective/train/theoretical_loss": 4.074422134981866, + "objective/train/tokens_used": 369410528, + "theoretical_loss": 4.074422134981866, + "tokens_seen": 348950528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045164493480441325, + "loss": 3.3084, + "theoretical_loss": 4.074422134981866, + "tokens_seen": 348950528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045163490471414244, + "loss": 3.2324, + "theoretical_loss": 4.0743345614778015, + "tokens_seen": 349016064 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004516248746238716, + "loss": 3.2049, + "theoretical_loss": 4.074247009019517, + "tokens_seen": 349081600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004516148445336008, + "loss": 3.2177, + "theoretical_loss": 4.0741594775980055, + "tokens_seen": 349147136 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045160481444333003, + "loss": 3.2679, + "theoretical_loss": 4.074071967204265, + "tokens_seen": 349212672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045159478435305916, + "loss": 3.2789, + "theoretical_loss": 4.073984477829298, + "tokens_seen": 349278208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004515847542627884, + "loss": 3.2713, + "theoretical_loss": 4.073897009464115, + "tokens_seen": 349343744 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004515747241725175, + "loss": 3.3582, + "theoretical_loss": 4.07380956209973, + "tokens_seen": 349409280 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045156469408224676, + "loss": 3.2896, + "theoretical_loss": 4.073722135727164, + "tokens_seen": 349474816 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045155466399197594, + "loss": 3.2173, + "theoretical_loss": 4.073634730337441, + "tokens_seen": 349540352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004515446339017051, + "loss": 3.2938, + "theoretical_loss": 4.073547345921595, + "tokens_seen": 349605888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004515346038114343, + "loss": 3.2741, + "theoretical_loss": 4.073459982470661, + "tokens_seen": 349671424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045152457372116354, + "loss": 3.2804, + "theoretical_loss": 4.07337263997568, + "tokens_seen": 349736960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045151454363089266, + "loss": 3.2614, + "theoretical_loss": 4.073285318427704, + "tokens_seen": 349802496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004515045135406219, + "loss": 3.307, + "theoretical_loss": 4.0731980178177825, + "tokens_seen": 349868032 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451494483450351, + "loss": 3.3654, + "theoretical_loss": 4.073110738136975, + "tokens_seen": 349933568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045148445336008026, + "loss": 3.2297, + "theoretical_loss": 4.073023479376348, + "tokens_seen": 349999104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045147442326980944, + "loss": 3.2235, + "theoretical_loss": 4.072936241526969, + "tokens_seen": 350064640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004514643931795386, + "loss": 3.2887, + "theoretical_loss": 4.072849024579915, + "tokens_seen": 350130176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004514543630892678, + "loss": 3.2858, + "theoretical_loss": 4.072761828526264, + "tokens_seen": 350195712 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451444332998997, + "loss": 3.3768, + "theoretical_loss": 4.0726746533571045, + "tokens_seen": 350261248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045143430290872617, + "loss": 3.2689, + "theoretical_loss": 4.072587499063529, + "tokens_seen": 350326784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004514242728184554, + "loss": 3.1692, + "theoretical_loss": 4.072500365636632, + "tokens_seen": 350392320 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045141424272818453, + "loss": 3.2305, + "theoretical_loss": 4.072413253067518, + "tokens_seen": 350457856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045140421263791376, + "loss": 3.2143, + "theoretical_loss": 4.072326161347295, + "tokens_seen": 350523392 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 866987, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2525362968444824, + "objective/train/theoretical_loss": 4.072239090467076, + "objective/train/tokens_used": 371048928, + "theoretical_loss": 4.072239090467076, + "tokens_seen": 350588928 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045139418254764294, + "loss": 3.2716, + "theoretical_loss": 4.072239090467076, + "tokens_seen": 350588928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004513841524573721, + "loss": 3.37, + "theoretical_loss": 4.07215204041798, + "tokens_seen": 350654464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004513741223671013, + "loss": 3.2447, + "theoretical_loss": 4.072065011191132, + "tokens_seen": 350720000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004513640922768305, + "loss": 3.2739, + "theoretical_loss": 4.07197800277766, + "tokens_seen": 350785536 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045135406218655967, + "loss": 3.3291, + "theoretical_loss": 4.071891015168703, + "tokens_seen": 350851072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004513440320962889, + "loss": 3.2343, + "theoretical_loss": 4.071804048355398, + "tokens_seen": 350916608 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045133400200601803, + "loss": 3.2703, + "theoretical_loss": 4.071717102328893, + "tokens_seen": 350982144 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045132397191574727, + "loss": 3.2719, + "theoretical_loss": 4.07163017708034, + "tokens_seen": 351047680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004513139418254764, + "loss": 3.1767, + "theoretical_loss": 4.071543272600895, + "tokens_seen": 351113216 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045130391173520563, + "loss": 3.3209, + "theoretical_loss": 4.071456388881719, + "tokens_seen": 351178752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004512938816449348, + "loss": 3.2611, + "theoretical_loss": 4.071369525913983, + "tokens_seen": 351244288 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451283851554664, + "loss": 3.3135, + "theoretical_loss": 4.0712826836888585, + "tokens_seen": 351309824 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045127382146439317, + "loss": 3.3821, + "theoretical_loss": 4.071195862197525, + "tokens_seen": 351375360 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045126379137412235, + "loss": 3.3465, + "theoretical_loss": 4.071109061431165, + "tokens_seen": 351440896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045125376128385153, + "loss": 3.3215, + "theoretical_loss": 4.071022281380969, + "tokens_seen": 351506432 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045124373119358077, + "loss": 3.2495, + "theoretical_loss": 4.070935522038132, + "tokens_seen": 351571968 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004512337011033099, + "loss": 3.2593, + "theoretical_loss": 4.070848783393853, + "tokens_seen": 351637504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045122367101303913, + "loss": 3.2872, + "theoretical_loss": 4.0707620654393395, + "tokens_seen": 351703040 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004512136409227683, + "loss": 3.3113, + "theoretical_loss": 4.0706753681658, + "tokens_seen": 351768576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004512036108324975, + "loss": 3.3245, + "theoretical_loss": 4.070588691564453, + "tokens_seen": 351834112 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045119358074222673, + "loss": 3.3145, + "theoretical_loss": 4.070502035626519, + "tokens_seen": 351899648 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045118355065195586, + "loss": 3.2414, + "theoretical_loss": 4.070415400343225, + "tokens_seen": 351965184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004511735205616851, + "loss": 3.3436, + "theoretical_loss": 4.070328785705804, + "tokens_seen": 352030720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004511634904714143, + "loss": 3.3237, + "theoretical_loss": 4.070242191705494, + "tokens_seen": 352096256 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045115346038114345, + "loss": 3.3193, + "theoretical_loss": 4.070155618333536, + "tokens_seen": 352161792 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 871782, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3596177101135254, + "objective/train/theoretical_loss": 4.0700690655811815, + "objective/train/tokens_used": 372687328, + "theoretical_loss": 4.0700690655811815, + "tokens_seen": 352227328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045114343029087264, + "loss": 3.2918, + "theoretical_loss": 4.0700690655811815, + "tokens_seen": 352227328 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004511334002006018, + "loss": 3.1415, + "theoretical_loss": 4.069982533439682, + "tokens_seen": 352292864 + }, + { + "epoch": 4.01, + "learning_rate": 0.000451123370110331, + "loss": 3.3401, + "theoretical_loss": 4.069896021900298, + "tokens_seen": 352358400 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045111334002006023, + "loss": 3.3969, + "theoretical_loss": 4.069809530954293, + "tokens_seen": 352423936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045110330992978936, + "loss": 3.2774, + "theoretical_loss": 4.0697230605929375, + "tokens_seen": 352489472 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510932798395186, + "loss": 3.1876, + "theoretical_loss": 4.069636610807506, + "tokens_seen": 352555008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510832497492477, + "loss": 3.2585, + "theoretical_loss": 4.069550181589278, + "tokens_seen": 352620544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045107321965897696, + "loss": 3.4486, + "theoretical_loss": 4.069463772929542, + "tokens_seen": 352686080 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045106318956870614, + "loss": 3.2052, + "theoretical_loss": 4.069377384819587, + "tokens_seen": 352751616 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510531594784353, + "loss": 3.1723, + "theoretical_loss": 4.06929101725071, + "tokens_seen": 352817152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510431293881645, + "loss": 3.2444, + "theoretical_loss": 4.069204670214212, + "tokens_seen": 352882688 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045103309929789374, + "loss": 3.2627, + "theoretical_loss": 4.0691183437014, + "tokens_seen": 352948224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045102306920762286, + "loss": 3.2991, + "theoretical_loss": 4.069032037703588, + "tokens_seen": 353013760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510130391173521, + "loss": 3.2367, + "theoretical_loss": 4.068945752212091, + "tokens_seen": 353079296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004510030090270812, + "loss": 3.265, + "theoretical_loss": 4.0688594872182335, + "tokens_seen": 353144832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045099297893681046, + "loss": 3.2273, + "theoretical_loss": 4.068773242713343, + "tokens_seen": 353210368 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045098294884653964, + "loss": 3.2803, + "theoretical_loss": 4.0686870186887525, + "tokens_seen": 353275904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004509729187562688, + "loss": 3.2099, + "theoretical_loss": 4.0686008151358015, + "tokens_seen": 353341440 + }, + { + "epoch": 4.01, + "learning_rate": 0.000450962888665998, + "loss": 3.2751, + "theoretical_loss": 4.068514632045833, + "tokens_seen": 353406976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004509528585757272, + "loss": 3.3189, + "theoretical_loss": 4.068428469410198, + "tokens_seen": 353472512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045094282848545637, + "loss": 3.1793, + "theoretical_loss": 4.068342327220249, + "tokens_seen": 353538048 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004509327983951856, + "loss": 3.3359, + "theoretical_loss": 4.068256205467345, + "tokens_seen": 353603584 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045092276830491473, + "loss": 3.3189, + "theoretical_loss": 4.068170104142855, + "tokens_seen": 353669120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045091273821464396, + "loss": 3.3045, + "theoretical_loss": 4.0680840232381446, + "tokens_seen": 353734656 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045090270812437314, + "loss": 3.301, + "theoretical_loss": 4.067997962744592, + "tokens_seen": 353800192 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 874750, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3350651264190674, + "objective/train/theoretical_loss": 4.067911922653576, + "objective/train/tokens_used": 374325728, + "theoretical_loss": 4.067911922653576, + "tokens_seen": 353865728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004508926780341023, + "loss": 3.3422, + "theoretical_loss": 4.067911922653576, + "tokens_seen": 353865728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004508826479438315, + "loss": 3.2408, + "theoretical_loss": 4.067825902956484, + "tokens_seen": 353931264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004508726178535607, + "loss": 3.2559, + "theoretical_loss": 4.067739903644707, + "tokens_seen": 353996800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045086258776328987, + "loss": 3.2442, + "theoretical_loss": 4.067653924709641, + "tokens_seen": 354062336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004508525576730191, + "loss": 3.2594, + "theoretical_loss": 4.067567966142688, + "tokens_seen": 354127872 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045084252758274823, + "loss": 3.2205, + "theoretical_loss": 4.067482027935254, + "tokens_seen": 354193408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045083249749247747, + "loss": 3.311, + "theoretical_loss": 4.067396110078752, + "tokens_seen": 354258944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004508224674022066, + "loss": 3.2887, + "theoretical_loss": 4.067310212564599, + "tokens_seen": 354324480 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045081243731193583, + "loss": 3.3155, + "theoretical_loss": 4.067224335384217, + "tokens_seen": 354390016 + }, + { + "epoch": 4.01, + "learning_rate": 0.000450802407221665, + "loss": 3.3124, + "theoretical_loss": 4.067138478529033, + "tokens_seen": 354455552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004507923771313942, + "loss": 3.2361, + "theoretical_loss": 4.0670526419904816, + "tokens_seen": 354521088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045078234704112337, + "loss": 3.2741, + "theoretical_loss": 4.06696682576, + "tokens_seen": 354586624 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045077231695085255, + "loss": 3.3131, + "theoretical_loss": 4.066881029829031, + "tokens_seen": 354652160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045076228686058173, + "loss": 3.255, + "theoretical_loss": 4.0667952541890235, + "tokens_seen": 354717696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045075225677031097, + "loss": 3.2479, + "theoretical_loss": 4.066709498831431, + "tokens_seen": 354783232 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004507422266800401, + "loss": 3.3259, + "theoretical_loss": 4.066623763747713, + "tokens_seen": 354848768 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045073219658976933, + "loss": 3.1946, + "theoretical_loss": 4.066538048929332, + "tokens_seen": 354914304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004507221664994985, + "loss": 3.185, + "theoretical_loss": 4.066452354367758, + "tokens_seen": 354979840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004507121364092277, + "loss": 3.2356, + "theoretical_loss": 4.0663666800544656, + "tokens_seen": 355045376 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004507021063189569, + "loss": 3.2856, + "theoretical_loss": 4.066281025980933, + "tokens_seen": 355110912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045069207622868606, + "loss": 3.2614, + "theoretical_loss": 4.0661953921386464, + "tokens_seen": 355176448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045068204613841524, + "loss": 3.3279, + "theoretical_loss": 4.0661097785190945, + "tokens_seen": 355241984 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004506720160481445, + "loss": 3.2393, + "theoretical_loss": 4.066024185113772, + "tokens_seen": 355307520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004506619859578736, + "loss": 3.2804, + "theoretical_loss": 4.06593861191418, + "tokens_seen": 355373056 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045065195586760284, + "loss": 3.2498, + "theoretical_loss": 4.065853058911824, + "tokens_seen": 355438592 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 879796, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.299773931503296, + "objective/train/theoretical_loss": 4.065767526098211, + "objective/train/tokens_used": 375964128, + "theoretical_loss": 4.065767526098211, + "tokens_seen": 355504128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045064192577733196, + "loss": 3.3439, + "theoretical_loss": 4.065767526098211, + "tokens_seen": 355504128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004506318956870612, + "loss": 3.2306, + "theoretical_loss": 4.065682013464861, + "tokens_seen": 355569664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004506218655967904, + "loss": 3.2124, + "theoretical_loss": 4.065596521003291, + "tokens_seen": 355635200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045061183550651956, + "loss": 3.2832, + "theoretical_loss": 4.065511048705029, + "tokens_seen": 355700736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045060180541624874, + "loss": 3.2056, + "theoretical_loss": 4.065425596561606, + "tokens_seen": 355766272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505917753259779, + "loss": 3.2644, + "theoretical_loss": 4.065340164564556, + "tokens_seen": 355831808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505817452357071, + "loss": 3.181, + "theoretical_loss": 4.065254752705421, + "tokens_seen": 355897344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045057171514543634, + "loss": 3.3075, + "theoretical_loss": 4.065169360975747, + "tokens_seen": 355962880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045056168505516547, + "loss": 3.3528, + "theoretical_loss": 4.065083989367086, + "tokens_seen": 356028416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505516549648947, + "loss": 3.2264, + "theoretical_loss": 4.064998637870994, + "tokens_seen": 356093952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505416248746239, + "loss": 3.2897, + "theoretical_loss": 4.064913306479032, + "tokens_seen": 356159488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045053159478435306, + "loss": 3.2405, + "theoretical_loss": 4.0648279951827675, + "tokens_seen": 356225024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045052156469408224, + "loss": 3.2738, + "theoretical_loss": 4.064742703973771, + "tokens_seen": 356290560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505115346038114, + "loss": 3.3109, + "theoretical_loss": 4.06465743284362, + "tokens_seen": 356356096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004505015045135406, + "loss": 3.17, + "theoretical_loss": 4.0645721817838965, + "tokens_seen": 356421632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045049147442326984, + "loss": 3.2613, + "theoretical_loss": 4.064486950786186, + "tokens_seen": 356487168 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045048144433299897, + "loss": 3.2144, + "theoretical_loss": 4.064401739842083, + "tokens_seen": 356552704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004504714142427282, + "loss": 3.3645, + "theoretical_loss": 4.064316548943181, + "tokens_seen": 356618240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045046138415245733, + "loss": 3.2873, + "theoretical_loss": 4.064231378081086, + "tokens_seen": 356683776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045045135406218657, + "loss": 3.2065, + "theoretical_loss": 4.064146227247402, + "tokens_seen": 356749312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004504413239719158, + "loss": 3.3478, + "theoretical_loss": 4.064061096433744, + "tokens_seen": 356814848 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045043129388164493, + "loss": 3.323, + "theoretical_loss": 4.063975985631727, + "tokens_seen": 356880384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045042126379137416, + "loss": 3.357, + "theoretical_loss": 4.063890894832975, + "tokens_seen": 356945920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045041123370110334, + "loss": 3.1951, + "theoretical_loss": 4.063805824029113, + "tokens_seen": 357011456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004504012036108325, + "loss": 3.219, + "theoretical_loss": 4.0637207732117755, + "tokens_seen": 357076992 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 882726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2413079738616943, + "objective/train/theoretical_loss": 4.0636357423725995, + "objective/train/tokens_used": 377602528, + "theoretical_loss": 4.0636357423725995, + "tokens_seen": 357142528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503911735205617, + "loss": 3.2991, + "theoretical_loss": 4.0636357423725995, + "tokens_seen": 357142528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503811434302909, + "loss": 3.2118, + "theoretical_loss": 4.063550731503227, + "tokens_seen": 357208064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045037111334002007, + "loss": 3.3068, + "theoretical_loss": 4.063465740595306, + "tokens_seen": 357273600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503610832497493, + "loss": 3.2626, + "theoretical_loss": 4.063380769640489, + "tokens_seen": 357339136 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045035105315947843, + "loss": 3.26, + "theoretical_loss": 4.063295818630434, + "tokens_seen": 357404672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045034102306920767, + "loss": 3.2673, + "theoretical_loss": 4.063210887556801, + "tokens_seen": 357470208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503309929789368, + "loss": 3.2733, + "theoretical_loss": 4.0631259764112615, + "tokens_seen": 357535744 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045032096288866603, + "loss": 3.2814, + "theoretical_loss": 4.0630410851854855, + "tokens_seen": 357601280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503109327983952, + "loss": 3.2548, + "theoretical_loss": 4.062956213871151, + "tokens_seen": 357666816 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004503009027081244, + "loss": 3.2853, + "theoretical_loss": 4.062871362459941, + "tokens_seen": 357732352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045029087261785357, + "loss": 3.1874, + "theoretical_loss": 4.0627865309435425, + "tokens_seen": 357797888 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045028084252758275, + "loss": 3.3257, + "theoretical_loss": 4.062701719313649, + "tokens_seen": 357863424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045027081243731193, + "loss": 3.3151, + "theoretical_loss": 4.062616927561957, + "tokens_seen": 357928960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045026078234704117, + "loss": 3.3046, + "theoretical_loss": 4.06253215568017, + "tokens_seen": 357994496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004502507522567703, + "loss": 3.2917, + "theoretical_loss": 4.062447403659995, + "tokens_seen": 358060032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045024072216649953, + "loss": 3.3046, + "theoretical_loss": 4.062362671493145, + "tokens_seen": 358125568 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004502306920762287, + "loss": 3.341, + "theoretical_loss": 4.062277959171337, + "tokens_seen": 358191104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004502206619859579, + "loss": 3.3305, + "theoretical_loss": 4.062193266686293, + "tokens_seen": 358256640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004502106318956871, + "loss": 3.2074, + "theoretical_loss": 4.062108594029742, + "tokens_seen": 358322176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045020060180541626, + "loss": 3.3608, + "theoretical_loss": 4.062023941193415, + "tokens_seen": 358387712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045019057171514544, + "loss": 3.3545, + "theoretical_loss": 4.06193930816905, + "tokens_seen": 358453248 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004501805416248747, + "loss": 3.2173, + "theoretical_loss": 4.061854694948389, + "tokens_seen": 358518784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004501705115346038, + "loss": 3.28, + "theoretical_loss": 4.061770101523179, + "tokens_seen": 358584320 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045016048144433304, + "loss": 3.2356, + "theoretical_loss": 4.061685527885173, + "tokens_seen": 358649856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045015045135406216, + "loss": 3.3218, + "theoretical_loss": 4.061600974026128, + "tokens_seen": 358715392 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 886223, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.363246202468872, + "objective/train/theoretical_loss": 4.061516439937805, + "objective/train/tokens_used": 379240928, + "theoretical_loss": 4.061516439937805, + "tokens_seen": 358780928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004501404212637914, + "loss": 3.2986, + "theoretical_loss": 4.061516439937805, + "tokens_seen": 358780928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004501303911735206, + "loss": 3.3245, + "theoretical_loss": 4.061431925611973, + "tokens_seen": 358846464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045012036108324976, + "loss": 3.2865, + "theoretical_loss": 4.061347431040401, + "tokens_seen": 358912000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045011033099297894, + "loss": 3.3843, + "theoretical_loss": 4.061262956214868, + "tokens_seen": 358977536 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004501003009027081, + "loss": 3.1806, + "theoretical_loss": 4.061178501127157, + "tokens_seen": 359043072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004500902708124373, + "loss": 3.288, + "theoretical_loss": 4.061094065769051, + "tokens_seen": 359108608 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045008024072216654, + "loss": 3.2797, + "theoretical_loss": 4.061009650132346, + "tokens_seen": 359174144 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045007021063189567, + "loss": 3.1899, + "theoretical_loss": 4.0609252542088345, + "tokens_seen": 359239680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004500601805416249, + "loss": 3.2545, + "theoretical_loss": 4.060840877990321, + "tokens_seen": 359305216 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004500501504513541, + "loss": 3.2507, + "theoretical_loss": 4.06075652146861, + "tokens_seen": 359370752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045004012036108326, + "loss": 3.327, + "theoretical_loss": 4.060672184635513, + "tokens_seen": 359436288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045003009027081244, + "loss": 3.2323, + "theoretical_loss": 4.060587867482847, + "tokens_seen": 359501824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004500200601805416, + "loss": 3.3521, + "theoretical_loss": 4.0605035700024334, + "tokens_seen": 359567360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004500100300902708, + "loss": 3.2845, + "theoretical_loss": 4.060419292186096, + "tokens_seen": 359632896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00045000000000000004, + "loss": 3.2916, + "theoretical_loss": 4.060335034025668, + "tokens_seen": 359698432 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044998996990972917, + "loss": 3.2782, + "theoretical_loss": 4.060250795512983, + "tokens_seen": 359763968 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004499799398194584, + "loss": 3.1751, + "theoretical_loss": 4.060166576639883, + "tokens_seen": 359829504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044996990972918753, + "loss": 3.3368, + "theoretical_loss": 4.0600823773982135, + "tokens_seen": 359895040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044995987963891677, + "loss": 3.301, + "theoretical_loss": 4.059998197779825, + "tokens_seen": 359960576 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044994984954864595, + "loss": 3.2918, + "theoretical_loss": 4.059914037776571, + "tokens_seen": 360026112 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044993981945837513, + "loss": 3.2888, + "theoretical_loss": 4.059829897380313, + "tokens_seen": 360091648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004499297893681043, + "loss": 3.3133, + "theoretical_loss": 4.059745776582916, + "tokens_seen": 360157184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044991975927783355, + "loss": 3.2845, + "theoretical_loss": 4.059661675376249, + "tokens_seen": 360222720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044990972918756267, + "loss": 3.17, + "theoretical_loss": 4.059577593752187, + "tokens_seen": 360288256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004498996990972919, + "loss": 3.3347, + "theoretical_loss": 4.059493531702609, + "tokens_seen": 360353792 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 889631, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3475241661071777, + "objective/train/theoretical_loss": 4.059409489219401, + "objective/train/tokens_used": 380879328, + "theoretical_loss": 4.059409489219401, + "tokens_seen": 360419328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044988966900702103, + "loss": 3.3637, + "theoretical_loss": 4.059409489219401, + "tokens_seen": 360419328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044987963891675027, + "loss": 3.2906, + "theoretical_loss": 4.05932546629445, + "tokens_seen": 360484864 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044986960882647945, + "loss": 3.3043, + "theoretical_loss": 4.05924146291965, + "tokens_seen": 360550400 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044985957873620863, + "loss": 3.3854, + "theoretical_loss": 4.0591574790869025, + "tokens_seen": 360615936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004498495486459378, + "loss": 3.2447, + "theoretical_loss": 4.059073514788109, + "tokens_seen": 360681472 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449839518555667, + "loss": 3.1937, + "theoretical_loss": 4.058989570015177, + "tokens_seen": 360747008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004498294884653962, + "loss": 3.2748, + "theoretical_loss": 4.0589056447600225, + "tokens_seen": 360812544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004498194583751254, + "loss": 3.303, + "theoretical_loss": 4.058821739014562, + "tokens_seen": 360878080 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044980942828485454, + "loss": 3.2729, + "theoretical_loss": 4.058737852770718, + "tokens_seen": 360943616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044979939819458377, + "loss": 3.3502, + "theoretical_loss": 4.058653986020419, + "tokens_seen": 361009152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497893681043129, + "loss": 3.3174, + "theoretical_loss": 4.058570138755598, + "tokens_seen": 361074688 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044977933801404213, + "loss": 3.2051, + "theoretical_loss": 4.05848631096819, + "tokens_seen": 361140224 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497693079237713, + "loss": 3.2743, + "theoretical_loss": 4.058402502650141, + "tokens_seen": 361205760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497592778335005, + "loss": 3.3108, + "theoretical_loss": 4.058318713793395, + "tokens_seen": 361271296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497492477432297, + "loss": 3.2509, + "theoretical_loss": 4.058234944389905, + "tokens_seen": 361336832 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497392176529589, + "loss": 3.3796, + "theoretical_loss": 4.058151194431626, + "tokens_seen": 361402368 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044972918756268804, + "loss": 3.1399, + "theoretical_loss": 4.058067463910521, + "tokens_seen": 361467904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497191574724173, + "loss": 3.1163, + "theoretical_loss": 4.057983752818556, + "tokens_seen": 361533440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004497091273821464, + "loss": 3.2255, + "theoretical_loss": 4.0579000611477, + "tokens_seen": 361598976 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044969909729187564, + "loss": 3.3241, + "theoretical_loss": 4.057816388889931, + "tokens_seen": 361664512 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004496890672016049, + "loss": 3.2396, + "theoretical_loss": 4.057732736037228, + "tokens_seen": 361730048 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449679037111334, + "loss": 3.3226, + "theoretical_loss": 4.057649102581577, + "tokens_seen": 361795584 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044966900702106324, + "loss": 3.3037, + "theoretical_loss": 4.057565488514966, + "tokens_seen": 361861120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044965897693079236, + "loss": 3.3059, + "theoretical_loss": 4.057481893829392, + "tokens_seen": 361926656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004496489468405216, + "loss": 3.3187, + "theoretical_loss": 4.057398318516853, + "tokens_seen": 361992192 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 894341, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.271829605102539, + "objective/train/theoretical_loss": 4.057314762569354, + "objective/train/tokens_used": 382517728, + "theoretical_loss": 4.057314762569354, + "tokens_seen": 362057728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004496389167502508, + "loss": 3.3641, + "theoretical_loss": 4.057314762569354, + "tokens_seen": 362057728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044962888665997996, + "loss": 3.2336, + "theoretical_loss": 4.057231225978904, + "tokens_seen": 362123264 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044961885656970914, + "loss": 3.3603, + "theoretical_loss": 4.057147708737515, + "tokens_seen": 362188800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004496088264794383, + "loss": 3.3011, + "theoretical_loss": 4.057064210837208, + "tokens_seen": 362254336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004495987963891675, + "loss": 3.2017, + "theoretical_loss": 4.056980732270004, + "tokens_seen": 362319872 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044958876629889674, + "loss": 3.3358, + "theoretical_loss": 4.056897273027932, + "tokens_seen": 362385408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044957873620862587, + "loss": 3.263, + "theoretical_loss": 4.056813833103025, + "tokens_seen": 362450944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004495687061183551, + "loss": 3.1561, + "theoretical_loss": 4.056730412487319, + "tokens_seen": 362516480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004495586760280843, + "loss": 3.2369, + "theoretical_loss": 4.056647011172856, + "tokens_seen": 362582016 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044954864593781346, + "loss": 3.3313, + "theoretical_loss": 4.056563629151684, + "tokens_seen": 362647552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044953861584754264, + "loss": 3.2746, + "theoretical_loss": 4.056480266415854, + "tokens_seen": 362713088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004495285857572718, + "loss": 3.2169, + "theoretical_loss": 4.056396922957423, + "tokens_seen": 362778624 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449518555667001, + "loss": 3.1915, + "theoretical_loss": 4.056313598768449, + "tokens_seen": 362844160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044950852557673024, + "loss": 3.288, + "theoretical_loss": 4.0562302938410015, + "tokens_seen": 362909696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044949849548645937, + "loss": 3.2296, + "theoretical_loss": 4.056147008167148, + "tokens_seen": 362975232 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004494884653961886, + "loss": 3.2625, + "theoretical_loss": 4.056063741738965, + "tokens_seen": 363040768 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044947843530591773, + "loss": 3.3203, + "theoretical_loss": 4.055980494548532, + "tokens_seen": 363106304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044946840521564697, + "loss": 3.3332, + "theoretical_loss": 4.055897266587932, + "tokens_seen": 363171840 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044945837512537615, + "loss": 3.3325, + "theoretical_loss": 4.055814057849255, + "tokens_seen": 363237376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044944834503510533, + "loss": 3.2858, + "theoretical_loss": 4.055730868324595, + "tokens_seen": 363302912 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004494383149448345, + "loss": 3.2744, + "theoretical_loss": 4.055647698006051, + "tokens_seen": 363368448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044942828485456375, + "loss": 3.1819, + "theoretical_loss": 4.055564546885725, + "tokens_seen": 363433984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044941825476429287, + "loss": 3.2828, + "theoretical_loss": 4.055481414955724, + "tokens_seen": 363499520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004494082246740221, + "loss": 3.197, + "theoretical_loss": 4.055398302208163, + "tokens_seen": 363565056 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044939819458375123, + "loss": 3.1428, + "theoretical_loss": 4.0553152086351565, + "tokens_seen": 363630592 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 897281, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0730271339416504, + "objective/train/theoretical_loss": 4.055232134228827, + "objective/train/tokens_used": 384156128, + "theoretical_loss": 4.055232134228827, + "tokens_seen": 363696128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044938816449348047, + "loss": 3.3135, + "theoretical_loss": 4.055232134228827, + "tokens_seen": 363696128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044937813440320965, + "loss": 3.2145, + "theoretical_loss": 4.055149078981303, + "tokens_seen": 363761664 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044936810431293883, + "loss": 3.2846, + "theoretical_loss": 4.055066042884713, + "tokens_seen": 363827200 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449358074222668, + "loss": 3.1584, + "theoretical_loss": 4.054983025931193, + "tokens_seen": 363892736 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004493480441323972, + "loss": 3.3398, + "theoretical_loss": 4.054900028112884, + "tokens_seen": 363958272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004493380140421264, + "loss": 3.2893, + "theoretical_loss": 4.054817049421931, + "tokens_seen": 364023808 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004493279839518556, + "loss": 3.1768, + "theoretical_loss": 4.054734089850484, + "tokens_seen": 364089344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044931795386158474, + "loss": 3.2472, + "theoretical_loss": 4.054651149390697, + "tokens_seen": 364154880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044930792377131397, + "loss": 3.2644, + "theoretical_loss": 4.054568228034729, + "tokens_seen": 364220416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492978936810431, + "loss": 3.2581, + "theoretical_loss": 4.054485325774742, + "tokens_seen": 364285952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044928786359077234, + "loss": 3.223, + "theoretical_loss": 4.054402442602905, + "tokens_seen": 364351488 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492778335005015, + "loss": 3.3706, + "theoretical_loss": 4.054319578511392, + "tokens_seen": 364417024 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492678034102307, + "loss": 3.3189, + "theoretical_loss": 4.05423673349238, + "tokens_seen": 364482560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492577733199599, + "loss": 3.158, + "theoretical_loss": 4.0541539075380495, + "tokens_seen": 364548096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492477432296891, + "loss": 3.2799, + "theoretical_loss": 4.054071100640589, + "tokens_seen": 364613632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044923771313941824, + "loss": 3.3496, + "theoretical_loss": 4.053988312792189, + "tokens_seen": 364679168 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492276830491475, + "loss": 3.2205, + "theoretical_loss": 4.053905543985044, + "tokens_seen": 364744704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004492176529588766, + "loss": 3.3568, + "theoretical_loss": 4.0538227942113565, + "tokens_seen": 364810240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044920762286860584, + "loss": 3.2822, + "theoretical_loss": 4.05374006346333, + "tokens_seen": 364875776 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449197592778335, + "loss": 3.3517, + "theoretical_loss": 4.053657351733175, + "tokens_seen": 364941312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004491875626880642, + "loss": 3.2644, + "theoretical_loss": 4.053574659013106, + "tokens_seen": 365006848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004491775325977934, + "loss": 3.2801, + "theoretical_loss": 4.053491985295341, + "tokens_seen": 365072384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044916750250752256, + "loss": 3.2568, + "theoretical_loss": 4.053409330572102, + "tokens_seen": 365137920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044915747241725174, + "loss": 3.3061, + "theoretical_loss": 4.05332669483562, + "tokens_seen": 365203456 + }, + { + "epoch": 4.01, + "learning_rate": 0.000449147442326981, + "loss": 3.3352, + "theoretical_loss": 4.053244078078126, + "tokens_seen": 365268992 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 900914, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2744319438934326, + "objective/train/theoretical_loss": 4.053161480291855, + "objective/train/tokens_used": 385794528, + "theoretical_loss": 4.053161480291855, + "tokens_seen": 365334528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004491374122367101, + "loss": 3.238, + "theoretical_loss": 4.053161480291855, + "tokens_seen": 365334528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044912738214643934, + "loss": 3.1988, + "theoretical_loss": 4.053078901469053, + "tokens_seen": 365400064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044911735205616847, + "loss": 3.2358, + "theoretical_loss": 4.052996341601963, + "tokens_seen": 365465600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004491073219658977, + "loss": 3.3214, + "theoretical_loss": 4.052913800682838, + "tokens_seen": 365531136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004490972918756269, + "loss": 3.3133, + "theoretical_loss": 4.0528312787039305, + "tokens_seen": 365596672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044908726178535607, + "loss": 3.1812, + "theoretical_loss": 4.052748775657502, + "tokens_seen": 365662208 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044907723169508525, + "loss": 3.2983, + "theoretical_loss": 4.052666291535816, + "tokens_seen": 365727744 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004490672016048145, + "loss": 3.3073, + "theoretical_loss": 4.052583826331144, + "tokens_seen": 365793280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004490571715145436, + "loss": 3.2196, + "theoretical_loss": 4.052501380035757, + "tokens_seen": 365858816 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044904714142427284, + "loss": 3.3301, + "theoretical_loss": 4.052418952641934, + "tokens_seen": 365924352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044903711133400197, + "loss": 3.2283, + "theoretical_loss": 4.052336544141957, + "tokens_seen": 365989888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004490270812437312, + "loss": 3.2348, + "theoretical_loss": 4.052254154528114, + "tokens_seen": 366055424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004490170511534604, + "loss": 3.3075, + "theoretical_loss": 4.052171783792696, + "tokens_seen": 366120960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044900702106318957, + "loss": 3.3095, + "theoretical_loss": 4.052089431928, + "tokens_seen": 366186496 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044899699097291875, + "loss": 3.256, + "theoretical_loss": 4.0520070989263255, + "tokens_seen": 366252032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044898696088264793, + "loss": 3.2836, + "theoretical_loss": 4.051924784779978, + "tokens_seen": 366317568 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004489769307923771, + "loss": 3.2975, + "theoretical_loss": 4.051842489481269, + "tokens_seen": 366383104 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044896690070210635, + "loss": 3.3802, + "theoretical_loss": 4.05176021302251, + "tokens_seen": 366448640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004489568706118355, + "loss": 3.3353, + "theoretical_loss": 4.051677955396022, + "tokens_seen": 366514176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004489468405215647, + "loss": 3.2797, + "theoretical_loss": 4.051595716594126, + "tokens_seen": 366579712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044893681043129395, + "loss": 3.369, + "theoretical_loss": 4.051513496609152, + "tokens_seen": 366645248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044892678034102307, + "loss": 3.3452, + "theoretical_loss": 4.051431295433431, + "tokens_seen": 366710784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004489167502507523, + "loss": 3.1608, + "theoretical_loss": 4.0513491130593, + "tokens_seen": 366776320 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044890672016048143, + "loss": 3.2557, + "theoretical_loss": 4.051266949479101, + "tokens_seen": 366841856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044889669007021067, + "loss": 3.3129, + "theoretical_loss": 4.051184804685178, + "tokens_seen": 366907392 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 905854, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.288435697555542, + "objective/train/theoretical_loss": 4.051102678669883, + "objective/train/tokens_used": 387432928, + "theoretical_loss": 4.051102678669883, + "tokens_seen": 366972928 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044888665997993985, + "loss": 3.3232, + "theoretical_loss": 4.051102678669883, + "tokens_seen": 366972928 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044887662988966903, + "loss": 3.2797, + "theoretical_loss": 4.051020571425569, + "tokens_seen": 367038464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488665997993982, + "loss": 3.2938, + "theoretical_loss": 4.050938482944598, + "tokens_seen": 367104000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488565697091274, + "loss": 3.2875, + "theoretical_loss": 4.05085641321933, + "tokens_seen": 367169536 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488465396188566, + "loss": 3.2739, + "theoretical_loss": 4.0507743622421355, + "tokens_seen": 367235072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488365095285858, + "loss": 3.2601, + "theoretical_loss": 4.050692330005386, + "tokens_seen": 367300608 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044882647943831494, + "loss": 3.2262, + "theoretical_loss": 4.05061031650146, + "tokens_seen": 367366144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488164493480442, + "loss": 3.2389, + "theoretical_loss": 4.050528321722737, + "tokens_seen": 367431680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004488064192577733, + "loss": 3.3609, + "theoretical_loss": 4.050446345661605, + "tokens_seen": 367497216 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044879638916750254, + "loss": 3.2839, + "theoretical_loss": 4.050364388310452, + "tokens_seen": 367562752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487863590772317, + "loss": 3.3147, + "theoretical_loss": 4.050282449661675, + "tokens_seen": 367628288 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487763289869609, + "loss": 3.329, + "theoretical_loss": 4.0502005297076735, + "tokens_seen": 367693824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487662988966901, + "loss": 3.2503, + "theoretical_loss": 4.050118628440849, + "tokens_seen": 367759360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487562688064193, + "loss": 3.2159, + "theoretical_loss": 4.050036745853611, + "tokens_seen": 367824896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044874623871614844, + "loss": 3.2092, + "theoretical_loss": 4.049954881938373, + "tokens_seen": 367890432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487362086258777, + "loss": 3.2995, + "theoretical_loss": 4.049873036687551, + "tokens_seen": 367955968 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487261785356068, + "loss": 3.2499, + "theoretical_loss": 4.049791210093566, + "tokens_seen": 368021504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044871614844533604, + "loss": 3.222, + "theoretical_loss": 4.049709402148845, + "tokens_seen": 368087040 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004487061183550652, + "loss": 3.1177, + "theoretical_loss": 4.049627612845818, + "tokens_seen": 368152576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486960882647944, + "loss": 3.2401, + "theoretical_loss": 4.049545842176919, + "tokens_seen": 368218112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486860581745236, + "loss": 3.3567, + "theoretical_loss": 4.049464090134588, + "tokens_seen": 368283648 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044867602808425276, + "loss": 3.2273, + "theoretical_loss": 4.049382356711269, + "tokens_seen": 368349184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044866599799398194, + "loss": 3.3656, + "theoretical_loss": 4.049300641899409, + "tokens_seen": 368414720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486559679037112, + "loss": 3.2398, + "theoretical_loss": 4.049218945691461, + "tokens_seen": 368480256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486459378134403, + "loss": 3.276, + "theoretical_loss": 4.049137268079882, + "tokens_seen": 368545792 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 908908, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.310276508331299, + "objective/train/theoretical_loss": 4.049055609057133, + "objective/train/tokens_used": 389071328, + "theoretical_loss": 4.049055609057133, + "tokens_seen": 368611328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044863590772316954, + "loss": 3.2338, + "theoretical_loss": 4.049055609057133, + "tokens_seen": 368611328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044862587763289867, + "loss": 3.2917, + "theoretical_loss": 4.0489739686156785, + "tokens_seen": 368676864 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486158475426279, + "loss": 3.2765, + "theoretical_loss": 4.04889234674799, + "tokens_seen": 368742400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004486058174523571, + "loss": 3.2553, + "theoretical_loss": 4.048810743446541, + "tokens_seen": 368807936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044859578736208627, + "loss": 3.2416, + "theoretical_loss": 4.048729158703811, + "tokens_seen": 368873472 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044858575727181545, + "loss": 3.1814, + "theoretical_loss": 4.048647592512282, + "tokens_seen": 368939008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004485757271815447, + "loss": 3.2482, + "theoretical_loss": 4.048566044864443, + "tokens_seen": 369004544 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004485656970912738, + "loss": 3.3919, + "theoretical_loss": 4.048484515752785, + "tokens_seen": 369070080 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044855566700100304, + "loss": 3.2895, + "theoretical_loss": 4.048403005169805, + "tokens_seen": 369135616 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044854563691073217, + "loss": 3.3462, + "theoretical_loss": 4.048321513108002, + "tokens_seen": 369201152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004485356068204614, + "loss": 3.2483, + "theoretical_loss": 4.048240039559882, + "tokens_seen": 369266688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004485255767301906, + "loss": 3.1507, + "theoretical_loss": 4.048158584517955, + "tokens_seen": 369332224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044851554663991977, + "loss": 3.3375, + "theoretical_loss": 4.048077147974735, + "tokens_seen": 369397760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044850551654964895, + "loss": 3.2785, + "theoretical_loss": 4.047995729922738, + "tokens_seen": 369463296 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044849548645937813, + "loss": 3.2296, + "theoretical_loss": 4.047914330354488, + "tokens_seen": 369528832 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004484854563691073, + "loss": 3.262, + "theoretical_loss": 4.047832949262512, + "tokens_seen": 369594368 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044847542627883655, + "loss": 3.3501, + "theoretical_loss": 4.0477515866393405, + "tokens_seen": 369659904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004484653961885657, + "loss": 3.109, + "theoretical_loss": 4.047670242477508, + "tokens_seen": 369725440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004484553660982949, + "loss": 3.2621, + "theoretical_loss": 4.047588916769557, + "tokens_seen": 369790976 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044844533600802404, + "loss": 3.2331, + "theoretical_loss": 4.04750760950803, + "tokens_seen": 369856512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044843530591775327, + "loss": 3.2698, + "theoretical_loss": 4.047426320685475, + "tokens_seen": 369922048 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044842527582748245, + "loss": 3.1773, + "theoretical_loss": 4.047345050294446, + "tokens_seen": 369987584 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044841524573721163, + "loss": 3.3285, + "theoretical_loss": 4.047263798327499, + "tokens_seen": 370053120 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004484052156469408, + "loss": 3.1599, + "theoretical_loss": 4.047182564777196, + "tokens_seen": 370118656 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044839518555667005, + "loss": 3.2508, + "theoretical_loss": 4.047101349636103, + "tokens_seen": 370184192 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 913676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2894537448883057, + "objective/train/theoretical_loss": 4.04702015289679, + "objective/train/tokens_used": 390709728, + "theoretical_loss": 4.04702015289679, + "tokens_seen": 370249728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483851554663992, + "loss": 3.2831, + "theoretical_loss": 4.04702015289679, + "tokens_seen": 370249728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483751253761284, + "loss": 3.2096, + "theoretical_loss": 4.046938974551831, + "tokens_seen": 370315264 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044836509528585754, + "loss": 3.3772, + "theoretical_loss": 4.046857814593805, + "tokens_seen": 370380800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483550651955868, + "loss": 3.3221, + "theoretical_loss": 4.046776673015295, + "tokens_seen": 370446336 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044834503510531596, + "loss": 3.2285, + "theoretical_loss": 4.046695549808889, + "tokens_seen": 370511872 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044833500501504514, + "loss": 3.2832, + "theoretical_loss": 4.046614444967178, + "tokens_seen": 370577408 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483249749247743, + "loss": 3.212, + "theoretical_loss": 4.046533358482757, + "tokens_seen": 370642944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483149448345035, + "loss": 3.2562, + "theoretical_loss": 4.0464522903482285, + "tokens_seen": 370708480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004483049147442327, + "loss": 3.2401, + "theoretical_loss": 4.046371240556195, + "tokens_seen": 370774016 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482948846539619, + "loss": 3.4092, + "theoretical_loss": 4.046290209099267, + "tokens_seen": 370839552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044828485456369104, + "loss": 3.2464, + "theoretical_loss": 4.0462091959700555, + "tokens_seen": 370905088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482748244734203, + "loss": 3.3093, + "theoretical_loss": 4.046128201161179, + "tokens_seen": 370970624 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482647943831494, + "loss": 3.2203, + "theoretical_loss": 4.0460472246652595, + "tokens_seen": 371036160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044825476429287864, + "loss": 3.1614, + "theoretical_loss": 4.045966266474922, + "tokens_seen": 371101696 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482447342026078, + "loss": 3.3201, + "theoretical_loss": 4.045885326582798, + "tokens_seen": 371167232 + }, + { + "epoch": 4.01, + "learning_rate": 0.000448234704112337, + "loss": 3.2511, + "theoretical_loss": 4.045804404981521, + "tokens_seen": 371232768 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482246740220662, + "loss": 3.2597, + "theoretical_loss": 4.04572350166373, + "tokens_seen": 371298304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004482146439317954, + "loss": 3.271, + "theoretical_loss": 4.045642616622067, + "tokens_seen": 371363840 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044820461384152455, + "loss": 3.3022, + "theoretical_loss": 4.045561749849181, + "tokens_seen": 371429376 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004481945837512538, + "loss": 3.3362, + "theoretical_loss": 4.045480901337722, + "tokens_seen": 371494912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044818455366098296, + "loss": 3.3299, + "theoretical_loss": 4.045400071080347, + "tokens_seen": 371560448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044817452357071214, + "loss": 3.2125, + "theoretical_loss": 4.045319259069715, + "tokens_seen": 371625984 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004481644934804414, + "loss": 3.2021, + "theoretical_loss": 4.045238465298491, + "tokens_seen": 371691520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004481544633901705, + "loss": 3.3085, + "theoretical_loss": 4.045157689759343, + "tokens_seen": 371757056 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044814443329989974, + "loss": 3.1857, + "theoretical_loss": 4.045076932444943, + "tokens_seen": 371822592 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 916622, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.277189016342163, + "objective/train/theoretical_loss": 4.04499619334797, + "objective/train/tokens_used": 392348128, + "theoretical_loss": 4.04499619334797, + "tokens_seen": 371888128 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044813440320962887, + "loss": 3.2697, + "theoretical_loss": 4.04499619334797, + "tokens_seen": 371888128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004481243731193581, + "loss": 3.205, + "theoretical_loss": 4.044915472461104, + "tokens_seen": 371953664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004481143430290873, + "loss": 3.2641, + "theoretical_loss": 4.0448347697770295, + "tokens_seen": 372019200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044810431293881647, + "loss": 3.2551, + "theoretical_loss": 4.044754085288437, + "tokens_seen": 372084736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044809428284854565, + "loss": 3.3388, + "theoretical_loss": 4.044673418988021, + "tokens_seen": 372150272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004480842527582749, + "loss": 3.2913, + "theoretical_loss": 4.044592770868478, + "tokens_seen": 372215808 + }, + { + "epoch": 4.01, + "learning_rate": 0.000448074222668004, + "loss": 3.2311, + "theoretical_loss": 4.044512140922512, + "tokens_seen": 372281344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044806419257773324, + "loss": 3.3541, + "theoretical_loss": 4.044431529142829, + "tokens_seen": 372346880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044805416248746237, + "loss": 3.2014, + "theoretical_loss": 4.044350935522139, + "tokens_seen": 372412416 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004480441323971916, + "loss": 3.2107, + "theoretical_loss": 4.044270360053158, + "tokens_seen": 372477952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004480341023069208, + "loss": 3.3043, + "theoretical_loss": 4.044189802728605, + "tokens_seen": 372543488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044802407221664997, + "loss": 3.2336, + "theoretical_loss": 4.044109263541202, + "tokens_seen": 372609024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044801404212637915, + "loss": 3.1751, + "theoretical_loss": 4.044028742483678, + "tokens_seen": 372674560 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044800401203610833, + "loss": 3.2689, + "theoretical_loss": 4.043948239548763, + "tokens_seen": 372740096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004479939819458375, + "loss": 3.1695, + "theoretical_loss": 4.043867754729196, + "tokens_seen": 372805632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044798395185556675, + "loss": 3.3237, + "theoretical_loss": 4.043787288017715, + "tokens_seen": 372871168 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004479739217652959, + "loss": 3.2516, + "theoretical_loss": 4.043706839407063, + "tokens_seen": 372936704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004479638916750251, + "loss": 3.289, + "theoretical_loss": 4.043626408889991, + "tokens_seen": 373002240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044795386158475424, + "loss": 3.2889, + "theoretical_loss": 4.043545996459251, + "tokens_seen": 373067776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044794383149448347, + "loss": 3.2828, + "theoretical_loss": 4.043465602107599, + "tokens_seen": 373133312 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044793380140421265, + "loss": 3.3363, + "theoretical_loss": 4.043385225827796, + "tokens_seen": 373198848 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044792377131394183, + "loss": 3.1961, + "theoretical_loss": 4.043304867612608, + "tokens_seen": 373264384 + }, + { + "epoch": 4.01, + "learning_rate": 0.000447913741223671, + "loss": 3.2544, + "theoretical_loss": 4.043224527454805, + "tokens_seen": 373329920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00044790371113340025, + "loss": 3.2727, + "theoretical_loss": 4.043144205347159, + "tokens_seen": 373395456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0004478936810431294, + "loss": 3.1993, + "theoretical_loss": 4.043063901282447, + "tokens_seen": 373460992 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 920534, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3052444458007812, + "objective/train/theoretical_loss": 4.042983615253453, + "objective/train/tokens_used": 393986528, + "theoretical_loss": 4.042983615253453, + "tokens_seen": 373526528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004478836509528586, + "loss": 3.3101, + "theoretical_loss": 4.042983615253453, + "tokens_seen": 373526528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044787362086258774, + "loss": 3.2399, + "theoretical_loss": 4.042903347252961, + "tokens_seen": 373592064 + }, + { + "epoch": 4.02, + "learning_rate": 0.000447863590772317, + "loss": 3.204, + "theoretical_loss": 4.0428230972737635, + "tokens_seen": 373657600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044785356068204616, + "loss": 3.2392, + "theoretical_loss": 4.042742865308653, + "tokens_seen": 373723136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044784353059177534, + "loss": 3.1516, + "theoretical_loss": 4.042662651350428, + "tokens_seen": 373788672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004478335005015045, + "loss": 3.2899, + "theoretical_loss": 4.042582455391891, + "tokens_seen": 373854208 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004478234704112337, + "loss": 3.2852, + "theoretical_loss": 4.042502277425849, + "tokens_seen": 373919744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004478134403209629, + "loss": 3.2785, + "theoretical_loss": 4.042422117445113, + "tokens_seen": 373985280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004478034102306921, + "loss": 3.2641, + "theoretical_loss": 4.042341975442498, + "tokens_seen": 374050816 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044779338014042124, + "loss": 3.2496, + "theoretical_loss": 4.042261851410823, + "tokens_seen": 374116352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004477833500501505, + "loss": 3.1378, + "theoretical_loss": 4.04218174534291, + "tokens_seen": 374181888 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004477733199598796, + "loss": 3.1857, + "theoretical_loss": 4.042101657231588, + "tokens_seen": 374247424 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044776328986960884, + "loss": 3.2096, + "theoretical_loss": 4.042021587069688, + "tokens_seen": 374312960 + }, + { + "epoch": 4.02, + "learning_rate": 0.000447753259779338, + "loss": 3.2544, + "theoretical_loss": 4.041941534850046, + "tokens_seen": 374378496 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004477432296890672, + "loss": 3.2591, + "theoretical_loss": 4.041861500565499, + "tokens_seen": 374444032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004477331995987964, + "loss": 3.2745, + "theoretical_loss": 4.041781484208895, + "tokens_seen": 374509568 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004477231695085256, + "loss": 3.3391, + "theoretical_loss": 4.041701485773079, + "tokens_seen": 374575104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044771313941825475, + "loss": 3.3008, + "theoretical_loss": 4.0416215052509035, + "tokens_seen": 374640640 + }, + { + "epoch": 4.02, + "learning_rate": 0.000447703109327984, + "loss": 3.3379, + "theoretical_loss": 4.041541542635226, + "tokens_seen": 374706176 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476930792377131, + "loss": 3.2382, + "theoretical_loss": 4.041461597918905, + "tokens_seen": 374771712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044768304914744234, + "loss": 3.3025, + "theoretical_loss": 4.041381671094805, + "tokens_seen": 374837248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476730190571715, + "loss": 3.2276, + "theoretical_loss": 4.041301762155795, + "tokens_seen": 374902784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476629889669007, + "loss": 3.2388, + "theoretical_loss": 4.041221871094747, + "tokens_seen": 374968320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476529588766299, + "loss": 3.3319, + "theoretical_loss": 4.041141997904538, + "tokens_seen": 375033856 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044764292878635907, + "loss": 3.2258, + "theoretical_loss": 4.041062142578049, + "tokens_seen": 375099392 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 925043, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1430866718292236, + "objective/train/theoretical_loss": 4.040982305108164, + "objective/train/tokens_used": 395624928, + "theoretical_loss": 4.040982305108164, + "tokens_seen": 375164928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044763289869608825, + "loss": 3.2574, + "theoretical_loss": 4.040982305108164, + "tokens_seen": 375164928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476228686058175, + "loss": 3.3149, + "theoretical_loss": 4.040902485487772, + "tokens_seen": 375230464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004476128385155466, + "loss": 3.2851, + "theoretical_loss": 4.040822683709765, + "tokens_seen": 375296000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044760280842527585, + "loss": 3.2772, + "theoretical_loss": 4.040742899767041, + "tokens_seen": 375361536 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044759277833500503, + "loss": 3.2773, + "theoretical_loss": 4.040663133652502, + "tokens_seen": 375427072 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004475827482447342, + "loss": 3.2682, + "theoretical_loss": 4.040583385359051, + "tokens_seen": 375492608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004475727181544634, + "loss": 3.1755, + "theoretical_loss": 4.040503654879598, + "tokens_seen": 375558144 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044756268806419257, + "loss": 3.2995, + "theoretical_loss": 4.040423942207056, + "tokens_seen": 375623680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044755265797392175, + "loss": 3.1801, + "theoretical_loss": 4.040344247334343, + "tokens_seen": 375689216 + }, + { + "epoch": 4.02, + "learning_rate": 0.000447542627883651, + "loss": 3.2179, + "theoretical_loss": 4.04026457025438, + "tokens_seen": 375754752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004475325977933801, + "loss": 3.0811, + "theoretical_loss": 4.040184910960091, + "tokens_seen": 375820288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044752256770310935, + "loss": 3.3103, + "theoretical_loss": 4.040105269444408, + "tokens_seen": 375885824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004475125376128385, + "loss": 3.2202, + "theoretical_loss": 4.040025645700262, + "tokens_seen": 375951360 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004475025075225677, + "loss": 3.2858, + "theoretical_loss": 4.039946039720592, + "tokens_seen": 376016896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004474924774322969, + "loss": 3.191, + "theoretical_loss": 4.039866451498339, + "tokens_seen": 376082432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004474824473420261, + "loss": 3.2673, + "theoretical_loss": 4.039786881026448, + "tokens_seen": 376147968 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044747241725175526, + "loss": 3.2223, + "theoretical_loss": 4.039707328297869, + "tokens_seen": 376213504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044746238716148444, + "loss": 3.1669, + "theoretical_loss": 4.039627793305556, + "tokens_seen": 376279040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004474523570712136, + "loss": 3.292, + "theoretical_loss": 4.039548276042466, + "tokens_seen": 376344576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044744232698094285, + "loss": 3.2698, + "theoretical_loss": 4.039468776501561, + "tokens_seen": 376410112 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044743229689067204, + "loss": 3.3125, + "theoretical_loss": 4.039389294675807, + "tokens_seen": 376475648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004474222668004012, + "loss": 3.2064, + "theoretical_loss": 4.039309830558174, + "tokens_seen": 376541184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044741223671013045, + "loss": 3.2138, + "theoretical_loss": 4.039230384141634, + "tokens_seen": 376606720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004474022066198596, + "loss": 3.2613, + "theoretical_loss": 4.039150955419166, + "tokens_seen": 376672256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473921765295888, + "loss": 3.2237, + "theoretical_loss": 4.039071544383752, + "tokens_seen": 376737792 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 928311, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0586836338043213, + "objective/train/theoretical_loss": 4.038992151028377, + "objective/train/tokens_used": 397263328, + "theoretical_loss": 4.038992151028377, + "tokens_seen": 376803328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044738214643931794, + "loss": 3.1577, + "theoretical_loss": 4.038992151028377, + "tokens_seen": 376803328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473721163490472, + "loss": 3.2695, + "theoretical_loss": 4.038912775346031, + "tokens_seen": 376868864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044736208625877636, + "loss": 3.2627, + "theoretical_loss": 4.038833417329707, + "tokens_seen": 376934400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044735205616850554, + "loss": 3.2657, + "theoretical_loss": 4.038754076972404, + "tokens_seen": 376999936 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473420260782347, + "loss": 3.0816, + "theoretical_loss": 4.038674754267124, + "tokens_seen": 377065472 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473319959879639, + "loss": 3.2867, + "theoretical_loss": 4.038595449206871, + "tokens_seen": 377131008 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473219658976931, + "loss": 3.3238, + "theoretical_loss": 4.038516161784655, + "tokens_seen": 377196544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004473119358074223, + "loss": 3.333, + "theoretical_loss": 4.0384368919934905, + "tokens_seen": 377262080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044730190571715144, + "loss": 3.3022, + "theoretical_loss": 4.038357639826395, + "tokens_seen": 377327616 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472918756268807, + "loss": 3.3477, + "theoretical_loss": 4.03827840527639, + "tokens_seen": 377393152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472818455366098, + "loss": 3.3, + "theoretical_loss": 4.038199188336501, + "tokens_seen": 377458688 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044727181544633904, + "loss": 3.1931, + "theoretical_loss": 4.038119988999758, + "tokens_seen": 377524224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472617853560682, + "loss": 3.242, + "theoretical_loss": 4.038040807259193, + "tokens_seen": 377589760 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472517552657974, + "loss": 3.2026, + "theoretical_loss": 4.037961643107845, + "tokens_seen": 377655296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472417251755266, + "loss": 3.2666, + "theoretical_loss": 4.0378824965387565, + "tokens_seen": 377720832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472316950852558, + "loss": 3.1859, + "theoretical_loss": 4.037803367544971, + "tokens_seen": 377786368 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044722166499498495, + "loss": 3.1544, + "theoretical_loss": 4.037724256119537, + "tokens_seen": 377851904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472116349047142, + "loss": 3.2758, + "theoretical_loss": 4.037645162255511, + "tokens_seen": 377917440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004472016048144433, + "loss": 3.1428, + "theoretical_loss": 4.037566085945948, + "tokens_seen": 377982976 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044719157472417254, + "loss": 3.2325, + "theoretical_loss": 4.03748702718391, + "tokens_seen": 378048512 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004471815446339017, + "loss": 3.2295, + "theoretical_loss": 4.037407985962462, + "tokens_seen": 378114048 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004471715145436309, + "loss": 3.3105, + "theoretical_loss": 4.037328962274673, + "tokens_seen": 378179584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004471614844533601, + "loss": 3.2556, + "theoretical_loss": 4.037249956113616, + "tokens_seen": 378245120 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044715145436308927, + "loss": 3.2881, + "theoretical_loss": 4.037170967472369, + "tokens_seen": 378310656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044714142427281845, + "loss": 3.2335, + "theoretical_loss": 4.037091996344011, + "tokens_seen": 378376192 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 933097, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9963252544403076, + "objective/train/theoretical_loss": 4.037013042721629, + "objective/train/tokens_used": 398901728, + "theoretical_loss": 4.037013042721629, + "tokens_seen": 378441728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004471313941825477, + "loss": 3.1708, + "theoretical_loss": 4.037013042721629, + "tokens_seen": 378441728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004471213640922768, + "loss": 3.3436, + "theoretical_loss": 4.03693410659831, + "tokens_seen": 378507264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044711133400200605, + "loss": 3.2466, + "theoretical_loss": 4.036855187967149, + "tokens_seen": 378572800 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044710130391173523, + "loss": 3.2507, + "theoretical_loss": 4.036776286821239, + "tokens_seen": 378638336 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470912738214644, + "loss": 3.3125, + "theoretical_loss": 4.036697403153685, + "tokens_seen": 378703872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470812437311936, + "loss": 3.341, + "theoretical_loss": 4.0366185369575875, + "tokens_seen": 378769408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044707121364092277, + "loss": 3.2724, + "theoretical_loss": 4.036539688226057, + "tokens_seen": 378834944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044706118355065195, + "loss": 3.3381, + "theoretical_loss": 4.036460856952205, + "tokens_seen": 378900480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470511534603812, + "loss": 3.3301, + "theoretical_loss": 4.03638204312915, + "tokens_seen": 378966016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470411233701103, + "loss": 3.3257, + "theoretical_loss": 4.036303246750008, + "tokens_seen": 379031552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044703109327983955, + "loss": 3.2869, + "theoretical_loss": 4.036224467807906, + "tokens_seen": 379097088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470210631895687, + "loss": 3.2953, + "theoretical_loss": 4.036145706295971, + "tokens_seen": 379162624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470110330992979, + "loss": 3.1736, + "theoretical_loss": 4.036066962207335, + "tokens_seen": 379228160 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004470010030090271, + "loss": 3.2711, + "theoretical_loss": 4.035988235535134, + "tokens_seen": 379293696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469909729187563, + "loss": 3.2547, + "theoretical_loss": 4.035909526272507, + "tokens_seen": 379359232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044698094282848546, + "loss": 3.3263, + "theoretical_loss": 4.035830834412597, + "tokens_seen": 379424768 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044697091273821464, + "loss": 3.2611, + "theoretical_loss": 4.035752159948553, + "tokens_seen": 379490304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469608826479438, + "loss": 3.342, + "theoretical_loss": 4.035673502873524, + "tokens_seen": 379555840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044695085255767305, + "loss": 3.1847, + "theoretical_loss": 4.035594863180667, + "tokens_seen": 379621376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469408224674022, + "loss": 3.2691, + "theoretical_loss": 4.03551624086314, + "tokens_seen": 379686912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469307923771314, + "loss": 3.3222, + "theoretical_loss": 4.035437635914107, + "tokens_seen": 379752448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469207622868606, + "loss": 3.2328, + "theoretical_loss": 4.035359048326734, + "tokens_seen": 379817984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004469107321965898, + "loss": 3.2885, + "theoretical_loss": 4.035280478094192, + "tokens_seen": 379883520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044690070210631896, + "loss": 3.2808, + "theoretical_loss": 4.035201925209654, + "tokens_seen": 379949056 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044689067201604814, + "loss": 3.351, + "theoretical_loss": 4.035123389666299, + "tokens_seen": 380014592 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 936054, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2913036346435547, + "objective/train/theoretical_loss": 4.03504487145731, + "objective/train/tokens_used": 400540128, + "theoretical_loss": 4.03504487145731, + "tokens_seen": 380080128 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004468806419257773, + "loss": 3.167, + "theoretical_loss": 4.03504487145731, + "tokens_seen": 380080128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044687061183550656, + "loss": 3.2129, + "theoretical_loss": 4.034966370575873, + "tokens_seen": 380145664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004468605817452357, + "loss": 3.2047, + "theoretical_loss": 4.034887887015177, + "tokens_seen": 380211200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004468505516549649, + "loss": 3.3047, + "theoretical_loss": 4.034809420768417, + "tokens_seen": 380276736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044684052156469405, + "loss": 3.1396, + "theoretical_loss": 4.034730971828788, + "tokens_seen": 380342272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004468304914744233, + "loss": 3.2839, + "theoretical_loss": 4.034652540189494, + "tokens_seen": 380407808 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044682046138415246, + "loss": 3.284, + "theoretical_loss": 4.03457412584374, + "tokens_seen": 380473344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044681043129388164, + "loss": 3.2722, + "theoretical_loss": 4.0344957287847345, + "tokens_seen": 380538880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004468004012036108, + "loss": 3.2596, + "theoretical_loss": 4.034417349005691, + "tokens_seen": 380604416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044679037111334, + "loss": 3.2002, + "theoretical_loss": 4.034338986499825, + "tokens_seen": 380669952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004467803410230692, + "loss": 3.2184, + "theoretical_loss": 4.034260641260358, + "tokens_seen": 380735488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004467703109327984, + "loss": 3.3317, + "theoretical_loss": 4.034182313280515, + "tokens_seen": 380801024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044676028084252755, + "loss": 3.3149, + "theoretical_loss": 4.034104002553523, + "tokens_seen": 380866560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004467502507522568, + "loss": 3.2237, + "theoretical_loss": 4.0340257090726155, + "tokens_seen": 380932096 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044674022066198597, + "loss": 3.2161, + "theoretical_loss": 4.0339474328310265, + "tokens_seen": 380997632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044673019057171515, + "loss": 3.2113, + "theoretical_loss": 4.033869173821998, + "tokens_seen": 381063168 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044672016048144433, + "loss": 3.2789, + "theoretical_loss": 4.033790932038771, + "tokens_seen": 381128704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004467101303911735, + "loss": 3.2499, + "theoretical_loss": 4.033712707474596, + "tokens_seen": 381194240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004467001003009027, + "loss": 3.2653, + "theoretical_loss": 4.033634500122721, + "tokens_seen": 381259776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004466900702106319, + "loss": 3.2551, + "theoretical_loss": 4.033556309976403, + "tokens_seen": 381325312 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004466800401203611, + "loss": 3.2572, + "theoretical_loss": 4.0334781370289, + "tokens_seen": 381390848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004466700100300903, + "loss": 3.3254, + "theoretical_loss": 4.033399981273474, + "tokens_seen": 381456384 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044665997993981947, + "loss": 3.2398, + "theoretical_loss": 4.033321842703392, + "tokens_seen": 381521920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044664994984954865, + "loss": 3.2991, + "theoretical_loss": 4.033243721311925, + "tokens_seen": 381587456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004466399197592779, + "loss": 3.4042, + "theoretical_loss": 4.033165617092346, + "tokens_seen": 381652992 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 939749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5047292709350586, + "objective/train/theoretical_loss": 4.033087530037932, + "objective/train/tokens_used": 402178528, + "theoretical_loss": 4.033087530037932, + "tokens_seen": 381718528 + }, + { + "epoch": 4.02, + "learning_rate": 0.000446629889669007, + "loss": 3.347, + "theoretical_loss": 4.033087530037932, + "tokens_seen": 381718528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044661985957873625, + "loss": 3.291, + "theoretical_loss": 4.033009460141966, + "tokens_seen": 381784064 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044660982948846543, + "loss": 3.1598, + "theoretical_loss": 4.032931407397732, + "tokens_seen": 381849600 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465997993981946, + "loss": 3.2943, + "theoretical_loss": 4.03285337179852, + "tokens_seen": 381915136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465897693079238, + "loss": 3.2821, + "theoretical_loss": 4.032775353337623, + "tokens_seen": 381980672 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044657973921765297, + "loss": 3.3326, + "theoretical_loss": 4.032697352008336, + "tokens_seen": 382046208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044656970912738215, + "loss": 3.2689, + "theoretical_loss": 4.032619367803961, + "tokens_seen": 382111744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465596790371114, + "loss": 3.2767, + "theoretical_loss": 4.032541400717802, + "tokens_seen": 382177280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465496489468405, + "loss": 3.3098, + "theoretical_loss": 4.032463450743166, + "tokens_seen": 382242816 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044653961885656975, + "loss": 3.2429, + "theoretical_loss": 4.032385517873366, + "tokens_seen": 382308352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465295887662989, + "loss": 3.3241, + "theoretical_loss": 4.032307602101716, + "tokens_seen": 382373888 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465195586760281, + "loss": 3.3019, + "theoretical_loss": 4.032229703421536, + "tokens_seen": 382439424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004465095285857573, + "loss": 3.3244, + "theoretical_loss": 4.032151821826149, + "tokens_seen": 382504960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004464994984954865, + "loss": 3.2843, + "theoretical_loss": 4.032073957308882, + "tokens_seen": 382570496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044648946840521566, + "loss": 3.3007, + "theoretical_loss": 4.031996109863066, + "tokens_seen": 382636032 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044647943831494484, + "loss": 3.2394, + "theoretical_loss": 4.031918279482032, + "tokens_seen": 382701568 + }, + { + "epoch": 4.02, + "learning_rate": 0.000446469408224674, + "loss": 3.3162, + "theoretical_loss": 4.031840466159122, + "tokens_seen": 382767104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044645937813440325, + "loss": 3.2676, + "theoretical_loss": 4.031762669887675, + "tokens_seen": 382832640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004464493480441324, + "loss": 3.2952, + "theoretical_loss": 4.031684890661038, + "tokens_seen": 382898176 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004464393179538616, + "loss": 3.2442, + "theoretical_loss": 4.031607128472559, + "tokens_seen": 382963712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004464292878635908, + "loss": 3.277, + "theoretical_loss": 4.031529383315592, + "tokens_seen": 383029248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044641925777332, + "loss": 3.2451, + "theoretical_loss": 4.031451655183492, + "tokens_seen": 383094784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044640922768304916, + "loss": 3.3284, + "theoretical_loss": 4.031373944069621, + "tokens_seen": 383160320 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044639919759277834, + "loss": 3.3078, + "theoretical_loss": 4.031296249967343, + "tokens_seen": 383225856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004463891675025075, + "loss": 3.2603, + "theoretical_loss": 4.031218572870025, + "tokens_seen": 383291392 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 944485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2276928424835205, + "objective/train/theoretical_loss": 4.031140912771038, + "objective/train/tokens_used": 403816928, + "theoretical_loss": 4.031140912771038, + "tokens_seen": 383356928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044637913741223676, + "loss": 3.1955, + "theoretical_loss": 4.031140912771038, + "tokens_seen": 383356928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004463691073219659, + "loss": 3.1433, + "theoretical_loss": 4.031063269663759, + "tokens_seen": 383422464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004463590772316951, + "loss": 3.1469, + "theoretical_loss": 4.030985643541566, + "tokens_seen": 383488000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044634904714142425, + "loss": 3.1237, + "theoretical_loss": 4.030908034397841, + "tokens_seen": 383553536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004463390170511535, + "loss": 3.2202, + "theoretical_loss": 4.030830442225971, + "tokens_seen": 383619072 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044632898696088266, + "loss": 3.195, + "theoretical_loss": 4.0307528670193475, + "tokens_seen": 383684608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044631895687061184, + "loss": 3.2935, + "theoretical_loss": 4.030675308771362, + "tokens_seen": 383750144 + }, + { + "epoch": 4.02, + "learning_rate": 0.000446308926780341, + "loss": 3.2156, + "theoretical_loss": 4.030597767475413, + "tokens_seen": 383815680 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004462988966900702, + "loss": 3.3514, + "theoretical_loss": 4.030520243124901, + "tokens_seen": 383881216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004462888665997994, + "loss": 3.1422, + "theoretical_loss": 4.030442735713232, + "tokens_seen": 383946752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004462788365095286, + "loss": 3.2821, + "theoretical_loss": 4.0303652452338135, + "tokens_seen": 384012288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044626880641925775, + "loss": 3.2667, + "theoretical_loss": 4.030287771680059, + "tokens_seen": 384077824 + }, + { + "epoch": 4.02, + "learning_rate": 0.000446258776328987, + "loss": 3.0996, + "theoretical_loss": 4.030210315045383, + "tokens_seen": 384143360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044624874623871617, + "loss": 3.1988, + "theoretical_loss": 4.030132875323206, + "tokens_seen": 384208896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044623871614844535, + "loss": 3.2456, + "theoretical_loss": 4.030055452506952, + "tokens_seen": 384274432 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044622868605817453, + "loss": 3.3257, + "theoretical_loss": 4.029978046590045, + "tokens_seen": 384339968 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004462186559679037, + "loss": 3.1689, + "theoretical_loss": 4.0299006575659195, + "tokens_seen": 384405504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004462086258776329, + "loss": 3.2122, + "theoretical_loss": 4.029823285428008, + "tokens_seen": 384471040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004461985957873621, + "loss": 3.1928, + "theoretical_loss": 4.029745930169748, + "tokens_seen": 384536576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044618856569709125, + "loss": 3.289, + "theoretical_loss": 4.029668591784582, + "tokens_seen": 384602112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004461785356068205, + "loss": 3.2802, + "theoretical_loss": 4.029591270265955, + "tokens_seen": 384667648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004461685055165496, + "loss": 3.2207, + "theoretical_loss": 4.0295139656073165, + "tokens_seen": 384733184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044615847542627885, + "loss": 3.1755, + "theoretical_loss": 4.029436677802118, + "tokens_seen": 384798720 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044614844533600803, + "loss": 3.331, + "theoretical_loss": 4.029359406843817, + "tokens_seen": 384864256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004461384152457372, + "loss": 3.2615, + "theoretical_loss": 4.029282152725873, + "tokens_seen": 384929792 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.080122947692871, + "objective/train/theoretical_loss": 4.02920491544175, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.02920491544175, + "tokens_seen": 384995328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004461283851554664, + "loss": 3.2118, + "theoretical_loss": 4.02920491544175, + "tokens_seen": 384995328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044611835506519563, + "loss": 3.182, + "theoretical_loss": 4.029127694984914, + "tokens_seen": 385060864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044610832497492476, + "loss": 3.1411, + "theoretical_loss": 4.029050491348837, + "tokens_seen": 385126400 + }, + { + "epoch": 4.02, + "learning_rate": 0.000446098294884654, + "loss": 3.3916, + "theoretical_loss": 4.028973304526994, + "tokens_seen": 385191936 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460882647943831, + "loss": 3.2736, + "theoretical_loss": 4.028896134512863, + "tokens_seen": 385257472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044607823470411235, + "loss": 3.2855, + "theoretical_loss": 4.028818981299924, + "tokens_seen": 385323008 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044606820461384153, + "loss": 3.3842, + "theoretical_loss": 4.028741844881665, + "tokens_seen": 385388544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460581745235707, + "loss": 3.2528, + "theoretical_loss": 4.028664725251574, + "tokens_seen": 385454080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460481444332999, + "loss": 3.1841, + "theoretical_loss": 4.028587622403144, + "tokens_seen": 385519616 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460381143430291, + "loss": 3.2431, + "theoretical_loss": 4.02851053632987, + "tokens_seen": 385585152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044602808425275826, + "loss": 3.3261, + "theoretical_loss": 4.028433467025254, + "tokens_seen": 385650688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460180541624875, + "loss": 3.2754, + "theoretical_loss": 4.028356414482799, + "tokens_seen": 385716224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004460080240722166, + "loss": 3.2324, + "theoretical_loss": 4.028279378696011, + "tokens_seen": 385781760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044599799398194586, + "loss": 3.2508, + "theoretical_loss": 4.028202359658402, + "tokens_seen": 385847296 + }, + { + "epoch": 4.02, + "learning_rate": 0.000445987963891675, + "loss": 3.2463, + "theoretical_loss": 4.028125357363487, + "tokens_seen": 385912832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004459779338014042, + "loss": 3.327, + "theoretical_loss": 4.028048371804783, + "tokens_seen": 385978368 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004459679037111334, + "loss": 3.1474, + "theoretical_loss": 4.027971402975812, + "tokens_seen": 386043904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004459578736208626, + "loss": 3.2663, + "theoretical_loss": 4.027894450870099, + "tokens_seen": 386109440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044594784353059176, + "loss": 3.416, + "theoretical_loss": 4.027817515481174, + "tokens_seen": 386174976 + }, + { + "epoch": 4.02, + "learning_rate": 0.000445937813440321, + "loss": 3.1516, + "theoretical_loss": 4.027740596802569, + "tokens_seen": 386240512 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004459277833500502, + "loss": 3.2949, + "theoretical_loss": 4.027663694827819, + "tokens_seen": 386306048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044591775325977936, + "loss": 3.4113, + "theoretical_loss": 4.027586809550465, + "tokens_seen": 386371584 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044590772316950854, + "loss": 3.3295, + "theoretical_loss": 4.02750994096405, + "tokens_seen": 386437120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458976930792377, + "loss": 3.3294, + "theoretical_loss": 4.02743308906212, + "tokens_seen": 386502656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044588766298896696, + "loss": 3.2579, + "theoretical_loss": 4.027356253838227, + "tokens_seen": 386568192 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.385573387145996, + "objective/train/theoretical_loss": 4.027279435285924, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.027279435285924, + "tokens_seen": 386633728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458776328986961, + "loss": 3.227, + "theoretical_loss": 4.027279435285924, + "tokens_seen": 386633728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458676028084253, + "loss": 3.3774, + "theoretical_loss": 4.027202633398769, + "tokens_seen": 386699264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044585757271815445, + "loss": 3.235, + "theoretical_loss": 4.027125848170323, + "tokens_seen": 386764800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458475426278837, + "loss": 3.2659, + "theoretical_loss": 4.027049079594151, + "tokens_seen": 386830336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044583751253761286, + "loss": 3.3, + "theoretical_loss": 4.02697232766382, + "tokens_seen": 386895872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044582748244734204, + "loss": 3.221, + "theoretical_loss": 4.026895592372905, + "tokens_seen": 386961408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458174523570712, + "loss": 3.1999, + "theoretical_loss": 4.02681887371498, + "tokens_seen": 387026944 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004458074222668004, + "loss": 3.2995, + "theoretical_loss": 4.0267421716836225, + "tokens_seen": 387092480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457973921765296, + "loss": 3.1902, + "theoretical_loss": 4.026665486272417, + "tokens_seen": 387158016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457873620862588, + "loss": 3.1668, + "theoretical_loss": 4.026588817474949, + "tokens_seen": 387223552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044577733199598795, + "loss": 3.2503, + "theoretical_loss": 4.026512165284809, + "tokens_seen": 387289088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457673019057172, + "loss": 3.3255, + "theoretical_loss": 4.02643552969559, + "tokens_seen": 387354624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044575727181544637, + "loss": 3.2572, + "theoretical_loss": 4.026358910700888, + "tokens_seen": 387420160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044574724172517555, + "loss": 3.1825, + "theoretical_loss": 4.026282308294305, + "tokens_seen": 387485696 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044573721163490473, + "loss": 3.2373, + "theoretical_loss": 4.026205722469443, + "tokens_seen": 387551232 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457271815446339, + "loss": 3.2128, + "theoretical_loss": 4.0261291532199115, + "tokens_seen": 387616768 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457171514543631, + "loss": 3.3174, + "theoretical_loss": 4.026052600539321, + "tokens_seen": 387682304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004457071213640923, + "loss": 3.217, + "theoretical_loss": 4.025976064421285, + "tokens_seen": 387747840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044569709127382145, + "loss": 3.2261, + "theoretical_loss": 4.025899544859423, + "tokens_seen": 387813376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004456870611835507, + "loss": 3.333, + "theoretical_loss": 4.025823041847357, + "tokens_seen": 387878912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004456770310932798, + "loss": 3.2819, + "theoretical_loss": 4.02574655537871, + "tokens_seen": 387944448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044566700100300905, + "loss": 3.2584, + "theoretical_loss": 4.025670085447113, + "tokens_seen": 388009984 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044565697091273823, + "loss": 3.2369, + "theoretical_loss": 4.025593632046197, + "tokens_seen": 388075520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004456469408224674, + "loss": 3.2692, + "theoretical_loss": 4.025517195169599, + "tokens_seen": 388141056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004456369107321966, + "loss": 3.3101, + "theoretical_loss": 4.025440774810958, + "tokens_seen": 388206592 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2409281730651855, + "objective/train/theoretical_loss": 4.025364370963916, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.025364370963916, + "tokens_seen": 388272128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044562688064192583, + "loss": 3.2982, + "theoretical_loss": 4.025364370963916, + "tokens_seen": 388272128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044561685055165496, + "loss": 3.3022, + "theoretical_loss": 4.02528798362212, + "tokens_seen": 388337664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004456068204613842, + "loss": 3.2745, + "theoretical_loss": 4.02521161277922, + "tokens_seen": 388403200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455967903711133, + "loss": 3.2654, + "theoretical_loss": 4.025135258428869, + "tokens_seen": 388468736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044558676028084255, + "loss": 3.2896, + "theoretical_loss": 4.0250589205647245, + "tokens_seen": 388534272 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044557673019057173, + "loss": 3.2368, + "theoretical_loss": 4.024982599180447, + "tokens_seen": 388599808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455667001003009, + "loss": 3.2776, + "theoretical_loss": 4.024906294269699, + "tokens_seen": 388665344 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455566700100301, + "loss": 3.2434, + "theoretical_loss": 4.02483000582615, + "tokens_seen": 388730880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455466399197593, + "loss": 3.2501, + "theoretical_loss": 4.02475373384347, + "tokens_seen": 388796416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044553660982948846, + "loss": 3.3292, + "theoretical_loss": 4.024677478315333, + "tokens_seen": 388861952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455265797392177, + "loss": 3.113, + "theoretical_loss": 4.024601239235417, + "tokens_seen": 388927488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004455165496489468, + "loss": 3.2517, + "theoretical_loss": 4.024525016597404, + "tokens_seen": 388993024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044550651955867606, + "loss": 3.3109, + "theoretical_loss": 4.024448810394979, + "tokens_seen": 389058560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454964894684052, + "loss": 3.284, + "theoretical_loss": 4.024372620621831, + "tokens_seen": 389124096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454864593781344, + "loss": 3.3003, + "theoretical_loss": 4.02429644727165, + "tokens_seen": 389189632 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454764292878636, + "loss": 3.2101, + "theoretical_loss": 4.024220290338132, + "tokens_seen": 389255168 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454663991975928, + "loss": 3.3273, + "theoretical_loss": 4.024144149814977, + "tokens_seen": 389320704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044545636910732196, + "loss": 3.2252, + "theoretical_loss": 4.024068025695887, + "tokens_seen": 389386240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454463390170512, + "loss": 3.2305, + "theoretical_loss": 4.023991917974567, + "tokens_seen": 389451776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454363089267803, + "loss": 3.1371, + "theoretical_loss": 4.023915826644727, + "tokens_seen": 389517312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044542627883650956, + "loss": 3.2948, + "theoretical_loss": 4.023839751700079, + "tokens_seen": 389582848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454162487462387, + "loss": 3.2785, + "theoretical_loss": 4.023763693134341, + "tokens_seen": 389648384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004454062186559679, + "loss": 3.2821, + "theoretical_loss": 4.0236876509412305, + "tokens_seen": 389713920 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004453961885656971, + "loss": 3.3252, + "theoretical_loss": 4.023611625114472, + "tokens_seen": 389779456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004453861584754263, + "loss": 3.237, + "theoretical_loss": 4.023535615647791, + "tokens_seen": 389844992 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3238933086395264, + "objective/train/theoretical_loss": 4.02345962253492, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.02345962253492, + "tokens_seen": 389910528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044537612838515547, + "loss": 3.3279, + "theoretical_loss": 4.02345962253492, + "tokens_seen": 389910528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044536609829488465, + "loss": 3.1881, + "theoretical_loss": 4.02338364576959, + "tokens_seen": 389976064 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044535606820461383, + "loss": 3.1093, + "theoretical_loss": 4.0233076853455385, + "tokens_seen": 390041600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044534603811434306, + "loss": 3.2733, + "theoretical_loss": 4.023231741256508, + "tokens_seen": 390107136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004453360080240722, + "loss": 3.1099, + "theoretical_loss": 4.023155813496238, + "tokens_seen": 390172672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004453259779338014, + "loss": 3.2735, + "theoretical_loss": 4.023079902058481, + "tokens_seen": 390238208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044531594784353055, + "loss": 3.3269, + "theoretical_loss": 4.023004006936985, + "tokens_seen": 390303744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004453059177532598, + "loss": 3.3142, + "theoretical_loss": 4.022928128125504, + "tokens_seen": 390369280 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044529588766298897, + "loss": 3.2491, + "theoretical_loss": 4.0228522656177965, + "tokens_seen": 390434816 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044528585757271815, + "loss": 3.2564, + "theoretical_loss": 4.022776419407624, + "tokens_seen": 390500352 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044527582748244733, + "loss": 3.2616, + "theoretical_loss": 4.02270058948875, + "tokens_seen": 390565888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044526579739217657, + "loss": 3.2643, + "theoretical_loss": 4.0226247758549425, + "tokens_seen": 390631424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004452557673019057, + "loss": 3.2914, + "theoretical_loss": 4.022548978499974, + "tokens_seen": 390696960 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044524573721163493, + "loss": 3.2634, + "theoretical_loss": 4.022473197417618, + "tokens_seen": 390762496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044523570712136406, + "loss": 3.3259, + "theoretical_loss": 4.022397432601654, + "tokens_seen": 390828032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004452256770310933, + "loss": 3.3547, + "theoretical_loss": 4.022321684045862, + "tokens_seen": 390893568 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044521564694082247, + "loss": 3.2886, + "theoretical_loss": 4.022245951744029, + "tokens_seen": 390959104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044520561685055165, + "loss": 3.1626, + "theoretical_loss": 4.022170235689943, + "tokens_seen": 391024640 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044519558676028083, + "loss": 3.2048, + "theoretical_loss": 4.022094535877395, + "tokens_seen": 391090176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044518555667001, + "loss": 3.234, + "theoretical_loss": 4.022018852300181, + "tokens_seen": 391155712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044517552657973925, + "loss": 3.3504, + "theoretical_loss": 4.0219431849521, + "tokens_seen": 391221248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044516549648946843, + "loss": 3.1728, + "theoretical_loss": 4.021867533826953, + "tokens_seen": 391286784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004451554663991976, + "loss": 3.2408, + "theoretical_loss": 4.021791898918547, + "tokens_seen": 391352320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004451454363089268, + "loss": 3.3056, + "theoretical_loss": 4.02171628022069, + "tokens_seen": 391417856 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044513540621865603, + "loss": 3.2375, + "theoretical_loss": 4.021640677727195, + "tokens_seen": 391483392 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3417718410491943, + "objective/train/theoretical_loss": 4.021565091431877, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.021565091431877, + "tokens_seen": 391548928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044512537612838516, + "loss": 3.3417, + "theoretical_loss": 4.021565091431877, + "tokens_seen": 391548928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004451153460381144, + "loss": 3.1351, + "theoretical_loss": 4.021489521328556, + "tokens_seen": 391614464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004451053159478435, + "loss": 3.3384, + "theoretical_loss": 4.021413967411053, + "tokens_seen": 391680000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044509528585757275, + "loss": 3.2507, + "theoretical_loss": 4.021338429673197, + "tokens_seen": 391745536 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044508525576730194, + "loss": 3.1259, + "theoretical_loss": 4.021262908108814, + "tokens_seen": 391811072 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004450752256770311, + "loss": 3.3971, + "theoretical_loss": 4.021187402711739, + "tokens_seen": 391876608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004450651955867603, + "loss": 3.2729, + "theoretical_loss": 4.021111913475806, + "tokens_seen": 391942144 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004450551654964895, + "loss": 3.2385, + "theoretical_loss": 4.021036440394856, + "tokens_seen": 392007680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044504513540621866, + "loss": 3.3042, + "theoretical_loss": 4.020960983462732, + "tokens_seen": 392073216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004450351053159479, + "loss": 3.3121, + "theoretical_loss": 4.0208855426732795, + "tokens_seen": 392138752 + }, + { + "epoch": 4.02, + "learning_rate": 0.000445025075225677, + "loss": 3.2548, + "theoretical_loss": 4.020810118020348, + "tokens_seen": 392204288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044501504513540626, + "loss": 3.2279, + "theoretical_loss": 4.02073470949779, + "tokens_seen": 392269824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004450050150451354, + "loss": 3.2791, + "theoretical_loss": 4.020659317099463, + "tokens_seen": 392335360 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449949849548646, + "loss": 3.1382, + "theoretical_loss": 4.020583940819227, + "tokens_seen": 392400896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449849548645938, + "loss": 3.2761, + "theoretical_loss": 4.020508580650944, + "tokens_seen": 392466432 + }, + { + "epoch": 4.02, + "learning_rate": 0.000444974924774323, + "loss": 3.1777, + "theoretical_loss": 4.020433236588481, + "tokens_seen": 392531968 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044496489468405216, + "loss": 3.2676, + "theoretical_loss": 4.020357908625707, + "tokens_seen": 392597504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449548645937814, + "loss": 3.2232, + "theoretical_loss": 4.020282596756496, + "tokens_seen": 392663040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449448345035105, + "loss": 3.243, + "theoretical_loss": 4.0202073009747235, + "tokens_seen": 392728576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044493480441323976, + "loss": 3.2969, + "theoretical_loss": 4.02013202127427, + "tokens_seen": 392794112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449247743229689, + "loss": 3.3144, + "theoretical_loss": 4.020056757649019, + "tokens_seen": 392859648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449147442326981, + "loss": 3.197, + "theoretical_loss": 4.019981510092856, + "tokens_seen": 392925184 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004449047141424273, + "loss": 3.2753, + "theoretical_loss": 4.019906278599673, + "tokens_seen": 392990720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004448946840521565, + "loss": 3.3146, + "theoretical_loss": 4.019831063163361, + "tokens_seen": 393056256 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044488465396188567, + "loss": 3.2838, + "theoretical_loss": 4.019755863777817, + "tokens_seen": 393121792 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0638654232025146, + "objective/train/theoretical_loss": 4.019680680436942, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.019680680436942, + "tokens_seen": 393187328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044487462387161485, + "loss": 3.2414, + "theoretical_loss": 4.019680680436942, + "tokens_seen": 393187328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044486459378134403, + "loss": 3.1863, + "theoretical_loss": 4.019605513134638, + "tokens_seen": 393252864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044485456369107326, + "loss": 3.3021, + "theoretical_loss": 4.019530361864812, + "tokens_seen": 393318400 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004448445336008024, + "loss": 3.1166, + "theoretical_loss": 4.019455226621375, + "tokens_seen": 393383936 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004448345035105316, + "loss": 3.287, + "theoretical_loss": 4.019380107398238, + "tokens_seen": 393449472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044482447342026075, + "loss": 3.2805, + "theoretical_loss": 4.019305004189318, + "tokens_seen": 393515008 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044481444332999, + "loss": 3.252, + "theoretical_loss": 4.019229916988537, + "tokens_seen": 393580544 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044480441323971917, + "loss": 3.2535, + "theoretical_loss": 4.019154845789816, + "tokens_seen": 393646080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044479438314944835, + "loss": 3.2315, + "theoretical_loss": 4.0190797905870825, + "tokens_seen": 393711616 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044478435305917753, + "loss": 3.34, + "theoretical_loss": 4.019004751374267, + "tokens_seen": 393777152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044477432296890677, + "loss": 3.2223, + "theoretical_loss": 4.0189297281453005, + "tokens_seen": 393842688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004447642928786359, + "loss": 3.3361, + "theoretical_loss": 4.0188547208941205, + "tokens_seen": 393908224 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044475426278836513, + "loss": 3.2629, + "theoretical_loss": 4.018779729614668, + "tokens_seen": 393973760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044474423269809426, + "loss": 3.1664, + "theoretical_loss": 4.0187047543008845, + "tokens_seen": 394039296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004447342026078235, + "loss": 3.2433, + "theoretical_loss": 4.018629794946717, + "tokens_seen": 394104832 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044472417251755267, + "loss": 3.2727, + "theoretical_loss": 4.0185548515461145, + "tokens_seen": 394170368 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044471414242728185, + "loss": 3.2453, + "theoretical_loss": 4.018479924093031, + "tokens_seen": 394235904 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044470411233701103, + "loss": 3.2964, + "theoretical_loss": 4.018405012581422, + "tokens_seen": 394301440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004446940822467402, + "loss": 3.3251, + "theoretical_loss": 4.018330117005248, + "tokens_seen": 394366976 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004446840521564694, + "loss": 3.2017, + "theoretical_loss": 4.018255237358471, + "tokens_seen": 394432512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044467402206619863, + "loss": 3.2275, + "theoretical_loss": 4.018180373635056, + "tokens_seen": 394498048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044466399197592776, + "loss": 3.1986, + "theoretical_loss": 4.018105525828975, + "tokens_seen": 394563584 + }, + { + "epoch": 4.02, + "learning_rate": 0.000444653961885657, + "loss": 3.278, + "theoretical_loss": 4.0180306939341985, + "tokens_seen": 394629120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004446439317953861, + "loss": 3.1524, + "theoretical_loss": 4.017955877944704, + "tokens_seen": 394694656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044463390170511536, + "loss": 3.3293, + "theoretical_loss": 4.0178810778544705, + "tokens_seen": 394760192 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.220686197280884, + "objective/train/theoretical_loss": 4.017806293657481, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.017806293657481, + "tokens_seen": 394825728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044462387161484454, + "loss": 3.2672, + "theoretical_loss": 4.017806293657481, + "tokens_seen": 394825728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004446138415245737, + "loss": 3.2032, + "theoretical_loss": 4.017731525347719, + "tokens_seen": 394891264 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004446038114343029, + "loss": 3.3539, + "theoretical_loss": 4.017656772919176, + "tokens_seen": 394956800 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044459378134403214, + "loss": 3.3672, + "theoretical_loss": 4.017582036365843, + "tokens_seen": 395022336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044458375125376126, + "loss": 3.151, + "theoretical_loss": 4.017507315681717, + "tokens_seen": 395087872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004445737211634905, + "loss": 3.281, + "theoretical_loss": 4.017432610860796, + "tokens_seen": 395153408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004445636910732196, + "loss": 3.2083, + "theoretical_loss": 4.017357921897082, + "tokens_seen": 395218944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044455366098294886, + "loss": 3.3364, + "theoretical_loss": 4.01728324878458, + "tokens_seen": 395284480 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044454363089267804, + "loss": 3.3057, + "theoretical_loss": 4.017208591517301, + "tokens_seen": 395350016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004445336008024072, + "loss": 3.3651, + "theoretical_loss": 4.017133950089255, + "tokens_seen": 395415552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004445235707121364, + "loss": 3.2784, + "theoretical_loss": 4.017059324494457, + "tokens_seen": 395481088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004445135406218656, + "loss": 3.194, + "theoretical_loss": 4.016984714726926, + "tokens_seen": 395546624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044450351053159477, + "loss": 3.316, + "theoretical_loss": 4.016910120780684, + "tokens_seen": 395612160 + }, + { + "epoch": 4.02, + "learning_rate": 0.000444493480441324, + "loss": 3.2566, + "theoretical_loss": 4.016835542649757, + "tokens_seen": 395677696 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044448345035105313, + "loss": 3.3071, + "theoretical_loss": 4.01676098032817, + "tokens_seen": 395743232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044447342026078236, + "loss": 3.237, + "theoretical_loss": 4.016686433809957, + "tokens_seen": 395808768 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444633901705115, + "loss": 3.3236, + "theoretical_loss": 4.016611903089152, + "tokens_seen": 395874304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444533600802407, + "loss": 3.2649, + "theoretical_loss": 4.016537388159794, + "tokens_seen": 395939840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044444332998996996, + "loss": 3.3196, + "theoretical_loss": 4.016462889015922, + "tokens_seen": 396005376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444332998996991, + "loss": 3.1814, + "theoretical_loss": 4.016388405651582, + "tokens_seen": 396070912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444232698094283, + "loss": 3.2715, + "theoretical_loss": 4.01631393806082, + "tokens_seen": 396136448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444132397191575, + "loss": 3.2886, + "theoretical_loss": 4.01623948623769, + "tokens_seen": 396201984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004444032096288867, + "loss": 3.3175, + "theoretical_loss": 4.016165050176243, + "tokens_seen": 396267520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044439317953861587, + "loss": 3.3342, + "theoretical_loss": 4.016090629870537, + "tokens_seen": 396333056 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044438314944834505, + "loss": 3.1408, + "theoretical_loss": 4.016016225314634, + "tokens_seen": 396398592 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.349506378173828, + "objective/train/theoretical_loss": 4.015941836502597, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.015941836502597, + "tokens_seen": 396464128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044437311935807423, + "loss": 3.3071, + "theoretical_loss": 4.015941836502597, + "tokens_seen": 396464128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044436308926780346, + "loss": 3.2708, + "theoretical_loss": 4.015867463428491, + "tokens_seen": 396529664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004443530591775326, + "loss": 3.1421, + "theoretical_loss": 4.01579310608639, + "tokens_seen": 396595200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004443430290872618, + "loss": 3.2466, + "theoretical_loss": 4.015718764470364, + "tokens_seen": 396660736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044433299899699095, + "loss": 3.3201, + "theoretical_loss": 4.0156444385744905, + "tokens_seen": 396726272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004443229689067202, + "loss": 3.2406, + "theoretical_loss": 4.01557012839285, + "tokens_seen": 396791808 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044431293881644937, + "loss": 3.2142, + "theoretical_loss": 4.015495833919527, + "tokens_seen": 396857344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044430290872617855, + "loss": 3.091, + "theoretical_loss": 4.015421555148604, + "tokens_seen": 396922880 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044429287863590773, + "loss": 3.2852, + "theoretical_loss": 4.015347292074173, + "tokens_seen": 396988416 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044428284854563697, + "loss": 3.3114, + "theoretical_loss": 4.015273044690326, + "tokens_seen": 397053952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004442728184553661, + "loss": 3.2023, + "theoretical_loss": 4.015198812991159, + "tokens_seen": 397119488 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044426278836509533, + "loss": 3.2189, + "theoretical_loss": 4.015124596970771, + "tokens_seen": 397185024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044425275827482446, + "loss": 3.2614, + "theoretical_loss": 4.0150503966232645, + "tokens_seen": 397250560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004442427281845537, + "loss": 3.2789, + "theoretical_loss": 4.014976211942743, + "tokens_seen": 397316096 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044423269809428287, + "loss": 3.374, + "theoretical_loss": 4.014902042923318, + "tokens_seen": 397381632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044422266800401205, + "loss": 3.2465, + "theoretical_loss": 4.0148278895591005, + "tokens_seen": 397447168 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044421263791374123, + "loss": 3.2947, + "theoretical_loss": 4.0147537518442045, + "tokens_seen": 397512704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004442026078234704, + "loss": 3.202, + "theoretical_loss": 4.014679629772748, + "tokens_seen": 397578240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004441925777331996, + "loss": 3.2675, + "theoretical_loss": 4.014605523338853, + "tokens_seen": 397643776 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044418254764292883, + "loss": 3.2039, + "theoretical_loss": 4.014531432536645, + "tokens_seen": 397709312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044417251755265796, + "loss": 3.1857, + "theoretical_loss": 4.014457357360249, + "tokens_seen": 397774848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004441624874623872, + "loss": 3.2382, + "theoretical_loss": 4.014383297803799, + "tokens_seen": 397840384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004441524573721163, + "loss": 3.2293, + "theoretical_loss": 4.014309253861427, + "tokens_seen": 397905920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044414242728184556, + "loss": 3.3111, + "theoretical_loss": 4.01423522552727, + "tokens_seen": 397971456 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044413239719157474, + "loss": 3.2246, + "theoretical_loss": 4.01416121279547, + "tokens_seen": 398036992 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.390597105026245, + "objective/train/theoretical_loss": 4.014087215660169, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.014087215660169, + "tokens_seen": 398102528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004441223671013039, + "loss": 3.3038, + "theoretical_loss": 4.014087215660169, + "tokens_seen": 398102528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004441123370110331, + "loss": 3.2286, + "theoretical_loss": 4.014013234115515, + "tokens_seen": 398168064 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044410230692076234, + "loss": 3.2143, + "theoretical_loss": 4.0139392681556565, + "tokens_seen": 398233600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044409227683049146, + "loss": 3.2795, + "theoretical_loss": 4.013865317774748, + "tokens_seen": 398299136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440822467402207, + "loss": 3.3162, + "theoretical_loss": 4.013791382966945, + "tokens_seen": 398364672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440722166499498, + "loss": 3.1897, + "theoretical_loss": 4.0137174637264055, + "tokens_seen": 398430208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044406218655967906, + "loss": 3.2004, + "theoretical_loss": 4.013643560047294, + "tokens_seen": 398495744 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044405215646940824, + "loss": 3.1779, + "theoretical_loss": 4.013569671923775, + "tokens_seen": 398561280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440421263791374, + "loss": 3.0806, + "theoretical_loss": 4.013495799350017, + "tokens_seen": 398626816 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440320962888666, + "loss": 3.2968, + "theoretical_loss": 4.013421942320193, + "tokens_seen": 398692352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440220661985958, + "loss": 3.2483, + "theoretical_loss": 4.013348100828478, + "tokens_seen": 398757888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044401203610832497, + "loss": 3.2474, + "theoretical_loss": 4.013274274869049, + "tokens_seen": 398823424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004440020060180542, + "loss": 3.3285, + "theoretical_loss": 4.013200464436087, + "tokens_seen": 398888960 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044399197592778333, + "loss": 3.2752, + "theoretical_loss": 4.013126669523779, + "tokens_seen": 398954496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044398194583751256, + "loss": 3.1803, + "theoretical_loss": 4.0130528901263105, + "tokens_seen": 399020032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004439719157472417, + "loss": 3.278, + "theoretical_loss": 4.012979126237873, + "tokens_seen": 399085568 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004439618856569709, + "loss": 3.2901, + "theoretical_loss": 4.012905377852661, + "tokens_seen": 399151104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004439518555667001, + "loss": 3.2555, + "theoretical_loss": 4.012831644964869, + "tokens_seen": 399216640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004439418254764293, + "loss": 3.2511, + "theoretical_loss": 4.0127579275687, + "tokens_seen": 399282176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044393179538615847, + "loss": 3.3102, + "theoretical_loss": 4.012684225658356, + "tokens_seen": 399347712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004439217652958877, + "loss": 3.2055, + "theoretical_loss": 4.012610539228042, + "tokens_seen": 399413248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044391173520561683, + "loss": 3.171, + "theoretical_loss": 4.01253686827197, + "tokens_seen": 399478784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044390170511534607, + "loss": 3.3347, + "theoretical_loss": 4.0124632127843505, + "tokens_seen": 399544320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004438916750250752, + "loss": 3.2206, + "theoretical_loss": 4.0123895727594014, + "tokens_seen": 399609856 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044388164493480443, + "loss": 3.172, + "theoretical_loss": 4.012315948191339, + "tokens_seen": 399675392 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1709775924682617, + "objective/train/theoretical_loss": 4.012242339074387, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.012242339074387, + "tokens_seen": 399740928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004438716148445336, + "loss": 3.3026, + "theoretical_loss": 4.012242339074387, + "tokens_seen": 399740928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004438615847542628, + "loss": 3.3502, + "theoretical_loss": 4.012168745402769, + "tokens_seen": 399806464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044385155466399197, + "loss": 3.3175, + "theoretical_loss": 4.012095167170715, + "tokens_seen": 399872000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044384152457372115, + "loss": 3.3292, + "theoretical_loss": 4.012021604372454, + "tokens_seen": 399937536 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044383149448345033, + "loss": 3.2851, + "theoretical_loss": 4.011948057002223, + "tokens_seen": 400003072 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044382146439317957, + "loss": 3.27, + "theoretical_loss": 4.011874525054257, + "tokens_seen": 400068608 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004438114343029087, + "loss": 3.2895, + "theoretical_loss": 4.011801008522797, + "tokens_seen": 400134144 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044380140421263793, + "loss": 3.139, + "theoretical_loss": 4.011727507402087, + "tokens_seen": 400199680 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004437913741223671, + "loss": 3.2366, + "theoretical_loss": 4.011654021686375, + "tokens_seen": 400265216 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004437813440320963, + "loss": 3.2393, + "theoretical_loss": 4.011580551369909, + "tokens_seen": 400330752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004437713139418255, + "loss": 3.2342, + "theoretical_loss": 4.011507096446943, + "tokens_seen": 400396288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044376128385155466, + "loss": 3.2571, + "theoretical_loss": 4.0114336569117315, + "tokens_seen": 400461824 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044375125376128384, + "loss": 3.3547, + "theoretical_loss": 4.011360232758535, + "tokens_seen": 400527360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044374122367101307, + "loss": 3.2686, + "theoretical_loss": 4.011286823981615, + "tokens_seen": 400592896 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004437311935807422, + "loss": 3.2328, + "theoretical_loss": 4.0112134305752365, + "tokens_seen": 400658432 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044372116349047143, + "loss": 3.2848, + "theoretical_loss": 4.011140052533668, + "tokens_seen": 400723968 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044371113340020056, + "loss": 3.3189, + "theoretical_loss": 4.011066689851182, + "tokens_seen": 400789504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004437011033099298, + "loss": 3.3021, + "theoretical_loss": 4.010993342522052, + "tokens_seen": 400855040 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044369107321965903, + "loss": 3.2667, + "theoretical_loss": 4.010920010540554, + "tokens_seen": 400920576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044368104312938816, + "loss": 3.2112, + "theoretical_loss": 4.010846693900971, + "tokens_seen": 400986112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004436710130391174, + "loss": 3.216, + "theoretical_loss": 4.010773392597585, + "tokens_seen": 401051648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004436609829488465, + "loss": 3.3381, + "theoretical_loss": 4.010700106624684, + "tokens_seen": 401117184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044365095285857576, + "loss": 3.3039, + "theoretical_loss": 4.010626835976558, + "tokens_seen": 401182720 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044364092276830494, + "loss": 3.1066, + "theoretical_loss": 4.010553580647497, + "tokens_seen": 401248256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004436308926780341, + "loss": 3.2878, + "theoretical_loss": 4.0104803406317995, + "tokens_seen": 401313792 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.139869213104248, + "objective/train/theoretical_loss": 4.010407115923764, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.010407115923764, + "tokens_seen": 401379328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004436208625877633, + "loss": 3.2956, + "theoretical_loss": 4.010407115923764, + "tokens_seen": 401379328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044361083249749254, + "loss": 3.3127, + "theoretical_loss": 4.010333906517692, + "tokens_seen": 401444864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044360080240722166, + "loss": 3.2959, + "theoretical_loss": 4.01026071240789, + "tokens_seen": 401510400 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004435907723169509, + "loss": 3.2291, + "theoretical_loss": 4.010187533588663, + "tokens_seen": 401575936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044358074222668, + "loss": 3.2389, + "theoretical_loss": 4.010114370054326, + "tokens_seen": 401641472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044357071213640926, + "loss": 3.2338, + "theoretical_loss": 4.01004122179919, + "tokens_seen": 401707008 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044356068204613844, + "loss": 3.2072, + "theoretical_loss": 4.009968088817573, + "tokens_seen": 401772544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004435506519558676, + "loss": 3.3205, + "theoretical_loss": 4.009894971103797, + "tokens_seen": 401838080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004435406218655968, + "loss": 3.2594, + "theoretical_loss": 4.009821868652183, + "tokens_seen": 401903616 + }, + { + "epoch": 4.02, + "learning_rate": 0.000443530591775326, + "loss": 3.3099, + "theoretical_loss": 4.009748781457057, + "tokens_seen": 401969152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044352056168505517, + "loss": 3.3148, + "theoretical_loss": 4.009675709512752, + "tokens_seen": 402034688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004435105315947844, + "loss": 3.1935, + "theoretical_loss": 4.009602652813597, + "tokens_seen": 402100224 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044350050150451353, + "loss": 3.2829, + "theoretical_loss": 4.00952961135393, + "tokens_seen": 402165760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044349047141424276, + "loss": 3.2457, + "theoretical_loss": 4.009456585128086, + "tokens_seen": 402231296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434804413239719, + "loss": 3.2205, + "theoretical_loss": 4.009383574130409, + "tokens_seen": 402296832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434704112337011, + "loss": 3.2875, + "theoretical_loss": 4.009310578355244, + "tokens_seen": 402362368 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434603811434303, + "loss": 3.3182, + "theoretical_loss": 4.009237597796936, + "tokens_seen": 402427904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434503510531595, + "loss": 3.1075, + "theoretical_loss": 4.009164632449838, + "tokens_seen": 402493440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044344032096288867, + "loss": 3.3115, + "theoretical_loss": 4.009091682308302, + "tokens_seen": 402558976 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434302908726179, + "loss": 3.2947, + "theoretical_loss": 4.009018747366685, + "tokens_seen": 402624512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044342026078234703, + "loss": 3.2509, + "theoretical_loss": 4.008945827619348, + "tokens_seen": 402690048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044341023069207627, + "loss": 3.2695, + "theoretical_loss": 4.008872923060651, + "tokens_seen": 402755584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004434002006018054, + "loss": 3.2034, + "theoretical_loss": 4.008800033684962, + "tokens_seen": 402821120 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044339017051153463, + "loss": 3.2193, + "theoretical_loss": 4.008727159486648, + "tokens_seen": 402886656 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004433801404212638, + "loss": 3.1847, + "theoretical_loss": 4.008654300460082, + "tokens_seen": 402952192 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.026794195175171, + "objective/train/theoretical_loss": 4.008581456599638, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.008581456599638, + "tokens_seen": 403017728 + }, + { + "epoch": 4.02, + "learning_rate": 0.000443370110330993, + "loss": 3.2137, + "theoretical_loss": 4.008581456599638, + "tokens_seen": 403017728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044336008024072217, + "loss": 3.382, + "theoretical_loss": 4.008508627899692, + "tokens_seen": 403083264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044335005015045135, + "loss": 3.3215, + "theoretical_loss": 4.008435814354627, + "tokens_seen": 403148800 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044334002006018053, + "loss": 3.1305, + "theoretical_loss": 4.0083630159588255, + "tokens_seen": 403214336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044332998996990977, + "loss": 3.3536, + "theoretical_loss": 4.008290232706674, + "tokens_seen": 403279872 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004433199598796389, + "loss": 3.2629, + "theoretical_loss": 4.008217464592563, + "tokens_seen": 403345408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044330992978936813, + "loss": 3.3023, + "theoretical_loss": 4.008144711610884, + "tokens_seen": 403410944 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004432998996990973, + "loss": 3.2965, + "theoretical_loss": 4.008071973756033, + "tokens_seen": 403476480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004432898696088265, + "loss": 3.1966, + "theoretical_loss": 4.00799925102241, + "tokens_seen": 403542016 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004432798395185557, + "loss": 3.2291, + "theoretical_loss": 4.007926543404413, + "tokens_seen": 403607552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044326980942828486, + "loss": 3.2076, + "theoretical_loss": 4.007853850896451, + "tokens_seen": 403673088 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044325977933801404, + "loss": 3.2594, + "theoretical_loss": 4.007781173492928, + "tokens_seen": 403738624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044324974924774327, + "loss": 3.2428, + "theoretical_loss": 4.007708511188255, + "tokens_seen": 403804160 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004432397191574724, + "loss": 3.3283, + "theoretical_loss": 4.007635863976848, + "tokens_seen": 403869696 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044322968906720163, + "loss": 3.2998, + "theoretical_loss": 4.007563231853121, + "tokens_seen": 403935232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044321965897693076, + "loss": 3.3065, + "theoretical_loss": 4.007490614811493, + "tokens_seen": 404000768 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044320962888666, + "loss": 3.2752, + "theoretical_loss": 4.007418012846388, + "tokens_seen": 404066304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004431995987963892, + "loss": 3.2138, + "theoretical_loss": 4.007345425952232, + "tokens_seen": 404131840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044318956870611836, + "loss": 3.2283, + "theoretical_loss": 4.007272854123451, + "tokens_seen": 404197376 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044317953861584754, + "loss": 3.2026, + "theoretical_loss": 4.007200297354476, + "tokens_seen": 404262912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004431695085255767, + "loss": 3.3654, + "theoretical_loss": 4.007127755639744, + "tokens_seen": 404328448 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004431594784353059, + "loss": 3.2164, + "theoretical_loss": 4.007055228973691, + "tokens_seen": 404393984 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044314944834503514, + "loss": 3.3189, + "theoretical_loss": 4.006982717350756, + "tokens_seen": 404459520 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044313941825476427, + "loss": 3.2973, + "theoretical_loss": 4.006910220765384, + "tokens_seen": 404525056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004431293881644935, + "loss": 3.2873, + "theoretical_loss": 4.00683773921202, + "tokens_seen": 404590592 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 945440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.295445680618286, + "objective/train/theoretical_loss": 4.0067652726851115, + "objective/train/tokens_used": 404956640, + "theoretical_loss": 4.0067652726851115, + "tokens_seen": 404656128 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004431193580742227, + "loss": 3.2375, + "theoretical_loss": 4.0067652726851115, + "tokens_seen": 404656128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044310932798395186, + "loss": 3.2365, + "theoretical_loss": 4.006692821179113, + "tokens_seen": 404721664 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044309929789368104, + "loss": 3.2844, + "theoretical_loss": 4.006620384688478, + "tokens_seen": 404787200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004430892678034102, + "loss": 3.1618, + "theoretical_loss": 4.006547963207666, + "tokens_seen": 404852736 + }, + { + "epoch": 4.02, + "learning_rate": 0.0004430792377131394, + "loss": 3.3092, + "theoretical_loss": 4.006475556731136, + "tokens_seen": 404918272 + }, + { + "epoch": 4.02, + "learning_rate": 0.00044306920762286864, + "loss": 3.2718, + "theoretical_loss": 4.0064110823408745, + "tokens_seen": 404976640 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044305917753259777, + "loss": 3.1147, + "theoretical_loss": 4.00633870421669, + "tokens_seen": 405042176 + }, + { + "epoch": 5.0, + "learning_rate": 0.000443049147442327, + "loss": 3.1525, + "theoretical_loss": 4.006266341080792, + "tokens_seen": 405107712 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044303911735205613, + "loss": 3.1688, + "theoretical_loss": 4.006193992927654, + "tokens_seen": 405173248 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044302908726178537, + "loss": 3.1825, + "theoretical_loss": 4.006121659751752, + "tokens_seen": 405238784 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044301905717151455, + "loss": 3.1895, + "theoretical_loss": 4.006049341547562, + "tokens_seen": 405304320 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044300902708124373, + "loss": 3.1451, + "theoretical_loss": 4.005977038309565, + "tokens_seen": 405369856 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004429989969909729, + "loss": 3.0995, + "theoretical_loss": 4.005904750032245, + "tokens_seen": 405435392 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004429889669007021, + "loss": 3.1974, + "theoretical_loss": 4.005832476710091, + "tokens_seen": 405500928 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044297893681043127, + "loss": 3.0955, + "theoretical_loss": 4.00576021833759, + "tokens_seen": 405566464 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004429689067201605, + "loss": 3.3098, + "theoretical_loss": 4.005687974909238, + "tokens_seen": 405632000 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044295887662988963, + "loss": 3.3265, + "theoretical_loss": 4.005615746419526, + "tokens_seen": 405697536 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044294884653961887, + "loss": 3.1112, + "theoretical_loss": 4.005543532862957, + "tokens_seen": 405763072 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004429388164493481, + "loss": 3.277, + "theoretical_loss": 4.005471334234029, + "tokens_seen": 405828608 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044292878635907723, + "loss": 3.2142, + "theoretical_loss": 4.005399150527249, + "tokens_seen": 405894144 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044291875626880647, + "loss": 3.2578, + "theoretical_loss": 4.005326981737123, + "tokens_seen": 405959680 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004429087261785356, + "loss": 3.2103, + "theoretical_loss": 4.005254827858161, + "tokens_seen": 406025216 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044289869608826483, + "loss": 3.163, + "theoretical_loss": 4.005182688884877, + "tokens_seen": 406090752 + }, + { + "epoch": 5.0, + "learning_rate": 0.000442888665997994, + "loss": 3.1538, + "theoretical_loss": 4.005110564811785, + "tokens_seen": 406156288 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004428786359077232, + "loss": 3.1675, + "theoretical_loss": 4.005038455633407, + "tokens_seen": 406221824 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 997770, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.058662176132202, + "objective/train/theoretical_loss": 4.0049663613442625, + "objective/train/tokens_used": 426747360, + "theoretical_loss": 4.0049663613442625, + "tokens_seen": 406287360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044286860581745237, + "loss": 3.1939, + "theoretical_loss": 4.0049663613442625, + "tokens_seen": 406287360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044285857572718155, + "loss": 3.1778, + "theoretical_loss": 4.004894281938876, + "tokens_seen": 406352896 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044284854563691073, + "loss": 3.2543, + "theoretical_loss": 4.0048222174117765, + "tokens_seen": 406418432 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044283851554663997, + "loss": 3.1747, + "theoretical_loss": 4.004750167757495, + "tokens_seen": 406483968 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004428284854563691, + "loss": 3.1942, + "theoretical_loss": 4.004678132970562, + "tokens_seen": 406549504 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044281845536609833, + "loss": 3.1313, + "theoretical_loss": 4.004606113045516, + "tokens_seen": 406615040 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004428084252758275, + "loss": 3.1256, + "theoretical_loss": 4.0045341079768955, + "tokens_seen": 406680576 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004427983951855567, + "loss": 3.2892, + "theoretical_loss": 4.0044621177592425, + "tokens_seen": 406746112 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004427883650952859, + "loss": 3.1608, + "theoretical_loss": 4.004390142387102, + "tokens_seen": 406811648 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044277833500501506, + "loss": 3.1773, + "theoretical_loss": 4.004318181855022, + "tokens_seen": 406877184 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044276830491474424, + "loss": 3.2197, + "theoretical_loss": 4.004246236157554, + "tokens_seen": 406942720 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044275827482447347, + "loss": 3.1179, + "theoretical_loss": 4.004174305289249, + "tokens_seen": 407008256 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004427482447342026, + "loss": 3.2674, + "theoretical_loss": 4.004102389244666, + "tokens_seen": 407073792 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044273821464393184, + "loss": 3.1133, + "theoretical_loss": 4.004030488018364, + "tokens_seen": 407139328 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044272818455366096, + "loss": 3.1599, + "theoretical_loss": 4.003958601604903, + "tokens_seen": 407204864 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004427181544633902, + "loss": 3.0056, + "theoretical_loss": 4.00388672999885, + "tokens_seen": 407270400 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004427081243731194, + "loss": 3.2042, + "theoretical_loss": 4.003814873194773, + "tokens_seen": 407335936 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044269809428284856, + "loss": 3.1863, + "theoretical_loss": 4.003743031187242, + "tokens_seen": 407401472 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044268806419257774, + "loss": 3.1663, + "theoretical_loss": 4.00367120397083, + "tokens_seen": 407467008 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004426780341023069, + "loss": 3.1984, + "theoretical_loss": 4.003599391540115, + "tokens_seen": 407532544 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004426680040120361, + "loss": 3.278, + "theoretical_loss": 4.003527593889677, + "tokens_seen": 407598080 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044265797392176534, + "loss": 3.1469, + "theoretical_loss": 4.0034558110140965, + "tokens_seen": 407663616 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044264794383149447, + "loss": 3.164, + "theoretical_loss": 4.003384042907959, + "tokens_seen": 407729152 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004426379137412237, + "loss": 3.1784, + "theoretical_loss": 4.0033122895658515, + "tokens_seen": 407794688 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004426278836509529, + "loss": 3.0708, + "theoretical_loss": 4.003240550982367, + "tokens_seen": 407860224 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1000849, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2294225692749023, + "objective/train/theoretical_loss": 4.003168827152098, + "objective/train/tokens_used": 428385760, + "theoretical_loss": 4.003168827152098, + "tokens_seen": 407925760 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044261785356068206, + "loss": 3.1767, + "theoretical_loss": 4.003168827152098, + "tokens_seen": 407925760 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044260782347041124, + "loss": 3.0735, + "theoretical_loss": 4.003097118069641, + "tokens_seen": 407991296 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004425977933801404, + "loss": 3.1409, + "theoretical_loss": 4.003025423729595, + "tokens_seen": 408056832 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004425877632898696, + "loss": 3.2391, + "theoretical_loss": 4.002953744126562, + "tokens_seen": 408122368 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044257773319959884, + "loss": 3.0822, + "theoretical_loss": 4.002882079255148, + "tokens_seen": 408187904 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044256770310932797, + "loss": 3.0982, + "theoretical_loss": 4.002810429109961, + "tokens_seen": 408253440 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004425576730190572, + "loss": 3.2273, + "theoretical_loss": 4.0027387936856105, + "tokens_seen": 408318976 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044254764292878633, + "loss": 3.2893, + "theoretical_loss": 4.002667172976711, + "tokens_seen": 408384512 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044253761283851557, + "loss": 3.1583, + "theoretical_loss": 4.002595566977878, + "tokens_seen": 408450048 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044252758274824475, + "loss": 3.2202, + "theoretical_loss": 4.002523975683732, + "tokens_seen": 408515584 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044251755265797393, + "loss": 3.149, + "theoretical_loss": 4.002452399088893, + "tokens_seen": 408581120 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004425075225677031, + "loss": 3.2274, + "theoretical_loss": 4.002380837187989, + "tokens_seen": 408646656 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004424974924774323, + "loss": 3.1384, + "theoretical_loss": 4.002309289975645, + "tokens_seen": 408712192 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044248746238716147, + "loss": 3.0659, + "theoretical_loss": 4.002237757446492, + "tokens_seen": 408777728 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004424774322968907, + "loss": 3.1697, + "theoretical_loss": 4.002166239595164, + "tokens_seen": 408843264 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044246740220661983, + "loss": 3.1846, + "theoretical_loss": 4.002094736416297, + "tokens_seen": 408908800 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044245737211634907, + "loss": 3.215, + "theoretical_loss": 4.00202324790453, + "tokens_seen": 408974336 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044244734202607825, + "loss": 3.2383, + "theoretical_loss": 4.001951774054505, + "tokens_seen": 409039872 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044243731193580743, + "loss": 3.1717, + "theoretical_loss": 4.001880314860866, + "tokens_seen": 409105408 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004424272818455366, + "loss": 3.1997, + "theoretical_loss": 4.001808870318261, + "tokens_seen": 409170944 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004424172517552658, + "loss": 3.195, + "theoretical_loss": 4.001737440421339, + "tokens_seen": 409236480 + }, + { + "epoch": 5.0, + "learning_rate": 0.000442407221664995, + "loss": 3.1144, + "theoretical_loss": 4.001666025164755, + "tokens_seen": 409302016 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423971915747242, + "loss": 3.1095, + "theoretical_loss": 4.001594624543164, + "tokens_seen": 409367552 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044238716148445334, + "loss": 3.2332, + "theoretical_loss": 4.001523238551224, + "tokens_seen": 409433088 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044237713139418257, + "loss": 3.2375, + "theoretical_loss": 4.001451867183597, + "tokens_seen": 409498624 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1003967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2710623741149902, + "objective/train/theoretical_loss": 4.001380510434949, + "objective/train/tokens_used": 430024160, + "theoretical_loss": 4.001380510434949, + "tokens_seen": 409564160 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423671013039117, + "loss": 3.1644, + "theoretical_loss": 4.001380510434949, + "tokens_seen": 409564160 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044235707121364093, + "loss": 3.0736, + "theoretical_loss": 4.001309168299944, + "tokens_seen": 409629696 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423470411233701, + "loss": 3.2258, + "theoretical_loss": 4.001237840773253, + "tokens_seen": 409695232 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423370110330993, + "loss": 3.2221, + "theoretical_loss": 4.00116652784955, + "tokens_seen": 409760768 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423269809428285, + "loss": 3.1822, + "theoretical_loss": 4.001095229523509, + "tokens_seen": 409826304 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004423169508525577, + "loss": 3.2475, + "theoretical_loss": 4.001023945789808, + "tokens_seen": 409891840 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044230692076228684, + "loss": 3.1045, + "theoretical_loss": 4.00095267664313, + "tokens_seen": 409957376 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422968906720161, + "loss": 3.2774, + "theoretical_loss": 4.000881422078157, + "tokens_seen": 410022912 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422868605817452, + "loss": 3.1671, + "theoretical_loss": 4.000810182089577, + "tokens_seen": 410088448 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044227683049147444, + "loss": 3.1683, + "theoretical_loss": 4.000738956672077, + "tokens_seen": 410153984 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422668004012036, + "loss": 3.1646, + "theoretical_loss": 4.000667745820351, + "tokens_seen": 410219520 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422567703109328, + "loss": 3.3691, + "theoretical_loss": 4.000596549529095, + "tokens_seen": 410285056 + }, + { + "epoch": 5.0, + "learning_rate": 0.000442246740220662, + "loss": 3.2401, + "theoretical_loss": 4.000525367793005, + "tokens_seen": 410350592 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044223671013039116, + "loss": 3.193, + "theoretical_loss": 4.000454200606782, + "tokens_seen": 410416128 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044222668004012034, + "loss": 3.1569, + "theoretical_loss": 4.000383047965129, + "tokens_seen": 410481664 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422166499498496, + "loss": 3.2967, + "theoretical_loss": 4.000311909862753, + "tokens_seen": 410547200 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004422066198595787, + "loss": 3.2329, + "theoretical_loss": 4.000240786294363, + "tokens_seen": 410612736 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044219658976930794, + "loss": 3.1624, + "theoretical_loss": 4.000169677254668, + "tokens_seen": 410678272 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004421865596790371, + "loss": 3.2541, + "theoretical_loss": 4.000098582738386, + "tokens_seen": 410743808 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004421765295887663, + "loss": 3.1364, + "theoretical_loss": 4.000027502740231, + "tokens_seen": 410809344 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044216649949849554, + "loss": 3.1388, + "theoretical_loss": 3.9999564372549257, + "tokens_seen": 410874880 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044215646940822467, + "loss": 3.1872, + "theoretical_loss": 3.9998853862771906, + "tokens_seen": 410940416 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004421464393179539, + "loss": 3.1428, + "theoretical_loss": 3.9998143498017527, + "tokens_seen": 411005952 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004421364092276831, + "loss": 3.147, + "theoretical_loss": 3.999743327823339, + "tokens_seen": 411071488 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044212637913741226, + "loss": 3.2207, + "theoretical_loss": 3.999672320336681, + "tokens_seen": 411137024 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1007850, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.348348379135132, + "objective/train/theoretical_loss": 3.9996013273365123, + "objective/train/tokens_used": 431662560, + "theoretical_loss": 3.9996013273365123, + "tokens_seen": 411202560 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044211634904714144, + "loss": 3.3297, + "theoretical_loss": 3.9996013273365123, + "tokens_seen": 411202560 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004421063189568706, + "loss": 3.2131, + "theoretical_loss": 3.9995303488175695, + "tokens_seen": 411268096 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004420962888665998, + "loss": 3.1579, + "theoretical_loss": 3.999459384774592, + "tokens_seen": 411333632 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044208625877632904, + "loss": 3.2027, + "theoretical_loss": 3.999388435202321, + "tokens_seen": 411399168 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044207622868605817, + "loss": 3.2036, + "theoretical_loss": 3.9993175000955024, + "tokens_seen": 411464704 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004420661985957874, + "loss": 3.2988, + "theoretical_loss": 3.999246579448882, + "tokens_seen": 411530240 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044205616850551653, + "loss": 3.3284, + "theoretical_loss": 3.999175673257211, + "tokens_seen": 411595776 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044204613841524577, + "loss": 3.2293, + "theoretical_loss": 3.999104781515243, + "tokens_seen": 411661312 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044203610832497495, + "loss": 3.2879, + "theoretical_loss": 3.999033904217733, + "tokens_seen": 411726848 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044202607823470413, + "loss": 3.0682, + "theoretical_loss": 3.998963041359439, + "tokens_seen": 411792384 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004420160481444333, + "loss": 3.2346, + "theoretical_loss": 3.9988921929351227, + "tokens_seen": 411857920 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004420060180541625, + "loss": 3.2244, + "theoretical_loss": 3.9988213589395474, + "tokens_seen": 411923456 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044199598796389167, + "loss": 3.2832, + "theoretical_loss": 3.9987505393674803, + "tokens_seen": 411988992 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004419859578736209, + "loss": 3.138, + "theoretical_loss": 3.998679734213691, + "tokens_seen": 412054528 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044197592778335003, + "loss": 3.3091, + "theoretical_loss": 3.9986089434729504, + "tokens_seen": 412120064 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044196589769307927, + "loss": 3.1792, + "theoretical_loss": 3.9985381671400346, + "tokens_seen": 412185600 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044195586760280845, + "loss": 3.2147, + "theoretical_loss": 3.9984674052097207, + "tokens_seen": 412251136 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044194583751253763, + "loss": 3.1735, + "theoretical_loss": 3.998396657676788, + "tokens_seen": 412316672 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004419358074222668, + "loss": 3.2109, + "theoretical_loss": 3.998325924536021, + "tokens_seen": 412382208 + }, + { + "epoch": 5.0, + "learning_rate": 0.000441925777331996, + "loss": 3.1404, + "theoretical_loss": 3.998255205782204, + "tokens_seen": 412447744 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004419157472417252, + "loss": 3.1939, + "theoretical_loss": 3.998184501410127, + "tokens_seen": 412513280 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004419057171514544, + "loss": 3.2223, + "theoretical_loss": 3.99811381141458, + "tokens_seen": 412578816 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044189568706118354, + "loss": 3.1761, + "theoretical_loss": 3.9980431357903568, + "tokens_seen": 412644352 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044188565697091277, + "loss": 3.1873, + "theoretical_loss": 3.9979724745322547, + "tokens_seen": 412709888 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418756268806419, + "loss": 3.1685, + "theoretical_loss": 3.997901827635072, + "tokens_seen": 412775424 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1012423, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2475807666778564, + "objective/train/theoretical_loss": 3.997831195093611, + "objective/train/tokens_used": 433300960, + "theoretical_loss": 3.997831195093611, + "tokens_seen": 412840960 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044186559679037113, + "loss": 3.2829, + "theoretical_loss": 3.997831195093611, + "tokens_seen": 412840960 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418555667001003, + "loss": 3.2492, + "theoretical_loss": 3.997760576902677, + "tokens_seen": 412906496 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418455366098295, + "loss": 3.2193, + "theoretical_loss": 3.9976899730570765, + "tokens_seen": 412972032 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418355065195587, + "loss": 3.0732, + "theoretical_loss": 3.99761938355162, + "tokens_seen": 413037568 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418254764292879, + "loss": 3.1389, + "theoretical_loss": 3.9975488083811204, + "tokens_seen": 413103104 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044181544633901704, + "loss": 3.1243, + "theoretical_loss": 3.997478247540393, + "tokens_seen": 413168640 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004418054162487463, + "loss": 3.2496, + "theoretical_loss": 3.997407701024256, + "tokens_seen": 413234176 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004417953861584754, + "loss": 3.2536, + "theoretical_loss": 3.99733716882753, + "tokens_seen": 413299712 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044178535606820464, + "loss": 3.3436, + "theoretical_loss": 3.997266650945039, + "tokens_seen": 413365248 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004417753259779338, + "loss": 3.2064, + "theoretical_loss": 3.9971961473716098, + "tokens_seen": 413430784 + }, + { + "epoch": 5.0, + "learning_rate": 0.000441765295887663, + "loss": 3.2, + "theoretical_loss": 3.9971256581020693, + "tokens_seen": 413496320 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004417552657973922, + "loss": 3.1865, + "theoretical_loss": 3.997055183131252, + "tokens_seen": 413561856 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044174523570712136, + "loss": 3.1685, + "theoretical_loss": 3.9969847224539894, + "tokens_seen": 413627392 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044173520561685054, + "loss": 3.2626, + "theoretical_loss": 3.9969142760651204, + "tokens_seen": 413692928 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004417251755265798, + "loss": 3.2041, + "theoretical_loss": 3.9968438439594838, + "tokens_seen": 413758464 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004417151454363089, + "loss": 3.2069, + "theoretical_loss": 3.9967734261319228, + "tokens_seen": 413824000 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044170511534603814, + "loss": 3.2254, + "theoretical_loss": 3.9967030225772815, + "tokens_seen": 413889536 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044169508525576727, + "loss": 3.1817, + "theoretical_loss": 3.9966326332904076, + "tokens_seen": 413955072 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004416850551654965, + "loss": 3.23, + "theoretical_loss": 3.9965622582661524, + "tokens_seen": 414020608 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004416750250752257, + "loss": 3.1492, + "theoretical_loss": 3.9964918974993684, + "tokens_seen": 414086144 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044166499498495487, + "loss": 3.3292, + "theoretical_loss": 3.9964215509849117, + "tokens_seen": 414151680 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044165496489468405, + "loss": 3.1312, + "theoretical_loss": 3.99635121871764, + "tokens_seen": 414217216 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004416449348044133, + "loss": 3.1461, + "theoretical_loss": 3.9962809006924154, + "tokens_seen": 414282752 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004416349047141424, + "loss": 3.2117, + "theoretical_loss": 3.996210596904101, + "tokens_seen": 414348288 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044162487462387164, + "loss": 3.2735, + "theoretical_loss": 3.9961403073475625, + "tokens_seen": 414413824 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1015608, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.248241424560547, + "objective/train/theoretical_loss": 3.9960700320176703, + "objective/train/tokens_used": 434939360, + "theoretical_loss": 3.9960700320176703, + "tokens_seen": 414479360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044161484453360077, + "loss": 3.2785, + "theoretical_loss": 3.9960700320176703, + "tokens_seen": 414479360 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044160481444333, + "loss": 3.2147, + "theoretical_loss": 3.9959997709092954, + "tokens_seen": 414544896 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004415947843530592, + "loss": 3.17, + "theoretical_loss": 3.9959295240173125, + "tokens_seen": 414610432 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044158475426278837, + "loss": 3.154, + "theoretical_loss": 3.9958592913365987, + "tokens_seen": 414675968 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044157472417251755, + "loss": 3.2501, + "theoretical_loss": 3.995789072862034, + "tokens_seen": 414741504 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044156469408224673, + "loss": 3.2557, + "theoretical_loss": 3.9957188685885, + "tokens_seen": 414807040 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004415546639919759, + "loss": 3.3114, + "theoretical_loss": 3.9956486785108813, + "tokens_seen": 414872576 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044154463390170515, + "loss": 3.1777, + "theoretical_loss": 3.995578502624067, + "tokens_seen": 414938112 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004415346038114343, + "loss": 3.2004, + "theoretical_loss": 3.9955083409229464, + "tokens_seen": 415003648 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004415245737211635, + "loss": 3.1583, + "theoretical_loss": 3.995438193402413, + "tokens_seen": 415069184 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044151454363089264, + "loss": 3.3056, + "theoretical_loss": 3.995368060057362, + "tokens_seen": 415134720 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044150451354062187, + "loss": 3.2358, + "theoretical_loss": 3.9952979408826916, + "tokens_seen": 415200256 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044149448345035105, + "loss": 3.271, + "theoretical_loss": 3.995227835873303, + "tokens_seen": 415265792 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044148445336008023, + "loss": 3.2329, + "theoretical_loss": 3.9951577450241, + "tokens_seen": 415331328 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004414744232698094, + "loss": 3.2003, + "theoretical_loss": 3.9950876683299876, + "tokens_seen": 415396864 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044146439317953865, + "loss": 3.287, + "theoretical_loss": 3.995017605785876, + "tokens_seen": 415462400 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004414543630892678, + "loss": 3.2754, + "theoretical_loss": 3.994947557386676, + "tokens_seen": 415527936 + }, + { + "epoch": 5.0, + "learning_rate": 0.000441444332998997, + "loss": 3.1423, + "theoretical_loss": 3.9948775231273013, + "tokens_seen": 415593472 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004414343029087262, + "loss": 3.2407, + "theoretical_loss": 3.9948075030026695, + "tokens_seen": 415659008 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004414242728184554, + "loss": 3.2727, + "theoretical_loss": 3.9947374970076988, + "tokens_seen": 415724544 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004414142427281846, + "loss": 3.2316, + "theoretical_loss": 3.994667505137312, + "tokens_seen": 415790080 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044140421263791374, + "loss": 3.1649, + "theoretical_loss": 3.9945975273864334, + "tokens_seen": 415855616 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044139418254764297, + "loss": 3.184, + "theoretical_loss": 3.99452756374999, + "tokens_seen": 415921152 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413841524573721, + "loss": 3.2672, + "theoretical_loss": 3.994457614222912, + "tokens_seen": 415986688 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044137412236710133, + "loss": 3.2189, + "theoretical_loss": 3.9943876788001313, + "tokens_seen": 416052224 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1020299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2148020267486572, + "objective/train/theoretical_loss": 3.994317757476584, + "objective/train/tokens_used": 436577760, + "theoretical_loss": 3.994317757476584, + "tokens_seen": 416117760 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413640922768305, + "loss": 3.1481, + "theoretical_loss": 3.994317757476584, + "tokens_seen": 416117760 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413540621865597, + "loss": 3.2107, + "theoretical_loss": 3.9942478502472065, + "tokens_seen": 416183296 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413440320962889, + "loss": 3.1833, + "theoretical_loss": 3.99417795710694, + "tokens_seen": 416248832 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413340020060181, + "loss": 3.2138, + "theoretical_loss": 3.994108078050727, + "tokens_seen": 416314368 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044132397191574724, + "loss": 3.1936, + "theoretical_loss": 3.9940382130735133, + "tokens_seen": 416379904 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413139418254765, + "loss": 3.2573, + "theoretical_loss": 3.993968362170246, + "tokens_seen": 416445440 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004413039117352056, + "loss": 3.2598, + "theoretical_loss": 3.993898525335877, + "tokens_seen": 416510976 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044129388164493484, + "loss": 3.301, + "theoretical_loss": 3.9938287025653594, + "tokens_seen": 416576512 + }, + { + "epoch": 5.0, + "learning_rate": 0.000441283851554664, + "loss": 3.2819, + "theoretical_loss": 3.9937588938536495, + "tokens_seen": 416642048 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004412738214643932, + "loss": 3.2206, + "theoretical_loss": 3.993689099195705, + "tokens_seen": 416707584 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004412637913741224, + "loss": 3.2574, + "theoretical_loss": 3.9936193185864868, + "tokens_seen": 416773120 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044125376128385156, + "loss": 3.2507, + "theoretical_loss": 3.9935495520209594, + "tokens_seen": 416838656 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044124373119358074, + "loss": 3.269, + "theoretical_loss": 3.9934797994940894, + "tokens_seen": 416904192 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044123370110331, + "loss": 3.192, + "theoretical_loss": 3.9934100610008447, + "tokens_seen": 416969728 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004412236710130391, + "loss": 3.2046, + "theoretical_loss": 3.993340336536198, + "tokens_seen": 417035264 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044121364092276834, + "loss": 3.2249, + "theoretical_loss": 3.9932706260951223, + "tokens_seen": 417100800 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044120361083249747, + "loss": 3.1304, + "theoretical_loss": 3.993200929672595, + "tokens_seen": 417166336 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411935807422267, + "loss": 3.2411, + "theoretical_loss": 3.9931312472635954, + "tokens_seen": 417231872 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411835506519559, + "loss": 3.2144, + "theoretical_loss": 3.993061578863105, + "tokens_seen": 417297408 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044117352056168507, + "loss": 3.1745, + "theoretical_loss": 3.9929919244661085, + "tokens_seen": 417362944 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044116349047141425, + "loss": 3.182, + "theoretical_loss": 3.992922284067593, + "tokens_seen": 417428480 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411534603811435, + "loss": 3.2655, + "theoretical_loss": 3.9928526576625476, + "tokens_seen": 417494016 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411434302908726, + "loss": 3.2575, + "theoretical_loss": 3.992783045245966, + "tokens_seen": 417559552 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044113340020060184, + "loss": 3.2785, + "theoretical_loss": 3.992713446812841, + "tokens_seen": 417625088 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044112337011033097, + "loss": 3.1987, + "theoretical_loss": 3.992643862358171, + "tokens_seen": 417690624 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1023255, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8714888095855713, + "objective/train/theoretical_loss": 3.992574291876956, + "objective/train/tokens_used": 438216160, + "theoretical_loss": 3.992574291876956, + "tokens_seen": 417756160 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411133400200602, + "loss": 3.1546, + "theoretical_loss": 3.992574291876956, + "tokens_seen": 417756160 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004411033099297894, + "loss": 3.2675, + "theoretical_loss": 3.9925047353641983, + "tokens_seen": 417821696 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044109327983951857, + "loss": 3.1827, + "theoretical_loss": 3.9924351928149027, + "tokens_seen": 417887232 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044108324974924775, + "loss": 3.1383, + "theoretical_loss": 3.992365664224077, + "tokens_seen": 417952768 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044107321965897693, + "loss": 3.2895, + "theoretical_loss": 3.992296149586732, + "tokens_seen": 418018304 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004410631895687061, + "loss": 3.224, + "theoretical_loss": 3.99222664889788, + "tokens_seen": 418083840 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044105315947843535, + "loss": 3.2478, + "theoretical_loss": 3.9921571621525365, + "tokens_seen": 418149376 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004410431293881645, + "loss": 3.2636, + "theoretical_loss": 3.992087689345719, + "tokens_seen": 418214912 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004410330992978937, + "loss": 3.17, + "theoretical_loss": 3.9920182304724485, + "tokens_seen": 418280448 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044102306920762284, + "loss": 3.0166, + "theoretical_loss": 3.991948785527748, + "tokens_seen": 418345984 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044101303911735207, + "loss": 3.2179, + "theoretical_loss": 3.9918793545066427, + "tokens_seen": 418411520 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044100300902708125, + "loss": 2.9816, + "theoretical_loss": 3.9918099374041605, + "tokens_seen": 418477056 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044099297893681043, + "loss": 3.213, + "theoretical_loss": 3.991740534215333, + "tokens_seen": 418542592 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004409829488465396, + "loss": 3.193, + "theoretical_loss": 3.991671144935193, + "tokens_seen": 418608128 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044097291875626885, + "loss": 3.366, + "theoretical_loss": 3.991601769558776, + "tokens_seen": 418673664 + }, + { + "epoch": 5.0, + "learning_rate": 0.000440962888665998, + "loss": 3.2313, + "theoretical_loss": 3.9915324080811203, + "tokens_seen": 418739200 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004409528585757272, + "loss": 3.2978, + "theoretical_loss": 3.991463060497267, + "tokens_seen": 418804736 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044094282848545634, + "loss": 3.2169, + "theoretical_loss": 3.9913937268022606, + "tokens_seen": 418870272 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004409327983951856, + "loss": 3.1744, + "theoretical_loss": 3.9913244069911453, + "tokens_seen": 418935808 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044092276830491476, + "loss": 3.1561, + "theoretical_loss": 3.9912551010589707, + "tokens_seen": 419001344 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044091273821464394, + "loss": 3.1312, + "theoretical_loss": 3.9911858090007875, + "tokens_seen": 419066880 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004409027081243731, + "loss": 3.2191, + "theoretical_loss": 3.991116530811649, + "tokens_seen": 419132416 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408926780341023, + "loss": 3.1702, + "theoretical_loss": 3.9910472664866123, + "tokens_seen": 419197952 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408826479438315, + "loss": 3.1609, + "theoretical_loss": 3.990978016020735, + "tokens_seen": 419263488 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408726178535607, + "loss": 3.1839, + "theoretical_loss": 3.990908779409079, + "tokens_seen": 419329024 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1026958, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1033482551574707, + "objective/train/theoretical_loss": 3.990839556646708, + "objective/train/tokens_used": 439854560, + "theoretical_loss": 3.990839556646708, + "tokens_seen": 419394560 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044086258776328984, + "loss": 3.1431, + "theoretical_loss": 3.990839556646708, + "tokens_seen": 419394560 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408525576730191, + "loss": 3.2248, + "theoretical_loss": 3.990770347728688, + "tokens_seen": 419460096 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408425275827482, + "loss": 3.1179, + "theoretical_loss": 3.9907011526500877, + "tokens_seen": 419525632 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044083249749247744, + "loss": 3.3422, + "theoretical_loss": 3.9906319714059784, + "tokens_seen": 419591168 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408224674022066, + "loss": 3.3154, + "theoretical_loss": 3.990562803991434, + "tokens_seen": 419656704 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004408124373119358, + "loss": 3.2481, + "theoretical_loss": 3.990493650401532, + "tokens_seen": 419722240 + }, + { + "epoch": 5.0, + "learning_rate": 0.000440802407221665, + "loss": 3.1136, + "theoretical_loss": 3.9904245106313496, + "tokens_seen": 419787776 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407923771313942, + "loss": 3.1867, + "theoretical_loss": 3.990355384675969, + "tokens_seen": 419853312 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044078234704112335, + "loss": 3.2422, + "theoretical_loss": 3.990286272530474, + "tokens_seen": 419918848 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407723169508526, + "loss": 3.1599, + "theoretical_loss": 3.990217174189951, + "tokens_seen": 419984384 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407622868605817, + "loss": 3.1941, + "theoretical_loss": 3.9901480896494883, + "tokens_seen": 420049920 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044075225677031094, + "loss": 3.2542, + "theoretical_loss": 3.9900790189041793, + "tokens_seen": 420115456 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407422266800401, + "loss": 3.1154, + "theoretical_loss": 3.9900099619491165, + "tokens_seen": 420180992 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407321965897693, + "loss": 3.1507, + "theoretical_loss": 3.9899409187793964, + "tokens_seen": 420246528 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004407221664994985, + "loss": 3.1613, + "theoretical_loss": 3.989871889390118, + "tokens_seen": 420312064 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044071213640922767, + "loss": 3.1852, + "theoretical_loss": 3.989802873776383, + "tokens_seen": 420377600 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044070210631895685, + "loss": 3.11, + "theoretical_loss": 3.989733871933296, + "tokens_seen": 420443136 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004406920762286861, + "loss": 3.1958, + "theoretical_loss": 3.989664883855963, + "tokens_seen": 420508672 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044068204613841527, + "loss": 3.1982, + "theoretical_loss": 3.9895959095394926, + "tokens_seen": 420574208 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044067201604814445, + "loss": 3.1197, + "theoretical_loss": 3.9895269489789964, + "tokens_seen": 420639744 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004406619859578737, + "loss": 3.2095, + "theoretical_loss": 3.98945800216959, + "tokens_seen": 420705280 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004406519558676028, + "loss": 3.2803, + "theoretical_loss": 3.9893890691063874, + "tokens_seen": 420770816 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044064192577733204, + "loss": 3.0689, + "theoretical_loss": 3.989320149784509, + "tokens_seen": 420836352 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044063189568706117, + "loss": 3.1538, + "theoretical_loss": 3.9892512441990764, + "tokens_seen": 420901888 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004406218655967904, + "loss": 3.1535, + "theoretical_loss": 3.989182352345213, + "tokens_seen": 420967424 + }, + { + "epoch": 5.0, + "objective/train/docs_used": 1031818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2083065509796143, + "objective/train/theoretical_loss": 3.989113474218046, + "objective/train/tokens_used": 441492960, + "theoretical_loss": 3.989113474218046, + "tokens_seen": 421032960 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004406118355065196, + "loss": 3.1356, + "theoretical_loss": 3.989113474218046, + "tokens_seen": 421032960 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044060180541624877, + "loss": 3.172, + "theoretical_loss": 3.989044609812704, + "tokens_seen": 421098496 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044059177532597795, + "loss": 3.1983, + "theoretical_loss": 3.988975759124318, + "tokens_seen": 421164032 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044058174523570713, + "loss": 3.132, + "theoretical_loss": 3.988906922148022, + "tokens_seen": 421229568 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004405717151454363, + "loss": 3.1339, + "theoretical_loss": 3.9888380988789534, + "tokens_seen": 421295104 + }, + { + "epoch": 5.0, + "learning_rate": 0.00044056168505516555, + "loss": 3.2355, + "theoretical_loss": 3.98876928931225, + "tokens_seen": 421360640 + }, + { + "epoch": 5.0, + "learning_rate": 0.0004405516549648947, + "loss": 3.2331, + "theoretical_loss": 3.988700493443054, + "tokens_seen": 421426176 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004405416248746239, + "loss": 3.2237, + "theoretical_loss": 3.9886317112665086, + "tokens_seen": 421491712 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044053159478435304, + "loss": 3.1186, + "theoretical_loss": 3.9885629427777607, + "tokens_seen": 421557248 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044052156469408227, + "loss": 3.136, + "theoretical_loss": 3.9884941879719586, + "tokens_seen": 421622784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044051153460381145, + "loss": 3.128, + "theoretical_loss": 3.9884254468442535, + "tokens_seen": 421688320 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044050150451354063, + "loss": 3.1118, + "theoretical_loss": 3.9883567193897997, + "tokens_seen": 421753856 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404914744232698, + "loss": 3.0565, + "theoretical_loss": 3.9882880056037533, + "tokens_seen": 421819392 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044048144433299905, + "loss": 3.2073, + "theoretical_loss": 3.988219305481273, + "tokens_seen": 421884928 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404714142427282, + "loss": 3.2724, + "theoretical_loss": 3.988150619017519, + "tokens_seen": 421950464 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404613841524574, + "loss": 3.1853, + "theoretical_loss": 3.9880819462076564, + "tokens_seen": 422016000 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044045135406218654, + "loss": 3.1771, + "theoretical_loss": 3.9880132870468508, + "tokens_seen": 422081536 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404413239719158, + "loss": 3.2619, + "theoretical_loss": 3.9879446415302704, + "tokens_seen": 422147072 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044043129388164496, + "loss": 3.2236, + "theoretical_loss": 3.987876009653086, + "tokens_seen": 422212608 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044042126379137414, + "loss": 3.1471, + "theoretical_loss": 3.9878073914104712, + "tokens_seen": 422278144 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404112337011033, + "loss": 3.1915, + "theoretical_loss": 3.987738786797603, + "tokens_seen": 422343680 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004404012036108325, + "loss": 3.1323, + "theoretical_loss": 3.9876701958096583, + "tokens_seen": 422409216 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403911735205617, + "loss": 3.0392, + "theoretical_loss": 3.9876016184418184, + "tokens_seen": 422474752 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403811434302909, + "loss": 3.2741, + "theoretical_loss": 3.9875330546892673, + "tokens_seen": 422540288 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044037111334002004, + "loss": 3.1561, + "theoretical_loss": 3.98746450454719, + "tokens_seen": 422605824 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1034737, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2565081119537354, + "objective/train/theoretical_loss": 3.9873959680107744, + "objective/train/tokens_used": 443131360, + "theoretical_loss": 3.9873959680107744, + "tokens_seen": 422671360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403610832497493, + "loss": 3.2652, + "theoretical_loss": 3.9873959680107744, + "tokens_seen": 422671360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403510531594784, + "loss": 3.1218, + "theoretical_loss": 3.987327445075212, + "tokens_seen": 422736896 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044034102306920764, + "loss": 3.2551, + "theoretical_loss": 3.987258935735696, + "tokens_seen": 422802432 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403309929789368, + "loss": 3.1953, + "theoretical_loss": 3.98719043998742, + "tokens_seen": 422867968 + }, + { + "epoch": 5.01, + "learning_rate": 0.000440320962888666, + "loss": 3.2179, + "theoretical_loss": 3.987121957825584, + "tokens_seen": 422933504 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403109327983952, + "loss": 3.1702, + "theoretical_loss": 3.9870534892453877, + "tokens_seen": 422999040 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004403009027081244, + "loss": 3.2727, + "theoretical_loss": 3.986985034242034, + "tokens_seen": 423064576 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044029087261785355, + "loss": 3.1904, + "theoretical_loss": 3.9869165928107275, + "tokens_seen": 423130112 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402808425275828, + "loss": 3.2887, + "theoretical_loss": 3.986848164946678, + "tokens_seen": 423195648 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402708124373119, + "loss": 3.2856, + "theoretical_loss": 3.9867797506450926, + "tokens_seen": 423261184 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044026078234704114, + "loss": 3.1569, + "theoretical_loss": 3.986711349901186, + "tokens_seen": 423326720 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402507522567703, + "loss": 3.2746, + "theoretical_loss": 3.9866429627101727, + "tokens_seen": 423392256 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402407221664995, + "loss": 3.3148, + "theoretical_loss": 3.9865745890672706, + "tokens_seen": 423457792 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402306920762287, + "loss": 3.2052, + "theoretical_loss": 3.9865062289676985, + "tokens_seen": 423523328 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044022066198595787, + "loss": 3.1365, + "theoretical_loss": 3.986437882406679, + "tokens_seen": 423588864 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044021063189568705, + "loss": 3.149, + "theoretical_loss": 3.9863695493794373, + "tokens_seen": 423654400 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004402006018054163, + "loss": 3.2547, + "theoretical_loss": 3.9863012298812004, + "tokens_seen": 423719936 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004401905717151454, + "loss": 3.2223, + "theoretical_loss": 3.9862329239071976, + "tokens_seen": 423785472 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044018054162487465, + "loss": 3.1376, + "theoretical_loss": 3.986164631452661, + "tokens_seen": 423851008 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044017051153460383, + "loss": 3.1743, + "theoretical_loss": 3.986096352512825, + "tokens_seen": 423916544 + }, + { + "epoch": 5.01, + "learning_rate": 0.000440160481444333, + "loss": 3.1646, + "theoretical_loss": 3.9860280870829263, + "tokens_seen": 423982080 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004401504513540622, + "loss": 3.2015, + "theoretical_loss": 3.9859598351582046, + "tokens_seen": 424047616 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044014042126379137, + "loss": 3.1969, + "theoretical_loss": 3.985891596733901, + "tokens_seen": 424113152 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044013039117352055, + "loss": 3.1704, + "theoretical_loss": 3.98582337180526, + "tokens_seen": 424178688 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004401203610832498, + "loss": 3.1923, + "theoretical_loss": 3.9857551603675274, + "tokens_seen": 424244224 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1039568, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1633660793304443, + "objective/train/theoretical_loss": 3.9856869624159526, + "objective/train/tokens_used": 444769760, + "theoretical_loss": 3.9856869624159526, + "tokens_seen": 424309760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004401103309929789, + "loss": 3.1806, + "theoretical_loss": 3.9856869624159526, + "tokens_seen": 424309760 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044010030090270815, + "loss": 3.199, + "theoretical_loss": 3.9856187779457866, + "tokens_seen": 424375296 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400902708124373, + "loss": 3.2004, + "theoretical_loss": 3.985550606952283, + "tokens_seen": 424440832 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400802407221665, + "loss": 3.134, + "theoretical_loss": 3.985482449430699, + "tokens_seen": 424506368 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400702106318957, + "loss": 3.1071, + "theoretical_loss": 3.9854143053762914, + "tokens_seen": 424571904 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400601805416249, + "loss": 3.0992, + "theoretical_loss": 3.985346174784322, + "tokens_seen": 424637440 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044005015045135406, + "loss": 3.2132, + "theoretical_loss": 3.9852780576500537, + "tokens_seen": 424702976 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044004012036108324, + "loss": 3.1826, + "theoretical_loss": 3.985209953968753, + "tokens_seen": 424768512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400300902708124, + "loss": 3.1011, + "theoretical_loss": 3.985141863735687, + "tokens_seen": 424834048 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044002006018054165, + "loss": 3.1502, + "theoretical_loss": 3.985073786946127, + "tokens_seen": 424899584 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004400100300902708, + "loss": 3.2027, + "theoretical_loss": 3.985005723595345, + "tokens_seen": 424965120 + }, + { + "epoch": 5.01, + "learning_rate": 0.00044, + "loss": 3.2323, + "theoretical_loss": 3.9849376736786173, + "tokens_seen": 425030656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004399899699097292, + "loss": 3.144, + "theoretical_loss": 3.9848696371912204, + "tokens_seen": 425096192 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004399799398194584, + "loss": 3.0779, + "theoretical_loss": 3.984801614128435, + "tokens_seen": 425161728 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043996990972918756, + "loss": 3.2262, + "theoretical_loss": 3.9847336044855437, + "tokens_seen": 425227264 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043995987963891674, + "loss": 3.2918, + "theoretical_loss": 3.9846656082578304, + "tokens_seen": 425292800 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004399498495486459, + "loss": 3.1138, + "theoretical_loss": 3.984597625440583, + "tokens_seen": 425358336 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043993981945837516, + "loss": 3.2489, + "theoretical_loss": 3.984529656029091, + "tokens_seen": 425423872 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043992978936810434, + "loss": 3.1598, + "theoretical_loss": 3.9844617000186466, + "tokens_seen": 425489408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004399197592778335, + "loss": 3.2633, + "theoretical_loss": 3.9843937574045434, + "tokens_seen": 425554944 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004399097291875627, + "loss": 3.1541, + "theoretical_loss": 3.9843258281820786, + "tokens_seen": 425620480 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398996990972919, + "loss": 3.1541, + "theoretical_loss": 3.9842579123465516, + "tokens_seen": 425686016 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398896690070211, + "loss": 3.0841, + "theoretical_loss": 3.984190009893263, + "tokens_seen": 425751552 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043987963891675024, + "loss": 3.3107, + "theoretical_loss": 3.9841221208175175, + "tokens_seen": 425817088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398696088264795, + "loss": 3.1933, + "theoretical_loss": 3.9840542451146206, + "tokens_seen": 425882624 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1042557, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.264849901199341, + "objective/train/theoretical_loss": 3.9839863827798814, + "objective/train/tokens_used": 446408160, + "theoretical_loss": 3.9839863827798814, + "tokens_seen": 425948160 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398595787362086, + "loss": 3.2217, + "theoretical_loss": 3.9839863827798814, + "tokens_seen": 425948160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043984954864593784, + "loss": 3.1908, + "theoretical_loss": 3.9839185338086103, + "tokens_seen": 426013696 + }, + { + "epoch": 5.01, + "learning_rate": 0.000439839518555667, + "loss": 3.1, + "theoretical_loss": 3.983850698196121, + "tokens_seen": 426079232 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398294884653962, + "loss": 3.2457, + "theoretical_loss": 3.9837828759377283, + "tokens_seen": 426144768 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398194583751254, + "loss": 3.2751, + "theoretical_loss": 3.9837150670287516, + "tokens_seen": 426210304 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004398094282848546, + "loss": 3.1936, + "theoretical_loss": 3.983647271464511, + "tokens_seen": 426275840 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043979939819458375, + "loss": 3.2384, + "theoretical_loss": 3.9835794892403276, + "tokens_seen": 426341376 + }, + { + "epoch": 5.01, + "learning_rate": 0.000439789368104313, + "loss": 3.24, + "theoretical_loss": 3.9835117203515287, + "tokens_seen": 426406912 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004397793380140421, + "loss": 3.1258, + "theoretical_loss": 3.98344396479344, + "tokens_seen": 426472448 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043976930792377134, + "loss": 3.1964, + "theoretical_loss": 3.983376222561393, + "tokens_seen": 426537984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004397592778335005, + "loss": 3.031, + "theoretical_loss": 3.9833084936507186, + "tokens_seen": 426603520 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004397492477432297, + "loss": 3.152, + "theoretical_loss": 3.983240778056752, + "tokens_seen": 426669056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004397392176529589, + "loss": 3.0163, + "theoretical_loss": 3.9831730757748294, + "tokens_seen": 426734592 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043972918756268807, + "loss": 3.243, + "theoretical_loss": 3.9831053868002906, + "tokens_seen": 426800128 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043971915747241725, + "loss": 3.1881, + "theoretical_loss": 3.9830377111284765, + "tokens_seen": 426865664 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004397091273821465, + "loss": 3.1248, + "theoretical_loss": 3.9829700487547326, + "tokens_seen": 426931200 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004396990972918756, + "loss": 3.1933, + "theoretical_loss": 3.982902399674403, + "tokens_seen": 426996736 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043968906720160485, + "loss": 3.1581, + "theoretical_loss": 3.9828347638828374, + "tokens_seen": 427062272 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043967903711133403, + "loss": 3.2506, + "theoretical_loss": 3.982767141375387, + "tokens_seen": 427127808 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004396690070210632, + "loss": 3.2308, + "theoretical_loss": 3.9826995321474046, + "tokens_seen": 427193344 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004396589769307924, + "loss": 3.1882, + "theoretical_loss": 3.9826319361942453, + "tokens_seen": 427258880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043964894684052157, + "loss": 3.3343, + "theoretical_loss": 3.9825643535112683, + "tokens_seen": 427324416 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043963891675025075, + "loss": 3.1962, + "theoretical_loss": 3.982496784093833, + "tokens_seen": 427389952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043962888665998, + "loss": 3.2231, + "theoretical_loss": 3.9824292279373026, + "tokens_seen": 427455488 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004396188565697091, + "loss": 3.2285, + "theoretical_loss": 3.9823616850370414, + "tokens_seen": 427521024 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1046403, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3761165142059326, + "objective/train/theoretical_loss": 3.9822941553884172, + "objective/train/tokens_used": 448046560, + "theoretical_loss": 3.9822941553884172, + "tokens_seen": 427586560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043960882647943835, + "loss": 3.3305, + "theoretical_loss": 3.9822941553884172, + "tokens_seen": 427586560 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395987963891675, + "loss": 3.222, + "theoretical_loss": 3.9822266389867997, + "tokens_seen": 427652096 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395887662988967, + "loss": 3.0922, + "theoretical_loss": 3.98215913582756, + "tokens_seen": 427717632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395787362086259, + "loss": 3.1341, + "theoretical_loss": 3.982091645906073, + "tokens_seen": 427783168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395687061183551, + "loss": 3.1849, + "theoretical_loss": 3.9820241692177154, + "tokens_seen": 427848704 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043955867602808426, + "loss": 3.2217, + "theoretical_loss": 3.981956705757866, + "tokens_seen": 427914240 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043954864593781344, + "loss": 3.1731, + "theoretical_loss": 3.981889255521906, + "tokens_seen": 427979776 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395386158475426, + "loss": 3.223, + "theoretical_loss": 3.9818218185052183, + "tokens_seen": 428045312 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043952858575727185, + "loss": 3.1491, + "theoretical_loss": 3.98175439470319, + "tokens_seen": 428110848 + }, + { + "epoch": 5.01, + "learning_rate": 0.000439518555667001, + "loss": 3.1742, + "theoretical_loss": 3.981686984111209, + "tokens_seen": 428176384 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004395085255767302, + "loss": 3.2573, + "theoretical_loss": 3.9816195867246647, + "tokens_seen": 428241920 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394984954864594, + "loss": 3.2308, + "theoretical_loss": 3.9815522025389507, + "tokens_seen": 428307456 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394884653961886, + "loss": 3.2401, + "theoretical_loss": 3.981484831549462, + "tokens_seen": 428372992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043947843530591776, + "loss": 3.1223, + "theoretical_loss": 3.9814174737515966, + "tokens_seen": 428438528 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043946840521564694, + "loss": 3.1807, + "theoretical_loss": 3.981350129140753, + "tokens_seen": 428504064 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394583751253761, + "loss": 3.178, + "theoretical_loss": 3.9812827977123346, + "tokens_seen": 428569600 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043944834503510536, + "loss": 3.181, + "theoretical_loss": 3.981215479461745, + "tokens_seen": 428635136 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394383149448345, + "loss": 3.2586, + "theoretical_loss": 3.9811481743843915, + "tokens_seen": 428700672 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394282848545637, + "loss": 3.1318, + "theoretical_loss": 3.9810808824756823, + "tokens_seen": 428766208 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043941825476429285, + "loss": 3.1715, + "theoretical_loss": 3.9810136037310286, + "tokens_seen": 428831744 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004394082246740221, + "loss": 3.1637, + "theoretical_loss": 3.9809463381458445, + "tokens_seen": 428897280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043939819458375126, + "loss": 3.2506, + "theoretical_loss": 3.980879085715546, + "tokens_seen": 428962816 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043938816449348044, + "loss": 3.1449, + "theoretical_loss": 3.98081184643555, + "tokens_seen": 429028352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004393781344032096, + "loss": 3.2733, + "theoretical_loss": 3.9807446203012793, + "tokens_seen": 429093888 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004393681043129388, + "loss": 3.2069, + "theoretical_loss": 3.9806774073081543, + "tokens_seen": 429159424 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1051156, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8677589893341064, + "objective/train/theoretical_loss": 3.9806102074516017, + "objective/train/tokens_used": 449684960, + "theoretical_loss": 3.9806102074516017, + "tokens_seen": 429224960 + }, + { + "epoch": 5.01, + "learning_rate": 0.000439358074222668, + "loss": 3.1615, + "theoretical_loss": 3.9806102074516017, + "tokens_seen": 429224960 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004393480441323972, + "loss": 3.2367, + "theoretical_loss": 3.9805430207270476, + "tokens_seen": 429290496 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043933801404212635, + "loss": 3.1125, + "theoretical_loss": 3.980475847129922, + "tokens_seen": 429356032 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004393279839518556, + "loss": 3.1633, + "theoretical_loss": 3.980408686655658, + "tokens_seen": 429421568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043931795386158477, + "loss": 3.2254, + "theoretical_loss": 3.9803415392996877, + "tokens_seen": 429487104 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043930792377131395, + "loss": 3.2589, + "theoretical_loss": 3.9802744050574494, + "tokens_seen": 429552640 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043929789368104313, + "loss": 3.1272, + "theoretical_loss": 3.980207283924381, + "tokens_seen": 429618176 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392878635907723, + "loss": 3.1924, + "theoretical_loss": 3.980140175895923, + "tokens_seen": 429683712 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392778335005015, + "loss": 3.1923, + "theoretical_loss": 3.9800730809675198, + "tokens_seen": 429749248 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392678034102307, + "loss": 3.076, + "theoretical_loss": 3.980005999134617, + "tokens_seen": 429814784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043925777331995985, + "loss": 3.1612, + "theoretical_loss": 3.9799389303926618, + "tokens_seen": 429880320 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392477432296891, + "loss": 3.137, + "theoretical_loss": 3.9798718747371042, + "tokens_seen": 429945856 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392377131394182, + "loss": 3.0934, + "theoretical_loss": 3.9798048321633983, + "tokens_seen": 430011392 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043922768304914745, + "loss": 3.165, + "theoretical_loss": 3.9797378026669965, + "tokens_seen": 430076928 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043921765295887663, + "loss": 3.3046, + "theoretical_loss": 3.979670786243357, + "tokens_seen": 430142464 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004392076228686058, + "loss": 3.1979, + "theoretical_loss": 3.9796037828879394, + "tokens_seen": 430208000 + }, + { + "epoch": 5.01, + "learning_rate": 0.000439197592778335, + "loss": 3.2164, + "theoretical_loss": 3.979536792596205, + "tokens_seen": 430273536 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043918756268806423, + "loss": 3.2101, + "theoretical_loss": 3.9794698153636165, + "tokens_seen": 430339072 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004391775325977934, + "loss": 3.1954, + "theoretical_loss": 3.9794028511856414, + "tokens_seen": 430404608 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004391675025075226, + "loss": 3.2606, + "theoretical_loss": 3.9793359000577473, + "tokens_seen": 430470144 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043915747241725177, + "loss": 3.1952, + "theoretical_loss": 3.9792689619754045, + "tokens_seen": 430535680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043914744232698095, + "loss": 3.225, + "theoretical_loss": 3.979202036934087, + "tokens_seen": 430601216 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004391374122367102, + "loss": 3.3642, + "theoretical_loss": 3.9791351249292686, + "tokens_seen": 430666752 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004391273821464393, + "loss": 3.2299, + "theoretical_loss": 3.9790682259564276, + "tokens_seen": 430732288 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043911735205616855, + "loss": 3.1822, + "theoretical_loss": 3.9790013400110427, + "tokens_seen": 430797824 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1054236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2270164489746094, + "objective/train/theoretical_loss": 3.978934467088597, + "objective/train/tokens_used": 451323360, + "theoretical_loss": 3.978934467088597, + "tokens_seen": 430863360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004391073219658977, + "loss": 3.1923, + "theoretical_loss": 3.978934467088597, + "tokens_seen": 430863360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390972918756269, + "loss": 3.1989, + "theoretical_loss": 3.978867607184573, + "tokens_seen": 430928896 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390872617853561, + "loss": 3.2723, + "theoretical_loss": 3.978800760294458, + "tokens_seen": 430994432 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390772316950853, + "loss": 3.1814, + "theoretical_loss": 3.9787339264137414, + "tokens_seen": 431059968 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043906720160481446, + "loss": 3.0865, + "theoretical_loss": 3.9786671055379133, + "tokens_seen": 431125504 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043905717151454364, + "loss": 3.3137, + "theoretical_loss": 3.9786002976624664, + "tokens_seen": 431191040 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390471414242728, + "loss": 3.1426, + "theoretical_loss": 3.9785335027828963, + "tokens_seen": 431256576 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043903711133400205, + "loss": 3.2968, + "theoretical_loss": 3.978466720894701, + "tokens_seen": 431322112 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390270812437312, + "loss": 3.1225, + "theoretical_loss": 3.9783999519933806, + "tokens_seen": 431387648 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390170511534604, + "loss": 3.2214, + "theoretical_loss": 3.978333196074436, + "tokens_seen": 431453184 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004390070210631896, + "loss": 3.2269, + "theoretical_loss": 3.978266453133373, + "tokens_seen": 431518720 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004389969909729188, + "loss": 3.0399, + "theoretical_loss": 3.978199723165697, + "tokens_seen": 431584256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043898696088264796, + "loss": 3.195, + "theoretical_loss": 3.978133006166918, + "tokens_seen": 431649792 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043897693079237714, + "loss": 3.29, + "theoretical_loss": 3.9780663021325458, + "tokens_seen": 431715328 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004389669007021063, + "loss": 3.0299, + "theoretical_loss": 3.9779996110580944, + "tokens_seen": 431780864 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043895687061183556, + "loss": 3.189, + "theoretical_loss": 3.977932932939079, + "tokens_seen": 431846400 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004389468405215647, + "loss": 3.161, + "theoretical_loss": 3.9778662677710175, + "tokens_seen": 431911936 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004389368104312939, + "loss": 3.1689, + "theoretical_loss": 3.97779961554943, + "tokens_seen": 431977472 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043892678034102305, + "loss": 3.139, + "theoretical_loss": 3.977732976269839, + "tokens_seen": 432043008 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004389167502507523, + "loss": 3.2115, + "theoretical_loss": 3.9776663499277687, + "tokens_seen": 432108544 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043890672016048146, + "loss": 3.2044, + "theoretical_loss": 3.977599736518745, + "tokens_seen": 432174080 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043889669007021064, + "loss": 3.254, + "theoretical_loss": 3.977533136038298, + "tokens_seen": 432239616 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004388866599799398, + "loss": 3.1738, + "theoretical_loss": 3.9774665484819582, + "tokens_seen": 432305152 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438876629889669, + "loss": 3.2082, + "theoretical_loss": 3.9773999738452592, + "tokens_seen": 432370688 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004388665997993982, + "loss": 3.2264, + "theoretical_loss": 3.977333412123736, + "tokens_seen": 432436224 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1059038, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.291698694229126, + "objective/train/theoretical_loss": 3.977266863312927, + "objective/train/tokens_used": 452961760, + "theoretical_loss": 3.977266863312927, + "tokens_seen": 432501760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004388565697091274, + "loss": 3.299, + "theoretical_loss": 3.977266863312927, + "tokens_seen": 432501760 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043884653961885655, + "loss": 3.2203, + "theoretical_loss": 3.977200327408372, + "tokens_seen": 432567296 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004388365095285858, + "loss": 3.2221, + "theoretical_loss": 3.9771338044056135, + "tokens_seen": 432632832 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043882647943831497, + "loss": 3.2127, + "theoretical_loss": 3.977067294300195, + "tokens_seen": 432698368 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043881644934804415, + "loss": 3.1853, + "theoretical_loss": 3.9770007970876637, + "tokens_seen": 432763904 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043880641925777333, + "loss": 3.1247, + "theoretical_loss": 3.976934312763569, + "tokens_seen": 432829440 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387963891675025, + "loss": 3.0583, + "theoretical_loss": 3.976867841323461, + "tokens_seen": 432894976 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387863590772317, + "loss": 3.2978, + "theoretical_loss": 3.976801382762894, + "tokens_seen": 432960512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387763289869609, + "loss": 3.1995, + "theoretical_loss": 3.9767349370774223, + "tokens_seen": 433026048 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043876629889669005, + "loss": 3.2369, + "theoretical_loss": 3.9766685042626047, + "tokens_seen": 433091584 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387562688064193, + "loss": 3.1849, + "theoretical_loss": 3.976602084314001, + "tokens_seen": 433157120 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387462387161484, + "loss": 3.1844, + "theoretical_loss": 3.976535677227172, + "tokens_seen": 433222656 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043873620862587765, + "loss": 3.1218, + "theoretical_loss": 3.976469282997683, + "tokens_seen": 433288192 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043872617853560683, + "loss": 3.0543, + "theoretical_loss": 3.976402901621101, + "tokens_seen": 433353728 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438716148445336, + "loss": 3.1121, + "theoretical_loss": 3.9763365330929936, + "tokens_seen": 433419264 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004387061183550652, + "loss": 3.2083, + "theoretical_loss": 3.9762701774089324, + "tokens_seen": 433484800 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043869608826479443, + "loss": 3.2401, + "theoretical_loss": 3.9762038345644903, + "tokens_seen": 433550336 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043868605817452356, + "loss": 3.1556, + "theoretical_loss": 3.9761375045552425, + "tokens_seen": 433615872 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004386760280842528, + "loss": 3.1476, + "theoretical_loss": 3.9760711873767667, + "tokens_seen": 433681408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004386659979939819, + "loss": 3.2451, + "theoretical_loss": 3.9760048830246424, + "tokens_seen": 433746944 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043865596790371115, + "loss": 3.2049, + "theoretical_loss": 3.975938591494451, + "tokens_seen": 433812480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043864593781344033, + "loss": 3.1787, + "theoretical_loss": 3.975872312781777, + "tokens_seen": 433878016 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004386359077231695, + "loss": 3.1637, + "theoretical_loss": 3.975806046882207, + "tokens_seen": 433943552 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004386258776328987, + "loss": 3.241, + "theoretical_loss": 3.9757397937913295, + "tokens_seen": 434009088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004386158475426279, + "loss": 3.1476, + "theoretical_loss": 3.975673553504734, + "tokens_seen": 434074624 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1062011, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3372769355773926, + "objective/train/theoretical_loss": 3.9756073260180136, + "objective/train/tokens_used": 454600160, + "theoretical_loss": 3.9756073260180136, + "tokens_seen": 434140160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043860581745235706, + "loss": 3.2062, + "theoretical_loss": 3.9756073260180136, + "tokens_seen": 434140160 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385957873620863, + "loss": 3.2944, + "theoretical_loss": 3.9755411113267645, + "tokens_seen": 434205696 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385857572718154, + "loss": 3.2206, + "theoretical_loss": 3.9754749094265827, + "tokens_seen": 434271232 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043857572718154466, + "loss": 3.2364, + "theoretical_loss": 3.975408720313068, + "tokens_seen": 434336768 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385656970912738, + "loss": 3.2778, + "theoretical_loss": 3.9753425439818217, + "tokens_seen": 434402304 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438555667001003, + "loss": 3.0775, + "theoretical_loss": 3.975276380428447, + "tokens_seen": 434467840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385456369107322, + "loss": 3.2798, + "theoretical_loss": 3.975210229648551, + "tokens_seen": 434533376 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385356068204614, + "loss": 3.0386, + "theoretical_loss": 3.9751440916377407, + "tokens_seen": 434598912 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043852557673019056, + "loss": 3.1349, + "theoretical_loss": 3.9750779663916265, + "tokens_seen": 434664448 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385155466399198, + "loss": 3.263, + "theoretical_loss": 3.975011853905821, + "tokens_seen": 434729984 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004385055165496489, + "loss": 3.1675, + "theoretical_loss": 3.9749457541759385, + "tokens_seen": 434795520 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043849548645937816, + "loss": 3.2635, + "theoretical_loss": 3.9748796671975963, + "tokens_seen": 434861056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004384854563691073, + "loss": 3.1956, + "theoretical_loss": 3.9748135929664126, + "tokens_seen": 434926592 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004384754262788365, + "loss": 3.2542, + "theoretical_loss": 3.974747531478009, + "tokens_seen": 434992128 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004384653961885657, + "loss": 3.2076, + "theoretical_loss": 3.974681482728008, + "tokens_seen": 435057664 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004384553660982949, + "loss": 3.1993, + "theoretical_loss": 3.9746154467120354, + "tokens_seen": 435123200 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043844533600802407, + "loss": 3.195, + "theoretical_loss": 3.974549423425719, + "tokens_seen": 435188736 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043843530591775325, + "loss": 3.1632, + "theoretical_loss": 3.974483412864688, + "tokens_seen": 435254272 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004384252758274825, + "loss": 3.2349, + "theoretical_loss": 3.974417415024574, + "tokens_seen": 435319808 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043841524573721166, + "loss": 3.1481, + "theoretical_loss": 3.974351429901012, + "tokens_seen": 435385344 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043840521564694084, + "loss": 3.2304, + "theoretical_loss": 3.974285457489638, + "tokens_seen": 435450880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043839518555667, + "loss": 3.1465, + "theoretical_loss": 3.9742194977860894, + "tokens_seen": 435516416 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004383851554663992, + "loss": 3.1396, + "theoretical_loss": 3.9741535507860073, + "tokens_seen": 435581952 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004383751253761284, + "loss": 3.2408, + "theoretical_loss": 3.974087616485034, + "tokens_seen": 435647488 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004383650952858576, + "loss": 3.2129, + "theoretical_loss": 3.9740216948788145, + "tokens_seen": 435713024 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1065680, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2247962951660156, + "objective/train/theoretical_loss": 3.973955785962996, + "objective/train/tokens_used": 456238560, + "theoretical_loss": 3.973955785962996, + "tokens_seen": 435778560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043835506519558675, + "loss": 3.1903, + "theoretical_loss": 3.973955785962996, + "tokens_seen": 435778560 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438345035105316, + "loss": 3.1, + "theoretical_loss": 3.9738898897332273, + "tokens_seen": 435844096 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043833500501504517, + "loss": 3.1559, + "theoretical_loss": 3.9738240061851604, + "tokens_seen": 435909632 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043832497492477435, + "loss": 3.2467, + "theoretical_loss": 3.973758135314447, + "tokens_seen": 435975168 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043831494483450353, + "loss": 3.263, + "theoretical_loss": 3.9736922771167436, + "tokens_seen": 436040704 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004383049147442327, + "loss": 3.1367, + "theoretical_loss": 3.9736264315877072, + "tokens_seen": 436106240 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382948846539619, + "loss": 3.1368, + "theoretical_loss": 3.9735605987229987, + "tokens_seen": 436171776 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382848545636911, + "loss": 3.144, + "theoretical_loss": 3.973494778518279, + "tokens_seen": 436237312 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043827482447342025, + "loss": 3.2179, + "theoretical_loss": 3.9734289709692128, + "tokens_seen": 436302848 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382647943831495, + "loss": 3.2214, + "theoretical_loss": 3.973363176071466, + "tokens_seen": 436368384 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382547642928786, + "loss": 3.2289, + "theoretical_loss": 3.9732973938207072, + "tokens_seen": 436433920 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043824473420260785, + "loss": 3.1612, + "theoretical_loss": 3.9732316242126062, + "tokens_seen": 436499456 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043823470411233703, + "loss": 3.2566, + "theoretical_loss": 3.9731658672428365, + "tokens_seen": 436564992 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382246740220662, + "loss": 3.2457, + "theoretical_loss": 3.9731001229070717, + "tokens_seen": 436630528 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004382146439317954, + "loss": 3.2, + "theoretical_loss": 3.97303439120099, + "tokens_seen": 436696064 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043820461384152463, + "loss": 3.1928, + "theoretical_loss": 3.9729686721202695, + "tokens_seen": 436761600 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043819458375125376, + "loss": 3.2747, + "theoretical_loss": 3.972902965660591, + "tokens_seen": 436827136 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438184553660983, + "loss": 3.0498, + "theoretical_loss": 3.972837271817639, + "tokens_seen": 436892672 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004381745235707121, + "loss": 3.2856, + "theoretical_loss": 3.9727715905870973, + "tokens_seen": 436958208 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043816449348044135, + "loss": 3.1976, + "theoretical_loss": 3.9727059219646548, + "tokens_seen": 437023744 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043815446339017053, + "loss": 3.2072, + "theoretical_loss": 3.9726402659460005, + "tokens_seen": 437089280 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004381444332998997, + "loss": 3.0541, + "theoretical_loss": 3.9725746225268255, + "tokens_seen": 437154816 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004381344032096289, + "loss": 3.2668, + "theoretical_loss": 3.972508991702825, + "tokens_seen": 437220352 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004381243731193581, + "loss": 3.1503, + "theoretical_loss": 3.9724433734696936, + "tokens_seen": 437285888 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043811434302908726, + "loss": 3.0773, + "theoretical_loss": 3.97237776782313, + "tokens_seen": 437351424 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1070828, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.179380416870117, + "objective/train/theoretical_loss": 3.9723121747588346, + "objective/train/tokens_used": 457876960, + "theoretical_loss": 3.9723121747588346, + "tokens_seen": 437416960 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004381043129388165, + "loss": 3.2341, + "theoretical_loss": 3.9723121747588346, + "tokens_seen": 437416960 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004380942828485456, + "loss": 3.1196, + "theoretical_loss": 3.9722465942725087, + "tokens_seen": 437482496 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043808425275827486, + "loss": 3.1581, + "theoretical_loss": 3.9721810263598583, + "tokens_seen": 437548032 + }, + { + "epoch": 5.01, + "learning_rate": 0.000438074222668004, + "loss": 3.1917, + "theoretical_loss": 3.972115471016589, + "tokens_seen": 437613568 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004380641925777332, + "loss": 3.2205, + "theoretical_loss": 3.972049928238409, + "tokens_seen": 437679104 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004380541624874624, + "loss": 3.2129, + "theoretical_loss": 3.9719843980210294, + "tokens_seen": 437744640 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004380441323971916, + "loss": 3.1253, + "theoretical_loss": 3.971918880360164, + "tokens_seen": 437810176 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043803410230692076, + "loss": 3.216, + "theoretical_loss": 3.971853375251526, + "tokens_seen": 437875712 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043802407221665, + "loss": 3.2792, + "theoretical_loss": 3.9717878826908333, + "tokens_seen": 437941248 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004380140421263791, + "loss": 3.1389, + "theoretical_loss": 3.971722402673806, + "tokens_seen": 438006784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043800401203610836, + "loss": 3.1373, + "theoretical_loss": 3.9716569351961635, + "tokens_seen": 438072320 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004379939819458375, + "loss": 3.1349, + "theoretical_loss": 3.9715914802536307, + "tokens_seen": 438137856 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004379839518555667, + "loss": 3.2484, + "theoretical_loss": 3.971526037841932, + "tokens_seen": 438203392 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004379739217652959, + "loss": 3.1833, + "theoretical_loss": 3.9714606079567956, + "tokens_seen": 438268928 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004379638916750251, + "loss": 3.2916, + "theoretical_loss": 3.971395190593951, + "tokens_seen": 438334464 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043795386158475427, + "loss": 3.1717, + "theoretical_loss": 3.97132978574913, + "tokens_seen": 438400000 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043794383149448345, + "loss": 3.1943, + "theoretical_loss": 3.971264393418066, + "tokens_seen": 438465536 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043793380140421263, + "loss": 3.1673, + "theoretical_loss": 3.9711990135964954, + "tokens_seen": 438531072 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043792377131394186, + "loss": 3.1558, + "theoretical_loss": 3.971133646280156, + "tokens_seen": 438596608 + }, + { + "epoch": 5.01, + "learning_rate": 0.000437913741223671, + "loss": 3.1817, + "theoretical_loss": 3.971068291464788, + "tokens_seen": 438662144 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004379037111334002, + "loss": 3.1448, + "theoretical_loss": 3.971002949146133, + "tokens_seen": 438727680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043789368104312935, + "loss": 3.1943, + "theoretical_loss": 3.970937619319937, + "tokens_seen": 438793216 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004378836509528586, + "loss": 3.1882, + "theoretical_loss": 3.9708723019819434, + "tokens_seen": 438858752 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043787362086258777, + "loss": 3.1539, + "theoretical_loss": 3.9708069971279034, + "tokens_seen": 438924288 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043786359077231695, + "loss": 3.206, + "theoretical_loss": 3.9707417047535665, + "tokens_seen": 438989824 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1073513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1781198978424072, + "objective/train/theoretical_loss": 3.970676424854685, + "objective/train/tokens_used": 459515360, + "theoretical_loss": 3.970676424854685, + "tokens_seen": 439055360 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043785356068204613, + "loss": 3.2143, + "theoretical_loss": 3.970676424854685, + "tokens_seen": 439055360 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043784353059177537, + "loss": 3.2744, + "theoretical_loss": 3.970611157427014, + "tokens_seen": 439120896 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004378335005015045, + "loss": 3.0553, + "theoretical_loss": 3.97054590246631, + "tokens_seen": 439186432 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043782347041123373, + "loss": 3.2465, + "theoretical_loss": 3.970480659968332, + "tokens_seen": 439251968 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043781344032096286, + "loss": 3.261, + "theoretical_loss": 3.970415429928841, + "tokens_seen": 439317504 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004378034102306921, + "loss": 3.1927, + "theoretical_loss": 3.9703502123436, + "tokens_seen": 439383040 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043779338014042127, + "loss": 3.0905, + "theoretical_loss": 3.9702850072083735, + "tokens_seen": 439448576 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043778335005015045, + "loss": 3.2427, + "theoretical_loss": 3.970219814518929, + "tokens_seen": 439514112 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043777331995987963, + "loss": 3.3016, + "theoretical_loss": 3.9701546342710357, + "tokens_seen": 439579648 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004377632898696088, + "loss": 3.2045, + "theoretical_loss": 3.9700894664604656, + "tokens_seen": 439645184 + }, + { + "epoch": 5.01, + "learning_rate": 0.000437753259779338, + "loss": 3.2166, + "theoretical_loss": 3.970024311082991, + "tokens_seen": 439710720 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043774322968906723, + "loss": 3.2524, + "theoretical_loss": 3.9699591681343875, + "tokens_seen": 439776256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043773319959879636, + "loss": 3.2243, + "theoretical_loss": 3.969894037610432, + "tokens_seen": 439841792 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004377231695085256, + "loss": 3.2066, + "theoretical_loss": 3.969828919506906, + "tokens_seen": 439907328 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004377131394182547, + "loss": 3.1723, + "theoretical_loss": 3.969763813819589, + "tokens_seen": 439972864 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043770310932798396, + "loss": 3.2363, + "theoretical_loss": 3.969698720544266, + "tokens_seen": 440038400 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043769307923771314, + "loss": 3.2015, + "theoretical_loss": 3.9696336396767222, + "tokens_seen": 440103936 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376830491474423, + "loss": 3.2455, + "theoretical_loss": 3.969568571212745, + "tokens_seen": 440169472 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043767301905717155, + "loss": 3.2446, + "theoretical_loss": 3.9695035151481246, + "tokens_seen": 440235008 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043766298896690073, + "loss": 3.2053, + "theoretical_loss": 3.969438471478653, + "tokens_seen": 440300544 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376529588766299, + "loss": 3.2386, + "theoretical_loss": 3.9693734402001235, + "tokens_seen": 440366080 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376429287863591, + "loss": 3.1746, + "theoretical_loss": 3.969308421308333, + "tokens_seen": 440431616 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376328986960883, + "loss": 3.2339, + "theoretical_loss": 3.969243414799079, + "tokens_seen": 440497152 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043762286860581746, + "loss": 3.2705, + "theoretical_loss": 3.969178420668162, + "tokens_seen": 440562688 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376128385155467, + "loss": 3.0612, + "theoretical_loss": 3.969113438911384, + "tokens_seen": 440628224 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1078719, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.25472092628479, + "objective/train/theoretical_loss": 3.969048469524548, + "objective/train/tokens_used": 461153760, + "theoretical_loss": 3.969048469524548, + "tokens_seen": 440693760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004376028084252758, + "loss": 3.2349, + "theoretical_loss": 3.969048469524548, + "tokens_seen": 440693760 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043759277833500506, + "loss": 3.2428, + "theoretical_loss": 3.968983512503462, + "tokens_seen": 440759296 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375827482447342, + "loss": 3.2296, + "theoretical_loss": 3.968918567843933, + "tokens_seen": 440824832 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375727181544634, + "loss": 3.2239, + "theoretical_loss": 3.968853635541773, + "tokens_seen": 440890368 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375626880641926, + "loss": 3.2527, + "theoretical_loss": 3.9687887155927926, + "tokens_seen": 440955904 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375526579739218, + "loss": 3.0284, + "theoretical_loss": 3.9687238079928067, + "tokens_seen": 441021440 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043754262788365096, + "loss": 3.2279, + "theoretical_loss": 3.968658912737632, + "tokens_seen": 441086976 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375325977933802, + "loss": 3.2858, + "theoretical_loss": 3.968594029823087, + "tokens_seen": 441152512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375225677031093, + "loss": 3.1872, + "theoretical_loss": 3.9685291592449916, + "tokens_seen": 441218048 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043751253761283856, + "loss": 3.2637, + "theoretical_loss": 3.9684643009991696, + "tokens_seen": 441283584 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004375025075225677, + "loss": 3.2286, + "theoretical_loss": 3.9683994550814443, + "tokens_seen": 441349120 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004374924774322969, + "loss": 3.1594, + "theoretical_loss": 3.968334621487643, + "tokens_seen": 441414656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004374824473420261, + "loss": 3.2065, + "theoretical_loss": 3.9682698002135943, + "tokens_seen": 441480192 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004374724172517553, + "loss": 3.225, + "theoretical_loss": 3.9682049912551287, + "tokens_seen": 441545728 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043746238716148447, + "loss": 3.1456, + "theoretical_loss": 3.9681401946080794, + "tokens_seen": 441611264 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043745235707121365, + "loss": 3.264, + "theoretical_loss": 3.9680754102682805, + "tokens_seen": 441676800 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043744232698094283, + "loss": 3.1649, + "theoretical_loss": 3.9680106382315694, + "tokens_seen": 441742336 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043743229689067206, + "loss": 3.1757, + "theoretical_loss": 3.967945878493784, + "tokens_seen": 441807872 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004374222668004012, + "loss": 3.1698, + "theoretical_loss": 3.967881131050766, + "tokens_seen": 441873408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004374122367101304, + "loss": 3.2469, + "theoretical_loss": 3.9678163958983577, + "tokens_seen": 441938944 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043740220661985955, + "loss": 3.2787, + "theoretical_loss": 3.9677516730324043, + "tokens_seen": 442004480 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004373921765295888, + "loss": 3.148, + "theoretical_loss": 3.9676869624487527, + "tokens_seen": 442070016 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043738214643931797, + "loss": 3.2316, + "theoretical_loss": 3.9676222641432517, + "tokens_seen": 442135552 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043737211634904715, + "loss": 3.2301, + "theoretical_loss": 3.967557578111752, + "tokens_seen": 442201088 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043736208625877633, + "loss": 3.088, + "theoretical_loss": 3.967492904350107, + "tokens_seen": 442266624 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1081597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2449395656585693, + "objective/train/theoretical_loss": 3.9674282428541714, + "objective/train/tokens_used": 462792160, + "theoretical_loss": 3.9674282428541714, + "tokens_seen": 442332160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043735205616850557, + "loss": 3.2035, + "theoretical_loss": 3.9674282428541714, + "tokens_seen": 442332160 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004373420260782347, + "loss": 3.2512, + "theoretical_loss": 3.967363593619802, + "tokens_seen": 442397696 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043733199598796393, + "loss": 3.1566, + "theoretical_loss": 3.9672989566428583, + "tokens_seen": 442463232 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043732196589769306, + "loss": 3.1658, + "theoretical_loss": 3.967234331919201, + "tokens_seen": 442528768 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004373119358074223, + "loss": 3.1888, + "theoretical_loss": 3.967169719444693, + "tokens_seen": 442594304 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043730190571715147, + "loss": 3.2043, + "theoretical_loss": 3.9671051192151987, + "tokens_seen": 442659840 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043729187562688065, + "loss": 3.2344, + "theoretical_loss": 3.967040531226587, + "tokens_seen": 442725376 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043728184553660983, + "loss": 3.2574, + "theoretical_loss": 3.966975955474725, + "tokens_seen": 442790912 + }, + { + "epoch": 5.01, + "learning_rate": 0.000437271815446339, + "loss": 3.2089, + "theoretical_loss": 3.9669113919554846, + "tokens_seen": 442856448 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004372617853560682, + "loss": 3.1264, + "theoretical_loss": 3.966846840664739, + "tokens_seen": 442921984 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043725175526579743, + "loss": 3.1577, + "theoretical_loss": 3.9667823015983625, + "tokens_seen": 442987520 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043724172517552656, + "loss": 3.2167, + "theoretical_loss": 3.966717774752233, + "tokens_seen": 443053056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004372316950852558, + "loss": 3.0641, + "theoretical_loss": 3.966653260122229, + "tokens_seen": 443118592 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004372216649949849, + "loss": 3.1813, + "theoretical_loss": 3.966588757704232, + "tokens_seen": 443184128 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043721163490471416, + "loss": 3.0922, + "theoretical_loss": 3.966524267494125, + "tokens_seen": 443249664 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043720160481444334, + "loss": 3.1375, + "theoretical_loss": 3.966459789487792, + "tokens_seen": 443315200 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004371915747241725, + "loss": 3.2024, + "theoretical_loss": 3.9663953236811214, + "tokens_seen": 443380736 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004371815446339017, + "loss": 3.0805, + "theoretical_loss": 3.9663308700700015, + "tokens_seen": 443446272 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043717151454363093, + "loss": 3.1148, + "theoretical_loss": 3.9662664286503233, + "tokens_seen": 443511808 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043716148445336006, + "loss": 3.1739, + "theoretical_loss": 3.9662019994179802, + "tokens_seen": 443577344 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004371514543630893, + "loss": 3.1848, + "theoretical_loss": 3.9661375823688667, + "tokens_seen": 443642880 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004371414242728184, + "loss": 3.2279, + "theoretical_loss": 3.96607317749888, + "tokens_seen": 443708416 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043713139418254766, + "loss": 3.2894, + "theoretical_loss": 3.96600878480392, + "tokens_seen": 443773952 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043712136409227684, + "loss": 3.2259, + "theoretical_loss": 3.965944404279886, + "tokens_seen": 443839488 + }, + { + "epoch": 5.01, + "learning_rate": 0.000437111334002006, + "loss": 3.189, + "theoretical_loss": 3.965880035922682, + "tokens_seen": 443905024 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1085069, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1931025981903076, + "objective/train/theoretical_loss": 3.965815679728213, + "objective/train/tokens_used": 464430560, + "theoretical_loss": 3.965815679728213, + "tokens_seen": 443970560 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004371013039117352, + "loss": 3.2443, + "theoretical_loss": 3.965815679728213, + "tokens_seen": 443970560 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370912738214644, + "loss": 3.1384, + "theoretical_loss": 3.965751335692385, + "tokens_seen": 444036096 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043708124373119356, + "loss": 3.1943, + "theoretical_loss": 3.965687003811108, + "tokens_seen": 444101632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370712136409228, + "loss": 3.3071, + "theoretical_loss": 3.9656226840802917, + "tokens_seen": 444167168 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370611835506519, + "loss": 3.155, + "theoretical_loss": 3.96555837649585, + "tokens_seen": 444232704 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043705115346038116, + "loss": 3.1708, + "theoretical_loss": 3.965494081053698, + "tokens_seen": 444298240 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370411233701103, + "loss": 3.1651, + "theoretical_loss": 3.965429797749751, + "tokens_seen": 444363776 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370310932798395, + "loss": 3.0596, + "theoretical_loss": 3.9653655265799292, + "tokens_seen": 444429312 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370210631895687, + "loss": 3.2004, + "theoretical_loss": 3.965301267540153, + "tokens_seen": 444494848 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004370110330992979, + "loss": 3.1497, + "theoretical_loss": 3.9652370206263443, + "tokens_seen": 444560384 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043700100300902707, + "loss": 3.2486, + "theoretical_loss": 3.9651727858344286, + "tokens_seen": 444625920 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004369909729187563, + "loss": 3.1578, + "theoretical_loss": 3.9651085631603324, + "tokens_seen": 444691456 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043698094282848543, + "loss": 3.1689, + "theoretical_loss": 3.965044352599985, + "tokens_seen": 444756992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043697091273821467, + "loss": 3.1948, + "theoretical_loss": 3.964980154149316, + "tokens_seen": 444822528 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004369608826479438, + "loss": 3.077, + "theoretical_loss": 3.964915967804258, + "tokens_seen": 444888064 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043695085255767303, + "loss": 3.1945, + "theoretical_loss": 3.964851793560746, + "tokens_seen": 444953600 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004369408224674022, + "loss": 3.1278, + "theoretical_loss": 3.9647876314147164, + "tokens_seen": 445019136 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004369307923771314, + "loss": 3.203, + "theoretical_loss": 3.9647234813621077, + "tokens_seen": 445084672 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004369207622868606, + "loss": 3.2416, + "theoretical_loss": 3.9646593433988606, + "tokens_seen": 445150208 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043691073219658975, + "loss": 3.1438, + "theoretical_loss": 3.964595217520917, + "tokens_seen": 445215744 + }, + { + "epoch": 5.01, + "learning_rate": 0.000436900702106319, + "loss": 3.2602, + "theoretical_loss": 3.964531103724221, + "tokens_seen": 445281280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043689067201604817, + "loss": 3.2825, + "theoretical_loss": 3.9644670020047195, + "tokens_seen": 445346816 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043688064192577735, + "loss": 3.2254, + "theoretical_loss": 3.9644029123583606, + "tokens_seen": 445412352 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043687061183550653, + "loss": 3.136, + "theoretical_loss": 3.9643388347810946, + "tokens_seen": 445477888 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043686058174523577, + "loss": 3.1925, + "theoretical_loss": 3.9642747692688736, + "tokens_seen": 445543424 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1088204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.172414541244507, + "objective/train/theoretical_loss": 3.964210715817651, + "objective/train/tokens_used": 466068960, + "theoretical_loss": 3.964210715817651, + "tokens_seen": 445608960 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004368505516549649, + "loss": 3.1675, + "theoretical_loss": 3.964210715817651, + "tokens_seen": 445608960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043684052156469413, + "loss": 3.2227, + "theoretical_loss": 3.964146674423384, + "tokens_seen": 445674496 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043683049147442326, + "loss": 3.0079, + "theoretical_loss": 3.964082645082031, + "tokens_seen": 445740032 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004368204613841525, + "loss": 3.1984, + "theoretical_loss": 3.9640186277895504, + "tokens_seen": 445805568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043681043129388167, + "loss": 3.2706, + "theoretical_loss": 3.9639546225419044, + "tokens_seen": 445871104 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043680040120361085, + "loss": 3.18, + "theoretical_loss": 3.963890629335058, + "tokens_seen": 445936640 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043679037111334003, + "loss": 3.2347, + "theoretical_loss": 3.963826648164976, + "tokens_seen": 446002176 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004367803410230692, + "loss": 3.2334, + "theoretical_loss": 3.9637626790276266, + "tokens_seen": 446067712 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004367703109327984, + "loss": 3.1772, + "theoretical_loss": 3.9636987219189797, + "tokens_seen": 446133248 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043676028084252763, + "loss": 3.2051, + "theoretical_loss": 3.963634776835006, + "tokens_seen": 446198784 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043675025075225676, + "loss": 3.2379, + "theoretical_loss": 3.9635708437716803, + "tokens_seen": 446264320 + }, + { + "epoch": 5.01, + "learning_rate": 0.000436740220661986, + "loss": 3.232, + "theoretical_loss": 3.963506922724977, + "tokens_seen": 446329856 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004367301905717151, + "loss": 3.1091, + "theoretical_loss": 3.963443013690875, + "tokens_seen": 446395392 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043672016048144436, + "loss": 3.2068, + "theoretical_loss": 3.963379116665352, + "tokens_seen": 446460928 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043671013039117354, + "loss": 3.1776, + "theoretical_loss": 3.9633152316443905, + "tokens_seen": 446526464 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004367001003009027, + "loss": 3.1748, + "theoretical_loss": 3.963251358623973, + "tokens_seen": 446592000 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004366900702106319, + "loss": 3.0775, + "theoretical_loss": 3.9631874976000856, + "tokens_seen": 446657536 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043668004012036113, + "loss": 3.1148, + "theoretical_loss": 3.9631236485687147, + "tokens_seen": 446723072 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043667001003009026, + "loss": 3.2482, + "theoretical_loss": 3.9630598115258495, + "tokens_seen": 446788608 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004366599799398195, + "loss": 3.1716, + "theoretical_loss": 3.9629959864674813, + "tokens_seen": 446854144 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004366499498495486, + "loss": 3.1812, + "theoretical_loss": 3.9629321733896026, + "tokens_seen": 446919680 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043663991975927786, + "loss": 3.0381, + "theoretical_loss": 3.9628683722882085, + "tokens_seen": 446985216 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043662988966900704, + "loss": 3.2627, + "theoretical_loss": 3.9628045831592953, + "tokens_seen": 447050752 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004366198595787362, + "loss": 3.2077, + "theoretical_loss": 3.962740805998863, + "tokens_seen": 447116288 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004366098294884654, + "loss": 3.1715, + "theoretical_loss": 3.9626770408029106, + "tokens_seen": 447181824 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1093045, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0211753845214844, + "objective/train/theoretical_loss": 3.962613287567442, + "objective/train/tokens_used": 467707360, + "theoretical_loss": 3.962613287567442, + "tokens_seen": 447247360 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004365997993981946, + "loss": 2.9833, + "theoretical_loss": 3.962613287567442, + "tokens_seen": 447247360 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043658976930792376, + "loss": 3.2536, + "theoretical_loss": 3.962549546288461, + "tokens_seen": 447312896 + }, + { + "epoch": 5.01, + "learning_rate": 0.000436579739217653, + "loss": 3.2015, + "theoretical_loss": 3.9624858169619737, + "tokens_seen": 447378432 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043656970912738213, + "loss": 3.1755, + "theoretical_loss": 3.9624220995839883, + "tokens_seen": 447443968 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043655967903711136, + "loss": 3.249, + "theoretical_loss": 3.962358394150516, + "tokens_seen": 447509504 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004365496489468405, + "loss": 3.2928, + "theoretical_loss": 3.9622947006575684, + "tokens_seen": 447575040 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004365396188565697, + "loss": 3.0787, + "theoretical_loss": 3.9622310191011594, + "tokens_seen": 447640576 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004365295887662989, + "loss": 3.1878, + "theoretical_loss": 3.962167349477305, + "tokens_seen": 447706112 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004365195586760281, + "loss": 3.1625, + "theoretical_loss": 3.9621036917820236, + "tokens_seen": 447771648 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043650952858575727, + "loss": 3.2131, + "theoretical_loss": 3.9620400460113343, + "tokens_seen": 447837184 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004364994984954865, + "loss": 3.219, + "theoretical_loss": 3.961976412161259, + "tokens_seen": 447902720 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043648946840521563, + "loss": 3.2025, + "theoretical_loss": 3.9619127902278217, + "tokens_seen": 447968256 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043647943831494487, + "loss": 3.1505, + "theoretical_loss": 3.9618491802070475, + "tokens_seen": 448033792 + }, + { + "epoch": 5.01, + "learning_rate": 0.000436469408224674, + "loss": 3.1685, + "theoretical_loss": 3.9617855820949637, + "tokens_seen": 448099328 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043645937813440323, + "loss": 3.2085, + "theoretical_loss": 3.9617219958876, + "tokens_seen": 448164864 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004364493480441324, + "loss": 3.2443, + "theoretical_loss": 3.961658421580988, + "tokens_seen": 448230400 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004364393179538616, + "loss": 3.1625, + "theoretical_loss": 3.96159485917116, + "tokens_seen": 448295936 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043642928786359077, + "loss": 3.1345, + "theoretical_loss": 3.9615313086541506, + "tokens_seen": 448361472 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043641925777331995, + "loss": 3.2079, + "theoretical_loss": 3.9614677700259984, + "tokens_seen": 448427008 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043640922768304913, + "loss": 3.1936, + "theoretical_loss": 3.961404243282741, + "tokens_seen": 448492544 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043639919759277837, + "loss": 3.1498, + "theoretical_loss": 3.9613407284204194, + "tokens_seen": 448558080 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004363891675025075, + "loss": 3.1854, + "theoretical_loss": 3.9612772254350763, + "tokens_seen": 448623616 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043637913741223673, + "loss": 3.1531, + "theoretical_loss": 3.9612137343227567, + "tokens_seen": 448689152 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004363691073219659, + "loss": 3.1897, + "theoretical_loss": 3.9611502550795064, + "tokens_seen": 448754688 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004363590772316951, + "loss": 3.1523, + "theoretical_loss": 3.9610867877013733, + "tokens_seen": 448820224 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1096048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1479835510253906, + "objective/train/theoretical_loss": 3.961023332184409, + "objective/train/tokens_used": 469345760, + "theoretical_loss": 3.961023332184409, + "tokens_seen": 448885760 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004363490471414243, + "loss": 3.2738, + "theoretical_loss": 3.961023332184409, + "tokens_seen": 448885760 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043633901705115346, + "loss": 3.2665, + "theoretical_loss": 3.9609598885246644, + "tokens_seen": 448951296 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043632898696088264, + "loss": 3.168, + "theoretical_loss": 3.960896456718194, + "tokens_seen": 449016832 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043631895687061187, + "loss": 3.2044, + "theoretical_loss": 3.960833036761054, + "tokens_seen": 449082368 + }, + { + "epoch": 5.01, + "learning_rate": 0.000436308926780341, + "loss": 3.1184, + "theoretical_loss": 3.9607696286493015, + "tokens_seen": 449147904 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043629889669007023, + "loss": 3.0419, + "theoretical_loss": 3.960706232378996, + "tokens_seen": 449213440 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043628886659979936, + "loss": 3.3184, + "theoretical_loss": 3.9606428479461995, + "tokens_seen": 449278976 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004362788365095286, + "loss": 3.176, + "theoretical_loss": 3.960579475346975, + "tokens_seen": 449344512 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004362688064192578, + "loss": 3.1088, + "theoretical_loss": 3.9605161145773895, + "tokens_seen": 449410048 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043625877632898696, + "loss": 3.0244, + "theoretical_loss": 3.960452765633508, + "tokens_seen": 449475584 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043624874623871614, + "loss": 3.1816, + "theoretical_loss": 3.9603894285114007, + "tokens_seen": 449541120 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004362387161484453, + "loss": 3.139, + "theoretical_loss": 3.960326103207138, + "tokens_seen": 449606656 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004362286860581745, + "loss": 3.2731, + "theoretical_loss": 3.9602627897167935, + "tokens_seen": 449672192 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043621865596790374, + "loss": 3.1538, + "theoretical_loss": 3.960199488036441, + "tokens_seen": 449737728 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043620862587763286, + "loss": 3.2604, + "theoretical_loss": 3.9601361981621572, + "tokens_seen": 449803264 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361985957873621, + "loss": 3.1421, + "theoretical_loss": 3.9600729200900218, + "tokens_seen": 449868800 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361885656970913, + "loss": 3.183, + "theoretical_loss": 3.9600096538161136, + "tokens_seen": 449934336 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043617853560682046, + "loss": 3.1124, + "theoretical_loss": 3.9599463993365154, + "tokens_seen": 449999872 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361685055165497, + "loss": 3.0959, + "theoretical_loss": 3.9598831566473116, + "tokens_seen": 450065408 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361584754262788, + "loss": 3.1385, + "theoretical_loss": 3.9598199257445876, + "tokens_seen": 450130944 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043614844533600806, + "loss": 3.1835, + "theoretical_loss": 3.9597567066244315, + "tokens_seen": 450196480 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043613841524573724, + "loss": 3.1798, + "theoretical_loss": 3.959693499282933, + "tokens_seen": 450262016 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361283851554664, + "loss": 3.188, + "theoretical_loss": 3.9596303037161835, + "tokens_seen": 450327552 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361183550651956, + "loss": 3.1557, + "theoretical_loss": 3.959567119920276, + "tokens_seen": 450393088 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004361083249749248, + "loss": 3.1129, + "theoretical_loss": 3.9595039478913066, + "tokens_seen": 450458624 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1099940, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3130602836608887, + "objective/train/theoretical_loss": 3.9594407876253728, + "objective/train/tokens_used": 470984160, + "theoretical_loss": 3.9594407876253728, + "tokens_seen": 450524160 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043609829488465397, + "loss": 3.2035, + "theoretical_loss": 3.9594407876253728, + "tokens_seen": 450524160 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360882647943832, + "loss": 3.013, + "theoretical_loss": 3.9593776391185718, + "tokens_seen": 450589696 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043607823470411233, + "loss": 3.2039, + "theoretical_loss": 3.9593145023670067, + "tokens_seen": 450655232 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043606820461384156, + "loss": 3.2455, + "theoretical_loss": 3.959251377366778, + "tokens_seen": 450720768 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360581745235707, + "loss": 3.1246, + "theoretical_loss": 3.9591882641139917, + "tokens_seen": 450786304 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360481444332999, + "loss": 3.2222, + "theoretical_loss": 3.959125162604754, + "tokens_seen": 450851840 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360381143430291, + "loss": 3.1693, + "theoretical_loss": 3.9590620728351733, + "tokens_seen": 450917376 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360280842527583, + "loss": 3.2609, + "theoretical_loss": 3.9589989948013593, + "tokens_seen": 450982912 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043601805416248747, + "loss": 3.2244, + "theoretical_loss": 3.9589359284994243, + "tokens_seen": 451048448 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004360080240722167, + "loss": 3.2792, + "theoretical_loss": 3.9588728739254826, + "tokens_seen": 451113984 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043599799398194583, + "loss": 3.1368, + "theoretical_loss": 3.9588098310756488, + "tokens_seen": 451179520 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043598796389167507, + "loss": 3.1582, + "theoretical_loss": 3.958746799946041, + "tokens_seen": 451245056 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004359779338014042, + "loss": 3.1626, + "theoretical_loss": 3.958683780532779, + "tokens_seen": 451310592 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043596790371113343, + "loss": 3.1353, + "theoretical_loss": 3.9586207728319835, + "tokens_seen": 451376128 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004359578736208626, + "loss": 3.2544, + "theoretical_loss": 3.958557776839778, + "tokens_seen": 451441664 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004359478435305918, + "loss": 3.1605, + "theoretical_loss": 3.9584947925522873, + "tokens_seen": 451507200 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043593781344032097, + "loss": 3.2035, + "theoretical_loss": 3.9584318199656385, + "tokens_seen": 451572736 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043592778335005015, + "loss": 3.2044, + "theoretical_loss": 3.9583688590759594, + "tokens_seen": 451638272 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043591775325977933, + "loss": 3.1901, + "theoretical_loss": 3.9583059098793814, + "tokens_seen": 451703808 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043590772316950857, + "loss": 3.2315, + "theoretical_loss": 3.9582429723720365, + "tokens_seen": 451769344 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004358976930792377, + "loss": 3.1986, + "theoretical_loss": 3.9581800465500585, + "tokens_seen": 451834880 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043588766298896693, + "loss": 3.1318, + "theoretical_loss": 3.9581171324095834, + "tokens_seen": 451900416 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004358776328986961, + "loss": 3.2783, + "theoretical_loss": 3.9580542299467503, + "tokens_seen": 451965952 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004358676028084253, + "loss": 3.2294, + "theoretical_loss": 3.9579913391576973, + "tokens_seen": 452031488 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004358575727181545, + "loss": 3.1572, + "theoretical_loss": 3.9579284600385662, + "tokens_seen": 452097024 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1104634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3362302780151367, + "objective/train/theoretical_loss": 3.957865592585501, + "objective/train/tokens_used": 472622560, + "theoretical_loss": 3.957865592585501, + "tokens_seen": 452162560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043584754262788366, + "loss": 3.1513, + "theoretical_loss": 3.957865592585501, + "tokens_seen": 452162560 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043583751253761284, + "loss": 3.1652, + "theoretical_loss": 3.9578027367946467, + "tokens_seen": 452228096 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043582748244734207, + "loss": 3.1983, + "theoretical_loss": 3.95773989266215, + "tokens_seen": 452293632 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004358174523570712, + "loss": 3.1936, + "theoretical_loss": 3.9576770601841598, + "tokens_seen": 452359168 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043580742226680043, + "loss": 3.0218, + "theoretical_loss": 3.957614239356827, + "tokens_seen": 452424704 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043579739217652956, + "loss": 3.0337, + "theoretical_loss": 3.957551430176304, + "tokens_seen": 452490240 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004357873620862588, + "loss": 3.1875, + "theoretical_loss": 3.9574886326387446, + "tokens_seen": 452555776 + }, + { + "epoch": 5.01, + "learning_rate": 0.000435777331995988, + "loss": 3.0866, + "theoretical_loss": 3.957425846740306, + "tokens_seen": 452621312 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043576730190571716, + "loss": 3.2482, + "theoretical_loss": 3.957363072477146, + "tokens_seen": 452686848 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043575727181544634, + "loss": 3.1654, + "theoretical_loss": 3.957300309845423, + "tokens_seen": 452752384 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004357472417251755, + "loss": 3.1683, + "theoretical_loss": 3.9572375588413005, + "tokens_seen": 452817920 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004357372116349047, + "loss": 3.2099, + "theoretical_loss": 3.9571748194609406, + "tokens_seen": 452883456 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043572718154463394, + "loss": 3.218, + "theoretical_loss": 3.957112091700509, + "tokens_seen": 452948992 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043571715145436306, + "loss": 3.243, + "theoretical_loss": 3.9570493755561733, + "tokens_seen": 453014528 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004357071213640923, + "loss": 3.1552, + "theoretical_loss": 3.9569866710241017, + "tokens_seen": 453080064 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004356970912738215, + "loss": 3.278, + "theoretical_loss": 3.9569239781004653, + "tokens_seen": 453145600 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043568706118355066, + "loss": 3.2005, + "theoretical_loss": 3.9568612967814367, + "tokens_seen": 453211136 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043567703109327984, + "loss": 3.1473, + "theoretical_loss": 3.95679862706319, + "tokens_seen": 453276672 + }, + { + "epoch": 5.01, + "learning_rate": 0.000435667001003009, + "loss": 3.1069, + "theoretical_loss": 3.956735968941901, + "tokens_seen": 453342208 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004356569709127382, + "loss": 3.3213, + "theoretical_loss": 3.9566733224137485, + "tokens_seen": 453407744 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043564694082246744, + "loss": 3.1011, + "theoretical_loss": 3.956610687474912, + "tokens_seen": 453473280 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043563691073219657, + "loss": 3.1449, + "theoretical_loss": 3.956548064121573, + "tokens_seen": 453538816 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004356268806419258, + "loss": 3.2418, + "theoretical_loss": 3.956485452349915, + "tokens_seen": 453604352 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043561685055165493, + "loss": 3.22, + "theoretical_loss": 3.9564228521561233, + "tokens_seen": 453669888 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043560682046138417, + "loss": 3.2489, + "theoretical_loss": 3.956360263536385, + "tokens_seen": 453735424 + }, + { + "epoch": 5.01, + "objective/train/docs_used": 1107670, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.32326602935791, + "objective/train/theoretical_loss": 3.956297686486888, + "objective/train/tokens_used": 474260960, + "theoretical_loss": 3.956297686486888, + "tokens_seen": 453800960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043559679037111335, + "loss": 3.3246, + "theoretical_loss": 3.956297686486888, + "tokens_seen": 453800960 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043558676028084253, + "loss": 3.0624, + "theoretical_loss": 3.9562351210038242, + "tokens_seen": 453866496 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004355767301905717, + "loss": 3.1611, + "theoretical_loss": 3.956172567083385, + "tokens_seen": 453932032 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004355667001003009, + "loss": 3.1885, + "theoretical_loss": 3.956110024721766, + "tokens_seen": 453997568 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043555667001003007, + "loss": 3.1395, + "theoretical_loss": 3.956047493915162, + "tokens_seen": 454063104 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004355466399197593, + "loss": 3.0318, + "theoretical_loss": 3.9559849746597715, + "tokens_seen": 454128640 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043553660982948843, + "loss": 3.2159, + "theoretical_loss": 3.955922466951794, + "tokens_seen": 454194176 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043552657973921767, + "loss": 3.0798, + "theoretical_loss": 3.955859970787431, + "tokens_seen": 454259712 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043551654964894685, + "loss": 3.2662, + "theoretical_loss": 3.9557974861628855, + "tokens_seen": 454325248 + }, + { + "epoch": 5.01, + "learning_rate": 0.00043550651955867603, + "loss": 3.2364, + "theoretical_loss": 3.9557350130743623, + "tokens_seen": 454390784 + }, + { + "epoch": 5.01, + "learning_rate": 0.0004354964894684052, + "loss": 3.2758, + "theoretical_loss": 3.9556725515180693, + "tokens_seen": 454456320 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004354864593781344, + "loss": 3.1996, + "theoretical_loss": 3.955610101490214, + "tokens_seen": 454521856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004354764292878636, + "loss": 3.2389, + "theoretical_loss": 3.955547662987007, + "tokens_seen": 454587392 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004354663991975928, + "loss": 3.3012, + "theoretical_loss": 3.9554852360046615, + "tokens_seen": 454652928 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043545636910732194, + "loss": 3.1733, + "theoretical_loss": 3.9554228205393906, + "tokens_seen": 454718464 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043544633901705117, + "loss": 3.2052, + "theoretical_loss": 3.9553604165874106, + "tokens_seen": 454784000 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004354363089267803, + "loss": 3.1845, + "theoretical_loss": 3.9552980241449385, + "tokens_seen": 454849536 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043542627883650953, + "loss": 3.2301, + "theoretical_loss": 3.9552356432081934, + "tokens_seen": 454915072 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043541624874623877, + "loss": 3.1594, + "theoretical_loss": 3.955173273773398, + "tokens_seen": 454980608 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004354062186559679, + "loss": 3.2208, + "theoretical_loss": 3.955110915836774, + "tokens_seen": 455046144 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043539618856569713, + "loss": 3.2031, + "theoretical_loss": 3.9550485693945463, + "tokens_seen": 455111680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004353861584754263, + "loss": 3.2028, + "theoretical_loss": 3.9549862344429414, + "tokens_seen": 455177216 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004353761283851555, + "loss": 3.1691, + "theoretical_loss": 3.954923910978188, + "tokens_seen": 455242752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004353660982948847, + "loss": 3.1274, + "theoretical_loss": 3.9548615989965157, + "tokens_seen": 455308288 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043535606820461386, + "loss": 3.2445, + "theoretical_loss": 3.9547992984941565, + "tokens_seen": 455373824 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1112378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.274557590484619, + "objective/train/theoretical_loss": 3.9547370094673444, + "objective/train/tokens_used": 475899360, + "theoretical_loss": 3.9547370094673444, + "tokens_seen": 455439360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043534603811434304, + "loss": 3.1536, + "theoretical_loss": 3.9547370094673444, + "tokens_seen": 455439360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043533600802407227, + "loss": 3.1937, + "theoretical_loss": 3.9546747319123146, + "tokens_seen": 455504896 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004353259779338014, + "loss": 3.3096, + "theoretical_loss": 3.9546124658253032, + "tokens_seen": 455570432 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043531594784353063, + "loss": 3.1289, + "theoretical_loss": 3.954550211202551, + "tokens_seen": 455635968 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043530591775325976, + "loss": 3.2651, + "theoretical_loss": 3.9544879680402976, + "tokens_seen": 455701504 + }, + { + "epoch": 5.02, + "learning_rate": 0.000435295887662989, + "loss": 3.1862, + "theoretical_loss": 3.9544257363347857, + "tokens_seen": 455767040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004352858575727182, + "loss": 3.1241, + "theoretical_loss": 3.9543635160822603, + "tokens_seen": 455832576 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043527582748244736, + "loss": 3.2464, + "theoretical_loss": 3.954301307278966, + "tokens_seen": 455898112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043526579739217654, + "loss": 3.2886, + "theoretical_loss": 3.9542391099211516, + "tokens_seen": 455963648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004352557673019057, + "loss": 3.2326, + "theoretical_loss": 3.954176924005067, + "tokens_seen": 456029184 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004352457372116349, + "loss": 3.1831, + "theoretical_loss": 3.954114749526963, + "tokens_seen": 456094720 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043523570712136414, + "loss": 3.1496, + "theoretical_loss": 3.954052586483092, + "tokens_seen": 456160256 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043522567703109326, + "loss": 3.0958, + "theoretical_loss": 3.9539904348697106, + "tokens_seen": 456225792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004352156469408225, + "loss": 3.2608, + "theoretical_loss": 3.9539282946830747, + "tokens_seen": 456291328 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004352056168505517, + "loss": 3.0927, + "theoretical_loss": 3.953866165919442, + "tokens_seen": 456356864 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043519558676028086, + "loss": 3.2794, + "theoretical_loss": 3.9538040485750736, + "tokens_seen": 456422400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043518555667001004, + "loss": 3.106, + "theoretical_loss": 3.9537419426462304, + "tokens_seen": 456487936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004351755265797392, + "loss": 3.1377, + "theoretical_loss": 3.9536798481291777, + "tokens_seen": 456553472 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004351654964894684, + "loss": 3.0292, + "theoretical_loss": 3.95361776502018, + "tokens_seen": 456619008 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043515546639919764, + "loss": 3.2132, + "theoretical_loss": 3.9535556933155043, + "tokens_seen": 456684544 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043514543630892677, + "loss": 3.2255, + "theoretical_loss": 3.95349363301142, + "tokens_seen": 456750080 + }, + { + "epoch": 5.02, + "learning_rate": 0.000435135406218656, + "loss": 3.1243, + "theoretical_loss": 3.9534315841041976, + "tokens_seen": 456815616 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043512537612838513, + "loss": 3.1442, + "theoretical_loss": 3.9533695465901095, + "tokens_seen": 456881152 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043511534603811437, + "loss": 3.218, + "theoretical_loss": 3.9533075204654304, + "tokens_seen": 456946688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043510531594784355, + "loss": 3.1889, + "theoretical_loss": 3.9532455057264357, + "tokens_seen": 457012224 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1115412, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2774972915649414, + "objective/train/theoretical_loss": 3.953183502369404, + "objective/train/tokens_used": 477537760, + "theoretical_loss": 3.953183502369404, + "tokens_seen": 457077760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043509528585757273, + "loss": 3.1855, + "theoretical_loss": 3.953183502369404, + "tokens_seen": 457077760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004350852557673019, + "loss": 3.0928, + "theoretical_loss": 3.9531215103906137, + "tokens_seen": 457143296 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004350752256770311, + "loss": 3.1266, + "theoretical_loss": 3.953059529786347, + "tokens_seen": 457208832 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043506519558676027, + "loss": 3.1855, + "theoretical_loss": 3.9529975605528866, + "tokens_seen": 457274368 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004350551654964895, + "loss": 3.2022, + "theoretical_loss": 3.952935602686517, + "tokens_seen": 457339904 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043504513540621863, + "loss": 3.2532, + "theoretical_loss": 3.9528736561835243, + "tokens_seen": 457405440 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043503510531594787, + "loss": 3.1899, + "theoretical_loss": 3.952811721040198, + "tokens_seen": 457470976 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043502507522567705, + "loss": 3.2174, + "theoretical_loss": 3.9527497972528263, + "tokens_seen": 457536512 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043501504513540623, + "loss": 3.1558, + "theoretical_loss": 3.9526878848177027, + "tokens_seen": 457602048 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004350050150451354, + "loss": 3.1414, + "theoretical_loss": 3.9526259837311195, + "tokens_seen": 457667584 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349949849548646, + "loss": 3.1506, + "theoretical_loss": 3.9525640939893725, + "tokens_seen": 457733120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349849548645938, + "loss": 3.2685, + "theoretical_loss": 3.9525022155887584, + "tokens_seen": 457798656 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434974924774323, + "loss": 3.0653, + "theoretical_loss": 3.952440348525576, + "tokens_seen": 457864192 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043496489468405214, + "loss": 3.192, + "theoretical_loss": 3.952378492796126, + "tokens_seen": 457929728 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043495486459378137, + "loss": 3.2731, + "theoretical_loss": 3.9523166483967094, + "tokens_seen": 457995264 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349448345035105, + "loss": 3.1867, + "theoretical_loss": 3.9522548153236317, + "tokens_seen": 458060800 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043493480441323973, + "loss": 3.2075, + "theoretical_loss": 3.952192993573197, + "tokens_seen": 458126336 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349247743229689, + "loss": 3.171, + "theoretical_loss": 3.9521311831417134, + "tokens_seen": 458191872 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349147442326981, + "loss": 3.1924, + "theoretical_loss": 3.9520693840254904, + "tokens_seen": 458257408 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004349047141424273, + "loss": 3.0925, + "theoretical_loss": 3.952007596220838, + "tokens_seen": 458322944 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004348946840521565, + "loss": 3.2133, + "theoretical_loss": 3.9519458197240693, + "tokens_seen": 458388480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043488465396188564, + "loss": 3.3107, + "theoretical_loss": 3.9518840545314977, + "tokens_seen": 458454016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004348746238716149, + "loss": 3.2107, + "theoretical_loss": 3.9518223006394404, + "tokens_seen": 458519552 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434864593781344, + "loss": 3.2278, + "theoretical_loss": 3.9517605580442146, + "tokens_seen": 458585088 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043485456369107324, + "loss": 3.1719, + "theoretical_loss": 3.9516988267421396, + "tokens_seen": 458650624 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1119387, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.098986864089966, + "objective/train/theoretical_loss": 3.951637106729537, + "objective/train/tokens_used": 479176160, + "theoretical_loss": 3.951637106729537, + "tokens_seen": 458716160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004348445336008024, + "loss": 3.127, + "theoretical_loss": 3.951637106729537, + "tokens_seen": 458716160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004348345035105316, + "loss": 3.2234, + "theoretical_loss": 3.9515753980027295, + "tokens_seen": 458781696 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004348244734202608, + "loss": 3.115, + "theoretical_loss": 3.951513700558041, + "tokens_seen": 458847232 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043481444332998996, + "loss": 3.1549, + "theoretical_loss": 3.9514520143917995, + "tokens_seen": 458912768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043480441323971914, + "loss": 3.2221, + "theoretical_loss": 3.951390339500332, + "tokens_seen": 458978304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004347943831494484, + "loss": 3.1574, + "theoretical_loss": 3.9513286758799677, + "tokens_seen": 459043840 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004347843530591775, + "loss": 3.1301, + "theoretical_loss": 3.951267023527039, + "tokens_seen": 459109376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043477432296890674, + "loss": 3.2328, + "theoretical_loss": 3.951205382437879, + "tokens_seen": 459174912 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043476429287863587, + "loss": 3.1657, + "theoretical_loss": 3.951143752608823, + "tokens_seen": 459240448 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004347542627883651, + "loss": 3.2364, + "theoretical_loss": 3.9510821340362066, + "tokens_seen": 459305984 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004347442326980943, + "loss": 3.021, + "theoretical_loss": 3.951020526716369, + "tokens_seen": 459371520 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043473420260782346, + "loss": 3.2468, + "theoretical_loss": 3.9509589306456503, + "tokens_seen": 459437056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043472417251755265, + "loss": 3.1378, + "theoretical_loss": 3.950897345820392, + "tokens_seen": 459502592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004347141424272819, + "loss": 3.0811, + "theoretical_loss": 3.9508357722369376, + "tokens_seen": 459568128 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434704112337011, + "loss": 3.1497, + "theoretical_loss": 3.9507742098916325, + "tokens_seen": 459633664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043469408224674024, + "loss": 3.1234, + "theoretical_loss": 3.9507126587808234, + "tokens_seen": 459699200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004346840521564694, + "loss": 3.2258, + "theoretical_loss": 3.9506511189008595, + "tokens_seen": 459764736 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004346740220661986, + "loss": 3.2419, + "theoretical_loss": 3.9505895902480903, + "tokens_seen": 459830272 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043466399197592784, + "loss": 3.2911, + "theoretical_loss": 3.9505280728188685, + "tokens_seen": 459895808 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043465396188565697, + "loss": 3.2607, + "theoretical_loss": 3.9504665666095473, + "tokens_seen": 459961344 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004346439317953862, + "loss": 3.237, + "theoretical_loss": 3.950405071616483, + "tokens_seen": 460026880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043463390170511533, + "loss": 3.3046, + "theoretical_loss": 3.9503435878360316, + "tokens_seen": 460092416 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043462387161484457, + "loss": 3.208, + "theoretical_loss": 3.950282115264553, + "tokens_seen": 460157952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043461384152457375, + "loss": 3.1001, + "theoretical_loss": 3.9502206538984077, + "tokens_seen": 460223488 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043460381143430293, + "loss": 3.2498, + "theoretical_loss": 3.9501592037339575, + "tokens_seen": 460289024 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1124182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2406134605407715, + "objective/train/theoretical_loss": 3.950097764767566, + "objective/train/tokens_used": 480814560, + "theoretical_loss": 3.950097764767566, + "tokens_seen": 460354560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004345937813440321, + "loss": 3.2101, + "theoretical_loss": 3.950097764767566, + "tokens_seen": 460354560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004345837512537613, + "loss": 3.1879, + "theoretical_loss": 3.9500363369956, + "tokens_seen": 460420096 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043457372116349047, + "loss": 3.0965, + "theoretical_loss": 3.949974920414426, + "tokens_seen": 460485632 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004345636910732197, + "loss": 3.2208, + "theoretical_loss": 3.9499135150204134, + "tokens_seen": 460551168 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043455366098294883, + "loss": 3.1561, + "theoretical_loss": 3.9498521208099326, + "tokens_seen": 460616704 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043454363089267807, + "loss": 3.2554, + "theoretical_loss": 3.949790737779356, + "tokens_seen": 460682240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043453360080240725, + "loss": 3.1825, + "theoretical_loss": 3.9497293659250587, + "tokens_seen": 460747776 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043452357071213643, + "loss": 3.1788, + "theoretical_loss": 3.9496680052434154, + "tokens_seen": 460813312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004345135406218656, + "loss": 3.2308, + "theoretical_loss": 3.949606655730804, + "tokens_seen": 460878848 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004345035105315948, + "loss": 3.1744, + "theoretical_loss": 3.949545317383604, + "tokens_seen": 460944384 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434493480441324, + "loss": 3.1663, + "theoretical_loss": 3.949483990198196, + "tokens_seen": 461009920 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344834503510532, + "loss": 3.2258, + "theoretical_loss": 3.949422674170963, + "tokens_seen": 461075456 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043447342026078234, + "loss": 3.2189, + "theoretical_loss": 3.949361369298288, + "tokens_seen": 461140992 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043446339017051157, + "loss": 3.0924, + "theoretical_loss": 3.9493000755765584, + "tokens_seen": 461206528 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344533600802407, + "loss": 3.3214, + "theoretical_loss": 3.9492387930021606, + "tokens_seen": 461272064 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043444332998996993, + "loss": 3.2489, + "theoretical_loss": 3.9491775215714853, + "tokens_seen": 461337600 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344332998996991, + "loss": 3.2566, + "theoretical_loss": 3.9491162612809223, + "tokens_seen": 461403136 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344232698094283, + "loss": 3.2455, + "theoretical_loss": 3.9490550121268653, + "tokens_seen": 461468672 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344132397191575, + "loss": 3.1543, + "theoretical_loss": 3.948993774105708, + "tokens_seen": 461534208 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004344032096288867, + "loss": 3.152, + "theoretical_loss": 3.948932547213847, + "tokens_seen": 461599744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043439317953861584, + "loss": 3.1658, + "theoretical_loss": 3.9488713314476787, + "tokens_seen": 461665280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004343831494483451, + "loss": 3.1093, + "theoretical_loss": 3.948810126803604, + "tokens_seen": 461730816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004343731193580742, + "loss": 3.2096, + "theoretical_loss": 3.948748933278023, + "tokens_seen": 461796352 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043436308926780344, + "loss": 3.2331, + "theoretical_loss": 3.948687750867339, + "tokens_seen": 461861888 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004343530591775326, + "loss": 3.1749, + "theoretical_loss": 3.9486265795679563, + "tokens_seen": 461927424 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1126981, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7000696659088135, + "objective/train/theoretical_loss": 3.9485654193762807, + "objective/train/tokens_used": 482452960, + "theoretical_loss": 3.9485654193762807, + "tokens_seen": 461992960 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004343430290872618, + "loss": 3.0943, + "theoretical_loss": 3.9485654193762807, + "tokens_seen": 461992960 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434332998996991, + "loss": 3.251, + "theoretical_loss": 3.94850427028872, + "tokens_seen": 462058496 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043432296890672016, + "loss": 3.1895, + "theoretical_loss": 3.9484431323016844, + "tokens_seen": 462124032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043431293881644934, + "loss": 3.0958, + "theoretical_loss": 3.9483820054115846, + "tokens_seen": 462189568 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004343029087261786, + "loss": 3.0429, + "theoretical_loss": 3.9483208896148327, + "tokens_seen": 462255104 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004342928786359077, + "loss": 3.1646, + "theoretical_loss": 3.948259784907844, + "tokens_seen": 462320640 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043428284854563694, + "loss": 3.13, + "theoretical_loss": 3.9481986912870335, + "tokens_seen": 462386176 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043427281845536607, + "loss": 3.2005, + "theoretical_loss": 3.948137608748821, + "tokens_seen": 462451712 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004342627883650953, + "loss": 3.1972, + "theoretical_loss": 3.948076537289624, + "tokens_seen": 462517248 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004342527582748245, + "loss": 3.1883, + "theoretical_loss": 3.9480154769058644, + "tokens_seen": 462582784 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043424272818455367, + "loss": 3.1937, + "theoretical_loss": 3.947954427593965, + "tokens_seen": 462648320 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043423269809428285, + "loss": 3.1159, + "theoretical_loss": 3.94789338935035, + "tokens_seen": 462713856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004342226680040121, + "loss": 3.2042, + "theoretical_loss": 3.9478323621714457, + "tokens_seen": 462779392 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004342126379137412, + "loss": 3.154, + "theoretical_loss": 3.94777134605368, + "tokens_seen": 462844928 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043420260782347044, + "loss": 3.1712, + "theoretical_loss": 3.9477103409934813, + "tokens_seen": 462910464 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043419257773319957, + "loss": 3.1592, + "theoretical_loss": 3.9476493469872818, + "tokens_seen": 462976000 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004341825476429288, + "loss": 3.2706, + "theoretical_loss": 3.9475883640315144, + "tokens_seen": 463041536 + }, + { + "epoch": 5.02, + "learning_rate": 0.000434172517552658, + "loss": 3.1578, + "theoretical_loss": 3.9475273921226126, + "tokens_seen": 463107072 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043416248746238717, + "loss": 3.2751, + "theoretical_loss": 3.9474664312570127, + "tokens_seen": 463172608 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043415245737211635, + "loss": 3.2183, + "theoretical_loss": 3.9474054814311526, + "tokens_seen": 463238144 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043414242728184553, + "loss": 3.1353, + "theoretical_loss": 3.9473445426414715, + "tokens_seen": 463303680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004341323971915747, + "loss": 3.1697, + "theoretical_loss": 3.9472836148844106, + "tokens_seen": 463369216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043412236710130395, + "loss": 3.1895, + "theoretical_loss": 3.947222698156412, + "tokens_seen": 463434752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004341123370110331, + "loss": 3.1805, + "theoretical_loss": 3.9471617924539206, + "tokens_seen": 463500288 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004341023069207623, + "loss": 3.1294, + "theoretical_loss": 3.9471008977733826, + "tokens_seen": 463565824 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1131603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2926547527313232, + "objective/train/theoretical_loss": 3.947040014111244, + "objective/train/tokens_used": 484091360, + "theoretical_loss": 3.947040014111244, + "tokens_seen": 463631360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043409227683049144, + "loss": 3.1811, + "theoretical_loss": 3.947040014111244, + "tokens_seen": 463631360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043408224674022067, + "loss": 3.1342, + "theoretical_loss": 3.946979141463956, + "tokens_seen": 463696896 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043407221664994985, + "loss": 3.2009, + "theoretical_loss": 3.9469182798279685, + "tokens_seen": 463762432 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043406218655967903, + "loss": 3.2899, + "theoretical_loss": 3.946857429199734, + "tokens_seen": 463827968 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004340521564694082, + "loss": 3.3206, + "theoretical_loss": 3.9467965895757064, + "tokens_seen": 463893504 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043404212637913745, + "loss": 3.206, + "theoretical_loss": 3.946735760952343, + "tokens_seen": 463959040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004340320962888666, + "loss": 3.2311, + "theoretical_loss": 3.9466749433260997, + "tokens_seen": 464024576 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004340220661985958, + "loss": 3.1307, + "theoretical_loss": 3.946614136693436, + "tokens_seen": 464090112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043401203610832494, + "loss": 3.1838, + "theoretical_loss": 3.946553341050813, + "tokens_seen": 464155648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004340020060180542, + "loss": 3.2612, + "theoretical_loss": 3.946492556394692, + "tokens_seen": 464221184 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043399197592778336, + "loss": 3.1408, + "theoretical_loss": 3.946431782721539, + "tokens_seen": 464286720 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043398194583751254, + "loss": 3.2322, + "theoretical_loss": 3.9463710200278186, + "tokens_seen": 464352256 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339719157472417, + "loss": 3.2612, + "theoretical_loss": 3.946310268309997, + "tokens_seen": 464417792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339618856569709, + "loss": 3.2623, + "theoretical_loss": 3.946249527564545, + "tokens_seen": 464483328 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339518555667001, + "loss": 3.2056, + "theoretical_loss": 3.946188797787932, + "tokens_seen": 464548864 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339418254764293, + "loss": 3.324, + "theoretical_loss": 3.9461280789766304, + "tokens_seen": 464614400 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339317953861585, + "loss": 3.1108, + "theoretical_loss": 3.946067371127114, + "tokens_seen": 464679936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339217652958877, + "loss": 3.1811, + "theoretical_loss": 3.946006674235859, + "tokens_seen": 464745472 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004339117352056169, + "loss": 3.1736, + "theoretical_loss": 3.945945988299341, + "tokens_seen": 464811008 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043390170511534604, + "loss": 3.1345, + "theoretical_loss": 3.9458853133140397, + "tokens_seen": 464876544 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338916750250753, + "loss": 3.1938, + "theoretical_loss": 3.9458246492764357, + "tokens_seen": 464942080 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338816449348044, + "loss": 3.2263, + "theoretical_loss": 3.94576399618301, + "tokens_seen": 465007616 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043387161484453364, + "loss": 3.238, + "theoretical_loss": 3.9457033540302477, + "tokens_seen": 465073152 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338615847542628, + "loss": 3.1659, + "theoretical_loss": 3.9456427228146325, + "tokens_seen": 465138688 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433851554663992, + "loss": 3.1488, + "theoretical_loss": 3.945582102532652, + "tokens_seen": 465204224 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.178684711456299, + "objective/train/theoretical_loss": 3.9455214931807943, + "objective/train/tokens_used": 485729760, + "theoretical_loss": 3.9455214931807943, + "tokens_seen": 465269760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338415245737212, + "loss": 3.2553, + "theoretical_loss": 3.9455214931807943, + "tokens_seen": 465269760 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043383149448345036, + "loss": 3.1953, + "theoretical_loss": 3.94546089475555, + "tokens_seen": 465335296 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043382146439317954, + "loss": 3.1162, + "theoretical_loss": 3.9454003072534105, + "tokens_seen": 465400832 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338114343029088, + "loss": 3.2392, + "theoretical_loss": 3.945339730670869, + "tokens_seen": 465466368 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004338014042126379, + "loss": 3.1638, + "theoretical_loss": 3.94527916500442, + "tokens_seen": 465531904 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043379137412236714, + "loss": 3.26, + "theoretical_loss": 3.9452186102505618, + "tokens_seen": 465597440 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043378134403209627, + "loss": 3.1344, + "theoretical_loss": 3.9451580664057913, + "tokens_seen": 465662976 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004337713139418255, + "loss": 3.2079, + "theoretical_loss": 3.945097533466608, + "tokens_seen": 465728512 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004337612838515547, + "loss": 3.2441, + "theoretical_loss": 3.9450370114295135, + "tokens_seen": 465794048 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043375125376128387, + "loss": 3.2097, + "theoretical_loss": 3.9449765002910118, + "tokens_seen": 465859584 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043374122367101305, + "loss": 3.2239, + "theoretical_loss": 3.9449160000476065, + "tokens_seen": 465925120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004337311935807423, + "loss": 3.1397, + "theoretical_loss": 3.9448555106958043, + "tokens_seen": 465990656 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004337211634904714, + "loss": 3.1119, + "theoretical_loss": 3.944795032232113, + "tokens_seen": 466056192 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043371113340020064, + "loss": 3.1809, + "theoretical_loss": 3.944734564653042, + "tokens_seen": 466121728 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043370110330992977, + "loss": 3.2241, + "theoretical_loss": 3.9446741079551026, + "tokens_seen": 466187264 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433691073219659, + "loss": 3.1438, + "theoretical_loss": 3.944613662134808, + "tokens_seen": 466252800 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004336810431293882, + "loss": 3.1796, + "theoretical_loss": 3.9445532271886705, + "tokens_seen": 466318336 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043367101303911737, + "loss": 3.1997, + "theoretical_loss": 3.9444928031132083, + "tokens_seen": 466383872 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043366098294884655, + "loss": 3.1751, + "theoretical_loss": 3.9444323899049376, + "tokens_seen": 466449408 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043365095285857573, + "loss": 3.2262, + "theoretical_loss": 3.9443719875603787, + "tokens_seen": 466514944 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004336409227683049, + "loss": 3.185, + "theoretical_loss": 3.9443115960760506, + "tokens_seen": 466580480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043363089267803415, + "loss": 3.1579, + "theoretical_loss": 3.9442512154484772, + "tokens_seen": 466646016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004336208625877633, + "loss": 3.0585, + "theoretical_loss": 3.944190845674181, + "tokens_seen": 466711552 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004336108324974925, + "loss": 3.1058, + "theoretical_loss": 3.944130486749689, + "tokens_seen": 466777088 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043360080240722164, + "loss": 3.141, + "theoretical_loss": 3.9440701386715276, + "tokens_seen": 466842624 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0058560371398926, + "objective/train/theoretical_loss": 3.9440098014362257, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.9440098014362257, + "tokens_seen": 466908160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043359077231695087, + "loss": 3.1862, + "theoretical_loss": 3.9440098014362257, + "tokens_seen": 466908160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043358074222668005, + "loss": 3.1573, + "theoretical_loss": 3.9439494750403137, + "tokens_seen": 466973696 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043357071213640923, + "loss": 3.1337, + "theoretical_loss": 3.9438891594803227, + "tokens_seen": 467039232 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004335606820461384, + "loss": 3.1885, + "theoretical_loss": 3.943828854752788, + "tokens_seen": 467104768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043355065195586765, + "loss": 3.1059, + "theoretical_loss": 3.9437685608542425, + "tokens_seen": 467170304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004335406218655968, + "loss": 3.1731, + "theoretical_loss": 3.9437082777812247, + "tokens_seen": 467235840 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433530591775326, + "loss": 3.1248, + "theoretical_loss": 3.943648005530272, + "tokens_seen": 467301376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043352056168505514, + "loss": 3.2804, + "theoretical_loss": 3.9435877440979246, + "tokens_seen": 467366912 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004335105315947844, + "loss": 3.2367, + "theoretical_loss": 3.9435274934807243, + "tokens_seen": 467432448 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043350050150451356, + "loss": 3.0515, + "theoretical_loss": 3.9434672536752133, + "tokens_seen": 467497984 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043349047141424274, + "loss": 3.1463, + "theoretical_loss": 3.9434070246779367, + "tokens_seen": 467563520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334804413239719, + "loss": 3.14, + "theoretical_loss": 3.943346806485441, + "tokens_seen": 467629056 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334704112337011, + "loss": 3.2516, + "theoretical_loss": 3.9432865990942743, + "tokens_seen": 467694592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334603811434303, + "loss": 3.1937, + "theoretical_loss": 3.9432264025009856, + "tokens_seen": 467760128 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334503510531595, + "loss": 3.0962, + "theoretical_loss": 3.943166216702125, + "tokens_seen": 467825664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043344032096288864, + "loss": 3.2083, + "theoretical_loss": 3.9431060416942465, + "tokens_seen": 467891200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334302908726179, + "loss": 3.1871, + "theoretical_loss": 3.943045877473904, + "tokens_seen": 467956736 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433420260782347, + "loss": 3.211, + "theoretical_loss": 3.942985724037653, + "tokens_seen": 468022272 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043341023069207624, + "loss": 3.2322, + "theoretical_loss": 3.9429255813820507, + "tokens_seen": 468087808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004334002006018054, + "loss": 3.1956, + "theoretical_loss": 3.942865449503657, + "tokens_seen": 468153344 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333901705115346, + "loss": 3.194, + "theoretical_loss": 3.9428053283990305, + "tokens_seen": 468218880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333801404212638, + "loss": 3.2011, + "theoretical_loss": 3.9427452180647355, + "tokens_seen": 468284416 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433370110330993, + "loss": 3.1793, + "theoretical_loss": 3.9426851184973346, + "tokens_seen": 468349952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043336008024072215, + "loss": 3.1971, + "theoretical_loss": 3.9426250296933922, + "tokens_seen": 468415488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333500501504514, + "loss": 3.3139, + "theoretical_loss": 3.9425649516494765, + "tokens_seen": 468481024 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2659475803375244, + "objective/train/theoretical_loss": 3.9425048843621555, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.9425048843621555, + "tokens_seen": 468546560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333400200601805, + "loss": 3.2157, + "theoretical_loss": 3.9425048843621555, + "tokens_seen": 468546560 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043332998996990974, + "loss": 3.2606, + "theoretical_loss": 3.942444827827999, + "tokens_seen": 468612096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333199598796389, + "loss": 3.2086, + "theoretical_loss": 3.9423847820435785, + "tokens_seen": 468677632 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004333099297893681, + "loss": 3.1997, + "theoretical_loss": 3.942324747005467, + "tokens_seen": 468743168 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004332998996990973, + "loss": 3.1745, + "theoretical_loss": 3.94226472271024, + "tokens_seen": 468808704 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043328986960882647, + "loss": 3.1667, + "theoretical_loss": 3.9422047091544727, + "tokens_seen": 468874240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043327983951855565, + "loss": 3.134, + "theoretical_loss": 3.9421447063347435, + "tokens_seen": 468939776 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004332698094282849, + "loss": 3.157, + "theoretical_loss": 3.9420847142476316, + "tokens_seen": 469005312 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433259779338014, + "loss": 3.3016, + "theoretical_loss": 3.9420247328897178, + "tokens_seen": 469070848 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043324974924774325, + "loss": 3.2231, + "theoretical_loss": 3.9419647622575855, + "tokens_seen": 469136384 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043323971915747243, + "loss": 3.1663, + "theoretical_loss": 3.9419048023478176, + "tokens_seen": 469201920 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004332296890672016, + "loss": 3.2415, + "theoretical_loss": 3.9418448531570007, + "tokens_seen": 469267456 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004332196589769308, + "loss": 3.2615, + "theoretical_loss": 3.9417849146817217, + "tokens_seen": 469332992 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043320962888665997, + "loss": 3.1908, + "theoretical_loss": 3.941724986918569, + "tokens_seen": 469398528 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043319959879638915, + "loss": 3.27, + "theoretical_loss": 3.9416650698641336, + "tokens_seen": 469464064 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004331895687061184, + "loss": 3.1517, + "theoretical_loss": 3.941605163515007, + "tokens_seen": 469529600 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043317953861584757, + "loss": 3.1823, + "theoretical_loss": 3.9415452678677827, + "tokens_seen": 469595136 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043316950852557675, + "loss": 3.1153, + "theoretical_loss": 3.9414853829190557, + "tokens_seen": 469660672 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043315947843530593, + "loss": 3.1931, + "theoretical_loss": 3.9414255086654233, + "tokens_seen": 469726208 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004331494483450351, + "loss": 3.2603, + "theoretical_loss": 3.9413656451034824, + "tokens_seen": 469791744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043313941825476435, + "loss": 3.1532, + "theoretical_loss": 3.9413057922298336, + "tokens_seen": 469857280 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004331293881644935, + "loss": 3.2976, + "theoretical_loss": 3.9412459500410773, + "tokens_seen": 469922816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004331193580742227, + "loss": 3.2783, + "theoretical_loss": 3.9411861185338175, + "tokens_seen": 469988352 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043310932798395184, + "loss": 3.2215, + "theoretical_loss": 3.941126297704658, + "tokens_seen": 470053888 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043309929789368107, + "loss": 3.1992, + "theoretical_loss": 3.941066487550205, + "tokens_seen": 470119424 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2432987689971924, + "objective/train/theoretical_loss": 3.941006688067065, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.941006688067065, + "tokens_seen": 470184960 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043308926780341025, + "loss": 3.2378, + "theoretical_loss": 3.941006688067065, + "tokens_seen": 470184960 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043307923771313943, + "loss": 3.2413, + "theoretical_loss": 3.9409468992518475, + "tokens_seen": 470250496 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004330692076228686, + "loss": 3.1897, + "theoretical_loss": 3.940887121101164, + "tokens_seen": 470316032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043305917753259785, + "loss": 3.1695, + "theoretical_loss": 3.940827353611625, + "tokens_seen": 470381568 + }, + { + "epoch": 5.02, + "learning_rate": 0.000433049147442327, + "loss": 3.1713, + "theoretical_loss": 3.940767596779846, + "tokens_seen": 470447104 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004330391173520562, + "loss": 3.1775, + "theoretical_loss": 3.9407078506024407, + "tokens_seen": 470512640 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043302908726178534, + "loss": 3.1937, + "theoretical_loss": 3.9406481150760264, + "tokens_seen": 470578176 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004330190571715146, + "loss": 3.319, + "theoretical_loss": 3.940588390197221, + "tokens_seen": 470643712 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043300902708124376, + "loss": 3.2116, + "theoretical_loss": 3.9405286759626454, + "tokens_seen": 470709248 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043299899699097294, + "loss": 3.1645, + "theoretical_loss": 3.9404689723689206, + "tokens_seen": 470774784 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329889669007021, + "loss": 3.1104, + "theoretical_loss": 3.9404092794126684, + "tokens_seen": 470840320 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329789368104313, + "loss": 3.2447, + "theoretical_loss": 3.9403495970905147, + "tokens_seen": 470905856 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329689067201605, + "loss": 3.1647, + "theoretical_loss": 3.9402899253990853, + "tokens_seen": 470971392 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329588766298897, + "loss": 3.1834, + "theoretical_loss": 3.940230264335007, + "tokens_seen": 471036928 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043294884653961884, + "loss": 3.113, + "theoretical_loss": 3.9401706138949093, + "tokens_seen": 471102464 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329388164493481, + "loss": 3.1188, + "theoretical_loss": 3.9401109740754228, + "tokens_seen": 471168000 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329287863590772, + "loss": 3.1753, + "theoretical_loss": 3.9400513448731798, + "tokens_seen": 471233536 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043291875626880644, + "loss": 3.2857, + "theoretical_loss": 3.939991726284814, + "tokens_seen": 471299072 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004329087261785356, + "loss": 3.1776, + "theoretical_loss": 3.939932118306961, + "tokens_seen": 471364608 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328986960882648, + "loss": 3.1032, + "theoretical_loss": 3.9398725209362566, + "tokens_seen": 471430144 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432888665997994, + "loss": 3.1851, + "theoretical_loss": 3.93981293416934, + "tokens_seen": 471495680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328786359077232, + "loss": 3.0663, + "theoretical_loss": 3.939753358002851, + "tokens_seen": 471561216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043286860581745235, + "loss": 3.2437, + "theoretical_loss": 3.93969379243343, + "tokens_seen": 471626752 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328585757271816, + "loss": 3.01, + "theoretical_loss": 3.9396342374577213, + "tokens_seen": 471692288 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328485456369107, + "loss": 3.1644, + "theoretical_loss": 3.9395746930723683, + "tokens_seen": 471757824 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1450233459472656, + "objective/train/theoretical_loss": 3.939515159274017, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.939515159274017, + "tokens_seen": 471823360 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043283851554663994, + "loss": 3.1386, + "theoretical_loss": 3.939515159274017, + "tokens_seen": 471823360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328284854563691, + "loss": 3.2145, + "theoretical_loss": 3.939455636059316, + "tokens_seen": 471888896 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328184553660983, + "loss": 3.2244, + "theoretical_loss": 3.9393961234249133, + "tokens_seen": 471954432 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004328084252758275, + "loss": 3.1084, + "theoretical_loss": 3.939336621367459, + "tokens_seen": 472019968 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043279839518555667, + "loss": 3.1928, + "theoretical_loss": 3.9392771298836067, + "tokens_seen": 472085504 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043278836509528585, + "loss": 3.0821, + "theoretical_loss": 3.939217648970009, + "tokens_seen": 472151040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004327783350050151, + "loss": 3.2076, + "theoretical_loss": 3.939158178623321, + "tokens_seen": 472216576 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004327683049147442, + "loss": 3.2275, + "theoretical_loss": 3.9390987188401994, + "tokens_seen": 472282112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043275827482447345, + "loss": 3.1979, + "theoretical_loss": 3.939039269617303, + "tokens_seen": 472347648 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043274824473420263, + "loss": 3.2167, + "theoretical_loss": 3.93897983095129, + "tokens_seen": 472413184 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004327382146439318, + "loss": 3.1954, + "theoretical_loss": 3.938920402838823, + "tokens_seen": 472478720 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432728184553661, + "loss": 3.2047, + "theoretical_loss": 3.9388609852765644, + "tokens_seen": 472544256 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043271815446339017, + "loss": 3.1826, + "theoretical_loss": 3.9388015782611783, + "tokens_seen": 472609792 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043270812437311935, + "loss": 3.2497, + "theoretical_loss": 3.9387421817893307, + "tokens_seen": 472675328 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326980942828486, + "loss": 3.2875, + "theoretical_loss": 3.938682795857688, + "tokens_seen": 472740864 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326880641925777, + "loss": 3.2208, + "theoretical_loss": 3.9386234204629202, + "tokens_seen": 472806400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043267803410230695, + "loss": 3.0866, + "theoretical_loss": 3.938564055601697, + "tokens_seen": 472871936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326680040120361, + "loss": 3.2415, + "theoretical_loss": 3.9385047012706895, + "tokens_seen": 472937472 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326579739217653, + "loss": 3.1585, + "theoretical_loss": 3.9384453574665725, + "tokens_seen": 473003008 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326479438314945, + "loss": 3.0469, + "theoretical_loss": 3.9383860241860202, + "tokens_seen": 473068544 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326379137412237, + "loss": 3.1958, + "theoretical_loss": 3.9383267014257086, + "tokens_seen": 473134080 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043262788365095286, + "loss": 3.1599, + "theoretical_loss": 3.9382673891823163, + "tokens_seen": 473199616 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043261785356068204, + "loss": 3.2283, + "theoretical_loss": 3.9382080874525216, + "tokens_seen": 473265152 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004326078234704112, + "loss": 3.1974, + "theoretical_loss": 3.938148796233006, + "tokens_seen": 473330688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043259779338014045, + "loss": 3.2252, + "theoretical_loss": 3.9380895155204523, + "tokens_seen": 473396224 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.243710517883301, + "objective/train/theoretical_loss": 3.938030245311544, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.938030245311544, + "tokens_seen": 473461760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004325877632898696, + "loss": 3.2517, + "theoretical_loss": 3.938030245311544, + "tokens_seen": 473461760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004325777331995988, + "loss": 3.1489, + "theoretical_loss": 3.937970985602966, + "tokens_seen": 473527296 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432567703109328, + "loss": 3.1659, + "theoretical_loss": 3.937911736391406, + "tokens_seen": 473592832 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004325576730190572, + "loss": 3.1257, + "theoretical_loss": 3.9378524976735516, + "tokens_seen": 473658368 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043254764292878636, + "loss": 3.1546, + "theoretical_loss": 3.9377932694460935, + "tokens_seen": 473723904 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043253761283851554, + "loss": 3.2066, + "theoretical_loss": 3.937734051705723, + "tokens_seen": 473789440 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004325275827482447, + "loss": 3.1515, + "theoretical_loss": 3.9376748444491327, + "tokens_seen": 473854976 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043251755265797396, + "loss": 3.2176, + "theoretical_loss": 3.937615647673017, + "tokens_seen": 473920512 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004325075225677031, + "loss": 3.3013, + "theoretical_loss": 3.937556461374072, + "tokens_seen": 473986048 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004324974924774323, + "loss": 3.256, + "theoretical_loss": 3.9374972855489947, + "tokens_seen": 474051584 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043248746238716145, + "loss": 3.1639, + "theoretical_loss": 3.937438120194485, + "tokens_seen": 474117120 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004324774322968907, + "loss": 3.2473, + "theoretical_loss": 3.937378965307242, + "tokens_seen": 474182656 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043246740220661986, + "loss": 3.2215, + "theoretical_loss": 3.9373198208839684, + "tokens_seen": 474248192 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043245737211634904, + "loss": 3.3029, + "theoretical_loss": 3.937260686921368, + "tokens_seen": 474313728 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004324473420260782, + "loss": 3.107, + "theoretical_loss": 3.9372015634161444, + "tokens_seen": 474379264 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004324373119358074, + "loss": 3.2177, + "theoretical_loss": 3.9371424503650054, + "tokens_seen": 474444800 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043242728184553664, + "loss": 3.1421, + "theoretical_loss": 3.9370833477646574, + "tokens_seen": 474510336 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004324172517552658, + "loss": 3.1908, + "theoretical_loss": 3.937024255611811, + "tokens_seen": 474575872 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432407221664995, + "loss": 3.1673, + "theoretical_loss": 3.936965173903176, + "tokens_seen": 474641408 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323971915747242, + "loss": 3.1655, + "theoretical_loss": 3.936906102635466, + "tokens_seen": 474706944 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323871614844534, + "loss": 3.2007, + "theoretical_loss": 3.936847041805394, + "tokens_seen": 474772480 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043237713139418255, + "loss": 3.2286, + "theoretical_loss": 3.9367879914096755, + "tokens_seen": 474838016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323671013039118, + "loss": 3.2052, + "theoretical_loss": 3.936728951445027, + "tokens_seen": 474903552 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323570712136409, + "loss": 3.1038, + "theoretical_loss": 3.9366699219081673, + "tokens_seen": 474969088 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043234704112337014, + "loss": 3.2698, + "theoretical_loss": 3.936610902795816, + "tokens_seen": 475034624 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1999917030334473, + "objective/train/theoretical_loss": 3.936551894104694, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.936551894104694, + "tokens_seen": 475100160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323370110330993, + "loss": 3.1305, + "theoretical_loss": 3.936551894104694, + "tokens_seen": 475100160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323269809428285, + "loss": 3.2171, + "theoretical_loss": 3.936492895831525, + "tokens_seen": 475165696 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004323169508525577, + "loss": 3.2075, + "theoretical_loss": 3.936433907973032, + "tokens_seen": 475231232 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043230692076228687, + "loss": 3.0779, + "theoretical_loss": 3.9363749305259415, + "tokens_seen": 475296768 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043229689067201605, + "loss": 3.2787, + "theoretical_loss": 3.9363159634869804, + "tokens_seen": 475362304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004322868605817453, + "loss": 3.192, + "theoretical_loss": 3.936257006852878, + "tokens_seen": 475427840 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004322768304914744, + "loss": 3.2175, + "theoretical_loss": 3.936198060620364, + "tokens_seen": 475493376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043226680040120365, + "loss": 3.1368, + "theoretical_loss": 3.936139124786169, + "tokens_seen": 475558912 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043225677031093283, + "loss": 3.2265, + "theoretical_loss": 3.9360801993470274, + "tokens_seen": 475624448 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432246740220662, + "loss": 3.1543, + "theoretical_loss": 3.936021284299674, + "tokens_seen": 475689984 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004322367101303912, + "loss": 3.0881, + "theoretical_loss": 3.935962379640843, + "tokens_seen": 475755520 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043222668004012037, + "loss": 3.182, + "theoretical_loss": 3.935903485367274, + "tokens_seen": 475821056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043221664994984955, + "loss": 3.1616, + "theoretical_loss": 3.935844601475706, + "tokens_seen": 475886592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004322066198595788, + "loss": 3.2662, + "theoretical_loss": 3.9357857279628776, + "tokens_seen": 475952128 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321965897693079, + "loss": 3.1702, + "theoretical_loss": 3.935726864825532, + "tokens_seen": 476017664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043218655967903715, + "loss": 3.2739, + "theoretical_loss": 3.935668012060413, + "tokens_seen": 476083200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321765295887663, + "loss": 3.1658, + "theoretical_loss": 3.9356091696642643, + "tokens_seen": 476148736 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321664994984955, + "loss": 3.2249, + "theoretical_loss": 3.935550337633833, + "tokens_seen": 476214272 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321564694082247, + "loss": 3.2786, + "theoretical_loss": 3.935491515965867, + "tokens_seen": 476279808 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321464393179539, + "loss": 3.1984, + "theoretical_loss": 3.9354327046571154, + "tokens_seen": 476345344 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043213640922768306, + "loss": 3.1515, + "theoretical_loss": 3.935373903704329, + "tokens_seen": 476410880 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043212637913741224, + "loss": 3.185, + "theoretical_loss": 3.9353151131042603, + "tokens_seen": 476476416 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004321163490471414, + "loss": 3.0898, + "theoretical_loss": 3.9352563328536623, + "tokens_seen": 476541952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043210631895687065, + "loss": 3.1828, + "theoretical_loss": 3.935197562949291, + "tokens_seen": 476607488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320962888665998, + "loss": 3.1635, + "theoretical_loss": 3.9351388033879022, + "tokens_seen": 476673024 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.226597547531128, + "objective/train/theoretical_loss": 3.9350800541662547, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.9350800541662547, + "tokens_seen": 476738560 + }, + { + "epoch": 5.02, + "learning_rate": 0.000432086258776329, + "loss": 3.2577, + "theoretical_loss": 3.9350800541662547, + "tokens_seen": 476738560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320762286860582, + "loss": 3.2808, + "theoretical_loss": 3.9350213152811073, + "tokens_seen": 476804096 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320661985957874, + "loss": 3.2466, + "theoretical_loss": 3.9349625867292217, + "tokens_seen": 476869632 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043205616850551656, + "loss": 3.1238, + "theoretical_loss": 3.93490386850736, + "tokens_seen": 476935168 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043204613841524574, + "loss": 3.1445, + "theoretical_loss": 3.9348451606122863, + "tokens_seen": 477000704 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320361083249749, + "loss": 3.1395, + "theoretical_loss": 3.9347864630407656, + "tokens_seen": 477066240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043202607823470416, + "loss": 3.1643, + "theoretical_loss": 3.9347277757895656, + "tokens_seen": 477131776 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320160481444333, + "loss": 3.11, + "theoretical_loss": 3.934669098855454, + "tokens_seen": 477197312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004320060180541625, + "loss": 3.1731, + "theoretical_loss": 3.9346104322352002, + "tokens_seen": 477262848 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043199598796389165, + "loss": 3.2374, + "theoretical_loss": 3.934551775925576, + "tokens_seen": 477328384 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004319859578736209, + "loss": 3.0875, + "theoretical_loss": 3.9344931299233536, + "tokens_seen": 477393920 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043197592778335006, + "loss": 3.199, + "theoretical_loss": 3.9344344942253073, + "tokens_seen": 477459456 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043196589769307924, + "loss": 3.2191, + "theoretical_loss": 3.9343758688282136, + "tokens_seen": 477524992 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004319558676028084, + "loss": 3.1123, + "theoretical_loss": 3.9343172537288478, + "tokens_seen": 477590528 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004319458375125376, + "loss": 3.2414, + "theoretical_loss": 3.93425864892399, + "tokens_seen": 477656064 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004319358074222668, + "loss": 3.2414, + "theoretical_loss": 3.934200054410419, + "tokens_seen": 477721600 + }, + { + "epoch": 5.02, + "learning_rate": 0.000431925777331996, + "loss": 3.241, + "theoretical_loss": 3.9341414701849162, + "tokens_seen": 477787136 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043191574724172515, + "loss": 3.2357, + "theoretical_loss": 3.9340828962442647, + "tokens_seen": 477852672 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004319057171514544, + "loss": 3.0817, + "theoretical_loss": 3.934024332585249, + "tokens_seen": 477918208 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043189568706118357, + "loss": 2.9577, + "theoretical_loss": 3.9339657792046547, + "tokens_seen": 477983744 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043188565697091275, + "loss": 3.0948, + "theoretical_loss": 3.9339072360992686, + "tokens_seen": 478049280 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043187562688064193, + "loss": 3.2131, + "theoretical_loss": 3.9338487032658804, + "tokens_seen": 478114816 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004318655967903711, + "loss": 3.282, + "theoretical_loss": 3.9337901807012785, + "tokens_seen": 478180352 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004318555667001003, + "loss": 3.1494, + "theoretical_loss": 3.933731668402255, + "tokens_seen": 478245888 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004318455366098295, + "loss": 3.2825, + "theoretical_loss": 3.9336731663656037, + "tokens_seen": 478311424 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.117650270462036, + "objective/train/theoretical_loss": 3.9336146745881178, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.9336146745881178, + "tokens_seen": 478376960 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043183550651955865, + "loss": 3.1783, + "theoretical_loss": 3.9336146745881178, + "tokens_seen": 478376960 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004318254764292879, + "loss": 3.1277, + "theoretical_loss": 3.9335561930665937, + "tokens_seen": 478442496 + }, + { + "epoch": 5.02, + "learning_rate": 0.000431815446339017, + "loss": 3.2175, + "theoretical_loss": 3.9334977217978286, + "tokens_seen": 478508032 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043180541624874625, + "loss": 3.2769, + "theoretical_loss": 3.933439260778621, + "tokens_seen": 478573568 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043179538615847543, + "loss": 3.1986, + "theoretical_loss": 3.9333808100057714, + "tokens_seen": 478639104 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004317853560682046, + "loss": 3.1113, + "theoretical_loss": 3.9333223694760804, + "tokens_seen": 478704640 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004317753259779338, + "loss": 3.1553, + "theoretical_loss": 3.933263939186353, + "tokens_seen": 478770176 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043176529588766303, + "loss": 3.1531, + "theoretical_loss": 3.933205519133391, + "tokens_seen": 478835712 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043175526579739215, + "loss": 3.1558, + "theoretical_loss": 3.9331471093140022, + "tokens_seen": 478901248 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004317452357071214, + "loss": 3.1583, + "theoretical_loss": 3.933088709724993, + "tokens_seen": 478966784 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004317352056168505, + "loss": 3.2133, + "theoretical_loss": 3.933030320363173, + "tokens_seen": 479032320 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043172517552657975, + "loss": 3.1581, + "theoretical_loss": 3.932971941225351, + "tokens_seen": 479097856 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043171514543630893, + "loss": 3.2264, + "theoretical_loss": 3.93291357230834, + "tokens_seen": 479163392 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004317051153460381, + "loss": 3.1276, + "theoretical_loss": 3.9328552136089527, + "tokens_seen": 479228928 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316950852557673, + "loss": 3.1445, + "theoretical_loss": 3.9327968651240024, + "tokens_seen": 479294464 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316850551654965, + "loss": 3.1923, + "theoretical_loss": 3.932738526850306, + "tokens_seen": 479360000 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316750250752257, + "loss": 3.1927, + "theoretical_loss": 3.9326801987846816, + "tokens_seen": 479425536 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316649949849549, + "loss": 3.2157, + "theoretical_loss": 3.9326218809239464, + "tokens_seen": 479491072 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316549648946841, + "loss": 3.1672, + "theoretical_loss": 3.932563573264921, + "tokens_seen": 479556608 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043164493480441326, + "loss": 3.1737, + "theoretical_loss": 3.932505275804427, + "tokens_seen": 479622144 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043163490471414244, + "loss": 3.185, + "theoretical_loss": 3.9324469885392883, + "tokens_seen": 479687680 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004316248746238716, + "loss": 3.1971, + "theoretical_loss": 3.932388711466328, + "tokens_seen": 479753216 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043161484453360085, + "loss": 3.222, + "theoretical_loss": 3.932330444582373, + "tokens_seen": 479818752 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043160481444333, + "loss": 3.1932, + "theoretical_loss": 3.9322721878842497, + "tokens_seen": 479884288 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315947843530592, + "loss": 3.1847, + "theoretical_loss": 3.932213941368788, + "tokens_seen": 479949824 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2205681800842285, + "objective/train/theoretical_loss": 3.932155705032817, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.932155705032817, + "tokens_seen": 480015360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315847542627884, + "loss": 3.1979, + "theoretical_loss": 3.932155705032817, + "tokens_seen": 480015360 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315747241725176, + "loss": 3.1596, + "theoretical_loss": 3.932097478873168, + "tokens_seen": 480080896 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043156469408224676, + "loss": 3.2569, + "theoretical_loss": 3.9320392628866747, + "tokens_seen": 480146432 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043155466399197594, + "loss": 3.1843, + "theoretical_loss": 3.9319810570701716, + "tokens_seen": 480211968 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315446339017051, + "loss": 3.2513, + "theoretical_loss": 3.931922861420494, + "tokens_seen": 480277504 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043153460381143436, + "loss": 3.1517, + "theoretical_loss": 3.931864675934479, + "tokens_seen": 480343040 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315245737211635, + "loss": 3.0926, + "theoretical_loss": 3.9318065006089657, + "tokens_seen": 480408576 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004315145436308927, + "loss": 3.2379, + "theoretical_loss": 3.9317483354407945, + "tokens_seen": 480474112 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043150451354062185, + "loss": 3.2239, + "theoretical_loss": 3.9316901804268056, + "tokens_seen": 480539648 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004314944834503511, + "loss": 3.2664, + "theoretical_loss": 3.931632035563843, + "tokens_seen": 480605184 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043148445336008026, + "loss": 3.2888, + "theoretical_loss": 3.9315739008487505, + "tokens_seen": 480670720 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043147442326980944, + "loss": 3.2166, + "theoretical_loss": 3.9315157762783737, + "tokens_seen": 480736256 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004314643931795386, + "loss": 3.0873, + "theoretical_loss": 3.93145766184956, + "tokens_seen": 480801792 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004314543630892678, + "loss": 3.3041, + "theoretical_loss": 3.931399557559158, + "tokens_seen": 480867328 + }, + { + "epoch": 5.02, + "learning_rate": 0.000431444332998997, + "loss": 3.0966, + "theoretical_loss": 3.931341463404017, + "tokens_seen": 480932864 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004314343029087262, + "loss": 3.2387, + "theoretical_loss": 3.9312833793809885, + "tokens_seen": 480998400 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043142427281845535, + "loss": 3.1325, + "theoretical_loss": 3.931225305486926, + "tokens_seen": 481063936 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004314142427281846, + "loss": 3.1577, + "theoretical_loss": 3.9311672417186827, + "tokens_seen": 481129472 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043140421263791377, + "loss": 3.2256, + "theoretical_loss": 3.9311091880731146, + "tokens_seen": 481195008 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043139418254764295, + "loss": 3.124, + "theoretical_loss": 3.9310511445470784, + "tokens_seen": 481260544 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043138415245737213, + "loss": 3.232, + "theoretical_loss": 3.9309931111374334, + "tokens_seen": 481326080 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004313741223671013, + "loss": 3.1657, + "theoretical_loss": 3.930935087841038, + "tokens_seen": 481391616 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004313640922768305, + "loss": 3.2058, + "theoretical_loss": 3.930877074654754, + "tokens_seen": 481457152 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004313540621865597, + "loss": 3.1302, + "theoretical_loss": 3.930819071575444, + "tokens_seen": 481522688 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043134403209628885, + "loss": 3.2152, + "theoretical_loss": 3.930761078599972, + "tokens_seen": 481588224 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2058184146881104, + "objective/train/theoretical_loss": 3.930703095725203, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.930703095725203, + "tokens_seen": 481653760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004313340020060181, + "loss": 3.1469, + "theoretical_loss": 3.930703095725203, + "tokens_seen": 481653760 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004313239719157472, + "loss": 3.24, + "theoretical_loss": 3.9306451229480044, + "tokens_seen": 481719296 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043131394182547645, + "loss": 3.2216, + "theoretical_loss": 3.9305871602652434, + "tokens_seen": 481784832 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043130391173520563, + "loss": 3.2412, + "theoretical_loss": 3.93052920767379, + "tokens_seen": 481850368 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004312938816449348, + "loss": 3.1941, + "theoretical_loss": 3.9304712651705165, + "tokens_seen": 481915904 + }, + { + "epoch": 5.02, + "learning_rate": 0.000431283851554664, + "loss": 3.2407, + "theoretical_loss": 3.930413332752293, + "tokens_seen": 481981440 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043127382146439323, + "loss": 3.1383, + "theoretical_loss": 3.930355410415994, + "tokens_seen": 482046976 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043126379137412236, + "loss": 3.1415, + "theoretical_loss": 3.930297498158496, + "tokens_seen": 482112512 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004312537612838516, + "loss": 3.2459, + "theoretical_loss": 3.9302395959766736, + "tokens_seen": 482178048 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004312437311935807, + "loss": 3.151, + "theoretical_loss": 3.930181703867406, + "tokens_seen": 482243584 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043123370110330995, + "loss": 3.24, + "theoretical_loss": 3.930123821827572, + "tokens_seen": 482309120 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043122367101303913, + "loss": 3.1981, + "theoretical_loss": 3.9300659498540518, + "tokens_seen": 482374656 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004312136409227683, + "loss": 3.1719, + "theoretical_loss": 3.9300080879437287, + "tokens_seen": 482440192 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004312036108324975, + "loss": 3.2414, + "theoretical_loss": 3.929950236093485, + "tokens_seen": 482505728 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311935807422267, + "loss": 3.1416, + "theoretical_loss": 3.9298923943002064, + "tokens_seen": 482571264 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043118355065195586, + "loss": 3.1593, + "theoretical_loss": 3.929834562560779, + "tokens_seen": 482636800 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311735205616851, + "loss": 3.179, + "theoretical_loss": 3.9297767408720903, + "tokens_seen": 482702336 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311634904714142, + "loss": 3.2283, + "theoretical_loss": 3.9297189292310293, + "tokens_seen": 482767872 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043115346038114346, + "loss": 3.3062, + "theoretical_loss": 3.9296611276344864, + "tokens_seen": 482833408 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311434302908726, + "loss": 3.1072, + "theoretical_loss": 3.9296033360793534, + "tokens_seen": 482898944 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311334002006018, + "loss": 3.1804, + "theoretical_loss": 3.929545554562524, + "tokens_seen": 482964480 + }, + { + "epoch": 5.02, + "learning_rate": 0.000431123370110331, + "loss": 3.1405, + "theoretical_loss": 3.9294877830808916, + "tokens_seen": 483030016 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004311133400200602, + "loss": 3.2724, + "theoretical_loss": 3.9294300216313536, + "tokens_seen": 483095552 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043110330992978936, + "loss": 3.1299, + "theoretical_loss": 3.929372270210806, + "tokens_seen": 483161088 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310932798395186, + "loss": 3.0803, + "theoretical_loss": 3.929314528816148, + "tokens_seen": 483226624 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.277773141860962, + "objective/train/theoretical_loss": 3.929256797444281, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.929256797444281, + "tokens_seen": 483292160 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310832497492477, + "loss": 3.2685, + "theoretical_loss": 3.929256797444281, + "tokens_seen": 483292160 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043107321965897696, + "loss": 3.1791, + "theoretical_loss": 3.929199076092104, + "tokens_seen": 483357696 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310631895687061, + "loss": 3.3028, + "theoretical_loss": 3.9291413647565214, + "tokens_seen": 483423232 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310531594784353, + "loss": 3.2329, + "theoretical_loss": 3.9290836634344375, + "tokens_seen": 483488768 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310431293881645, + "loss": 3.2315, + "theoretical_loss": 3.9290259721227567, + "tokens_seen": 483554304 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310330992978937, + "loss": 3.2393, + "theoretical_loss": 3.928968290818388, + "tokens_seen": 483619840 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043102306920762286, + "loss": 3.1263, + "theoretical_loss": 3.9289106195182377, + "tokens_seen": 483685376 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043101303911735205, + "loss": 3.2322, + "theoretical_loss": 3.928852958219217, + "tokens_seen": 483750912 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004310030090270812, + "loss": 3.0523, + "theoretical_loss": 3.9287953069182358, + "tokens_seen": 483816448 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043099297893681046, + "loss": 3.2325, + "theoretical_loss": 3.9287376656122075, + "tokens_seen": 483881984 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004309829488465396, + "loss": 3.2469, + "theoretical_loss": 3.9286800342980452, + "tokens_seen": 483947520 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004309729187562688, + "loss": 3.1622, + "theoretical_loss": 3.928622412972665, + "tokens_seen": 484013056 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043096288866599795, + "loss": 3.171, + "theoretical_loss": 3.928564801632983, + "tokens_seen": 484078592 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004309528585757272, + "loss": 3.2319, + "theoretical_loss": 3.9285072002759165, + "tokens_seen": 484144128 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043094282848545637, + "loss": 3.0469, + "theoretical_loss": 3.928449608898386, + "tokens_seen": 484209664 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043093279839518555, + "loss": 3.3017, + "theoretical_loss": 3.9283920274973116, + "tokens_seen": 484275200 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004309227683049148, + "loss": 3.2482, + "theoretical_loss": 3.928334456069615, + "tokens_seen": 484340736 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043091273821464397, + "loss": 3.1589, + "theoretical_loss": 3.92827689461222, + "tokens_seen": 484406272 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043090270812437315, + "loss": 3.2581, + "theoretical_loss": 3.928219343122052, + "tokens_seen": 484471808 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043089267803410233, + "loss": 3.2133, + "theoretical_loss": 3.928161801596036, + "tokens_seen": 484537344 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004308826479438315, + "loss": 3.2296, + "theoretical_loss": 3.9281042700311004, + "tokens_seen": 484602880 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004308726178535607, + "loss": 3.0855, + "theoretical_loss": 3.9280467484241735, + "tokens_seen": 484668416 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004308625877632899, + "loss": 3.133, + "theoretical_loss": 3.927989236772186, + "tokens_seen": 484733952 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043085255767301905, + "loss": 3.2684, + "theoretical_loss": 3.927931735072069, + "tokens_seen": 484799488 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004308425275827483, + "loss": 3.1746, + "theoretical_loss": 3.9278742433207556, + "tokens_seen": 484865024 + }, + { + "epoch": 5.02, + "objective/train/docs_used": 1134528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2358572483062744, + "objective/train/theoretical_loss": 3.9278167615151807, + "objective/train/tokens_used": 485951968, + "theoretical_loss": 3.9278167615151807, + "tokens_seen": 484930560 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004308324974924774, + "loss": 3.2022, + "theoretical_loss": 3.9278167615151807, + "tokens_seen": 484930560 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043082246740220665, + "loss": 3.1431, + "theoretical_loss": 3.927759289652279, + "tokens_seen": 484996096 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043081243731193583, + "loss": 3.0503, + "theoretical_loss": 3.9277018277289883, + "tokens_seen": 485061632 + }, + { + "epoch": 5.02, + "learning_rate": 0.000430802407221665, + "loss": 3.1692, + "theoretical_loss": 3.927644375742246, + "tokens_seen": 485127168 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307923771313942, + "loss": 3.2683, + "theoretical_loss": 3.9275869336889935, + "tokens_seen": 485192704 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043078234704112343, + "loss": 3.2026, + "theoretical_loss": 3.927529501566171, + "tokens_seen": 485258240 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043077231695085256, + "loss": 3.1557, + "theoretical_loss": 3.9274720793707205, + "tokens_seen": 485323776 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307622868605818, + "loss": 3.1537, + "theoretical_loss": 3.9274146670995864, + "tokens_seen": 485389312 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307522567703109, + "loss": 3.121, + "theoretical_loss": 3.927357264749714, + "tokens_seen": 485454848 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043074222668004015, + "loss": 3.1244, + "theoretical_loss": 3.927299872318049, + "tokens_seen": 485520384 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043073219658976933, + "loss": 3.2359, + "theoretical_loss": 3.9272424898015403, + "tokens_seen": 485585920 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307221664994985, + "loss": 3.2419, + "theoretical_loss": 3.927185117197136, + "tokens_seen": 485651456 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307121364092277, + "loss": 3.3106, + "theoretical_loss": 3.9271277545017877, + "tokens_seen": 485716992 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004307021063189569, + "loss": 3.1608, + "theoretical_loss": 3.9270704017124474, + "tokens_seen": 485782528 + }, + { + "epoch": 5.02, + "learning_rate": 0.00043069207622868606, + "loss": 3.1755, + "theoretical_loss": 3.927013058826067, + "tokens_seen": 485848064 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004306820461384153, + "loss": 3.1854, + "theoretical_loss": 3.926955725839602, + "tokens_seen": 485913600 + }, + { + "epoch": 5.02, + "learning_rate": 0.0004306720160481444, + "loss": 3.2391, + "theoretical_loss": 3.9269046719809895, + "tokens_seen": 485971968 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043066198595787366, + "loss": 3.162, + "theoretical_loss": 3.9268473577032355, + "tokens_seen": 486037504 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004306519558676028, + "loss": 3.1877, + "theoretical_loss": 3.9267900533166014, + "tokens_seen": 486103040 + }, + { + "epoch": 6.0, + "learning_rate": 0.000430641925777332, + "loss": 2.9753, + "theoretical_loss": 3.9267327588180474, + "tokens_seen": 486168576 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004306318956870612, + "loss": 3.1574, + "theoretical_loss": 3.9266754742045338, + "tokens_seen": 486234112 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004306218655967904, + "loss": 3.1841, + "theoretical_loss": 3.9266181994730243, + "tokens_seen": 486299648 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043061183550651956, + "loss": 3.0804, + "theoretical_loss": 3.9265609346204817, + "tokens_seen": 486365184 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004306018054162488, + "loss": 3.0051, + "theoretical_loss": 3.926503679643872, + "tokens_seen": 486430720 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305917753259779, + "loss": 3.1522, + "theoretical_loss": 3.926446434540162, + "tokens_seen": 486496256 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1184972, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.810680866241455, + "objective/train/theoretical_loss": 3.9263891993063185, + "objective/train/tokens_used": 507021792, + "theoretical_loss": 3.9263891993063185, + "tokens_seen": 486561792 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043058174523570716, + "loss": 3.0736, + "theoretical_loss": 3.9263891993063185, + "tokens_seen": 486561792 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305717151454363, + "loss": 3.1864, + "theoretical_loss": 3.926331973939311, + "tokens_seen": 486627328 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305616850551655, + "loss": 2.9994, + "theoretical_loss": 3.9262747584361115, + "tokens_seen": 486692864 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305516549648947, + "loss": 3.1838, + "theoretical_loss": 3.92621755279369, + "tokens_seen": 486758400 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305416248746239, + "loss": 3.1965, + "theoretical_loss": 3.926160357009021, + "tokens_seen": 486823936 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043053159478435306, + "loss": 3.1085, + "theoretical_loss": 3.926103171079078, + "tokens_seen": 486889472 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043052156469408225, + "loss": 3.0827, + "theoretical_loss": 3.9260459950008384, + "tokens_seen": 486955008 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004305115346038114, + "loss": 3.0815, + "theoretical_loss": 3.9259888287712785, + "tokens_seen": 487020544 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043050150451354066, + "loss": 3.1527, + "theoretical_loss": 3.925931672387377, + "tokens_seen": 487086080 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004304914744232698, + "loss": 3.0383, + "theoretical_loss": 3.925874525846114, + "tokens_seen": 487151616 + }, + { + "epoch": 6.0, + "learning_rate": 0.000430481444332999, + "loss": 3.1525, + "theoretical_loss": 3.9258173891444708, + "tokens_seen": 487217152 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043047141424272815, + "loss": 3.083, + "theoretical_loss": 3.9257602622794296, + "tokens_seen": 487282688 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004304613841524574, + "loss": 3.1267, + "theoretical_loss": 3.925703145247975, + "tokens_seen": 487348224 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043045135406218657, + "loss": 2.9632, + "theoretical_loss": 3.9256460380470917, + "tokens_seen": 487413760 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043044132397191575, + "loss": 3.161, + "theoretical_loss": 3.9255889406737667, + "tokens_seen": 487479296 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043043129388164493, + "loss": 3.118, + "theoretical_loss": 3.9255318531249874, + "tokens_seen": 487544832 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043042126379137417, + "loss": 3.1786, + "theoretical_loss": 3.925474775397743, + "tokens_seen": 487610368 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004304112337011033, + "loss": 3.1401, + "theoretical_loss": 3.925417707489025, + "tokens_seen": 487675904 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043040120361083253, + "loss": 3.115, + "theoretical_loss": 3.9253606493958246, + "tokens_seen": 487741440 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043039117352056165, + "loss": 3.1131, + "theoretical_loss": 3.9253036011151354, + "tokens_seen": 487806976 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004303811434302909, + "loss": 3.1494, + "theoretical_loss": 3.9252465626439514, + "tokens_seen": 487872512 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043037111334002007, + "loss": 3.0715, + "theoretical_loss": 3.9251895339792684, + "tokens_seen": 487938048 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043036108324974925, + "loss": 3.0727, + "theoretical_loss": 3.9251325151180847, + "tokens_seen": 488003584 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043035105315947843, + "loss": 3.2092, + "theoretical_loss": 3.9250755060573974, + "tokens_seen": 488069120 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004303410230692076, + "loss": 3.0441, + "theoretical_loss": 3.9250185067942076, + "tokens_seen": 488134656 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1188118, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.288266181945801, + "objective/train/theoretical_loss": 3.9249615173255155, + "objective/train/tokens_used": 508660192, + "theoretical_loss": 3.9249615173255155, + "tokens_seen": 488200192 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004303309929789368, + "loss": 3.1691, + "theoretical_loss": 3.9249615173255155, + "tokens_seen": 488200192 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043032096288866603, + "loss": 3.1123, + "theoretical_loss": 3.924904537648324, + "tokens_seen": 488265728 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043031093279839516, + "loss": 3.1903, + "theoretical_loss": 3.9248475677596373, + "tokens_seen": 488331264 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004303009027081244, + "loss": 3.0355, + "theoretical_loss": 3.9247906076564596, + "tokens_seen": 488396800 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004302908726178535, + "loss": 3.1854, + "theoretical_loss": 3.924733657335798, + "tokens_seen": 488462336 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043028084252758276, + "loss": 2.9547, + "theoretical_loss": 3.92467671679466, + "tokens_seen": 488527872 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043027081243731194, + "loss": 3.0247, + "theoretical_loss": 3.924619786030055, + "tokens_seen": 488593408 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004302607823470411, + "loss": 3.1202, + "theoretical_loss": 3.9245628650389928, + "tokens_seen": 488658944 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004302507522567703, + "loss": 3.0438, + "theoretical_loss": 3.9245059538184854, + "tokens_seen": 488724480 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043024072216649953, + "loss": 3.13, + "theoretical_loss": 3.9244490523655466, + "tokens_seen": 488790016 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043023069207622866, + "loss": 3.1311, + "theoretical_loss": 3.924392160677189, + "tokens_seen": 488855552 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004302206619859579, + "loss": 3.0716, + "theoretical_loss": 3.92433527875043, + "tokens_seen": 488921088 + }, + { + "epoch": 6.0, + "learning_rate": 0.000430210631895687, + "loss": 3.1157, + "theoretical_loss": 3.9242784065822853, + "tokens_seen": 488986624 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043020060180541626, + "loss": 3.0411, + "theoretical_loss": 3.924221544169774, + "tokens_seen": 489052160 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043019057171514544, + "loss": 3.1368, + "theoretical_loss": 3.9241646915099153, + "tokens_seen": 489117696 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004301805416248746, + "loss": 3.098, + "theoretical_loss": 3.9241078485997303, + "tokens_seen": 489183232 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043017051153460386, + "loss": 3.0042, + "theoretical_loss": 3.9240510154362407, + "tokens_seen": 489248768 + }, + { + "epoch": 6.0, + "learning_rate": 0.000430160481444333, + "loss": 3.0994, + "theoretical_loss": 3.9239941920164707, + "tokens_seen": 489314304 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004301504513540622, + "loss": 3.1451, + "theoretical_loss": 3.9239373783374445, + "tokens_seen": 489379840 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004301404212637914, + "loss": 3.1811, + "theoretical_loss": 3.923880574396188, + "tokens_seen": 489445376 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004301303911735206, + "loss": 2.8719, + "theoretical_loss": 3.9238237801897293, + "tokens_seen": 489510912 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043012036108324976, + "loss": 3.2061, + "theoretical_loss": 3.9237669957150976, + "tokens_seen": 489576448 + }, + { + "epoch": 6.0, + "learning_rate": 0.000430110330992979, + "loss": 3.075, + "theoretical_loss": 3.9237102209693218, + "tokens_seen": 489641984 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004301003009027081, + "loss": 3.0284, + "theoretical_loss": 3.9236534559494336, + "tokens_seen": 489707520 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043009027081243736, + "loss": 3.0543, + "theoretical_loss": 3.9235967006524657, + "tokens_seen": 489773056 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1191791, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1983702182769775, + "objective/train/theoretical_loss": 3.923539955075452, + "objective/train/tokens_used": 510298592, + "theoretical_loss": 3.923539955075452, + "tokens_seen": 489838592 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004300802407221665, + "loss": 3.161, + "theoretical_loss": 3.923539955075452, + "tokens_seen": 489838592 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004300702106318957, + "loss": 3.1309, + "theoretical_loss": 3.923483219215428, + "tokens_seen": 489904128 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004300601805416249, + "loss": 2.9954, + "theoretical_loss": 3.9234264930694303, + "tokens_seen": 489969664 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004300501504513541, + "loss": 3.1239, + "theoretical_loss": 3.923369776634496, + "tokens_seen": 490035200 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043004012036108326, + "loss": 3.1142, + "theoretical_loss": 3.923313069907665, + "tokens_seen": 490100736 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043003009027081245, + "loss": 3.0717, + "theoretical_loss": 3.9232563728859775, + "tokens_seen": 490166272 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043002006018054163, + "loss": 3.26, + "theoretical_loss": 3.9231996855664755, + "tokens_seen": 490231808 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043001003009027086, + "loss": 2.9898, + "theoretical_loss": 3.9231430079462015, + "tokens_seen": 490297344 + }, + { + "epoch": 6.0, + "learning_rate": 0.00043, + "loss": 3.1398, + "theoretical_loss": 3.9230863400222002, + "tokens_seen": 490362880 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004299899699097292, + "loss": 3.1097, + "theoretical_loss": 3.923029681791517, + "tokens_seen": 490428416 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042997993981945835, + "loss": 3.0986, + "theoretical_loss": 3.922973033251199, + "tokens_seen": 490493952 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004299699097291876, + "loss": 3.1672, + "theoretical_loss": 3.9229163943982943, + "tokens_seen": 490559488 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042995987963891677, + "loss": 3.233, + "theoretical_loss": 3.9228597652298527, + "tokens_seen": 490625024 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042994984954864595, + "loss": 3.0883, + "theoretical_loss": 3.922803145742925, + "tokens_seen": 490690560 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042993981945837513, + "loss": 3.1437, + "theoretical_loss": 3.9227465359345626, + "tokens_seen": 490756096 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042992978936810437, + "loss": 3.1187, + "theoretical_loss": 3.9226899358018197, + "tokens_seen": 490821632 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004299197592778335, + "loss": 2.9755, + "theoretical_loss": 3.922633345341751, + "tokens_seen": 490887168 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042990972918756273, + "loss": 3.1233, + "theoretical_loss": 3.922576764551412, + "tokens_seen": 490952704 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042989969909729185, + "loss": 3.1709, + "theoretical_loss": 3.922520193427859, + "tokens_seen": 491018240 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004298896690070211, + "loss": 3.0707, + "theoretical_loss": 3.922463631968153, + "tokens_seen": 491083776 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042987963891675027, + "loss": 3.1203, + "theoretical_loss": 3.922407080169352, + "tokens_seen": 491149312 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042986960882647945, + "loss": 3.0505, + "theoretical_loss": 3.9223505380285175, + "tokens_seen": 491214848 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042985957873620863, + "loss": 3.0242, + "theoretical_loss": 3.922294005542712, + "tokens_seen": 491280384 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004298495486459378, + "loss": 3.1489, + "theoretical_loss": 3.922237482708999, + "tokens_seen": 491345920 + }, + { + "epoch": 6.0, + "learning_rate": 0.000429839518555667, + "loss": 3.1258, + "theoretical_loss": 3.922180969524444, + "tokens_seen": 491411456 + }, + { + "debugging/Self-BLEU-5": 0.7221837933189033, + "debugging/distinct-1-grams": 0.7379945850539217, + "debugging/distinct-2-grams": 0.9465100387403372, + "debugging/entropy-1-grams": 6.54358751918253, + "debugging/entropy-2-grams": 7.959249981884544, + "debugging/length": 568.780487804878, + "debugging/num_segments": 41, + "epoch": 6.0, + "objective/train/docs_used": 1196938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1798505783081055, + "objective/train/theoretical_loss": 3.922124465986113, + "objective/train/tokens_used": 511936992, + "theoretical_loss": 3.922124465986113, + "tokens_seen": 491476992 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042982948846539623, + "loss": 3.1755, + "theoretical_loss": 3.922124465986113, + "tokens_seen": 491476992 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042981945837512536, + "loss": 3.0978, + "theoretical_loss": 3.9220679720910727, + "tokens_seen": 491542528 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004298094282848546, + "loss": 3.0721, + "theoretical_loss": 3.9220114878363934, + "tokens_seen": 491608064 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004297993981945837, + "loss": 3.2235, + "theoretical_loss": 3.9219550132191445, + "tokens_seen": 491673600 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042978936810431296, + "loss": 3.2054, + "theoretical_loss": 3.9218985482363973, + "tokens_seen": 491739136 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042977933801404214, + "loss": 3.0728, + "theoretical_loss": 3.9218420928852242, + "tokens_seen": 491804672 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004297693079237713, + "loss": 3.1368, + "theoretical_loss": 3.9217856471626997, + "tokens_seen": 491870208 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004297592778335005, + "loss": 3.1373, + "theoretical_loss": 3.921729211065899, + "tokens_seen": 491935744 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042974924774322973, + "loss": 3.1148, + "theoretical_loss": 3.921672784591898, + "tokens_seen": 492001280 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042973921765295886, + "loss": 3.1066, + "theoretical_loss": 3.921616367737775, + "tokens_seen": 492066816 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004297291875626881, + "loss": 3.1142, + "theoretical_loss": 3.921559960500609, + "tokens_seen": 492132352 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004297191574724172, + "loss": 3.0215, + "theoretical_loss": 3.921503562877481, + "tokens_seen": 492197888 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042970912738214646, + "loss": 3.1025, + "theoretical_loss": 3.921447174865471, + "tokens_seen": 492263424 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042969909729187564, + "loss": 3.1671, + "theoretical_loss": 3.921390796461663, + "tokens_seen": 492328960 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296890672016048, + "loss": 3.1395, + "theoretical_loss": 3.9213344276631408, + "tokens_seen": 492394496 + }, + { + "epoch": 6.0, + "learning_rate": 0.000429679037111334, + "loss": 3.167, + "theoretical_loss": 3.9212780684669903, + "tokens_seen": 492460032 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296690070210632, + "loss": 3.1056, + "theoretical_loss": 3.9212217188702976, + "tokens_seen": 492525568 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042965897693079236, + "loss": 3.0545, + "theoretical_loss": 3.921165378870151, + "tokens_seen": 492591104 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296489468405216, + "loss": 3.1569, + "theoretical_loss": 3.9211090484636397, + "tokens_seen": 492656640 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296389167502507, + "loss": 3.0745, + "theoretical_loss": 3.921052727647854, + "tokens_seen": 492722176 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042962888665997996, + "loss": 3.1368, + "theoretical_loss": 3.920996416419886, + "tokens_seen": 492787712 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296188565697091, + "loss": 3.0483, + "theoretical_loss": 3.920940114776829, + "tokens_seen": 492853248 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004296088264794383, + "loss": 3.1953, + "theoretical_loss": 3.9208838227157763, + "tokens_seen": 492918784 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004295987963891675, + "loss": 3.1899, + "theoretical_loss": 3.920827540233824, + "tokens_seen": 492984320 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004295887662988967, + "loss": 3.1566, + "theoretical_loss": 3.9207712673280692, + "tokens_seen": 493049856 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1199635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0793819427490234, + "objective/train/theoretical_loss": 3.92071500399561, + "objective/train/tokens_used": 513575392, + "theoretical_loss": 3.92071500399561, + "tokens_seen": 493115392 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042957873620862587, + "loss": 3.0087, + "theoretical_loss": 3.92071500399561, + "tokens_seen": 493115392 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004295687061183551, + "loss": 3.1225, + "theoretical_loss": 3.920658750233546, + "tokens_seen": 493180928 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042955867602808423, + "loss": 3.1621, + "theoretical_loss": 3.920602506038977, + "tokens_seen": 493246464 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042954864593781347, + "loss": 3.2194, + "theoretical_loss": 3.9205462714090054, + "tokens_seen": 493312000 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004295386158475426, + "loss": 3.1355, + "theoretical_loss": 3.920490046340735, + "tokens_seen": 493377536 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042952858575727183, + "loss": 3.0024, + "theoretical_loss": 3.9204338308312687, + "tokens_seen": 493443072 + }, + { + "epoch": 6.0, + "learning_rate": 0.000429518555667001, + "loss": 3.1508, + "theoretical_loss": 3.9203776248777134, + "tokens_seen": 493508608 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004295085255767302, + "loss": 3.0057, + "theoretical_loss": 3.9203214284771764, + "tokens_seen": 493574144 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042949849548645937, + "loss": 3.0809, + "theoretical_loss": 3.9202652416267645, + "tokens_seen": 493639680 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042948846539618855, + "loss": 3.117, + "theoretical_loss": 3.920209064323588, + "tokens_seen": 493705216 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042947843530591773, + "loss": 3.1275, + "theoretical_loss": 3.920152896564758, + "tokens_seen": 493770752 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042946840521564697, + "loss": 3.136, + "theoretical_loss": 3.920096738347386, + "tokens_seen": 493836288 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004294583751253761, + "loss": 3.1796, + "theoretical_loss": 3.920040589668585, + "tokens_seen": 493901824 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042944834503510533, + "loss": 3.1274, + "theoretical_loss": 3.9199844505254697, + "tokens_seen": 493967360 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004294383149448345, + "loss": 3.0871, + "theoretical_loss": 3.9199283209151563, + "tokens_seen": 494032896 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004294282848545637, + "loss": 2.9557, + "theoretical_loss": 3.9198722008347615, + "tokens_seen": 494098432 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042941825476429293, + "loss": 3.0971, + "theoretical_loss": 3.9198160902814037, + "tokens_seen": 494163968 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042940822467402206, + "loss": 3.0985, + "theoretical_loss": 3.919759989252202, + "tokens_seen": 494229504 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004293981945837513, + "loss": 3.1796, + "theoretical_loss": 3.9197038977442773, + "tokens_seen": 494295040 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042938816449348047, + "loss": 3.0355, + "theoretical_loss": 3.919647815754752, + "tokens_seen": 494360576 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042937813440320965, + "loss": 3.1675, + "theoretical_loss": 3.9195917432807494, + "tokens_seen": 494426112 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042936810431293883, + "loss": 3.0087, + "theoretical_loss": 3.9195356803193935, + "tokens_seen": 494491648 + }, + { + "epoch": 6.0, + "learning_rate": 0.000429358074222668, + "loss": 3.1207, + "theoretical_loss": 3.91947962686781, + "tokens_seen": 494557184 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004293480441323972, + "loss": 3.1302, + "theoretical_loss": 3.9194235829231268, + "tokens_seen": 494622720 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042933801404212643, + "loss": 3.2467, + "theoretical_loss": 3.9193675484824717, + "tokens_seen": 494688256 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1204696, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9569177627563477, + "objective/train/theoretical_loss": 3.919311523542974, + "objective/train/tokens_used": 515213792, + "theoretical_loss": 3.919311523542974, + "tokens_seen": 494753792 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042932798395185556, + "loss": 3.1317, + "theoretical_loss": 3.919311523542974, + "tokens_seen": 494753792 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004293179538615848, + "loss": 3.104, + "theoretical_loss": 3.919255508101765, + "tokens_seen": 494819328 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004293079237713139, + "loss": 3.1615, + "theoretical_loss": 3.919199502155976, + "tokens_seen": 494884864 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042929789368104316, + "loss": 2.8885, + "theoretical_loss": 3.91914350570274, + "tokens_seen": 494950400 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042928786359077234, + "loss": 3.1724, + "theoretical_loss": 3.919087518739193, + "tokens_seen": 495015936 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004292778335005015, + "loss": 3.1507, + "theoretical_loss": 3.91903154126247, + "tokens_seen": 495081472 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004292678034102307, + "loss": 3.0785, + "theoretical_loss": 3.918975573269708, + "tokens_seen": 495147008 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042925777331995993, + "loss": 3.1179, + "theoretical_loss": 3.9189196147580443, + "tokens_seen": 495212544 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042924774322968906, + "loss": 3.0538, + "theoretical_loss": 3.9188636657246194, + "tokens_seen": 495278080 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004292377131394183, + "loss": 3.0708, + "theoretical_loss": 3.918807726166574, + "tokens_seen": 495343616 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004292276830491474, + "loss": 3.0669, + "theoretical_loss": 3.9187517960810503, + "tokens_seen": 495409152 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042921765295887666, + "loss": 3.0843, + "theoretical_loss": 3.9186958754651906, + "tokens_seen": 495474688 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042920762286860584, + "loss": 3.0554, + "theoretical_loss": 3.9186399643161396, + "tokens_seen": 495540224 + }, + { + "epoch": 6.0, + "learning_rate": 0.000429197592778335, + "loss": 3.2076, + "theoretical_loss": 3.9185840626310435, + "tokens_seen": 495605760 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291875626880642, + "loss": 3.1683, + "theoretical_loss": 3.9185281704070496, + "tokens_seen": 495671296 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291775325977934, + "loss": 3.0903, + "theoretical_loss": 3.9184722876413045, + "tokens_seen": 495736832 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042916750250752256, + "loss": 3.0615, + "theoretical_loss": 3.9184164143309586, + "tokens_seen": 495802368 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291574724172518, + "loss": 3.2247, + "theoretical_loss": 3.918360550473163, + "tokens_seen": 495867904 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291474423269809, + "loss": 3.0724, + "theoretical_loss": 3.9183046960650683, + "tokens_seen": 495933440 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042913741223671016, + "loss": 3.1678, + "theoretical_loss": 3.9182488511038285, + "tokens_seen": 495998976 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291273821464393, + "loss": 2.9944, + "theoretical_loss": 3.9181930155865974, + "tokens_seen": 496064512 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291173520561685, + "loss": 3.1358, + "theoretical_loss": 3.9181371895105306, + "tokens_seen": 496130048 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004291073219658977, + "loss": 3.1358, + "theoretical_loss": 3.918081372872786, + "tokens_seen": 496195584 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004290972918756269, + "loss": 3.0767, + "theoretical_loss": 3.9180255656705203, + "tokens_seen": 496261120 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042908726178535607, + "loss": 3.1303, + "theoretical_loss": 3.9179697679008934, + "tokens_seen": 496326656 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1207533, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.167299270629883, + "objective/train/theoretical_loss": 3.9179139795610656, + "objective/train/tokens_used": 516852192, + "theoretical_loss": 3.9179139795610656, + "tokens_seen": 496392192 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004290772316950853, + "loss": 3.1468, + "theoretical_loss": 3.9179139795610656, + "tokens_seen": 496392192 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042906720160481443, + "loss": 3.128, + "theoretical_loss": 3.9178582006481983, + "tokens_seen": 496457728 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042905717151454367, + "loss": 3.2055, + "theoretical_loss": 3.9178024311594553, + "tokens_seen": 496523264 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004290471414242728, + "loss": 3.0078, + "theoretical_loss": 3.917746671092, + "tokens_seen": 496588800 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042903711133400203, + "loss": 3.0849, + "theoretical_loss": 3.9176909204429977, + "tokens_seen": 496654336 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004290270812437312, + "loss": 3.1698, + "theoretical_loss": 3.9176351792096162, + "tokens_seen": 496719872 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004290170511534604, + "loss": 3.0838, + "theoretical_loss": 3.917579447389022, + "tokens_seen": 496785408 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042900702106318957, + "loss": 3.1685, + "theoretical_loss": 3.917523724978385, + "tokens_seen": 496850944 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042899699097291875, + "loss": 3.1573, + "theoretical_loss": 3.917468011974875, + "tokens_seen": 496916480 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042898696088264793, + "loss": 3.0372, + "theoretical_loss": 3.917412308375664, + "tokens_seen": 496982016 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042897693079237717, + "loss": 3.1782, + "theoretical_loss": 3.9173566141779244, + "tokens_seen": 497047552 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004289669007021063, + "loss": 3.1449, + "theoretical_loss": 3.91730092937883, + "tokens_seen": 497113088 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042895687061183553, + "loss": 3.0795, + "theoretical_loss": 3.9172452539755565, + "tokens_seen": 497178624 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004289468405215647, + "loss": 2.9834, + "theoretical_loss": 3.9171895879652805, + "tokens_seen": 497244160 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004289368104312939, + "loss": 3.0989, + "theoretical_loss": 3.9171339313451785, + "tokens_seen": 497309696 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004289267803410231, + "loss": 2.996, + "theoretical_loss": 3.9170782841124305, + "tokens_seen": 497375232 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042891675025075226, + "loss": 3.1693, + "theoretical_loss": 3.9170226462642157, + "tokens_seen": 497440768 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042890672016048144, + "loss": 3.2053, + "theoretical_loss": 3.9169670177977167, + "tokens_seen": 497506304 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042889669007021067, + "loss": 3.0729, + "theoretical_loss": 3.916911398710115, + "tokens_seen": 497571840 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004288866599799398, + "loss": 3.1259, + "theoretical_loss": 3.916855788998594, + "tokens_seen": 497637376 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042887662988966903, + "loss": 3.0921, + "theoretical_loss": 3.916800188660339, + "tokens_seen": 497702912 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042886659979939816, + "loss": 3.0731, + "theoretical_loss": 3.9167445976925377, + "tokens_seen": 497768448 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004288565697091274, + "loss": 3.1026, + "theoretical_loss": 3.9166890160923744, + "tokens_seen": 497833984 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004288465396188566, + "loss": 3.0503, + "theoretical_loss": 3.9166334438570405, + "tokens_seen": 497899520 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042883650952858576, + "loss": 3.1489, + "theoretical_loss": 3.9165778809837244, + "tokens_seen": 497965056 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1211187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.966336965560913, + "objective/train/theoretical_loss": 3.9165223274696173, + "objective/train/tokens_used": 518490592, + "theoretical_loss": 3.9165223274696173, + "tokens_seen": 498030592 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042882647943831494, + "loss": 3.1272, + "theoretical_loss": 3.9165223274696173, + "tokens_seen": 498030592 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004288164493480441, + "loss": 3.1477, + "theoretical_loss": 3.916466783311912, + "tokens_seen": 498096128 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004288064192577733, + "loss": 3.1108, + "theoretical_loss": 3.9164112485078006, + "tokens_seen": 498161664 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042879638916750254, + "loss": 3.1635, + "theoretical_loss": 3.9163557230544797, + "tokens_seen": 498227200 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042878635907723166, + "loss": 3.0589, + "theoretical_loss": 3.9163002069491437, + "tokens_seen": 498292736 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004287763289869609, + "loss": 3.0092, + "theoretical_loss": 3.9162447001889897, + "tokens_seen": 498358272 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004287662988966901, + "loss": 3.1915, + "theoretical_loss": 3.9161892027712164, + "tokens_seen": 498423808 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042875626880641926, + "loss": 3.1322, + "theoretical_loss": 3.9161337146930237, + "tokens_seen": 498489344 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042874623871614844, + "loss": 3.0551, + "theoretical_loss": 3.9160782359516118, + "tokens_seen": 498554880 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004287362086258776, + "loss": 3.1706, + "theoretical_loss": 3.9160227665441822, + "tokens_seen": 498620416 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004287261785356068, + "loss": 3.1794, + "theoretical_loss": 3.9159673064679392, + "tokens_seen": 498685952 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042871614844533604, + "loss": 3.1553, + "theoretical_loss": 3.9159118557200854, + "tokens_seen": 498751488 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042870611835506517, + "loss": 3.162, + "theoretical_loss": 3.915856414297828, + "tokens_seen": 498817024 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286960882647944, + "loss": 3.0809, + "theoretical_loss": 3.9158009821983732, + "tokens_seen": 498882560 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042868605817452353, + "loss": 3.0342, + "theoretical_loss": 3.9157455594189283, + "tokens_seen": 498948096 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042867602808425276, + "loss": 3.1854, + "theoretical_loss": 3.9156901459567033, + "tokens_seen": 499013632 + }, + { + "epoch": 6.0, + "learning_rate": 0.000428665997993982, + "loss": 3.0589, + "theoretical_loss": 3.915634741808908, + "tokens_seen": 499079168 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286559679037111, + "loss": 3.1392, + "theoretical_loss": 3.915579346972754, + "tokens_seen": 499144704 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042864593781344036, + "loss": 3.0143, + "theoretical_loss": 3.9155239614454542, + "tokens_seen": 499210240 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286359077231695, + "loss": 3.1011, + "theoretical_loss": 3.915468585224222, + "tokens_seen": 499275776 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286258776328987, + "loss": 3.2137, + "theoretical_loss": 3.9154132183062735, + "tokens_seen": 499341312 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286158475426279, + "loss": 3.1727, + "theoretical_loss": 3.9153578606888244, + "tokens_seen": 499406848 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004286058174523571, + "loss": 3.0783, + "theoretical_loss": 3.915302512369092, + "tokens_seen": 499472384 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042859578736208627, + "loss": 3.0997, + "theoretical_loss": 3.915247173344296, + "tokens_seen": 499537920 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004285857572718155, + "loss": 3.2098, + "theoretical_loss": 3.9151918436116553, + "tokens_seen": 499603456 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1214309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.31550669670105, + "objective/train/theoretical_loss": 3.9151365231683917, + "objective/train/tokens_used": 520128992, + "theoretical_loss": 3.9151365231683917, + "tokens_seen": 499668992 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042857572718154463, + "loss": 3.1651, + "theoretical_loss": 3.9151365231683917, + "tokens_seen": 499668992 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042856569709127387, + "loss": 3.121, + "theoretical_loss": 3.9150812120117267, + "tokens_seen": 499734528 + }, + { + "epoch": 6.0, + "learning_rate": 0.000428555667001003, + "loss": 3.2257, + "theoretical_loss": 3.9150259101388842, + "tokens_seen": 499800064 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042854563691073223, + "loss": 3.0978, + "theoretical_loss": 3.91497061754709, + "tokens_seen": 499865600 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004285356068204614, + "loss": 3.1162, + "theoretical_loss": 3.9149153342335685, + "tokens_seen": 499931136 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004285255767301906, + "loss": 3.0769, + "theoretical_loss": 3.914860060195547, + "tokens_seen": 499996672 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042851554663991977, + "loss": 3.1082, + "theoretical_loss": 3.914804795430255, + "tokens_seen": 500062208 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042850551654964895, + "loss": 3.1617, + "theoretical_loss": 3.9147495399349204, + "tokens_seen": 500127744 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042849548645937813, + "loss": 3.0925, + "theoretical_loss": 3.9146942937067752, + "tokens_seen": 500193280 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042848545636910737, + "loss": 3.1355, + "theoretical_loss": 3.9146390567430505, + "tokens_seen": 500258816 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004284754262788365, + "loss": 3.1202, + "theoretical_loss": 3.9145838290409793, + "tokens_seen": 500324352 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042846539618856573, + "loss": 3.1547, + "theoretical_loss": 3.914528610597796, + "tokens_seen": 500389888 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004284553660982949, + "loss": 3.0181, + "theoretical_loss": 3.914473401410736, + "tokens_seen": 500455424 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004284453360080241, + "loss": 3.1662, + "theoretical_loss": 3.914418201477036, + "tokens_seen": 500520960 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004284353059177533, + "loss": 3.1227, + "theoretical_loss": 3.9143630107939336, + "tokens_seen": 500586496 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042842527582748246, + "loss": 3.0214, + "theoretical_loss": 3.914307829358668, + "tokens_seen": 500652032 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042841524573721164, + "loss": 2.9425, + "theoretical_loss": 3.91425265716848, + "tokens_seen": 500717568 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042840521564694087, + "loss": 3.2217, + "theoretical_loss": 3.9141974942206095, + "tokens_seen": 500783104 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042839518555667, + "loss": 3.0723, + "theoretical_loss": 3.9141423405122993, + "tokens_seen": 500848640 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042838515546639923, + "loss": 3.0381, + "theoretical_loss": 3.9140871960407946, + "tokens_seen": 500914176 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042837512537612836, + "loss": 3.1123, + "theoretical_loss": 3.914032060803339, + "tokens_seen": 500979712 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004283650952858576, + "loss": 3.11, + "theoretical_loss": 3.913976934797179, + "tokens_seen": 501045248 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004283550651955868, + "loss": 3.1485, + "theoretical_loss": 3.9139218180195616, + "tokens_seen": 501110784 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042834503510531596, + "loss": 3.0078, + "theoretical_loss": 3.913866710467736, + "tokens_seen": 501176320 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042833500501504514, + "loss": 3.0516, + "theoretical_loss": 3.91381161213895, + "tokens_seen": 501241856 + }, + { + "epoch": 6.0, + "objective/train/docs_used": 1218950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2451417446136475, + "objective/train/theoretical_loss": 3.9137565230304565, + "objective/train/tokens_used": 521767392, + "theoretical_loss": 3.9137565230304565, + "tokens_seen": 501307392 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004283249749247743, + "loss": 3.2257, + "theoretical_loss": 3.9137565230304565, + "tokens_seen": 501307392 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004283149448345035, + "loss": 3.1508, + "theoretical_loss": 3.913701443139507, + "tokens_seen": 501372928 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042830491474423274, + "loss": 3.2235, + "theoretical_loss": 3.9136463724633535, + "tokens_seen": 501438464 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042829488465396186, + "loss": 2.9947, + "theoretical_loss": 3.9135913109992515, + "tokens_seen": 501504000 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004282848545636911, + "loss": 3.0694, + "theoretical_loss": 3.9135362587444558, + "tokens_seen": 501569536 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004282748244734203, + "loss": 3.1532, + "theoretical_loss": 3.9134812156962244, + "tokens_seen": 501635072 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042826479438314946, + "loss": 3.0092, + "theoretical_loss": 3.913426181851813, + "tokens_seen": 501700608 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042825476429287864, + "loss": 3.0699, + "theoretical_loss": 3.913371157208483, + "tokens_seen": 501766144 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004282447342026078, + "loss": 3.09, + "theoretical_loss": 3.9133161417634925, + "tokens_seen": 501831680 + }, + { + "epoch": 6.0, + "learning_rate": 0.000428234704112337, + "loss": 3.1325, + "theoretical_loss": 3.9132611355141043, + "tokens_seen": 501897216 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042822467402206624, + "loss": 3.0481, + "theoretical_loss": 3.9132061384575803, + "tokens_seen": 501962752 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042821464393179537, + "loss": 3.1357, + "theoretical_loss": 3.913151150591185, + "tokens_seen": 502028288 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004282046138415246, + "loss": 3.1414, + "theoretical_loss": 3.9130961719121826, + "tokens_seen": 502093824 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042819458375125373, + "loss": 3.1843, + "theoretical_loss": 3.9130412024178396, + "tokens_seen": 502159360 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042818455366098296, + "loss": 3.1301, + "theoretical_loss": 3.9129862421054225, + "tokens_seen": 502224896 + }, + { + "epoch": 6.0, + "learning_rate": 0.00042817452357071215, + "loss": 3.2092, + "theoretical_loss": 3.9129312909722005, + "tokens_seen": 502290432 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004281644934804413, + "loss": 3.1171, + "theoretical_loss": 3.912876349015443, + "tokens_seen": 502355968 + }, + { + "epoch": 6.0, + "learning_rate": 0.0004281544633901705, + "loss": 3.0714, + "theoretical_loss": 3.91282141623242, + "tokens_seen": 502421504 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004281444332998997, + "loss": 3.1682, + "theoretical_loss": 3.9127664926204053, + "tokens_seen": 502487040 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042813440320962887, + "loss": 3.0809, + "theoretical_loss": 3.91271157817667, + "tokens_seen": 502552576 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004281243731193581, + "loss": 3.1313, + "theoretical_loss": 3.912656672898489, + "tokens_seen": 502618112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042811434302908723, + "loss": 3.1022, + "theoretical_loss": 3.9126017767831383, + "tokens_seen": 502683648 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042810431293881647, + "loss": 3.0259, + "theoretical_loss": 3.912546889827894, + "tokens_seen": 502749184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042809428284854565, + "loss": 3.1366, + "theoretical_loss": 3.9124920120300337, + "tokens_seen": 502814720 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042808425275827483, + "loss": 3.1046, + "theoretical_loss": 3.9124371433868372, + "tokens_seen": 502880256 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1221936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0431900024414062, + "objective/train/theoretical_loss": 3.912382283895583, + "objective/train/tokens_used": 523405792, + "theoretical_loss": 3.912382283895583, + "tokens_seen": 502945792 + }, + { + "epoch": 6.01, + "learning_rate": 0.000428074222668004, + "loss": 3.0285, + "theoretical_loss": 3.912382283895583, + "tokens_seen": 502945792 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004280641925777332, + "loss": 3.237, + "theoretical_loss": 3.9123274335535534, + "tokens_seen": 503011328 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004280541624874624, + "loss": 3.1647, + "theoretical_loss": 3.9122725923580313, + "tokens_seen": 503076864 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004280441323971916, + "loss": 3.145, + "theoretical_loss": 3.912217760306299, + "tokens_seen": 503142400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042803410230692074, + "loss": 3.0102, + "theoretical_loss": 3.9121629373956424, + "tokens_seen": 503207936 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042802407221664997, + "loss": 3.0519, + "theoretical_loss": 3.912108123623346, + "tokens_seen": 503273472 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004280140421263791, + "loss": 2.9509, + "theoretical_loss": 3.9120533189866986, + "tokens_seen": 503339008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042800401203610833, + "loss": 3.009, + "theoretical_loss": 3.9119985234829864, + "tokens_seen": 503404544 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279939819458375, + "loss": 3.1765, + "theoretical_loss": 3.9119437371095005, + "tokens_seen": 503470080 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279839518555667, + "loss": 3.1579, + "theoretical_loss": 3.911888959863531, + "tokens_seen": 503535616 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279739217652959, + "loss": 3.0521, + "theoretical_loss": 3.911834191742369, + "tokens_seen": 503601152 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279638916750251, + "loss": 3.0695, + "theoretical_loss": 3.911779432743307, + "tokens_seen": 503666688 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042795386158475424, + "loss": 3.2007, + "theoretical_loss": 3.9117246828636407, + "tokens_seen": 503732224 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279438314944835, + "loss": 3.0941, + "theoretical_loss": 3.911669942100664, + "tokens_seen": 503797760 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279338014042126, + "loss": 3.1695, + "theoretical_loss": 3.9116152104516733, + "tokens_seen": 503863296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042792377131394184, + "loss": 3.1136, + "theoretical_loss": 3.911560487913966, + "tokens_seen": 503928832 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042791374122367107, + "loss": 3.1507, + "theoretical_loss": 3.9115057744848403, + "tokens_seen": 503994368 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004279037111334002, + "loss": 3.0797, + "theoretical_loss": 3.911451070161597, + "tokens_seen": 504059904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042789368104312943, + "loss": 3.1498, + "theoretical_loss": 3.9113963749415364, + "tokens_seen": 504125440 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042788365095285856, + "loss": 3.1699, + "theoretical_loss": 3.9113416888219605, + "tokens_seen": 504190976 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004278736208625878, + "loss": 3.1285, + "theoretical_loss": 3.911287011800173, + "tokens_seen": 504256512 + }, + { + "epoch": 6.01, + "learning_rate": 0.000427863590772317, + "loss": 3.1728, + "theoretical_loss": 3.9112323438734773, + "tokens_seen": 504322048 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042785356068204616, + "loss": 3.0691, + "theoretical_loss": 3.91117768503918, + "tokens_seen": 504387584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042784353059177534, + "loss": 3.0219, + "theoretical_loss": 3.9111230352945867, + "tokens_seen": 504453120 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004278335005015045, + "loss": 3.0621, + "theoretical_loss": 3.911068394637006, + "tokens_seen": 504518656 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1226765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.121598243713379, + "objective/train/theoretical_loss": 3.911013763063747, + "objective/train/tokens_used": 525044192, + "theoretical_loss": 3.911013763063747, + "tokens_seen": 504584192 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004278234704112337, + "loss": 3.068, + "theoretical_loss": 3.911013763063747, + "tokens_seen": 504584192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042781344032096294, + "loss": 3.1218, + "theoretical_loss": 3.9109591405721185, + "tokens_seen": 504649728 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042780341023069206, + "loss": 3.1075, + "theoretical_loss": 3.9109045271594334, + "tokens_seen": 504715264 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004277933801404213, + "loss": 3.1657, + "theoretical_loss": 3.9108499228230027, + "tokens_seen": 504780800 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004277833500501505, + "loss": 3.2089, + "theoretical_loss": 3.910795327560141, + "tokens_seen": 504846336 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042777331995987966, + "loss": 3.0344, + "theoretical_loss": 3.910740741368162, + "tokens_seen": 504911872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042776328986960884, + "loss": 3.2232, + "theoretical_loss": 3.9106861642443826, + "tokens_seen": 504977408 + }, + { + "epoch": 6.01, + "learning_rate": 0.000427753259779338, + "loss": 3.063, + "theoretical_loss": 3.9106315961861187, + "tokens_seen": 505042944 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004277432296890672, + "loss": 3.2348, + "theoretical_loss": 3.9105770371906887, + "tokens_seen": 505108480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042773319959879644, + "loss": 3.1446, + "theoretical_loss": 3.9105224872554123, + "tokens_seen": 505174016 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042772316950852557, + "loss": 3.1994, + "theoretical_loss": 3.9104679463776097, + "tokens_seen": 505239552 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004277131394182548, + "loss": 3.0819, + "theoretical_loss": 3.910413414554602, + "tokens_seen": 505305088 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042770310932798393, + "loss": 3.0645, + "theoretical_loss": 3.910358891783712, + "tokens_seen": 505370624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042769307923771316, + "loss": 3.0799, + "theoretical_loss": 3.910304378062264, + "tokens_seen": 505436160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042768304914744235, + "loss": 3.0893, + "theoretical_loss": 3.910249873387582, + "tokens_seen": 505501696 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042767301905717153, + "loss": 3.1932, + "theoretical_loss": 3.9101953777569936, + "tokens_seen": 505567232 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004276629889669007, + "loss": 3.1381, + "theoretical_loss": 3.910140891167824, + "tokens_seen": 505632768 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004276529588766299, + "loss": 3.1574, + "theoretical_loss": 3.9100864136174036, + "tokens_seen": 505698304 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042764292878635907, + "loss": 3.1214, + "theoretical_loss": 3.9100319451030607, + "tokens_seen": 505763840 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004276328986960883, + "loss": 3.0815, + "theoretical_loss": 3.909977485622126, + "tokens_seen": 505829376 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042762286860581743, + "loss": 3.0562, + "theoretical_loss": 3.909923035171931, + "tokens_seen": 505894912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042761283851554667, + "loss": 3.2201, + "theoretical_loss": 3.909868593749809, + "tokens_seen": 505960448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042760280842527585, + "loss": 3.1479, + "theoretical_loss": 3.9098141613530943, + "tokens_seen": 506025984 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042759277833500503, + "loss": 3.0979, + "theoretical_loss": 3.9097597379791216, + "tokens_seen": 506091520 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275827482447342, + "loss": 3.0882, + "theoretical_loss": 3.9097053236252273, + "tokens_seen": 506157056 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1230576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1757938861846924, + "objective/train/theoretical_loss": 3.9096509182887487, + "objective/train/tokens_used": 526682592, + "theoretical_loss": 3.9096509182887487, + "tokens_seen": 506222592 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275727181544634, + "loss": 3.101, + "theoretical_loss": 3.9096509182887487, + "tokens_seen": 506222592 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275626880641926, + "loss": 3.1378, + "theoretical_loss": 3.9095965219670243, + "tokens_seen": 506288128 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275526579739218, + "loss": 3.1551, + "theoretical_loss": 3.909542134657394, + "tokens_seen": 506353664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042754262788365094, + "loss": 3.1184, + "theoretical_loss": 3.909487756357199, + "tokens_seen": 506419200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042753259779338017, + "loss": 3.186, + "theoretical_loss": 3.90943338706378, + "tokens_seen": 506484736 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275225677031093, + "loss": 3.1233, + "theoretical_loss": 3.9093790267744812, + "tokens_seen": 506550272 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042751253761283853, + "loss": 3.1837, + "theoretical_loss": 3.909324675486647, + "tokens_seen": 506615808 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004275025075225677, + "loss": 3.1237, + "theoretical_loss": 3.9092703331976213, + "tokens_seen": 506681344 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274924774322969, + "loss": 3.1612, + "theoretical_loss": 3.9092159999047515, + "tokens_seen": 506746880 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274824473420261, + "loss": 3.169, + "theoretical_loss": 3.909161675605385, + "tokens_seen": 506812416 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274724172517553, + "loss": 3.0952, + "theoretical_loss": 3.9091073602968707, + "tokens_seen": 506877952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042746238716148444, + "loss": 2.9903, + "theoretical_loss": 3.909053053976558, + "tokens_seen": 506943488 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274523570712137, + "loss": 3.1426, + "theoretical_loss": 3.9089987566417985, + "tokens_seen": 507009024 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274423269809428, + "loss": 3.0482, + "theoretical_loss": 3.9089444682899437, + "tokens_seen": 507074560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042743229689067204, + "loss": 3.1947, + "theoretical_loss": 3.9088901889183463, + "tokens_seen": 507140096 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274222668004012, + "loss": 3.1187, + "theoretical_loss": 3.9088359185243613, + "tokens_seen": 507205632 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274122367101304, + "loss": 3.1256, + "theoretical_loss": 3.9087816571053446, + "tokens_seen": 507271168 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004274022066198596, + "loss": 3.1881, + "theoretical_loss": 3.908727404658652, + "tokens_seen": 507336704 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042739217652958876, + "loss": 3.1046, + "theoretical_loss": 3.908673161181641, + "tokens_seen": 507402240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042738214643931794, + "loss": 3.0884, + "theoretical_loss": 3.9086189266716715, + "tokens_seen": 507467776 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004273721163490472, + "loss": 3.1468, + "theoretical_loss": 3.908564701126102, + "tokens_seen": 507533312 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004273620862587763, + "loss": 3.0419, + "theoretical_loss": 3.9085104845422944, + "tokens_seen": 507598848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042735205616850554, + "loss": 3.16, + "theoretical_loss": 3.9084562769176108, + "tokens_seen": 507664384 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042734202607823467, + "loss": 3.024, + "theoretical_loss": 3.9084020782494138, + "tokens_seen": 507729920 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004273319959879639, + "loss": 3.1402, + "theoretical_loss": 3.9083478885350686, + "tokens_seen": 507795456 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1233558, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2856802940368652, + "objective/train/theoretical_loss": 3.90829370777194, + "objective/train/tokens_used": 528320992, + "theoretical_loss": 3.90829370777194, + "tokens_seen": 507860992 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004273219658976931, + "loss": 3.0578, + "theoretical_loss": 3.90829370777194, + "tokens_seen": 507860992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042731193580742226, + "loss": 3.1163, + "theoretical_loss": 3.9082395359573954, + "tokens_seen": 507926528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042730190571715145, + "loss": 3.1833, + "theoretical_loss": 3.9081853730888017, + "tokens_seen": 507992064 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004272918756268807, + "loss": 3.0276, + "theoretical_loss": 3.9081312191635282, + "tokens_seen": 508057600 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004272818455366098, + "loss": 2.9781, + "theoretical_loss": 3.908077074178945, + "tokens_seen": 508123136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042727181544633904, + "loss": 3.2162, + "theoretical_loss": 3.9080229381324223, + "tokens_seen": 508188672 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042726178535606817, + "loss": 3.0799, + "theoretical_loss": 3.907968811021333, + "tokens_seen": 508254208 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004272517552657974, + "loss": 3.1403, + "theoretical_loss": 3.9079146928430504, + "tokens_seen": 508319744 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004272417251755266, + "loss": 3.0837, + "theoretical_loss": 3.9078605835949487, + "tokens_seen": 508385280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042723169508525577, + "loss": 3.1765, + "theoretical_loss": 3.9078064832744035, + "tokens_seen": 508450816 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042722166499498495, + "loss": 3.1077, + "theoretical_loss": 3.9077523918787915, + "tokens_seen": 508516352 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042721163490471413, + "loss": 3.1276, + "theoretical_loss": 3.9076983094054896, + "tokens_seen": 508581888 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004272016048144433, + "loss": 3.1079, + "theoretical_loss": 3.907644235851878, + "tokens_seen": 508647424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042719157472417255, + "loss": 3.1283, + "theoretical_loss": 3.907590171215336, + "tokens_seen": 508712960 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004271815446339017, + "loss": 3.1674, + "theoretical_loss": 3.9075361154932438, + "tokens_seen": 508778496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004271715145436309, + "loss": 3.0896, + "theoretical_loss": 3.907482068682985, + "tokens_seen": 508844032 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004271614844533601, + "loss": 3.0738, + "theoretical_loss": 3.9074280307819422, + "tokens_seen": 508909568 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042715145436308927, + "loss": 2.9385, + "theoretical_loss": 3.9073740017874994, + "tokens_seen": 508975104 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004271414242728185, + "loss": 3.1356, + "theoretical_loss": 3.907319981697042, + "tokens_seen": 509040640 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042713139418254763, + "loss": 3.0791, + "theoretical_loss": 3.9072659705079578, + "tokens_seen": 509106176 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042712136409227687, + "loss": 3.1658, + "theoretical_loss": 3.9072119682176334, + "tokens_seen": 509171712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042711133400200605, + "loss": 3.1148, + "theoretical_loss": 3.9071579748234573, + "tokens_seen": 509237248 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042710130391173523, + "loss": 3.1344, + "theoretical_loss": 3.907103990322821, + "tokens_seen": 509302784 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270912738214644, + "loss": 3.105, + "theoretical_loss": 3.9070500147131133, + "tokens_seen": 509368320 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270812437311936, + "loss": 3.1214, + "theoretical_loss": 3.9069960479917283, + "tokens_seen": 509433856 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1238468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1576576232910156, + "objective/train/theoretical_loss": 3.9069420901560576, + "objective/train/tokens_used": 529959392, + "theoretical_loss": 3.9069420901560576, + "tokens_seen": 509499392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270712136409228, + "loss": 3.0757, + "theoretical_loss": 3.9069420901560576, + "tokens_seen": 509499392 + }, + { + "epoch": 6.01, + "learning_rate": 0.000427061183550652, + "loss": 3.0299, + "theoretical_loss": 3.906888141203496, + "tokens_seen": 509564928 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042705115346038114, + "loss": 3.1251, + "theoretical_loss": 3.906834201131439, + "tokens_seen": 509630464 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042704112337011037, + "loss": 3.1224, + "theoretical_loss": 3.9067802699372836, + "tokens_seen": 509696000 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270310932798395, + "loss": 3.1672, + "theoretical_loss": 3.9067263476184264, + "tokens_seen": 509761536 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042702106318956873, + "loss": 3.1093, + "theoretical_loss": 3.9066724341722665, + "tokens_seen": 509827072 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270110330992979, + "loss": 3.2208, + "theoretical_loss": 3.906618529596204, + "tokens_seen": 509892608 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004270010030090271, + "loss": 3.1414, + "theoretical_loss": 3.906564633887639, + "tokens_seen": 509958144 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269909729187563, + "loss": 3.0756, + "theoretical_loss": 3.9065107470439746, + "tokens_seen": 510023680 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269809428284855, + "loss": 3.2093, + "theoretical_loss": 3.906456869062613, + "tokens_seen": 510089216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042697091273821464, + "loss": 3.1762, + "theoretical_loss": 3.906402999940958, + "tokens_seen": 510154752 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269608826479439, + "loss": 3.1333, + "theoretical_loss": 3.906349139676416, + "tokens_seen": 510220288 + }, + { + "epoch": 6.01, + "learning_rate": 0.000426950852557673, + "loss": 3.2241, + "theoretical_loss": 3.906295288266392, + "tokens_seen": 510285824 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042694082246740224, + "loss": 3.0398, + "theoretical_loss": 3.906241445708295, + "tokens_seen": 510351360 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269307923771314, + "loss": 3.2117, + "theoretical_loss": 3.906187611999532, + "tokens_seen": 510416896 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269207622868606, + "loss": 3.1545, + "theoretical_loss": 3.906133787137513, + "tokens_seen": 510482432 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004269107321965898, + "loss": 3.0921, + "theoretical_loss": 3.9060799711196497, + "tokens_seen": 510547968 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042690070210631896, + "loss": 3.1056, + "theoretical_loss": 3.906026163943353, + "tokens_seen": 510613504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042689067201604814, + "loss": 3.0759, + "theoretical_loss": 3.905972365606036, + "tokens_seen": 510679040 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004268806419257774, + "loss": 3.1408, + "theoretical_loss": 3.9059185761051127, + "tokens_seen": 510744576 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004268706118355065, + "loss": 3.0599, + "theoretical_loss": 3.9058647954379975, + "tokens_seen": 510810112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042686058174523574, + "loss": 3.1785, + "theoretical_loss": 3.9058110236021077, + "tokens_seen": 510875648 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042685055165496487, + "loss": 3.0607, + "theoretical_loss": 3.90575726059486, + "tokens_seen": 510941184 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004268405215646941, + "loss": 3.1493, + "theoretical_loss": 3.905703506413672, + "tokens_seen": 511006720 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004268304914744233, + "loss": 3.0663, + "theoretical_loss": 3.9056497610559644, + "tokens_seen": 511072256 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1241366, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8943047523498535, + "objective/train/theoretical_loss": 3.905596024519157, + "objective/train/tokens_used": 531597792, + "theoretical_loss": 3.905596024519157, + "tokens_seen": 511137792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042682046138415246, + "loss": 3.0505, + "theoretical_loss": 3.905596024519157, + "tokens_seen": 511137792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042681043129388165, + "loss": 3.1244, + "theoretical_loss": 3.9055422968006717, + "tokens_seen": 511203328 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004268004012036109, + "loss": 3.1704, + "theoretical_loss": 3.9054885778979305, + "tokens_seen": 511268864 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042679037111334, + "loss": 3.0136, + "theoretical_loss": 3.9054348678083577, + "tokens_seen": 511334400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042678034102306924, + "loss": 3.1911, + "theoretical_loss": 3.905381166529378, + "tokens_seen": 511399936 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042677031093279837, + "loss": 3.1838, + "theoretical_loss": 3.905327474058417, + "tokens_seen": 511465472 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004267602808425276, + "loss": 3.1191, + "theoretical_loss": 3.9052737903929025, + "tokens_seen": 511531008 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004267502507522568, + "loss": 3.1549, + "theoretical_loss": 3.905220115530262, + "tokens_seen": 511596544 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042674022066198597, + "loss": 3.1277, + "theoretical_loss": 3.9051664494679246, + "tokens_seen": 511662080 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042673019057171515, + "loss": 3.0836, + "theoretical_loss": 3.9051127922033206, + "tokens_seen": 511727616 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042672016048144433, + "loss": 3.1474, + "theoretical_loss": 3.9050591437338817, + "tokens_seen": 511793152 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004267101303911735, + "loss": 3.1634, + "theoretical_loss": 3.9050055040570397, + "tokens_seen": 511858688 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042670010030090275, + "loss": 3.0422, + "theoretical_loss": 3.9049518731702286, + "tokens_seen": 511924224 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004266900702106319, + "loss": 3.1577, + "theoretical_loss": 3.9048982510708825, + "tokens_seen": 511989760 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004266800401203611, + "loss": 3.1434, + "theoretical_loss": 3.9048446377564376, + "tokens_seen": 512055296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042667001003009024, + "loss": 3.021, + "theoretical_loss": 3.9047910332243294, + "tokens_seen": 512120832 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042665997993981947, + "loss": 3.1212, + "theoretical_loss": 3.904737437471997, + "tokens_seen": 512186368 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042664994984954865, + "loss": 3.1619, + "theoretical_loss": 3.9046838504968786, + "tokens_seen": 512251904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042663991975927783, + "loss": 3.101, + "theoretical_loss": 3.904630272296415, + "tokens_seen": 512317440 + }, + { + "epoch": 6.01, + "learning_rate": 0.000426629889669007, + "loss": 2.9943, + "theoretical_loss": 3.904576702868045, + "tokens_seen": 512382976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042661985957873625, + "loss": 3.0631, + "theoretical_loss": 3.9045231422092135, + "tokens_seen": 512448512 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004266098294884654, + "loss": 3.1408, + "theoretical_loss": 3.904469590317362, + "tokens_seen": 512514048 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004265997993981946, + "loss": 3.0778, + "theoretical_loss": 3.904416047189934, + "tokens_seen": 512579584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042658976930792374, + "loss": 3.2103, + "theoretical_loss": 3.904362512824377, + "tokens_seen": 512645120 + }, + { + "epoch": 6.01, + "learning_rate": 0.000426579739217653, + "loss": 3.2241, + "theoretical_loss": 3.904308987218136, + "tokens_seen": 512710656 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1246210, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0864126682281494, + "objective/train/theoretical_loss": 3.9042554703686583, + "objective/train/tokens_used": 533236192, + "theoretical_loss": 3.9042554703686583, + "tokens_seen": 512776192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042656970912738216, + "loss": 3.0519, + "theoretical_loss": 3.9042554703686583, + "tokens_seen": 512776192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042655967903711134, + "loss": 3.1617, + "theoretical_loss": 3.904201962273393, + "tokens_seen": 512841728 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004265496489468405, + "loss": 3.2868, + "theoretical_loss": 3.904148462929789, + "tokens_seen": 512907264 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004265396188565697, + "loss": 3.1026, + "theoretical_loss": 3.9040949723352973, + "tokens_seen": 512972800 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004265295887662989, + "loss": 3.1316, + "theoretical_loss": 3.9040414904873697, + "tokens_seen": 513038336 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004265195586760281, + "loss": 3.1831, + "theoretical_loss": 3.903988017383459, + "tokens_seen": 513103872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042650952858575724, + "loss": 3.17, + "theoretical_loss": 3.9039345530210188, + "tokens_seen": 513169408 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004264994984954865, + "loss": 3.1451, + "theoretical_loss": 3.9038810973975044, + "tokens_seen": 513234944 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004264894684052156, + "loss": 3.0421, + "theoretical_loss": 3.9038276505103715, + "tokens_seen": 513300480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042647943831494484, + "loss": 3.0397, + "theoretical_loss": 3.903774212357077, + "tokens_seen": 513366016 + }, + { + "epoch": 6.01, + "learning_rate": 0.000426469408224674, + "loss": 3.1181, + "theoretical_loss": 3.9037207829350793, + "tokens_seen": 513431552 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004264593781344032, + "loss": 3.076, + "theoretical_loss": 3.903667362241838, + "tokens_seen": 513497088 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004264493480441324, + "loss": 3.0725, + "theoretical_loss": 3.9036139502748126, + "tokens_seen": 513562624 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004264393179538616, + "loss": 3.0569, + "theoretical_loss": 3.9035605470314643, + "tokens_seen": 513628160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042642928786359075, + "loss": 3.2146, + "theoretical_loss": 3.9035071525092553, + "tokens_seen": 513693696 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042641925777332, + "loss": 3.1849, + "theoretical_loss": 3.90345376670565, + "tokens_seen": 513759232 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042640922768304916, + "loss": 3.2488, + "theoretical_loss": 3.9034003896181124, + "tokens_seen": 513824768 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042639919759277834, + "loss": 3.1493, + "theoretical_loss": 3.903347021244108, + "tokens_seen": 513890304 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004263891675025076, + "loss": 3.1206, + "theoretical_loss": 3.9032936615811034, + "tokens_seen": 513955840 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004263791374122367, + "loss": 3.1761, + "theoretical_loss": 3.9032403106265656, + "tokens_seen": 514021376 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042636910732196594, + "loss": 3.0485, + "theoretical_loss": 3.9031869683779643, + "tokens_seen": 514086912 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042635907723169507, + "loss": 3.1178, + "theoretical_loss": 3.90313363483277, + "tokens_seen": 514152448 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004263490471414243, + "loss": 3.1922, + "theoretical_loss": 3.903080309988451, + "tokens_seen": 514217984 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004263390170511535, + "loss": 3.0431, + "theoretical_loss": 3.9030269938424818, + "tokens_seen": 514283520 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042632898696088266, + "loss": 3.1774, + "theoretical_loss": 3.9029736863923334, + "tokens_seen": 514349056 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1249958, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1321420669555664, + "objective/train/theoretical_loss": 3.902920387635481, + "objective/train/tokens_used": 534874592, + "theoretical_loss": 3.902920387635481, + "tokens_seen": 514414592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042631895687061185, + "loss": 3.1574, + "theoretical_loss": 3.902920387635481, + "tokens_seen": 514414592 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004263089267803411, + "loss": 3.1453, + "theoretical_loss": 3.9028670975693998, + "tokens_seen": 514480128 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004262988966900702, + "loss": 3.1522, + "theoretical_loss": 3.902813816191565, + "tokens_seen": 514545664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042628886659979944, + "loss": 3.1423, + "theoretical_loss": 3.902760543499454, + "tokens_seen": 514611200 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042627883650952857, + "loss": 3.1232, + "theoretical_loss": 3.9027072794905457, + "tokens_seen": 514676736 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004262688064192578, + "loss": 3.0601, + "theoretical_loss": 3.9026540241623184, + "tokens_seen": 514742272 + }, + { + "epoch": 6.01, + "learning_rate": 0.000426258776328987, + "loss": 3.0085, + "theoretical_loss": 3.9026007775122533, + "tokens_seen": 514807808 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042624874623871617, + "loss": 3.0902, + "theoretical_loss": 3.9025475395378315, + "tokens_seen": 514873344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042623871614844535, + "loss": 3.1502, + "theoretical_loss": 3.902494310236535, + "tokens_seen": 514938880 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042622868605817453, + "loss": 3.1595, + "theoretical_loss": 3.902441089605848, + "tokens_seen": 515004416 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004262186559679037, + "loss": 3.1786, + "theoretical_loss": 3.902387877643255, + "tokens_seen": 515069952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042620862587763295, + "loss": 3.1873, + "theoretical_loss": 3.902334674346241, + "tokens_seen": 515135488 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004261985957873621, + "loss": 3.15, + "theoretical_loss": 3.9022814797122924, + "tokens_seen": 515201024 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004261885656970913, + "loss": 3.2235, + "theoretical_loss": 3.9022282937388977, + "tokens_seen": 515266560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042617853560682044, + "loss": 2.9974, + "theoretical_loss": 3.902175116423545, + "tokens_seen": 515332096 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042616850551654967, + "loss": 3.1576, + "theoretical_loss": 3.9021219477637255, + "tokens_seen": 515397632 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042615847542627885, + "loss": 3.1362, + "theoretical_loss": 3.902068787756928, + "tokens_seen": 515463168 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042614844533600803, + "loss": 3.183, + "theoretical_loss": 3.9020156364006455, + "tokens_seen": 515528704 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004261384152457372, + "loss": 3.0821, + "theoretical_loss": 3.9019624936923707, + "tokens_seen": 515594240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042612838515546645, + "loss": 3.2053, + "theoretical_loss": 3.9019093596295975, + "tokens_seen": 515659776 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004261183550651956, + "loss": 3.1704, + "theoretical_loss": 3.901856234209821, + "tokens_seen": 515725312 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004261083249749248, + "loss": 3.0419, + "theoretical_loss": 3.9018031174305374, + "tokens_seen": 515790848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042609829488465394, + "loss": 3.1151, + "theoretical_loss": 3.901750009289244, + "tokens_seen": 515856384 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260882647943832, + "loss": 3.1409, + "theoretical_loss": 3.9016969097834378, + "tokens_seen": 515921920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042607823470411236, + "loss": 3.2291, + "theoretical_loss": 3.9016438189106193, + "tokens_seen": 515987456 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1252926, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.270508050918579, + "objective/train/theoretical_loss": 3.901590736668288, + "objective/train/tokens_used": 536512992, + "theoretical_loss": 3.901590736668288, + "tokens_seen": 516052992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042606820461384154, + "loss": 3.1262, + "theoretical_loss": 3.901590736668288, + "tokens_seen": 516052992 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260581745235707, + "loss": 3.1199, + "theoretical_loss": 3.901537663053945, + "tokens_seen": 516118528 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260481444332999, + "loss": 3.121, + "theoretical_loss": 3.9014845980650934, + "tokens_seen": 516184064 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260381143430291, + "loss": 3.1675, + "theoretical_loss": 3.9014315416992362, + "tokens_seen": 516249600 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260280842527583, + "loss": 3.164, + "theoretical_loss": 3.901378493953877, + "tokens_seen": 516315136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042601805416248744, + "loss": 3.1217, + "theoretical_loss": 3.901325454826522, + "tokens_seen": 516380672 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004260080240722167, + "loss": 3.1322, + "theoretical_loss": 3.901272424314678, + "tokens_seen": 516446208 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259979939819458, + "loss": 3.0834, + "theoretical_loss": 3.901219402415852, + "tokens_seen": 516511744 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042598796389167504, + "loss": 3.0905, + "theoretical_loss": 3.9011663891275528, + "tokens_seen": 516577280 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259779338014042, + "loss": 3.1289, + "theoretical_loss": 3.9011133844472896, + "tokens_seen": 516642816 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259679037111334, + "loss": 3.1195, + "theoretical_loss": 3.9010603883725725, + "tokens_seen": 516708352 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259578736208626, + "loss": 3.1916, + "theoretical_loss": 3.9010074009009146, + "tokens_seen": 516773888 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259478435305918, + "loss": 3.1581, + "theoretical_loss": 3.9009544220298276, + "tokens_seen": 516839424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042593781344032095, + "loss": 3.1632, + "theoretical_loss": 3.900901451756825, + "tokens_seen": 516904960 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259277833500502, + "loss": 3.1631, + "theoretical_loss": 3.9008484900794222, + "tokens_seen": 516970496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004259177532597793, + "loss": 3.1488, + "theoretical_loss": 3.900795536995135, + "tokens_seen": 517036032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042590772316950854, + "loss": 3.1535, + "theoretical_loss": 3.90074259250148, + "tokens_seen": 517101568 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258976930792377, + "loss": 3.1631, + "theoretical_loss": 3.9006896565959748, + "tokens_seen": 517167104 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258876629889669, + "loss": 3.21, + "theoretical_loss": 3.9006367292761386, + "tokens_seen": 517232640 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258776328986961, + "loss": 3.2268, + "theoretical_loss": 3.9005838105394908, + "tokens_seen": 517298176 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042586760280842527, + "loss": 3.1134, + "theoretical_loss": 3.9005309003835533, + "tokens_seen": 517363712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042585757271815445, + "loss": 3.1572, + "theoretical_loss": 3.900477998805847, + "tokens_seen": 517429248 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258475426278837, + "loss": 3.1878, + "theoretical_loss": 3.9004251058038952, + "tokens_seen": 517494784 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258375125376128, + "loss": 3.1681, + "theoretical_loss": 3.900372221375222, + "tokens_seen": 517560320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042582748244734205, + "loss": 3.0593, + "theoretical_loss": 3.900319345517353, + "tokens_seen": 517625856 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1257972, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.120173692703247, + "objective/train/theoretical_loss": 3.900266478227814, + "objective/train/tokens_used": 538151392, + "theoretical_loss": 3.900266478227814, + "tokens_seen": 517691392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258174523570712, + "loss": 3.2079, + "theoretical_loss": 3.900266478227814, + "tokens_seen": 517691392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004258074222668004, + "loss": 3.079, + "theoretical_loss": 3.900213619504132, + "tokens_seen": 517756928 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257973921765296, + "loss": 2.9966, + "theoretical_loss": 3.900160769343835, + "tokens_seen": 517822464 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042578736208625877, + "loss": 3.0891, + "theoretical_loss": 3.900107927744452, + "tokens_seen": 517888000 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042577733199598795, + "loss": 3.0952, + "theoretical_loss": 3.9000550947035135, + "tokens_seen": 517953536 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257673019057172, + "loss": 3.1402, + "theoretical_loss": 3.900002270218551, + "tokens_seen": 518019072 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257572718154463, + "loss": 3.1012, + "theoretical_loss": 3.899949454287096, + "tokens_seen": 518084608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042574724172517555, + "loss": 3.1201, + "theoretical_loss": 3.899896646906682, + "tokens_seen": 518150144 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257372116349047, + "loss": 3.1375, + "theoretical_loss": 3.899843848074844, + "tokens_seen": 518215680 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257271815446339, + "loss": 3.1125, + "theoretical_loss": 3.899791057789116, + "tokens_seen": 518281216 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257171514543631, + "loss": 3.1193, + "theoretical_loss": 3.899738276047036, + "tokens_seen": 518346752 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004257071213640923, + "loss": 3.2046, + "theoretical_loss": 3.8996855028461397, + "tokens_seen": 518412288 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042569709127382145, + "loss": 3.1157, + "theoretical_loss": 3.899632738183967, + "tokens_seen": 518477824 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042568706118355064, + "loss": 3.1362, + "theoretical_loss": 3.8995799820580554, + "tokens_seen": 518543360 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004256770310932798, + "loss": 3.0362, + "theoretical_loss": 3.899527234465947, + "tokens_seen": 518608896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042566700100300905, + "loss": 3.1708, + "theoretical_loss": 3.8994744954051828, + "tokens_seen": 518674432 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042565697091273823, + "loss": 3.1155, + "theoretical_loss": 3.8994217648733045, + "tokens_seen": 518739968 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004256469408224674, + "loss": 3.049, + "theoretical_loss": 3.899369042867857, + "tokens_seen": 518805504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042563691073219665, + "loss": 3.068, + "theoretical_loss": 3.899316329386383, + "tokens_seen": 518871040 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004256268806419258, + "loss": 3.1297, + "theoretical_loss": 3.8992636244264296, + "tokens_seen": 518936576 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425616850551655, + "loss": 3.1572, + "theoretical_loss": 3.8992109279855427, + "tokens_seen": 519002112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042560682046138414, + "loss": 3.0812, + "theoretical_loss": 3.8991582400612694, + "tokens_seen": 519067648 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255967903711134, + "loss": 3.2118, + "theoretical_loss": 3.8991055606511584, + "tokens_seen": 519133184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042558676028084256, + "loss": 3.0779, + "theoretical_loss": 3.89905288975276, + "tokens_seen": 519198720 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042557673019057174, + "loss": 3.1067, + "theoretical_loss": 3.899000227363625, + "tokens_seen": 519264256 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1260902, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2184648513793945, + "objective/train/theoretical_loss": 3.898947573481303, + "objective/train/tokens_used": 539789792, + "theoretical_loss": 3.898947573481303, + "tokens_seen": 519329792 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255667001003009, + "loss": 3.1318, + "theoretical_loss": 3.898947573481303, + "tokens_seen": 519329792 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255566700100301, + "loss": 3.1294, + "theoretical_loss": 3.8988949281033483, + "tokens_seen": 519395328 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255466399197593, + "loss": 3.1264, + "theoretical_loss": 3.8988422912273144, + "tokens_seen": 519460864 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255366098294885, + "loss": 3.118, + "theoretical_loss": 3.8987896628507555, + "tokens_seen": 519526400 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042552657973921764, + "loss": 3.0671, + "theoretical_loss": 3.898737042971227, + "tokens_seen": 519591936 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004255165496489469, + "loss": 3.1081, + "theoretical_loss": 3.8986844315862865, + "tokens_seen": 519657472 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425506519558676, + "loss": 3.1238, + "theoretical_loss": 3.898631828693491, + "tokens_seen": 519723008 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042549648946840524, + "loss": 3.1021, + "theoretical_loss": 3.8985792342903993, + "tokens_seen": 519788544 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254864593781344, + "loss": 3.1103, + "theoretical_loss": 3.8985266483745713, + "tokens_seen": 519854080 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254764292878636, + "loss": 3.1794, + "theoretical_loss": 3.8984740709435672, + "tokens_seen": 519919616 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254663991975928, + "loss": 3.082, + "theoretical_loss": 3.898421501994949, + "tokens_seen": 519985152 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425456369107322, + "loss": 3.0978, + "theoretical_loss": 3.8983689415262797, + "tokens_seen": 520050688 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042544633901705115, + "loss": 3.1857, + "theoretical_loss": 3.8983163895351227, + "tokens_seen": 520116224 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254363089267804, + "loss": 3.1026, + "theoretical_loss": 3.8982638460190424, + "tokens_seen": 520181760 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254262788365095, + "loss": 3.1265, + "theoretical_loss": 3.898211310975605, + "tokens_seen": 520247296 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042541624874623874, + "loss": 3.1696, + "theoretical_loss": 3.8981587844023773, + "tokens_seen": 520312832 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004254062186559679, + "loss": 3.1587, + "theoretical_loss": 3.898106266296927, + "tokens_seen": 520378368 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253961885656971, + "loss": 3.018, + "theoretical_loss": 3.898053756656822, + "tokens_seen": 520443904 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253861584754263, + "loss": 3.1696, + "theoretical_loss": 3.8980012554796333, + "tokens_seen": 520509440 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042537612838515547, + "loss": 3.2178, + "theoretical_loss": 3.8979487627629306, + "tokens_seen": 520574976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042536609829488465, + "loss": 3.0847, + "theoretical_loss": 3.897896278504287, + "tokens_seen": 520640512 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253560682046139, + "loss": 3.0712, + "theoretical_loss": 3.8978438027012743, + "tokens_seen": 520706048 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425346038114343, + "loss": 3.1894, + "theoretical_loss": 3.897791335351466, + "tokens_seen": 520771584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042533600802407225, + "loss": 2.9512, + "theoretical_loss": 3.897738876452437, + "tokens_seen": 520837120 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253259779338014, + "loss": 3.0697, + "theoretical_loss": 3.897686426001764, + "tokens_seen": 520902656 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1264399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9338362216949463, + "objective/train/theoretical_loss": 3.897633983997023, + "objective/train/tokens_used": 541428192, + "theoretical_loss": 3.897633983997023, + "tokens_seen": 520968192 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253159478435306, + "loss": 3.1088, + "theoretical_loss": 3.897633983997023, + "tokens_seen": 520968192 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004253059177532598, + "loss": 3.2218, + "theoretical_loss": 3.897581550435792, + "tokens_seen": 521033728 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042529588766298897, + "loss": 3.1453, + "theoretical_loss": 3.897529125315649, + "tokens_seen": 521099264 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042528585757271815, + "loss": 3.1155, + "theoretical_loss": 3.8974767086341746, + "tokens_seen": 521164800 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252758274824474, + "loss": 3.0929, + "theoretical_loss": 3.8974243003889493, + "tokens_seen": 521230336 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252657973921765, + "loss": 3.0092, + "theoretical_loss": 3.8973719005775553, + "tokens_seen": 521295872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042525576730190575, + "loss": 3.1186, + "theoretical_loss": 3.8973195091975747, + "tokens_seen": 521361408 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252457372116349, + "loss": 3.1163, + "theoretical_loss": 3.8972671262465917, + "tokens_seen": 521426944 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252357071213641, + "loss": 3.0666, + "theoretical_loss": 3.8972147517221907, + "tokens_seen": 521492480 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252256770310933, + "loss": 3.1244, + "theoretical_loss": 3.8971623856219577, + "tokens_seen": 521558016 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004252156469408225, + "loss": 3.1486, + "theoretical_loss": 3.897110027943479, + "tokens_seen": 521623552 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042520561685055165, + "loss": 3.2303, + "theoretical_loss": 3.8970576786843436, + "tokens_seen": 521689088 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042519558676028084, + "loss": 3.1966, + "theoretical_loss": 3.8970053378421383, + "tokens_seen": 521754624 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042518555667001, + "loss": 3.0596, + "theoretical_loss": 3.8969530054144546, + "tokens_seen": 521820160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042517552657973925, + "loss": 3.2003, + "theoretical_loss": 3.896900681398882, + "tokens_seen": 521885696 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004251654964894684, + "loss": 3.0368, + "theoretical_loss": 3.896848365793013, + "tokens_seen": 521951232 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004251554663991976, + "loss": 3.0689, + "theoretical_loss": 3.8967960585944397, + "tokens_seen": 522016768 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004251454363089268, + "loss": 3.1088, + "theoretical_loss": 3.896743759800756, + "tokens_seen": 522082304 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425135406218656, + "loss": 3.0537, + "theoretical_loss": 3.8966914694095567, + "tokens_seen": 522147840 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042512537612838516, + "loss": 3.1774, + "theoretical_loss": 3.8966391874184376, + "tokens_seen": 522213376 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042511534603811434, + "loss": 3.1409, + "theoretical_loss": 3.8965869138249944, + "tokens_seen": 522278912 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004251053159478435, + "loss": 3.1669, + "theoretical_loss": 3.8965346486268264, + "tokens_seen": 522344448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042509528585757276, + "loss": 3.1525, + "theoretical_loss": 3.896482391821531, + "tokens_seen": 522409984 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004250852557673019, + "loss": 3.2657, + "theoretical_loss": 3.896430143406708, + "tokens_seen": 522475520 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004250752256770311, + "loss": 3.0944, + "theoretical_loss": 3.8963779033799586, + "tokens_seen": 522541056 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1269654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0146005153656006, + "objective/train/theoretical_loss": 3.8963256717388832, + "objective/train/tokens_used": 543066592, + "theoretical_loss": 3.8963256717388832, + "tokens_seen": 522606592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042506519558676024, + "loss": 3.1218, + "theoretical_loss": 3.8963256717388832, + "tokens_seen": 522606592 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004250551654964895, + "loss": 3.2143, + "theoretical_loss": 3.896273448481086, + "tokens_seen": 522672128 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042504513540621866, + "loss": 3.1574, + "theoretical_loss": 3.8962212336041695, + "tokens_seen": 522737664 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042503510531594784, + "loss": 3.031, + "theoretical_loss": 3.896169027105738, + "tokens_seen": 522803200 + }, + { + "epoch": 6.01, + "learning_rate": 0.000425025075225677, + "loss": 3.1897, + "theoretical_loss": 3.8961168289833976, + "tokens_seen": 522868736 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004250150451354062, + "loss": 3.1402, + "theoretical_loss": 3.8960646392347558, + "tokens_seen": 522934272 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004250050150451354, + "loss": 3.1382, + "theoretical_loss": 3.896012457857418, + "tokens_seen": 522999808 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004249949849548646, + "loss": 3.0977, + "theoretical_loss": 3.895960284848994, + "tokens_seen": 523065344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042498495486459375, + "loss": 3.1746, + "theoretical_loss": 3.8959081202070935, + "tokens_seen": 523130880 + }, + { + "epoch": 6.01, + "learning_rate": 0.000424974924774323, + "loss": 3.0591, + "theoretical_loss": 3.8958559639293258, + "tokens_seen": 523196416 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042496489468405216, + "loss": 3.1702, + "theoretical_loss": 3.8958038160133035, + "tokens_seen": 523261952 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042495486459378135, + "loss": 3.1325, + "theoretical_loss": 3.8957516764566384, + "tokens_seen": 523327488 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004249448345035105, + "loss": 3.1257, + "theoretical_loss": 3.895699545256944, + "tokens_seen": 523393024 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004249348044132397, + "loss": 3.1942, + "theoretical_loss": 3.8956474224118347, + "tokens_seen": 523458560 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042492477432296894, + "loss": 3.1769, + "theoretical_loss": 3.895595307918926, + "tokens_seen": 523524096 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004249147442326981, + "loss": 3.133, + "theoretical_loss": 3.8955432017758342, + "tokens_seen": 523589632 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004249047141424273, + "loss": 3.0443, + "theoretical_loss": 3.8954911039801763, + "tokens_seen": 523655168 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004248946840521565, + "loss": 3.132, + "theoretical_loss": 3.8954390145295714, + "tokens_seen": 523720704 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042488465396188567, + "loss": 3.1293, + "theoretical_loss": 3.895386933421638, + "tokens_seen": 523786240 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042487462387161485, + "loss": 3.1113, + "theoretical_loss": 3.895334860653996, + "tokens_seen": 523851776 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004248645937813441, + "loss": 3.0944, + "theoretical_loss": 3.895282796224268, + "tokens_seen": 523917312 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004248545636910732, + "loss": 3.1261, + "theoretical_loss": 3.895230740130075, + "tokens_seen": 523982848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042484453360080245, + "loss": 3.0873, + "theoretical_loss": 3.895178692369041, + "tokens_seen": 524048384 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004248345035105316, + "loss": 3.0585, + "theoretical_loss": 3.8951266529387896, + "tokens_seen": 524113920 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004248244734202608, + "loss": 3.1629, + "theoretical_loss": 3.8950746218369456, + "tokens_seen": 524179456 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1272517, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.089585065841675, + "objective/train/theoretical_loss": 3.8950225990611362, + "objective/train/tokens_used": 544704992, + "theoretical_loss": 3.8950225990611362, + "tokens_seen": 524244992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042481444332999, + "loss": 3.1934, + "theoretical_loss": 3.8950225990611362, + "tokens_seen": 524244992 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042480441323971917, + "loss": 3.0969, + "theoretical_loss": 3.8949705846089877, + "tokens_seen": 524310528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042479438314944835, + "loss": 3.1323, + "theoretical_loss": 3.894918578478128, + "tokens_seen": 524376064 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247843530591776, + "loss": 3.0214, + "theoretical_loss": 3.8948665806661866, + "tokens_seen": 524441600 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247743229689067, + "loss": 3.2123, + "theoretical_loss": 3.8948145911707934, + "tokens_seen": 524507136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042476429287863595, + "loss": 3.1697, + "theoretical_loss": 3.894762609989579, + "tokens_seen": 524572672 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247542627883651, + "loss": 3.0929, + "theoretical_loss": 3.894710637120176, + "tokens_seen": 524638208 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247442326980943, + "loss": 3.0736, + "theoretical_loss": 3.8946586725602166, + "tokens_seen": 524703744 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247342026078235, + "loss": 3.1441, + "theoretical_loss": 3.894606716307335, + "tokens_seen": 524769280 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004247241725175527, + "loss": 3.126, + "theoretical_loss": 3.8945547683591664, + "tokens_seen": 524834816 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042471414242728186, + "loss": 3.0462, + "theoretical_loss": 3.894502828713346, + "tokens_seen": 524900352 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042470411233701104, + "loss": 3.138, + "theoretical_loss": 3.8944508973675105, + "tokens_seen": 524965888 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004246940822467402, + "loss": 3.1159, + "theoretical_loss": 3.894398974319299, + "tokens_seen": 525031424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042468405215646945, + "loss": 3.126, + "theoretical_loss": 3.894347059566348, + "tokens_seen": 525096960 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004246740220661986, + "loss": 3.1164, + "theoretical_loss": 3.894295153106299, + "tokens_seen": 525162496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004246639919759278, + "loss": 3.0412, + "theoretical_loss": 3.894243254936792, + "tokens_seen": 525228032 + }, + { + "epoch": 6.01, + "learning_rate": 0.000424653961885657, + "loss": 3.0783, + "theoretical_loss": 3.8941913650554683, + "tokens_seen": 525293568 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004246439317953862, + "loss": 3.0903, + "theoretical_loss": 3.8941394834599707, + "tokens_seen": 525359104 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042463390170511536, + "loss": 3.1463, + "theoretical_loss": 3.8940876101479436, + "tokens_seen": 525424640 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042462387161484454, + "loss": 3.216, + "theoretical_loss": 3.89403574511703, + "tokens_seen": 525490176 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004246138415245737, + "loss": 3.1021, + "theoretical_loss": 3.8939838883648767, + "tokens_seen": 525555712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042460381143430296, + "loss": 3.2734, + "theoretical_loss": 3.893932039889129, + "tokens_seen": 525621248 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245937813440321, + "loss": 3.1147, + "theoretical_loss": 3.8938801996874357, + "tokens_seen": 525686784 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245837512537613, + "loss": 3.0803, + "theoretical_loss": 3.8938283677574432, + "tokens_seen": 525752320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042457372116349044, + "loss": 3.0192, + "theoretical_loss": 3.8937765440968026, + "tokens_seen": 525817856 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1277292, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.241668224334717, + "objective/train/theoretical_loss": 3.8937247287031633, + "objective/train/tokens_used": 546343392, + "theoretical_loss": 3.8937247287031633, + "tokens_seen": 525883392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245636910732197, + "loss": 3.1633, + "theoretical_loss": 3.8937247287031633, + "tokens_seen": 525883392 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042455366098294886, + "loss": 3.1359, + "theoretical_loss": 3.893672921574177, + "tokens_seen": 525948928 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042454363089267804, + "loss": 2.9919, + "theoretical_loss": 3.8936211227074957, + "tokens_seen": 526014464 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245336008024072, + "loss": 3.1839, + "theoretical_loss": 3.893569332100772, + "tokens_seen": 526080000 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245235707121364, + "loss": 3.166, + "theoretical_loss": 3.893517549751661, + "tokens_seen": 526145536 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245135406218656, + "loss": 3.1277, + "theoretical_loss": 3.893465775657817, + "tokens_seen": 526211072 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004245035105315948, + "loss": 3.2288, + "theoretical_loss": 3.8934140098168966, + "tokens_seen": 526276608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042449348044132395, + "loss": 3.0705, + "theoretical_loss": 3.8933622522265567, + "tokens_seen": 526342144 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244834503510532, + "loss": 3.2373, + "theoretical_loss": 3.8933105028844546, + "tokens_seen": 526407680 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042447342026078236, + "loss": 3.1022, + "theoretical_loss": 3.8932587617882497, + "tokens_seen": 526473216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042446339017051155, + "loss": 3.1933, + "theoretical_loss": 3.8932070289356018, + "tokens_seen": 526538752 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244533600802407, + "loss": 3.1205, + "theoretical_loss": 3.8931553043241722, + "tokens_seen": 526604288 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244433299899699, + "loss": 3.1373, + "theoretical_loss": 3.893103587951622, + "tokens_seen": 526669824 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244332998996991, + "loss": 3.0873, + "theoretical_loss": 3.893051879815614, + "tokens_seen": 526735360 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244232698094283, + "loss": 3.0885, + "theoretical_loss": 3.8930001799138125, + "tokens_seen": 526800896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042441323971915745, + "loss": 3.0581, + "theoretical_loss": 3.892948488243881, + "tokens_seen": 526866432 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004244032096288867, + "loss": 3.1733, + "theoretical_loss": 3.892896804803486, + "tokens_seen": 526931968 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243931795386158, + "loss": 3.1251, + "theoretical_loss": 3.892845129590294, + "tokens_seen": 526997504 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042438314944834505, + "loss": 3.1776, + "theoretical_loss": 3.892793462601972, + "tokens_seen": 527063040 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042437311935807423, + "loss": 3.1564, + "theoretical_loss": 3.8927418038361887, + "tokens_seen": 527128576 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243630892678034, + "loss": 3.0932, + "theoretical_loss": 3.8926901532906135, + "tokens_seen": 527194112 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243530591775326, + "loss": 3.101, + "theoretical_loss": 3.892638510962917, + "tokens_seen": 527259648 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243430290872618, + "loss": 3.092, + "theoretical_loss": 3.89258687685077, + "tokens_seen": 527325184 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042433299899699095, + "loss": 3.1759, + "theoretical_loss": 3.8925352509518447, + "tokens_seen": 527390720 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243229689067202, + "loss": 2.9637, + "theoretical_loss": 3.892483633263815, + "tokens_seen": 527456256 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1280299, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.080016851425171, + "objective/train/theoretical_loss": 3.8924320237843544, + "objective/train/tokens_used": 547981792, + "theoretical_loss": 3.8924320237843544, + "tokens_seen": 527521792 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004243129388164493, + "loss": 3.0517, + "theoretical_loss": 3.8924320237843544, + "tokens_seen": 527521792 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042430290872617855, + "loss": 3.2568, + "theoretical_loss": 3.8923804225111382, + "tokens_seen": 527587328 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042429287863590773, + "loss": 3.1202, + "theoretical_loss": 3.8923288294418423, + "tokens_seen": 527652864 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242828485456369, + "loss": 3.1075, + "theoretical_loss": 3.892277244574144, + "tokens_seen": 527718400 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242728184553661, + "loss": 3.1187, + "theoretical_loss": 3.8922256679057208, + "tokens_seen": 527783936 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242627883650953, + "loss": 3.1048, + "theoretical_loss": 3.8921740994342517, + "tokens_seen": 527849472 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042425275827482446, + "loss": 3.1113, + "theoretical_loss": 3.892122539157417, + "tokens_seen": 527915008 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242427281845537, + "loss": 3.0818, + "theoretical_loss": 3.892070987072897, + "tokens_seen": 527980544 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242326980942828, + "loss": 3.2459, + "theoretical_loss": 3.8920194431783735, + "tokens_seen": 528046080 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042422266800401206, + "loss": 3.0708, + "theoretical_loss": 3.8919679074715288, + "tokens_seen": 528111616 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242126379137412, + "loss": 3.1049, + "theoretical_loss": 3.8919163799500467, + "tokens_seen": 528177152 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004242026078234704, + "loss": 3.138, + "theoretical_loss": 3.891864860611612, + "tokens_seen": 528242688 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241925777331996, + "loss": 3.223, + "theoretical_loss": 3.89181334945391, + "tokens_seen": 528308224 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241825476429288, + "loss": 3.0864, + "theoretical_loss": 3.891761846474627, + "tokens_seen": 528373760 + }, + { + "epoch": 6.01, + "learning_rate": 0.000424172517552658, + "loss": 3.053, + "theoretical_loss": 3.8917103516714504, + "tokens_seen": 528439296 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241624874623872, + "loss": 3.2222, + "theoretical_loss": 3.8916588650420687, + "tokens_seen": 528504832 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241524573721164, + "loss": 3.1904, + "theoretical_loss": 3.8916073865841714, + "tokens_seen": 528570368 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042414242728184556, + "loss": 3.0854, + "theoretical_loss": 3.891555916295448, + "tokens_seen": 528635904 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042413239719157474, + "loss": 3.1093, + "theoretical_loss": 3.8915044541735897, + "tokens_seen": 528701440 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241223671013039, + "loss": 3.1235, + "theoretical_loss": 3.8914530002162886, + "tokens_seen": 528766976 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042411233701103316, + "loss": 3.1818, + "theoretical_loss": 3.891401554421238, + "tokens_seen": 528832512 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004241023069207623, + "loss": 3.1267, + "theoretical_loss": 3.891350116786132, + "tokens_seen": 528898048 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004240922768304915, + "loss": 3.2067, + "theoretical_loss": 3.891298687308665, + "tokens_seen": 528963584 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042408224674022065, + "loss": 3.1841, + "theoretical_loss": 3.891247265986533, + "tokens_seen": 529029120 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004240722166499499, + "loss": 3.1982, + "theoretical_loss": 3.891195852817433, + "tokens_seen": 529094656 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1284030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.20843505859375, + "objective/train/theoretical_loss": 3.8911444477990615, + "objective/train/tokens_used": 549620192, + "theoretical_loss": 3.8911444477990615, + "tokens_seen": 529160192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042406218655967906, + "loss": 3.1594, + "theoretical_loss": 3.8911444477990615, + "tokens_seen": 529160192 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042405215646940824, + "loss": 3.1624, + "theoretical_loss": 3.8910930509291193, + "tokens_seen": 529225728 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004240421263791374, + "loss": 2.9882, + "theoretical_loss": 3.8910416622053035, + "tokens_seen": 529291264 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004240320962888666, + "loss": 3.1385, + "theoretical_loss": 3.8909902816253163, + "tokens_seen": 529356800 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004240220661985958, + "loss": 3.1942, + "theoretical_loss": 3.890938909186858, + "tokens_seen": 529422336 + }, + { + "epoch": 6.01, + "learning_rate": 0.000424012036108325, + "loss": 3.1019, + "theoretical_loss": 3.890887544887632, + "tokens_seen": 529487872 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042400200601805415, + "loss": 3.016, + "theoretical_loss": 3.8908361887253404, + "tokens_seen": 529553408 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239919759277834, + "loss": 2.9712, + "theoretical_loss": 3.8907848406976893, + "tokens_seen": 529618944 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042398194583751256, + "loss": 3.1129, + "theoretical_loss": 3.890733500802382, + "tokens_seen": 529684480 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042397191574724175, + "loss": 3.0563, + "theoretical_loss": 3.8906821690371247, + "tokens_seen": 529750016 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239618856569709, + "loss": 3.149, + "theoretical_loss": 3.8906308453996257, + "tokens_seen": 529815552 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239518555667001, + "loss": 3.1943, + "theoretical_loss": 3.8905795298875914, + "tokens_seen": 529881088 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239418254764293, + "loss": 3.1629, + "theoretical_loss": 3.890528222498732, + "tokens_seen": 529946624 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239317953861585, + "loss": 3.1113, + "theoretical_loss": 3.8904769232307563, + "tokens_seen": 530012160 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042392176529588765, + "loss": 3.1586, + "theoretical_loss": 3.890425632081376, + "tokens_seen": 530077696 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004239117352056169, + "loss": 3.2208, + "theoretical_loss": 3.890374349048302, + "tokens_seen": 530143232 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423901705115346, + "loss": 3.1951, + "theoretical_loss": 3.8903230741292467, + "tokens_seen": 530208768 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042389167502507525, + "loss": 3.1498, + "theoretical_loss": 3.8902718073219242, + "tokens_seen": 530274304 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042388164493480443, + "loss": 3.1188, + "theoretical_loss": 3.8902205486240486, + "tokens_seen": 530339840 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004238716148445336, + "loss": 3.0423, + "theoretical_loss": 3.890169298033336, + "tokens_seen": 530405376 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004238615847542628, + "loss": 3.156, + "theoretical_loss": 3.890118055547501, + "tokens_seen": 530470912 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423851554663992, + "loss": 3.0888, + "theoretical_loss": 3.890066821164263, + "tokens_seen": 530536448 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042384152457372115, + "loss": 3.1533, + "theoretical_loss": 3.890015594881338, + "tokens_seen": 530601984 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004238314944834504, + "loss": 3.1346, + "theoretical_loss": 3.889964376696447, + "tokens_seen": 530667520 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004238214643931795, + "loss": 3.2521, + "theoretical_loss": 3.889913166607309, + "tokens_seen": 530733056 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1287084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0766446590423584, + "objective/train/theoretical_loss": 3.889861964611644, + "objective/train/tokens_used": 551258592, + "theoretical_loss": 3.889861964611644, + "tokens_seen": 530798592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042381143430290875, + "loss": 3.1444, + "theoretical_loss": 3.889861964611644, + "tokens_seen": 530798592 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042380140421263793, + "loss": 3.1926, + "theoretical_loss": 3.8898107707071756, + "tokens_seen": 530864128 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237913741223671, + "loss": 3.1638, + "theoretical_loss": 3.8897595848916255, + "tokens_seen": 530929664 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237813440320963, + "loss": 3.2157, + "theoretical_loss": 3.889708407162718, + "tokens_seen": 530995200 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237713139418255, + "loss": 3.0381, + "theoretical_loss": 3.8896572375181773, + "tokens_seen": 531060736 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042376128385155466, + "loss": 3.2111, + "theoretical_loss": 3.8896060759557285, + "tokens_seen": 531126272 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237512537612839, + "loss": 3.1555, + "theoretical_loss": 3.8895549224730988, + "tokens_seen": 531191808 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423741223671013, + "loss": 3.2133, + "theoretical_loss": 3.889503777068015, + "tokens_seen": 531257344 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042373119358074226, + "loss": 3.0921, + "theoretical_loss": 3.889452639738206, + "tokens_seen": 531322880 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237211634904714, + "loss": 3.1211, + "theoretical_loss": 3.8894015104814006, + "tokens_seen": 531388416 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237111334002006, + "loss": 3.1355, + "theoretical_loss": 3.8893503892953287, + "tokens_seen": 531453952 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004237011033099298, + "loss": 3.0667, + "theoretical_loss": 3.889299276177722, + "tokens_seen": 531519488 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423691073219659, + "loss": 3.0132, + "theoretical_loss": 3.8892481711263116, + "tokens_seen": 531585024 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042368104312938816, + "loss": 3.2357, + "theoretical_loss": 3.889197074138831, + "tokens_seen": 531650560 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236710130391174, + "loss": 3.185, + "theoretical_loss": 3.8891459852130135, + "tokens_seen": 531716096 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236609829488465, + "loss": 3.1489, + "theoretical_loss": 3.889094904346594, + "tokens_seen": 531781632 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042365095285857576, + "loss": 2.9882, + "theoretical_loss": 3.8890438315373093, + "tokens_seen": 531847168 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236409227683049, + "loss": 3.1688, + "theoretical_loss": 3.888992766782894, + "tokens_seen": 531912704 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236308926780341, + "loss": 3.0622, + "theoretical_loss": 3.8889417100810864, + "tokens_seen": 531978240 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236208625877633, + "loss": 3.2218, + "theoretical_loss": 3.888890661429625, + "tokens_seen": 532043776 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004236108324974925, + "loss": 3.0389, + "theoretical_loss": 3.888839620826249, + "tokens_seen": 532109312 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042360080240722166, + "loss": 3.1124, + "theoretical_loss": 3.8887885882686986, + "tokens_seen": 532174848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042359077231695085, + "loss": 3.1659, + "theoretical_loss": 3.8887375637547144, + "tokens_seen": 532240384 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042358074222668, + "loss": 3.06, + "theoretical_loss": 3.888686547282039, + "tokens_seen": 532305920 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042357071213640926, + "loss": 3.1355, + "theoretical_loss": 3.888635538848415, + "tokens_seen": 532371456 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1291852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.178598403930664, + "objective/train/theoretical_loss": 3.888584538451587, + "objective/train/tokens_used": 552896992, + "theoretical_loss": 3.888584538451587, + "tokens_seen": 532436992 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004235606820461384, + "loss": 3.0872, + "theoretical_loss": 3.888584538451587, + "tokens_seen": 532436992 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004235506519558676, + "loss": 3.0773, + "theoretical_loss": 3.888533546089299, + "tokens_seen": 532502528 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042354062186559675, + "loss": 3.0946, + "theoretical_loss": 3.8884825617592957, + "tokens_seen": 532568064 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423530591775326, + "loss": 3.1444, + "theoretical_loss": 3.888431585459326, + "tokens_seen": 532633600 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042352056168505517, + "loss": 3.0911, + "theoretical_loss": 3.888380617187136, + "tokens_seen": 532699136 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042351053159478435, + "loss": 3.1461, + "theoretical_loss": 3.8883296569404737, + "tokens_seen": 532764672 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042350050150451353, + "loss": 3.02, + "theoretical_loss": 3.8882787047170893, + "tokens_seen": 532830208 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042349047141424276, + "loss": 3.0641, + "theoretical_loss": 3.8882277605147326, + "tokens_seen": 532895744 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004234804413239719, + "loss": 3.1715, + "theoretical_loss": 3.8881768243311545, + "tokens_seen": 532961280 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042347041123370113, + "loss": 3.1204, + "theoretical_loss": 3.8881258961641074, + "tokens_seen": 533026816 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042346038114343025, + "loss": 3.2326, + "theoretical_loss": 3.888074976011344, + "tokens_seen": 533092352 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004234503510531595, + "loss": 3.1337, + "theoretical_loss": 3.8880240638706183, + "tokens_seen": 533157888 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042344032096288867, + "loss": 3.0208, + "theoretical_loss": 3.887973159739685, + "tokens_seen": 533223424 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042343029087261785, + "loss": 3.1107, + "theoretical_loss": 3.8879222636162996, + "tokens_seen": 533288960 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004234202607823471, + "loss": 3.0161, + "theoretical_loss": 3.887871375498219, + "tokens_seen": 533354496 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004234102306920762, + "loss": 3.0618, + "theoretical_loss": 3.8878204953832, + "tokens_seen": 533420032 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042340020060180545, + "loss": 3.0777, + "theoretical_loss": 3.8877696232690013, + "tokens_seen": 533485568 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042339017051153463, + "loss": 3.1283, + "theoretical_loss": 3.8877187591533824, + "tokens_seen": 533551104 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004233801404212638, + "loss": 3.25, + "theoretical_loss": 3.887667903034103, + "tokens_seen": 533616640 + }, + { + "epoch": 6.01, + "learning_rate": 0.000423370110330993, + "loss": 3.1938, + "theoretical_loss": 3.887617054908924, + "tokens_seen": 533682176 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004233600802407222, + "loss": 3.0648, + "theoretical_loss": 3.8875662147756085, + "tokens_seen": 533747712 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042335005015045135, + "loss": 3.1342, + "theoretical_loss": 3.887515382631918, + "tokens_seen": 533813248 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004233400200601806, + "loss": 3.2055, + "theoretical_loss": 3.8874645584756173, + "tokens_seen": 533878784 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004233299899699097, + "loss": 3.0756, + "theoretical_loss": 3.887413742304471, + "tokens_seen": 533944320 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042331995987963895, + "loss": 3.1351, + "theoretical_loss": 3.8873629341162435, + "tokens_seen": 534009856 + }, + { + "epoch": 6.01, + "objective/train/docs_used": 1294798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.14487361907959, + "objective/train/theoretical_loss": 3.8873121339087024, + "objective/train/tokens_used": 554535392, + "theoretical_loss": 3.8873121339087024, + "tokens_seen": 534075392 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042330992978936813, + "loss": 3.1592, + "theoretical_loss": 3.8873121339087024, + "tokens_seen": 534075392 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232998996990973, + "loss": 3.0917, + "theoretical_loss": 3.887261341679615, + "tokens_seen": 534140928 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232898696088265, + "loss": 3.1783, + "theoretical_loss": 3.887210557426749, + "tokens_seen": 534206464 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232798395185557, + "loss": 3.1698, + "theoretical_loss": 3.8871597811478735, + "tokens_seen": 534272000 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042326980942828486, + "loss": 3.1342, + "theoretical_loss": 3.8871090128407593, + "tokens_seen": 534337536 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232597793380141, + "loss": 3.2521, + "theoretical_loss": 3.887058252503177, + "tokens_seen": 534403072 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232497492477432, + "loss": 3.0893, + "theoretical_loss": 3.8870075001328983, + "tokens_seen": 534468608 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042323971915747246, + "loss": 3.0855, + "theoretical_loss": 3.886956755727696, + "tokens_seen": 534534144 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232296890672016, + "loss": 3.1093, + "theoretical_loss": 3.8869060192853437, + "tokens_seen": 534599680 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004232196589769308, + "loss": 2.9967, + "theoretical_loss": 3.8868552908036165, + "tokens_seen": 534665216 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042320962888666, + "loss": 3.0925, + "theoretical_loss": 3.8868045702802894, + "tokens_seen": 534730752 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231995987963892, + "loss": 2.9698, + "theoretical_loss": 3.886753857713138, + "tokens_seen": 534796288 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042318956870611836, + "loss": 3.1443, + "theoretical_loss": 3.8867031530999405, + "tokens_seen": 534861824 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231795386158476, + "loss": 3.1087, + "theoretical_loss": 3.886652456438475, + "tokens_seen": 534927360 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231695085255767, + "loss": 3.1096, + "theoretical_loss": 3.8866017677265194, + "tokens_seen": 534992896 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042315947843530596, + "loss": 3.0328, + "theoretical_loss": 3.8865510869618554, + "tokens_seen": 535058432 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231494483450351, + "loss": 3.1092, + "theoretical_loss": 3.8865004141422625, + "tokens_seen": 535123968 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231394182547643, + "loss": 3.0653, + "theoretical_loss": 3.8864497492655223, + "tokens_seen": 535189504 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231293881644935, + "loss": 3.1805, + "theoretical_loss": 3.886399092329418, + "tokens_seen": 535255040 + }, + { + "epoch": 6.01, + "learning_rate": 0.0004231193580742227, + "loss": 3.1866, + "theoretical_loss": 3.886348443331733, + "tokens_seen": 535320576 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042310932798395186, + "loss": 3.1825, + "theoretical_loss": 3.886297802270251, + "tokens_seen": 535386112 + }, + { + "epoch": 6.01, + "learning_rate": 0.00042309929789368105, + "loss": 3.0991, + "theoretical_loss": 3.8862471691427585, + "tokens_seen": 535451648 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004230892678034102, + "loss": 3.1227, + "theoretical_loss": 3.8861965439470403, + "tokens_seen": 535517184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042307923771313946, + "loss": 3.1391, + "theoretical_loss": 3.886145926680884, + "tokens_seen": 535582720 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004230692076228686, + "loss": 3.1369, + "theoretical_loss": 3.886095317342077, + "tokens_seen": 535648256 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1298710, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0240514278411865, + "objective/train/theoretical_loss": 3.886044715928409, + "objective/train/tokens_used": 556173792, + "theoretical_loss": 3.886044715928409, + "tokens_seen": 535713792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004230591775325978, + "loss": 3.1294, + "theoretical_loss": 3.886044715928409, + "tokens_seen": 535713792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042304914744232695, + "loss": 3.1001, + "theoretical_loss": 3.88599412243767, + "tokens_seen": 535779328 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004230391173520562, + "loss": 3.0575, + "theoretical_loss": 3.885943536867649, + "tokens_seen": 535844864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042302908726178537, + "loss": 3.1658, + "theoretical_loss": 3.885892959216138, + "tokens_seen": 535910400 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042301905717151455, + "loss": 3.2089, + "theoretical_loss": 3.88584238948093, + "tokens_seen": 535975936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042300902708124373, + "loss": 3.1108, + "theoretical_loss": 3.885791827659818, + "tokens_seen": 536041472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042299899699097297, + "loss": 3.1262, + "theoretical_loss": 3.885741273750596, + "tokens_seen": 536107008 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004229889669007021, + "loss": 3.0501, + "theoretical_loss": 3.885690727751058, + "tokens_seen": 536172544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042297893681043133, + "loss": 3.1901, + "theoretical_loss": 3.885640189659002, + "tokens_seen": 536238080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042296890672016045, + "loss": 3.1414, + "theoretical_loss": 3.885589659472223, + "tokens_seen": 536303616 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004229588766298897, + "loss": 3.153, + "theoretical_loss": 3.8855391371885193, + "tokens_seen": 536369152 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042294884653961887, + "loss": 3.1338, + "theoretical_loss": 3.8854886228056893, + "tokens_seen": 536434688 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042293881644934805, + "loss": 3.0859, + "theoretical_loss": 3.8854381163215326, + "tokens_seen": 536500224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042292878635907723, + "loss": 3.0404, + "theoretical_loss": 3.885387617733849, + "tokens_seen": 536565760 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004229187562688064, + "loss": 3.2103, + "theoretical_loss": 3.8853371270404407, + "tokens_seen": 536631296 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004229087261785356, + "loss": 3.0831, + "theoretical_loss": 3.8852866442391085, + "tokens_seen": 536696832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042289869608826483, + "loss": 3.0799, + "theoretical_loss": 3.885236169327656, + "tokens_seen": 536762368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042288866599799396, + "loss": 3.2378, + "theoretical_loss": 3.885185702303887, + "tokens_seen": 536827904 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004228786359077232, + "loss": 3.1273, + "theoretical_loss": 3.885135243165606, + "tokens_seen": 536893440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004228686058174523, + "loss": 3.0832, + "theoretical_loss": 3.885084791910619, + "tokens_seen": 536958976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042285857572718155, + "loss": 3.1008, + "theoretical_loss": 3.885034348536732, + "tokens_seen": 537024512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042284854563691074, + "loss": 3.1795, + "theoretical_loss": 3.8849839130417525, + "tokens_seen": 537090048 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004228385155466399, + "loss": 3.0197, + "theoretical_loss": 3.8849334854234883, + "tokens_seen": 537155584 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004228284854563691, + "loss": 3.122, + "theoretical_loss": 3.8848830656797495, + "tokens_seen": 537221120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042281845536609833, + "loss": 3.0989, + "theoretical_loss": 3.884832653808345, + "tokens_seen": 537286656 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1303219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1692276000976562, + "objective/train/theoretical_loss": 3.884782249807086, + "objective/train/tokens_used": 557812192, + "theoretical_loss": 3.884782249807086, + "tokens_seen": 537352192 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042280842527582746, + "loss": 3.1307, + "theoretical_loss": 3.884782249807086, + "tokens_seen": 537352192 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227983951855567, + "loss": 3.1714, + "theoretical_loss": 3.8847318536737845, + "tokens_seen": 537417728 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227883650952858, + "loss": 3.1397, + "theoretical_loss": 3.8846814654062527, + "tokens_seen": 537483264 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042277833500501506, + "loss": 3.049, + "theoretical_loss": 3.8846310850023045, + "tokens_seen": 537548800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042276830491474424, + "loss": 3.1661, + "theoretical_loss": 3.884580712459754, + "tokens_seen": 537614336 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227582748244734, + "loss": 3.0991, + "theoretical_loss": 3.884530347776416, + "tokens_seen": 537679872 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227482447342026, + "loss": 3.2543, + "theoretical_loss": 3.8844799909501067, + "tokens_seen": 537745408 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227382146439318, + "loss": 3.1226, + "theoretical_loss": 3.884429641978644, + "tokens_seen": 537810944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042272818455366096, + "loss": 3.0837, + "theoretical_loss": 3.8843793008598446, + "tokens_seen": 537876480 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227181544633902, + "loss": 3.0967, + "theoretical_loss": 3.884328967591528, + "tokens_seen": 537942016 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004227081243731193, + "loss": 3.0964, + "theoretical_loss": 3.884278642171513, + "tokens_seen": 538007552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042269809428284856, + "loss": 3.1333, + "theoretical_loss": 3.88422832459762, + "tokens_seen": 538073088 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226880641925777, + "loss": 2.9764, + "theoretical_loss": 3.884178014867672, + "tokens_seen": 538138624 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226780341023069, + "loss": 3.1649, + "theoretical_loss": 3.884127712979489, + "tokens_seen": 538204160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042266800401203616, + "loss": 3.0105, + "theoretical_loss": 3.884077418930895, + "tokens_seen": 538269696 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226579739217653, + "loss": 3.099, + "theoretical_loss": 3.884027132719714, + "tokens_seen": 538335232 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226479438314945, + "loss": 3.1654, + "theoretical_loss": 3.8839768543437714, + "tokens_seen": 538400768 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226379137412237, + "loss": 3.1969, + "theoretical_loss": 3.883926583800892, + "tokens_seen": 538466304 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004226278836509529, + "loss": 3.1132, + "theoretical_loss": 3.8838763210889016, + "tokens_seen": 538531840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042261785356068206, + "loss": 3.1134, + "theoretical_loss": 3.883826066205629, + "tokens_seen": 538597376 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042260782347041125, + "loss": 3.1306, + "theoretical_loss": 3.8837758191489016, + "tokens_seen": 538662912 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004225977933801404, + "loss": 3.1025, + "theoretical_loss": 3.8837255799165495, + "tokens_seen": 538728448 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042258776328986966, + "loss": 3.2558, + "theoretical_loss": 3.8836753485064017, + "tokens_seen": 538793984 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004225777331995988, + "loss": 2.9995, + "theoretical_loss": 3.8836251249162896, + "tokens_seen": 538859520 + }, + { + "epoch": 6.02, + "learning_rate": 0.000422567703109328, + "loss": 3.1367, + "theoretical_loss": 3.883574909144045, + "tokens_seen": 538925056 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1306487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9207870960235596, + "objective/train/theoretical_loss": 3.8835247011875, + "objective/train/tokens_used": 559450592, + "theoretical_loss": 3.8835247011875, + "tokens_seen": 538990592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042255767301905715, + "loss": 3.0266, + "theoretical_loss": 3.8835247011875, + "tokens_seen": 538990592 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004225476429287864, + "loss": 3.1266, + "theoretical_loss": 3.883474501044488, + "tokens_seen": 539056128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042253761283851557, + "loss": 3.1151, + "theoretical_loss": 3.883424308712844, + "tokens_seen": 539121664 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042252758274824475, + "loss": 3.1545, + "theoretical_loss": 3.8833741241904027, + "tokens_seen": 539187200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042251755265797393, + "loss": 3.2354, + "theoretical_loss": 3.8833239474750005, + "tokens_seen": 539252736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042250752256770317, + "loss": 3.1262, + "theoretical_loss": 3.8832737785644738, + "tokens_seen": 539318272 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004224974924774323, + "loss": 3.1097, + "theoretical_loss": 3.883223617456661, + "tokens_seen": 539383808 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042248746238716153, + "loss": 3.1397, + "theoretical_loss": 3.8831734641494, + "tokens_seen": 539449344 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042247743229689065, + "loss": 3.0983, + "theoretical_loss": 3.8831233186405303, + "tokens_seen": 539514880 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004224674022066199, + "loss": 3.0894, + "theoretical_loss": 3.883073180927893, + "tokens_seen": 539580416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042245737211634907, + "loss": 3.1216, + "theoretical_loss": 3.8830230510093293, + "tokens_seen": 539645952 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042244734202607825, + "loss": 3.1046, + "theoretical_loss": 3.8829729288826806, + "tokens_seen": 539711488 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042243731193580743, + "loss": 3.2009, + "theoretical_loss": 3.8829228145457897, + "tokens_seen": 539777024 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004224272818455366, + "loss": 3.2171, + "theoretical_loss": 3.8828727079965017, + "tokens_seen": 539842560 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004224172517552658, + "loss": 3.1144, + "theoretical_loss": 3.88282260923266, + "tokens_seen": 539908096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042240722166499503, + "loss": 3.1989, + "theoretical_loss": 3.88277251825211, + "tokens_seen": 539973632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042239719157472416, + "loss": 3.0895, + "theoretical_loss": 3.882722435052699, + "tokens_seen": 540039168 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004223871614844534, + "loss": 3.1177, + "theoretical_loss": 3.8826723596322736, + "tokens_seen": 540104704 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004223771313941825, + "loss": 3.146, + "theoretical_loss": 3.8826222919886826, + "tokens_seen": 540170240 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042236710130391176, + "loss": 3.0849, + "theoretical_loss": 3.8825722321197738, + "tokens_seen": 540235776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042235707121364094, + "loss": 3.0609, + "theoretical_loss": 3.8825221800233978, + "tokens_seen": 540301312 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004223470411233701, + "loss": 2.9971, + "theoretical_loss": 3.882472135697405, + "tokens_seen": 540366848 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004223370110330993, + "loss": 3.0841, + "theoretical_loss": 3.8824220991396468, + "tokens_seen": 540432384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042232698094282853, + "loss": 3.1167, + "theoretical_loss": 3.8823720703479756, + "tokens_seen": 540497920 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042231695085255766, + "loss": 3.0627, + "theoretical_loss": 3.882322049320245, + "tokens_seen": 540563456 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1311273, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9174752235412598, + "objective/train/theoretical_loss": 3.882272036054309, + "objective/train/tokens_used": 561088992, + "theoretical_loss": 3.882272036054309, + "tokens_seen": 540628992 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004223069207622869, + "loss": 3.0691, + "theoretical_loss": 3.882272036054309, + "tokens_seen": 540628992 + }, + { + "epoch": 6.02, + "learning_rate": 0.000422296890672016, + "loss": 3.1874, + "theoretical_loss": 3.882222030548022, + "tokens_seen": 540694528 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042228686058174526, + "loss": 3.171, + "theoretical_loss": 3.88217203279924, + "tokens_seen": 540760064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042227683049147444, + "loss": 3.1185, + "theoretical_loss": 3.8821220428058196, + "tokens_seen": 540825600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004222668004012036, + "loss": 3.0965, + "theoretical_loss": 3.8820720605656187, + "tokens_seen": 540891136 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004222567703109328, + "loss": 3.176, + "theoretical_loss": 3.882022086076495, + "tokens_seen": 540956672 + }, + { + "epoch": 6.02, + "learning_rate": 0.000422246740220662, + "loss": 3.2174, + "theoretical_loss": 3.8819721193363077, + "tokens_seen": 541022208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042223671013039116, + "loss": 3.1324, + "theoretical_loss": 3.8819221603429175, + "tokens_seen": 541087744 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004222266800401204, + "loss": 3.2125, + "theoretical_loss": 3.881872209094185, + "tokens_seen": 541153280 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004222166499498495, + "loss": 3.1593, + "theoretical_loss": 3.8818222655879717, + "tokens_seen": 541218816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042220661985957876, + "loss": 3.067, + "theoretical_loss": 3.88177232982214, + "tokens_seen": 541284352 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221965897693079, + "loss": 3.123, + "theoretical_loss": 3.881722401794554, + "tokens_seen": 541349888 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221865596790371, + "loss": 3.087, + "theoretical_loss": 3.881672481503077, + "tokens_seen": 541415424 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221765295887663, + "loss": 3.1153, + "theoretical_loss": 3.8816225689455752, + "tokens_seen": 541480960 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221664994984955, + "loss": 3.1938, + "theoretical_loss": 3.8815726641199135, + "tokens_seen": 541546496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042215646940822467, + "loss": 3.059, + "theoretical_loss": 3.88152276702396, + "tokens_seen": 541612032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221464393179539, + "loss": 3.1737, + "theoretical_loss": 3.8814728776555807, + "tokens_seen": 541677568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042213640922768303, + "loss": 3.1603, + "theoretical_loss": 3.8814229960126463, + "tokens_seen": 541743104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042212637913741226, + "loss": 3.1793, + "theoretical_loss": 3.8813731220930237, + "tokens_seen": 541808640 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221163490471414, + "loss": 3.1429, + "theoretical_loss": 3.8813232558945847, + "tokens_seen": 541874176 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004221063189568706, + "loss": 3.1328, + "theoretical_loss": 3.8812733974152005, + "tokens_seen": 541939712 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004220962888665998, + "loss": 3.152, + "theoretical_loss": 3.881223546652742, + "tokens_seen": 542005248 + }, + { + "epoch": 6.02, + "learning_rate": 0.000422086258776329, + "loss": 3.1361, + "theoretical_loss": 3.881173703605082, + "tokens_seen": 542070784 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042207622868605817, + "loss": 3.0504, + "theoretical_loss": 3.8811238682700946, + "tokens_seen": 542136320 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042206619859578735, + "loss": 3.1663, + "theoretical_loss": 3.8810740406456548, + "tokens_seen": 542201856 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1314230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1640379428863525, + "objective/train/theoretical_loss": 3.8810242207296364, + "objective/train/tokens_used": 562727392, + "theoretical_loss": 3.8810242207296364, + "tokens_seen": 542267392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042205616850551653, + "loss": 3.0759, + "theoretical_loss": 3.8810242207296364, + "tokens_seen": 542267392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042204613841524577, + "loss": 3.2124, + "theoretical_loss": 3.8809744085199167, + "tokens_seen": 542332928 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004220361083249749, + "loss": 3.0918, + "theoretical_loss": 3.880924604014372, + "tokens_seen": 542398464 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042202607823470413, + "loss": 3.1625, + "theoretical_loss": 3.88087480721088, + "tokens_seen": 542464000 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004220160481444333, + "loss": 3.2067, + "theoretical_loss": 3.88082501810732, + "tokens_seen": 542529536 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004220060180541625, + "loss": 3.2237, + "theoretical_loss": 3.88077523670157, + "tokens_seen": 542595072 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004219959879638917, + "loss": 3.0552, + "theoretical_loss": 3.8807254629915127, + "tokens_seen": 542660608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042198595787362085, + "loss": 3.1884, + "theoretical_loss": 3.880675696975027, + "tokens_seen": 542726144 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042197592778335004, + "loss": 3.1908, + "theoretical_loss": 3.880625938649997, + "tokens_seen": 542791680 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042196589769307927, + "loss": 3.187, + "theoretical_loss": 3.8805761880143033, + "tokens_seen": 542857216 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004219558676028084, + "loss": 3.1182, + "theoretical_loss": 3.880526445065831, + "tokens_seen": 542922752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042194583751253763, + "loss": 3.2308, + "theoretical_loss": 3.8804767098024637, + "tokens_seen": 542988288 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042193580742226676, + "loss": 3.1106, + "theoretical_loss": 3.8804269822220876, + "tokens_seen": 543053824 + }, + { + "epoch": 6.02, + "learning_rate": 0.000421925777331996, + "loss": 3.0622, + "theoretical_loss": 3.8803772623225887, + "tokens_seen": 543119360 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042191574724172523, + "loss": 3.0865, + "theoretical_loss": 3.8803275501018533, + "tokens_seen": 543184896 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042190571715145436, + "loss": 3.0654, + "theoretical_loss": 3.88027784555777, + "tokens_seen": 543250432 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218956870611836, + "loss": 3.2049, + "theoretical_loss": 3.8802281486882277, + "tokens_seen": 543315968 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218856569709127, + "loss": 3.0252, + "theoretical_loss": 3.8801784594911153, + "tokens_seen": 543381504 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042187562688064196, + "loss": 3.14, + "theoretical_loss": 3.880128777964323, + "tokens_seen": 543447040 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042186559679037114, + "loss": 3.1791, + "theoretical_loss": 3.8800791041057425, + "tokens_seen": 543512576 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218555667001003, + "loss": 3.1582, + "theoretical_loss": 3.880029437913266, + "tokens_seen": 543578112 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218455366098295, + "loss": 3.1371, + "theoretical_loss": 3.8799797793847857, + "tokens_seen": 543643648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042183550651955873, + "loss": 3.1056, + "theoretical_loss": 3.879930128518195, + "tokens_seen": 543709184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042182547642928786, + "loss": 3.1822, + "theoretical_loss": 3.87988048531139, + "tokens_seen": 543774720 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218154463390171, + "loss": 3.1553, + "theoretical_loss": 3.8798308497622647, + "tokens_seen": 543840256 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1317925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2977333068847656, + "objective/train/theoretical_loss": 3.8797812218687158, + "objective/train/tokens_used": 564365792, + "theoretical_loss": 3.8797812218687158, + "tokens_seen": 543905792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004218054162487462, + "loss": 3.103, + "theoretical_loss": 3.8797812218687158, + "tokens_seen": 543905792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042179538615847546, + "loss": 3.0133, + "theoretical_loss": 3.87973160162864, + "tokens_seen": 543971328 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042178535606820464, + "loss": 3.1834, + "theoretical_loss": 3.879681989039935, + "tokens_seen": 544036864 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004217753259779338, + "loss": 3.1667, + "theoretical_loss": 3.8796323841005003, + "tokens_seen": 544102400 + }, + { + "epoch": 6.02, + "learning_rate": 0.000421765295887663, + "loss": 3.103, + "theoretical_loss": 3.8795827868082347, + "tokens_seen": 544167936 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004217552657973922, + "loss": 3.1282, + "theoretical_loss": 3.879533197161039, + "tokens_seen": 544233472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042174523570712136, + "loss": 3.1418, + "theoretical_loss": 3.879483615156814, + "tokens_seen": 544299008 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004217352056168506, + "loss": 3.1023, + "theoretical_loss": 3.8794340407934618, + "tokens_seen": 544364544 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004217251755265797, + "loss": 3.0847, + "theoretical_loss": 3.8793844740688854, + "tokens_seen": 544430080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042171514543630896, + "loss": 3.0743, + "theoretical_loss": 3.879334914980988, + "tokens_seen": 544495616 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004217051153460381, + "loss": 3.108, + "theoretical_loss": 3.8792853635276745, + "tokens_seen": 544561152 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216950852557673, + "loss": 3.1816, + "theoretical_loss": 3.87923581970685, + "tokens_seen": 544626688 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216850551654965, + "loss": 2.9895, + "theoretical_loss": 3.879186283516421, + "tokens_seen": 544692224 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216750250752257, + "loss": 3.1045, + "theoretical_loss": 3.879136754954294, + "tokens_seen": 544757760 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042166499498495487, + "loss": 3.1325, + "theoretical_loss": 3.879087234018377, + "tokens_seen": 544823296 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216549648946841, + "loss": 3.0834, + "theoretical_loss": 3.8790377207065787, + "tokens_seen": 544888832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042164493480441323, + "loss": 3.1296, + "theoretical_loss": 3.8789882150168085, + "tokens_seen": 544954368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042163490471414246, + "loss": 3.1783, + "theoretical_loss": 3.8789387169469762, + "tokens_seen": 545019904 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216248746238716, + "loss": 3.1837, + "theoretical_loss": 3.878889226494994, + "tokens_seen": 545085440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004216148445336008, + "loss": 3.0399, + "theoretical_loss": 3.878839743658772, + "tokens_seen": 545150976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042160481444333, + "loss": 3.1109, + "theoretical_loss": 3.8787902684362248, + "tokens_seen": 545216512 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004215947843530592, + "loss": 3.1573, + "theoretical_loss": 3.8787408008252653, + "tokens_seen": 545282048 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042158475426278837, + "loss": 2.976, + "theoretical_loss": 3.8786913408238073, + "tokens_seen": 545347584 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042157472417251755, + "loss": 3.2305, + "theoretical_loss": 3.878641888429767, + "tokens_seen": 545413120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042156469408224673, + "loss": 3.2345, + "theoretical_loss": 3.8785924436410593, + "tokens_seen": 545478656 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1322661, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2231879234313965, + "objective/train/theoretical_loss": 3.878543006455602, + "objective/train/tokens_used": 566004192, + "theoretical_loss": 3.878543006455602, + "tokens_seen": 545544192 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042155466399197597, + "loss": 3.0729, + "theoretical_loss": 3.878543006455602, + "tokens_seen": 545544192 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004215446339017051, + "loss": 3.1511, + "theoretical_loss": 3.8784935768713122, + "tokens_seen": 545609728 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042153460381143433, + "loss": 3.2535, + "theoretical_loss": 3.8784441548861084, + "tokens_seen": 545675264 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004215245737211635, + "loss": 3.1169, + "theoretical_loss": 3.8783947404979107, + "tokens_seen": 545740800 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004215145436308927, + "loss": 3.1151, + "theoretical_loss": 3.878345333704638, + "tokens_seen": 545806336 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004215045135406219, + "loss": 3.1609, + "theoretical_loss": 3.8782959345042123, + "tokens_seen": 545871872 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042149448345035105, + "loss": 3.1004, + "theoretical_loss": 3.878246542894555, + "tokens_seen": 545937408 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042148445336008024, + "loss": 3.1198, + "theoretical_loss": 3.8781971588735877, + "tokens_seen": 546002944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042147442326980947, + "loss": 2.9268, + "theoretical_loss": 3.8781477824392354, + "tokens_seen": 546068480 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004214643931795386, + "loss": 3.1166, + "theoretical_loss": 3.8780984135894214, + "tokens_seen": 546134016 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042145436308926783, + "loss": 3.1295, + "theoretical_loss": 3.878049052322071, + "tokens_seen": 546199552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042144433299899696, + "loss": 3.0954, + "theoretical_loss": 3.87799969863511, + "tokens_seen": 546265088 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004214343029087262, + "loss": 3.0623, + "theoretical_loss": 3.877950352526465, + "tokens_seen": 546330624 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004214242728184554, + "loss": 3.1427, + "theoretical_loss": 3.877901013994064, + "tokens_seen": 546396160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042141424272818456, + "loss": 3.0935, + "theoretical_loss": 3.877851683035834, + "tokens_seen": 546461696 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042140421263791374, + "loss": 3.1377, + "theoretical_loss": 3.8778023596497055, + "tokens_seen": 546527232 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213941825476429, + "loss": 3.0564, + "theoretical_loss": 3.8777530438336076, + "tokens_seen": 546592768 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213841524573721, + "loss": 3.1296, + "theoretical_loss": 3.8777037355854715, + "tokens_seen": 546658304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042137412236710134, + "loss": 3.1169, + "theoretical_loss": 3.8776544349032287, + "tokens_seen": 546723840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042136409227683046, + "loss": 3.0294, + "theoretical_loss": 3.877605141784811, + "tokens_seen": 546789376 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213540621865597, + "loss": 3.0639, + "theoretical_loss": 3.877555856228152, + "tokens_seen": 546854912 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213440320962889, + "loss": 3.1684, + "theoretical_loss": 3.877506578231186, + "tokens_seen": 546920448 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042133400200601806, + "loss": 3.049, + "theoretical_loss": 3.8774573077918473, + "tokens_seen": 546985984 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042132397191574724, + "loss": 3.1133, + "theoretical_loss": 3.8774080449080714, + "tokens_seen": 547051520 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213139418254764, + "loss": 3.1309, + "theoretical_loss": 3.8773587895777957, + "tokens_seen": 547117056 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2373247146606445, + "objective/train/theoretical_loss": 3.8773095417989563, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8773095417989563, + "tokens_seen": 547182592 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004213039117352056, + "loss": 3.1826, + "theoretical_loss": 3.8773095417989563, + "tokens_seen": 547182592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042129388164493484, + "loss": 3.2196, + "theoretical_loss": 3.8772603015694918, + "tokens_seen": 547248128 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042128385155466397, + "loss": 3.2249, + "theoretical_loss": 3.877211068887341, + "tokens_seen": 547313664 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004212738214643932, + "loss": 3.0774, + "theoretical_loss": 3.877161843750444, + "tokens_seen": 547379200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042126379137412233, + "loss": 3.1111, + "theoretical_loss": 3.87711262615674, + "tokens_seen": 547444736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042125376128385156, + "loss": 3.1821, + "theoretical_loss": 3.8770634161041717, + "tokens_seen": 547510272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042124373119358075, + "loss": 3.1677, + "theoretical_loss": 3.87701421359068, + "tokens_seen": 547575808 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004212337011033099, + "loss": 3.1872, + "theoretical_loss": 3.876965018614209, + "tokens_seen": 547641344 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004212236710130391, + "loss": 3.1612, + "theoretical_loss": 3.8769158311727017, + "tokens_seen": 547706880 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004212136409227683, + "loss": 3.0724, + "theoretical_loss": 3.876866651264103, + "tokens_seen": 547772416 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042120361083249747, + "loss": 3.1165, + "theoretical_loss": 3.876817478886357, + "tokens_seen": 547837952 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004211935807422267, + "loss": 3.0854, + "theoretical_loss": 3.8767683140374114, + "tokens_seen": 547903488 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042118355065195583, + "loss": 3.2556, + "theoretical_loss": 3.8767191567152124, + "tokens_seen": 547969024 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042117352056168507, + "loss": 3.1516, + "theoretical_loss": 3.8766700069177076, + "tokens_seen": 548034560 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004211634904714143, + "loss": 3.2116, + "theoretical_loss": 3.876620864642846, + "tokens_seen": 548100096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042115346038114343, + "loss": 3.1618, + "theoretical_loss": 3.8765717298885765, + "tokens_seen": 548165632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042114343029087266, + "loss": 3.1906, + "theoretical_loss": 3.87652260265285, + "tokens_seen": 548231168 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004211334002006018, + "loss": 3.0206, + "theoretical_loss": 3.8764734829336165, + "tokens_seen": 548296704 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042112337011033103, + "loss": 3.2087, + "theoretical_loss": 3.876424370728828, + "tokens_seen": 548362240 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004211133400200602, + "loss": 3.195, + "theoretical_loss": 3.8763752660364377, + "tokens_seen": 548427776 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004211033099297894, + "loss": 3.142, + "theoretical_loss": 3.8763261688543986, + "tokens_seen": 548493312 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042109327983951857, + "loss": 3.1002, + "theoretical_loss": 3.876277079180664, + "tokens_seen": 548558848 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042108324974924775, + "loss": 3.0843, + "theoretical_loss": 3.8762279970131903, + "tokens_seen": 548624384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042107321965897693, + "loss": 3.1853, + "theoretical_loss": 3.8761789223499328, + "tokens_seen": 548689920 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042106318956870617, + "loss": 2.9051, + "theoretical_loss": 3.8761298551888475, + "tokens_seen": 548755456 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.169639825820923, + "objective/train/theoretical_loss": 3.876080795527892, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.876080795527892, + "tokens_seen": 548820992 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004210531594784353, + "loss": 2.9923, + "theoretical_loss": 3.876080795527892, + "tokens_seen": 548820992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042104312938816453, + "loss": 3.066, + "theoretical_loss": 3.876031743365025, + "tokens_seen": 548886528 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004210330992978937, + "loss": 3.042, + "theoretical_loss": 3.8759826986982047, + "tokens_seen": 548952064 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004210230692076229, + "loss": 3.1714, + "theoretical_loss": 3.8759336615253908, + "tokens_seen": 549017600 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004210130391173521, + "loss": 3.1356, + "theoretical_loss": 3.8758846318445452, + "tokens_seen": 549083136 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042100300902708125, + "loss": 3.2001, + "theoretical_loss": 3.8758356096536284, + "tokens_seen": 549148672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042099297893681044, + "loss": 3.1636, + "theoretical_loss": 3.8757865949506023, + "tokens_seen": 549214208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042098294884653967, + "loss": 3.2036, + "theoretical_loss": 3.87573758773343, + "tokens_seen": 549279744 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004209729187562688, + "loss": 3.1046, + "theoretical_loss": 3.8756885880000755, + "tokens_seen": 549345280 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042096288866599803, + "loss": 3.125, + "theoretical_loss": 3.8756395957485035, + "tokens_seen": 549410816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042095285857572716, + "loss": 3.1065, + "theoretical_loss": 3.8755906109766785, + "tokens_seen": 549476352 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004209428284854564, + "loss": 3.0761, + "theoretical_loss": 3.875541633682568, + "tokens_seen": 549541888 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004209327983951856, + "loss": 3.062, + "theoretical_loss": 3.8754926638641374, + "tokens_seen": 549607424 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042092276830491476, + "loss": 3.126, + "theoretical_loss": 3.8754437015193552, + "tokens_seen": 549672960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042091273821464394, + "loss": 3.1168, + "theoretical_loss": 3.87539474664619, + "tokens_seen": 549738496 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004209027081243731, + "loss": 3.13, + "theoretical_loss": 3.8753457992426115, + "tokens_seen": 549804032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004208926780341023, + "loss": 3.0861, + "theoretical_loss": 3.875296859306589, + "tokens_seen": 549869568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042088264794383154, + "loss": 3.1625, + "theoretical_loss": 3.875247926836094, + "tokens_seen": 549935104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042087261785356066, + "loss": 3.0701, + "theoretical_loss": 3.875199001829098, + "tokens_seen": 550000640 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004208625877632899, + "loss": 3.0383, + "theoretical_loss": 3.8751500842835735, + "tokens_seen": 550066176 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004208525576730191, + "loss": 3.1312, + "theoretical_loss": 3.8751011741974937, + "tokens_seen": 550131712 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042084252758274826, + "loss": 3.2117, + "theoretical_loss": 3.8750522715688325, + "tokens_seen": 550197248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042083249749247744, + "loss": 3.1298, + "theoretical_loss": 3.875003376395566, + "tokens_seen": 550262784 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004208224674022066, + "loss": 3.1241, + "theoretical_loss": 3.8749544886756677, + "tokens_seen": 550328320 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004208124373119358, + "loss": 3.0487, + "theoretical_loss": 3.874905608407116, + "tokens_seen": 550393856 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.025250196456909, + "objective/train/theoretical_loss": 3.8748567355878873, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8748567355878873, + "tokens_seen": 550459392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042080240722166504, + "loss": 3.0083, + "theoretical_loss": 3.8748567355878873, + "tokens_seen": 550459392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042079237713139417, + "loss": 3.1884, + "theoretical_loss": 3.87480787021596, + "tokens_seen": 550524928 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004207823470411234, + "loss": 3.0948, + "theoretical_loss": 3.8747590122893123, + "tokens_seen": 550590464 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042077231695085253, + "loss": 3.0356, + "theoretical_loss": 3.8747101618059245, + "tokens_seen": 550656000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042076228686058176, + "loss": 3.0493, + "theoretical_loss": 3.874661318763777, + "tokens_seen": 550721536 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042075225677031095, + "loss": 3.1899, + "theoretical_loss": 3.87461248316085, + "tokens_seen": 550787072 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004207422266800401, + "loss": 3.0726, + "theoretical_loss": 3.874563654995126, + "tokens_seen": 550852608 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004207321965897693, + "loss": 3.1533, + "theoretical_loss": 3.8745148342645885, + "tokens_seen": 550918144 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004207221664994985, + "loss": 3.125, + "theoretical_loss": 3.8744660209672204, + "tokens_seen": 550983680 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042071213640922767, + "loss": 3.0524, + "theoretical_loss": 3.874417215101006, + "tokens_seen": 551049216 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004207021063189569, + "loss": 3.1153, + "theoretical_loss": 3.874368416663931, + "tokens_seen": 551114752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042069207622868603, + "loss": 3.0909, + "theoretical_loss": 3.8743196256539805, + "tokens_seen": 551180288 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042068204613841527, + "loss": 3.0401, + "theoretical_loss": 3.8742708420691416, + "tokens_seen": 551245824 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042067201604814445, + "loss": 3.1062, + "theoretical_loss": 3.874222065907402, + "tokens_seen": 551311360 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042066198595787363, + "loss": 3.0249, + "theoretical_loss": 3.8741732971667493, + "tokens_seen": 551376896 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004206519558676028, + "loss": 3.106, + "theoretical_loss": 3.8741245358451732, + "tokens_seen": 551442432 + }, + { + "epoch": 6.02, + "learning_rate": 0.000420641925777332, + "loss": 3.1912, + "theoretical_loss": 3.8740757819406633, + "tokens_seen": 551507968 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004206318956870612, + "loss": 3.1055, + "theoretical_loss": 3.8740270354512107, + "tokens_seen": 551573504 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004206218655967904, + "loss": 3.1825, + "theoretical_loss": 3.8739782963748057, + "tokens_seen": 551639040 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042061183550651954, + "loss": 3.1543, + "theoretical_loss": 3.8739295647094414, + "tokens_seen": 551704576 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042060180541624877, + "loss": 3.1514, + "theoretical_loss": 3.8738808404531104, + "tokens_seen": 551770112 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205917753259779, + "loss": 3.091, + "theoretical_loss": 3.8738321236038065, + "tokens_seen": 551835648 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042058174523570713, + "loss": 3.1672, + "theoretical_loss": 3.873783414159525, + "tokens_seen": 551901184 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205717151454363, + "loss": 3.1141, + "theoretical_loss": 3.8737347121182597, + "tokens_seen": 551966720 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205616850551655, + "loss": 3.0549, + "theoretical_loss": 3.8736860174780077, + "tokens_seen": 552032256 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0409982204437256, + "objective/train/theoretical_loss": 3.8736373302367655, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8736373302367655, + "tokens_seen": 552097792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205516549648947, + "loss": 3.1536, + "theoretical_loss": 3.8736373302367655, + "tokens_seen": 552097792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205416248746239, + "loss": 3.1337, + "theoretical_loss": 3.8735886503925308, + "tokens_seen": 552163328 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042053159478435304, + "loss": 3.1807, + "theoretical_loss": 3.8735399779433024, + "tokens_seen": 552228864 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205215646940823, + "loss": 3.1009, + "theoretical_loss": 3.8734913128870794, + "tokens_seen": 552294400 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004205115346038114, + "loss": 3.0154, + "theoretical_loss": 3.8734426552218615, + "tokens_seen": 552359936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042050150451354064, + "loss": 3.0698, + "theoretical_loss": 3.8733940049456494, + "tokens_seen": 552425472 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204914744232698, + "loss": 3.1197, + "theoretical_loss": 3.8733453620564453, + "tokens_seen": 552491008 + }, + { + "epoch": 6.02, + "learning_rate": 0.000420481444332999, + "loss": 3.233, + "theoretical_loss": 3.873296726552251, + "tokens_seen": 552556544 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204714142427282, + "loss": 3.0288, + "theoretical_loss": 3.8732480984310693, + "tokens_seen": 552622080 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042046138415245736, + "loss": 3.2502, + "theoretical_loss": 3.873199477690905, + "tokens_seen": 552687616 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042045135406218654, + "loss": 3.0821, + "theoretical_loss": 3.8731508643297614, + "tokens_seen": 552753152 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204413239719158, + "loss": 3.1463, + "theoretical_loss": 3.8731022583456456, + "tokens_seen": 552818688 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204312938816449, + "loss": 3.1429, + "theoretical_loss": 3.873053659736563, + "tokens_seen": 552884224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042042126379137414, + "loss": 3.1207, + "theoretical_loss": 3.8730050685005195, + "tokens_seen": 552949760 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204112337011033, + "loss": 3.156, + "theoretical_loss": 3.872956484635525, + "tokens_seen": 553015296 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004204012036108325, + "loss": 3.0368, + "theoretical_loss": 3.872907908139586, + "tokens_seen": 553080832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042039117352056174, + "loss": 3.1248, + "theoretical_loss": 3.872859339010713, + "tokens_seen": 553146368 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042038114343029086, + "loss": 3.092, + "theoretical_loss": 3.872810777246916, + "tokens_seen": 553211904 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004203711133400201, + "loss": 3.1868, + "theoretical_loss": 3.8727622228462053, + "tokens_seen": 553277440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004203610832497493, + "loss": 3.1259, + "theoretical_loss": 3.8727136758065934, + "tokens_seen": 553342976 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042035105315947846, + "loss": 3.1269, + "theoretical_loss": 3.8726651361260918, + "tokens_seen": 553408512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042034102306920764, + "loss": 3.1332, + "theoretical_loss": 3.8726166038027143, + "tokens_seen": 553474048 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004203309929789368, + "loss": 3.136, + "theoretical_loss": 3.8725680788344743, + "tokens_seen": 553539584 + }, + { + "epoch": 6.02, + "learning_rate": 0.000420320962888666, + "loss": 3.0903, + "theoretical_loss": 3.872519561219387, + "tokens_seen": 553605120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042031093279839524, + "loss": 3.1199, + "theoretical_loss": 3.8724710509554674, + "tokens_seen": 553670656 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.980152130126953, + "objective/train/theoretical_loss": 3.872422548040732, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.872422548040732, + "tokens_seen": 553736192 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042030090270812437, + "loss": 3.1052, + "theoretical_loss": 3.872422548040732, + "tokens_seen": 553736192 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004202908726178536, + "loss": 3.1081, + "theoretical_loss": 3.872374052473198, + "tokens_seen": 553801728 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042028084252758273, + "loss": 3.16, + "theoretical_loss": 3.8723255642508834, + "tokens_seen": 553867264 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042027081243731196, + "loss": 3.0764, + "theoretical_loss": 3.8722770833718054, + "tokens_seen": 553932800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042026078234704115, + "loss": 3.1233, + "theoretical_loss": 3.872228609833985, + "tokens_seen": 553998336 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004202507522567703, + "loss": 3.1241, + "theoretical_loss": 3.8721801436354415, + "tokens_seen": 554063872 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004202407221664995, + "loss": 3.1127, + "theoretical_loss": 3.8721316847741956, + "tokens_seen": 554129408 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004202306920762287, + "loss": 3.1016, + "theoretical_loss": 3.8720832332482695, + "tokens_seen": 554194944 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042022066198595787, + "loss": 3.2272, + "theoretical_loss": 3.872034789055685, + "tokens_seen": 554260480 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004202106318956871, + "loss": 3.1623, + "theoretical_loss": 3.871986352194466, + "tokens_seen": 554326016 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042020060180541623, + "loss": 3.1883, + "theoretical_loss": 3.871937922662636, + "tokens_seen": 554391552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042019057171514547, + "loss": 3.0978, + "theoretical_loss": 3.8718895004582192, + "tokens_seen": 554457088 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042018054162487465, + "loss": 3.0904, + "theoretical_loss": 3.8718410855792422, + "tokens_seen": 554522624 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042017051153460383, + "loss": 3.1807, + "theoretical_loss": 3.8717926780237297, + "tokens_seen": 554588160 + }, + { + "epoch": 6.02, + "learning_rate": 0.000420160481444333, + "loss": 3.1582, + "theoretical_loss": 3.871744277789711, + "tokens_seen": 554653696 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004201504513540622, + "loss": 3.1386, + "theoretical_loss": 3.871695884875211, + "tokens_seen": 554719232 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004201404212637914, + "loss": 3.1143, + "theoretical_loss": 3.8716474992782604, + "tokens_seen": 554784768 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004201303911735206, + "loss": 3.1678, + "theoretical_loss": 3.871599120996888, + "tokens_seen": 554850304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042012036108324974, + "loss": 3.0741, + "theoretical_loss": 3.8715507500291233, + "tokens_seen": 554915840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042011033099297897, + "loss": 3.1215, + "theoretical_loss": 3.871502386372997, + "tokens_seen": 554981376 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004201003009027081, + "loss": 3.183, + "theoretical_loss": 3.871454030026542, + "tokens_seen": 555046912 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042009027081243733, + "loss": 3.0949, + "theoretical_loss": 3.871405680987789, + "tokens_seen": 555112448 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200802407221665, + "loss": 3.1364, + "theoretical_loss": 3.8713573392547724, + "tokens_seen": 555177984 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200702106318957, + "loss": 3.2009, + "theoretical_loss": 3.871309004825525, + "tokens_seen": 555243520 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200601805416249, + "loss": 3.2329, + "theoretical_loss": 3.871260677698082, + "tokens_seen": 555309056 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0800435543060303, + "objective/train/theoretical_loss": 3.871212357870479, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.871212357870479, + "tokens_seen": 555374592 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200501504513541, + "loss": 3.1846, + "theoretical_loss": 3.871212357870479, + "tokens_seen": 555374592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042004012036108324, + "loss": 3.1753, + "theoretical_loss": 3.871164045340752, + "tokens_seen": 555440128 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200300902708125, + "loss": 3.0987, + "theoretical_loss": 3.871115740106938, + "tokens_seen": 555505664 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004200200601805416, + "loss": 3.178, + "theoretical_loss": 3.8710674421670737, + "tokens_seen": 555571200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042001003009027084, + "loss": 3.1146, + "theoretical_loss": 3.8710191515191985, + "tokens_seen": 555636736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00042, + "loss": 3.097, + "theoretical_loss": 3.8709708681613515, + "tokens_seen": 555702272 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004199899699097292, + "loss": 3.1764, + "theoretical_loss": 3.8709225920915724, + "tokens_seen": 555767808 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004199799398194584, + "loss": 3.1289, + "theoretical_loss": 3.8708743233079024, + "tokens_seen": 555833344 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041996990972918756, + "loss": 3.1546, + "theoretical_loss": 3.8708260618083816, + "tokens_seen": 555898880 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041995987963891674, + "loss": 3.1321, + "theoretical_loss": 3.870777807591054, + "tokens_seen": 555964416 + }, + { + "epoch": 6.02, + "learning_rate": 0.000419949849548646, + "loss": 3.1205, + "theoretical_loss": 3.8707295606539613, + "tokens_seen": 556029952 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004199398194583751, + "loss": 3.0906, + "theoretical_loss": 3.870681320995148, + "tokens_seen": 556095488 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041992978936810434, + "loss": 3.1483, + "theoretical_loss": 3.8706330886126583, + "tokens_seen": 556161024 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041991975927783347, + "loss": 3.0283, + "theoretical_loss": 3.8705848635045372, + "tokens_seen": 556226560 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004199097291875627, + "loss": 3.1545, + "theoretical_loss": 3.870536645668831, + "tokens_seen": 556292096 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004198996990972919, + "loss": 3.1209, + "theoretical_loss": 3.870488435103586, + "tokens_seen": 556357632 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041988966900702106, + "loss": 3.1461, + "theoretical_loss": 3.8704402318068505, + "tokens_seen": 556423168 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041987963891675025, + "loss": 3.0688, + "theoretical_loss": 3.870392035776672, + "tokens_seen": 556488704 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004198696088264795, + "loss": 3.1956, + "theoretical_loss": 3.8703438470111, + "tokens_seen": 556554240 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004198595787362086, + "loss": 3.0728, + "theoretical_loss": 3.870295665508184, + "tokens_seen": 556619776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041984954864593784, + "loss": 3.0472, + "theoretical_loss": 3.8702474912659746, + "tokens_seen": 556685312 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041983951855566697, + "loss": 3.1333, + "theoretical_loss": 3.870199324282523, + "tokens_seen": 556750848 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004198294884653962, + "loss": 3.1897, + "theoretical_loss": 3.870151164555881, + "tokens_seen": 556816384 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004198194583751254, + "loss": 3.1579, + "theoretical_loss": 3.870103012084102, + "tokens_seen": 556881920 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041980942828485457, + "loss": 3.1607, + "theoretical_loss": 3.8700548668652397, + "tokens_seen": 556947456 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.82358455657959, + "objective/train/theoretical_loss": 3.870006728897347, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.870006728897347, + "tokens_seen": 557012992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041979939819458375, + "loss": 3.0536, + "theoretical_loss": 3.870006728897347, + "tokens_seen": 557012992 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041978936810431293, + "loss": 3.1929, + "theoretical_loss": 3.86995859817848, + "tokens_seen": 557078528 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004197793380140421, + "loss": 3.1571, + "theoretical_loss": 3.8699104747066944, + "tokens_seen": 557144064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041976930792377135, + "loss": 3.1429, + "theoretical_loss": 3.869862358480047, + "tokens_seen": 557209600 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041975927783350047, + "loss": 3.0817, + "theoretical_loss": 3.8698142494965944, + "tokens_seen": 557275136 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004197492477432297, + "loss": 3.094, + "theoretical_loss": 3.8697661477543948, + "tokens_seen": 557340672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041973921765295883, + "loss": 3.1139, + "theoretical_loss": 3.869718053251507, + "tokens_seen": 557406208 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041972918756268807, + "loss": 3.2229, + "theoretical_loss": 3.8696699659859912, + "tokens_seen": 557471744 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041971915747241725, + "loss": 3.0558, + "theoretical_loss": 3.8696218859559064, + "tokens_seen": 557537280 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041970912738214643, + "loss": 3.1842, + "theoretical_loss": 3.869573813159315, + "tokens_seen": 557602816 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196990972918756, + "loss": 3.0467, + "theoretical_loss": 3.869525747594278, + "tokens_seen": 557668352 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041968906720160485, + "loss": 3.05, + "theoretical_loss": 3.869477689258858, + "tokens_seen": 557733888 + }, + { + "epoch": 6.02, + "learning_rate": 0.000419679037111334, + "loss": 3.0924, + "theoretical_loss": 3.869429638151118, + "tokens_seen": 557799424 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196690070210632, + "loss": 3.1143, + "theoretical_loss": 3.869381594269122, + "tokens_seen": 557864960 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196589769307924, + "loss": 3.0497, + "theoretical_loss": 3.8693335576109353, + "tokens_seen": 557930496 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196489468405216, + "loss": 3.1903, + "theoretical_loss": 3.8692855281746237, + "tokens_seen": 557996032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196389167502508, + "loss": 3.0918, + "theoretical_loss": 3.869237505958252, + "tokens_seen": 558061568 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041962888665997994, + "loss": 3.2415, + "theoretical_loss": 3.869189490959889, + "tokens_seen": 558127104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041961885656970917, + "loss": 3.0408, + "theoretical_loss": 3.869141483177601, + "tokens_seen": 558192640 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004196088264794383, + "loss": 2.9361, + "theoretical_loss": 3.869093482609457, + "tokens_seen": 558258176 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041959879638916753, + "loss": 3.0761, + "theoretical_loss": 3.8690454892535264, + "tokens_seen": 558323712 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195887662988967, + "loss": 3.1023, + "theoretical_loss": 3.868997503107879, + "tokens_seen": 558389248 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195787362086259, + "loss": 3.1579, + "theoretical_loss": 3.868949524170586, + "tokens_seen": 558454784 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195687061183551, + "loss": 3.1791, + "theoretical_loss": 3.8689015524397172, + "tokens_seen": 558520320 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195586760280843, + "loss": 3.0661, + "theoretical_loss": 3.868853587913347, + "tokens_seen": 558585856 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.173600435256958, + "objective/train/theoretical_loss": 3.868805630589547, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.868805630589547, + "tokens_seen": 558651392 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041954864593781344, + "loss": 3.1663, + "theoretical_loss": 3.868805630589547, + "tokens_seen": 558651392 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195386158475427, + "loss": 3.0201, + "theoretical_loss": 3.868757680466391, + "tokens_seen": 558716928 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195285857572718, + "loss": 3.1162, + "theoretical_loss": 3.8687097375419537, + "tokens_seen": 558782464 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041951855566700104, + "loss": 3.1173, + "theoretical_loss": 3.8686618018143104, + "tokens_seen": 558848000 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004195085255767302, + "loss": 3.0908, + "theoretical_loss": 3.868613873281536, + "tokens_seen": 558913536 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194984954864594, + "loss": 3.2598, + "theoretical_loss": 3.8685659519417084, + "tokens_seen": 558979072 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194884653961886, + "loss": 3.1307, + "theoretical_loss": 3.8685180377929047, + "tokens_seen": 559044608 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041947843530591776, + "loss": 3.2628, + "theoretical_loss": 3.8684701308332023, + "tokens_seen": 559110144 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041946840521564694, + "loss": 3.1856, + "theoretical_loss": 3.8684222310606806, + "tokens_seen": 559175680 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194583751253762, + "loss": 3.1596, + "theoretical_loss": 3.8683743384734193, + "tokens_seen": 559241216 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194483450351053, + "loss": 3.1443, + "theoretical_loss": 3.868326453069499, + "tokens_seen": 559306752 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041943831494483454, + "loss": 3.1159, + "theoretical_loss": 3.868278574847, + "tokens_seen": 559372288 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041942828485456367, + "loss": 3.1342, + "theoretical_loss": 3.8682307038040045, + "tokens_seen": 559437824 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194182547642929, + "loss": 3.2319, + "theoretical_loss": 3.868182839938595, + "tokens_seen": 559503360 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004194082246740221, + "loss": 3.0748, + "theoretical_loss": 3.8681349832488547, + "tokens_seen": 559568896 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041939819458375126, + "loss": 3.0786, + "theoretical_loss": 3.8680871337328675, + "tokens_seen": 559634432 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041938816449348045, + "loss": 3.163, + "theoretical_loss": 3.868039291388719, + "tokens_seen": 559699968 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004193781344032097, + "loss": 3.0837, + "theoretical_loss": 3.8679914562144937, + "tokens_seen": 559765504 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004193681043129388, + "loss": 3.1417, + "theoretical_loss": 3.8679436282082778, + "tokens_seen": 559831040 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041935807422266804, + "loss": 3.0483, + "theoretical_loss": 3.867895807368159, + "tokens_seen": 559896576 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041934804413239717, + "loss": 3.0999, + "theoretical_loss": 3.8678479936922248, + "tokens_seen": 559962112 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004193380140421264, + "loss": 3.1053, + "theoretical_loss": 3.8678001871785637, + "tokens_seen": 560027648 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004193279839518556, + "loss": 3.125, + "theoretical_loss": 3.867752387825264, + "tokens_seen": 560093184 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041931795386158477, + "loss": 3.1402, + "theoretical_loss": 3.867704595630417, + "tokens_seen": 560158720 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041930792377131395, + "loss": 3.0846, + "theoretical_loss": 3.867656810592112, + "tokens_seen": 560224256 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.19158935546875, + "objective/train/theoretical_loss": 3.8676090327084407, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8676090327084407, + "tokens_seen": 560289792 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041929789368104313, + "loss": 3.1634, + "theoretical_loss": 3.8676090327084407, + "tokens_seen": 560289792 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004192878635907723, + "loss": 3.0917, + "theoretical_loss": 3.867561261977496, + "tokens_seen": 560355328 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041927783350050155, + "loss": 3.135, + "theoretical_loss": 3.8675134983973694, + "tokens_seen": 560420864 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041926780341023067, + "loss": 3.2168, + "theoretical_loss": 3.8674657419661562, + "tokens_seen": 560486400 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004192577733199599, + "loss": 3.152, + "theoretical_loss": 3.8674179926819487, + "tokens_seen": 560551936 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041924774322968904, + "loss": 3.1122, + "theoretical_loss": 3.8673702505428436, + "tokens_seen": 560617472 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041923771313941827, + "loss": 3.1828, + "theoretical_loss": 3.8673225155469355, + "tokens_seen": 560683008 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041922768304914745, + "loss": 3.1774, + "theoretical_loss": 3.8672747876923217, + "tokens_seen": 560748544 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041921765295887663, + "loss": 3.085, + "theoretical_loss": 3.867227066977099, + "tokens_seen": 560814080 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004192076228686058, + "loss": 3.1026, + "theoretical_loss": 3.867179353399365, + "tokens_seen": 560879616 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041919759277833505, + "loss": 3.2013, + "theoretical_loss": 3.867131646957219, + "tokens_seen": 560945152 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191875626880642, + "loss": 3.2076, + "theoretical_loss": 3.8670839476487604, + "tokens_seen": 561010688 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191775325977934, + "loss": 3.0091, + "theoretical_loss": 3.867036255472089, + "tokens_seen": 561076224 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041916750250752254, + "loss": 3.1012, + "theoretical_loss": 3.8669885704253053, + "tokens_seen": 561141760 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191574724172518, + "loss": 3.1457, + "theoretical_loss": 3.866940892506512, + "tokens_seen": 561207296 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041914744232698095, + "loss": 3.1606, + "theoretical_loss": 3.86689322171381, + "tokens_seen": 561272832 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041913741223671014, + "loss": 3.139, + "theoretical_loss": 3.866845558045304, + "tokens_seen": 561338368 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191273821464393, + "loss": 3.2413, + "theoretical_loss": 3.866797901499096, + "tokens_seen": 561403904 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191173520561685, + "loss": 3.1396, + "theoretical_loss": 3.866750252073291, + "tokens_seen": 561469440 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004191073219658977, + "loss": 3.0749, + "theoretical_loss": 3.866702609765995, + "tokens_seen": 561534976 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190972918756269, + "loss": 3.0446, + "theoretical_loss": 3.8666549745753134, + "tokens_seen": 561600512 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041908726178535604, + "loss": 3.0409, + "theoretical_loss": 3.866607346499353, + "tokens_seen": 561666048 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190772316950853, + "loss": 3.1273, + "theoretical_loss": 3.866559725536221, + "tokens_seen": 561731584 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190672016048144, + "loss": 3.184, + "theoretical_loss": 3.8665121116840253, + "tokens_seen": 561797120 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041905717151454364, + "loss": 3.1754, + "theoretical_loss": 3.8664645049408755, + "tokens_seen": 561862656 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2101447582244873, + "objective/train/theoretical_loss": 3.86641690530488, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.86641690530488, + "tokens_seen": 561928192 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190471414242728, + "loss": 3.1784, + "theoretical_loss": 3.86641690530488, + "tokens_seen": 561928192 + }, + { + "epoch": 6.02, + "learning_rate": 0.000419037111334002, + "loss": 3.1412, + "theoretical_loss": 3.8663693127741503, + "tokens_seen": 561993728 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190270812437312, + "loss": 3.1964, + "theoretical_loss": 3.866321727346797, + "tokens_seen": 562059264 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004190170511534604, + "loss": 3.0858, + "theoretical_loss": 3.8662741490209314, + "tokens_seen": 562124800 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041900702106318954, + "loss": 3.0626, + "theoretical_loss": 3.8662265777946665, + "tokens_seen": 562190336 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189969909729188, + "loss": 3.121, + "theoretical_loss": 3.866179013666115, + "tokens_seen": 562255872 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189869608826479, + "loss": 3.0351, + "theoretical_loss": 3.866131456633391, + "tokens_seen": 562321408 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041897693079237714, + "loss": 3.119, + "theoretical_loss": 3.866083906694609, + "tokens_seen": 562386944 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189669007021063, + "loss": 3.0664, + "theoretical_loss": 3.8660363638478845, + "tokens_seen": 562452480 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189568706118355, + "loss": 3.0302, + "theoretical_loss": 3.865988828091334, + "tokens_seen": 562518016 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189468405215647, + "loss": 2.9545, + "theoretical_loss": 3.8659412994230733, + "tokens_seen": 562583552 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041893681043129387, + "loss": 3.0562, + "theoretical_loss": 3.8658937778412197, + "tokens_seen": 562649088 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041892678034102305, + "loss": 3.076, + "theoretical_loss": 3.865846263343893, + "tokens_seen": 562714624 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004189167502507523, + "loss": 3.1348, + "theoretical_loss": 3.865798755929211, + "tokens_seen": 562780160 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041890672016048146, + "loss": 3.1208, + "theoretical_loss": 3.8657512555952938, + "tokens_seen": 562845696 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041889669007021065, + "loss": 3.2192, + "theoretical_loss": 3.865703762340261, + "tokens_seen": 562911232 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004188866599799399, + "loss": 3.1835, + "theoretical_loss": 3.865656276162235, + "tokens_seen": 562976768 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418876629889669, + "loss": 2.9748, + "theoretical_loss": 3.865608797059336, + "tokens_seen": 563042304 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041886659979939824, + "loss": 3.0788, + "theoretical_loss": 3.865561325029687, + "tokens_seen": 563107840 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041885656970912737, + "loss": 3.1614, + "theoretical_loss": 3.8655138600714123, + "tokens_seen": 563173376 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004188465396188566, + "loss": 3.1257, + "theoretical_loss": 3.8654664021826353, + "tokens_seen": 563238912 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004188365095285858, + "loss": 3.2114, + "theoretical_loss": 3.8654189513614803, + "tokens_seen": 563304448 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041882647943831497, + "loss": 3.1237, + "theoretical_loss": 3.8653715076060724, + "tokens_seen": 563369984 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041881644934804415, + "loss": 3.1624, + "theoretical_loss": 3.8653240709145384, + "tokens_seen": 563435520 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041880641925777333, + "loss": 3.0611, + "theoretical_loss": 3.8652766412850053, + "tokens_seen": 563501056 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3311705589294434, + "objective/train/theoretical_loss": 3.8652292187155997, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8652292187155997, + "tokens_seen": 563566592 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004187963891675025, + "loss": 3.1698, + "theoretical_loss": 3.8652292187155997, + "tokens_seen": 563566592 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041878635907723175, + "loss": 3.0757, + "theoretical_loss": 3.8651818032044503, + "tokens_seen": 563632128 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004187763289869609, + "loss": 3.1597, + "theoretical_loss": 3.8651343947496866, + "tokens_seen": 563697664 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004187662988966901, + "loss": 3.0016, + "theoretical_loss": 3.865086993349437, + "tokens_seen": 563763200 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041875626880641924, + "loss": 3.2582, + "theoretical_loss": 3.8650395990018334, + "tokens_seen": 563828736 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041874623871614847, + "loss": 3.1245, + "theoretical_loss": 3.8649922117050055, + "tokens_seen": 563894272 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041873620862587765, + "loss": 3.1343, + "theoretical_loss": 3.864944831457086, + "tokens_seen": 563959808 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041872617853560683, + "loss": 3.2956, + "theoretical_loss": 3.8648974582562072, + "tokens_seen": 564025344 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418716148445336, + "loss": 3.0788, + "theoretical_loss": 3.8648500921005025, + "tokens_seen": 564090880 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041870611835506525, + "loss": 3.2115, + "theoretical_loss": 3.864802732988105, + "tokens_seen": 564156416 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186960882647944, + "loss": 3.0477, + "theoretical_loss": 3.864755380917151, + "tokens_seen": 564221952 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186860581745236, + "loss": 3.0759, + "theoretical_loss": 3.8647080358857737, + "tokens_seen": 564287488 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041867602808425274, + "loss": 3.1785, + "theoretical_loss": 3.8646606978921114, + "tokens_seen": 564353024 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418665997993982, + "loss": 3.089, + "theoretical_loss": 3.8646133669342992, + "tokens_seen": 564418560 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041865596790371115, + "loss": 3.16, + "theoretical_loss": 3.864566043010475, + "tokens_seen": 564484096 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041864593781344034, + "loss": 3.1792, + "theoretical_loss": 3.864518726118778, + "tokens_seen": 564549632 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186359077231695, + "loss": 3.1207, + "theoretical_loss": 3.864471416257346, + "tokens_seen": 564615168 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186258776328987, + "loss": 3.1824, + "theoretical_loss": 3.8644241134243185, + "tokens_seen": 564680704 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186158475426279, + "loss": 3.1849, + "theoretical_loss": 3.8643768176178366, + "tokens_seen": 564746240 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004186058174523571, + "loss": 3.1596, + "theoretical_loss": 3.864329528836041, + "tokens_seen": 564811776 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041859578736208624, + "loss": 3.0577, + "theoretical_loss": 3.8642822470770732, + "tokens_seen": 564877312 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004185857572718155, + "loss": 3.222, + "theoretical_loss": 3.8642349723390765, + "tokens_seen": 564942848 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004185757271815446, + "loss": 3.1673, + "theoretical_loss": 3.8641877046201927, + "tokens_seen": 565008384 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041856569709127384, + "loss": 3.1212, + "theoretical_loss": 3.8641404439185667, + "tokens_seen": 565073920 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418555667001003, + "loss": 3.1477, + "theoretical_loss": 3.864093190232343, + "tokens_seen": 565139456 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0723984241485596, + "objective/train/theoretical_loss": 3.864045943559667, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.864045943559667, + "tokens_seen": 565204992 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004185456369107322, + "loss": 3.0869, + "theoretical_loss": 3.864045943559667, + "tokens_seen": 565204992 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004185356068204614, + "loss": 3.1209, + "theoretical_loss": 3.8639987038986834, + "tokens_seen": 565270528 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004185255767301906, + "loss": 3.1565, + "theoretical_loss": 3.8639514712475402, + "tokens_seen": 565336064 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041851554663991974, + "loss": 3.1861, + "theoretical_loss": 3.8639042456043846, + "tokens_seen": 565401600 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418505516549649, + "loss": 3.0749, + "theoretical_loss": 3.8638570269673647, + "tokens_seen": 565467136 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184954864593781, + "loss": 3.1541, + "theoretical_loss": 3.8638098153346285, + "tokens_seen": 565532672 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041848545636910734, + "loss": 3.0388, + "theoretical_loss": 3.8637626107043266, + "tokens_seen": 565598208 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184754262788365, + "loss": 3.2081, + "theoretical_loss": 3.8637154130746083, + "tokens_seen": 565663744 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184653961885657, + "loss": 3.2369, + "theoretical_loss": 3.8636682224436254, + "tokens_seen": 565729280 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184553660982949, + "loss": 3.0741, + "theoretical_loss": 3.8636210388095287, + "tokens_seen": 565794816 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041844533600802407, + "loss": 3.2237, + "theoretical_loss": 3.8635738621704707, + "tokens_seen": 565860352 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041843530591775325, + "loss": 3.1224, + "theoretical_loss": 3.863526692524605, + "tokens_seen": 565925888 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184252758274825, + "loss": 3.0914, + "theoretical_loss": 3.8634795298700846, + "tokens_seen": 565991424 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004184152457372116, + "loss": 3.1426, + "theoretical_loss": 3.8634323742050647, + "tokens_seen": 566056960 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041840521564694085, + "loss": 3.0521, + "theoretical_loss": 3.8633852255276997, + "tokens_seen": 566122496 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041839518555666997, + "loss": 3.1062, + "theoretical_loss": 3.8633380838361453, + "tokens_seen": 566188032 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004183851554663992, + "loss": 3.235, + "theoretical_loss": 3.863290949128558, + "tokens_seen": 566253568 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004183751253761284, + "loss": 3.0264, + "theoretical_loss": 3.8632438214030964, + "tokens_seen": 566319104 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041836509528585757, + "loss": 3.0817, + "theoretical_loss": 3.863196700657917, + "tokens_seen": 566384640 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041835506519558675, + "loss": 3.1283, + "theoretical_loss": 3.8631495868911783, + "tokens_seen": 566450176 + }, + { + "epoch": 6.02, + "learning_rate": 0.000418345035105316, + "loss": 3.0479, + "theoretical_loss": 3.8631024801010407, + "tokens_seen": 566515712 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004183350050150451, + "loss": 3.1392, + "theoretical_loss": 3.8630553802856635, + "tokens_seen": 566581248 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041832497492477435, + "loss": 3.085, + "theoretical_loss": 3.8630082874432077, + "tokens_seen": 566646784 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004183149448345035, + "loss": 3.1231, + "theoretical_loss": 3.862961201571834, + "tokens_seen": 566712320 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004183049147442327, + "loss": 3.1085, + "theoretical_loss": 3.8629141226697055, + "tokens_seen": 566777856 + }, + { + "epoch": 6.02, + "objective/train/docs_used": 1323616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.105577230453491, + "objective/train/theoretical_loss": 3.8628670507349847, + "objective/train/tokens_used": 566947296, + "theoretical_loss": 3.8628670507349847, + "tokens_seen": 566843392 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004182948846539619, + "loss": 3.1686, + "theoretical_loss": 3.8628670507349847, + "tokens_seen": 566843392 + }, + { + "epoch": 6.02, + "learning_rate": 0.0004182848545636911, + "loss": 3.1126, + "theoretical_loss": 3.8628199857658343, + "tokens_seen": 566908928 + }, + { + "epoch": 6.02, + "learning_rate": 0.00041827482447342025, + "loss": 3.0498, + "theoretical_loss": 3.8627780743906426, + "tokens_seen": 566967296 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041826479438314944, + "loss": 3.0778, + "theoretical_loss": 3.862731022585759, + "tokens_seen": 567032832 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004182547642928786, + "loss": 3.021, + "theoretical_loss": 3.862683977741143, + "tokens_seen": 567098368 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041824473420260785, + "loss": 2.9695, + "theoretical_loss": 3.8626369398549585, + "tokens_seen": 567163904 + }, + { + "epoch": 7.0, + "learning_rate": 0.000418234704112337, + "loss": 3.1646, + "theoretical_loss": 3.862589908925374, + "tokens_seen": 567229440 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004182246740220662, + "loss": 3.0938, + "theoretical_loss": 3.8625428849505563, + "tokens_seen": 567294976 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004182146439317954, + "loss": 3.101, + "theoretical_loss": 3.862495867928674, + "tokens_seen": 567360512 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004182046138415246, + "loss": 3.0753, + "theoretical_loss": 3.8624488578578964, + "tokens_seen": 567426048 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041819458375125376, + "loss": 3.0755, + "theoretical_loss": 3.8624018547363925, + "tokens_seen": 567491584 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041818455366098294, + "loss": 2.9578, + "theoretical_loss": 3.862354858562333, + "tokens_seen": 567557120 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004181745235707121, + "loss": 3.1646, + "theoretical_loss": 3.862307869333889, + "tokens_seen": 567622656 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041816449348044136, + "loss": 3.0343, + "theoretical_loss": 3.8622608870492323, + "tokens_seen": 567688192 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041815446339017054, + "loss": 2.9686, + "theoretical_loss": 3.862213911706535, + "tokens_seen": 567753728 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004181444332998997, + "loss": 2.943, + "theoretical_loss": 3.862166943303971, + "tokens_seen": 567819264 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004181344032096289, + "loss": 2.9705, + "theoretical_loss": 3.862119981839713, + "tokens_seen": 567884800 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004181243731193581, + "loss": 3.0725, + "theoretical_loss": 3.8620730273119364, + "tokens_seen": 567950336 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004181143430290873, + "loss": 3.022, + "theoretical_loss": 3.862026079718816, + "tokens_seen": 568015872 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041810431293881644, + "loss": 3.1179, + "theoretical_loss": 3.8619791390585285, + "tokens_seen": 568081408 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180942828485457, + "loss": 3.0232, + "theoretical_loss": 3.8619322053292495, + "tokens_seen": 568146944 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180842527582748, + "loss": 3.0442, + "theoretical_loss": 3.8618852785291566, + "tokens_seen": 568212480 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041807422266800404, + "loss": 3.0716, + "theoretical_loss": 3.8618383586564278, + "tokens_seen": 568278016 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180641925777332, + "loss": 3.0368, + "theoretical_loss": 3.861791445709242, + "tokens_seen": 568343552 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180541624874624, + "loss": 3.037, + "theoretical_loss": 3.8617445396857786, + "tokens_seen": 568409088 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1375946, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.211026906967163, + "objective/train/theoretical_loss": 3.861697640584217, + "objective/train/tokens_used": 588934624, + "theoretical_loss": 3.861697640584217, + "tokens_seen": 568474624 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180441323971916, + "loss": 3.1643, + "theoretical_loss": 3.861697640584217, + "tokens_seen": 568474624 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180341023069208, + "loss": 3.0419, + "theoretical_loss": 3.861650748402738, + "tokens_seen": 568540160 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041802407221664994, + "loss": 2.9925, + "theoretical_loss": 3.861603863139524, + "tokens_seen": 568605696 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180140421263792, + "loss": 3.1468, + "theoretical_loss": 3.861556984792756, + "tokens_seen": 568671232 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004180040120361083, + "loss": 3.099, + "theoretical_loss": 3.861510113360618, + "tokens_seen": 568736768 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041799398194583754, + "loss": 2.9971, + "theoretical_loss": 3.861463248841292, + "tokens_seen": 568802304 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004179839518555667, + "loss": 3.1083, + "theoretical_loss": 3.861416391232963, + "tokens_seen": 568867840 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004179739217652959, + "loss": 2.9688, + "theoretical_loss": 3.8613695405338158, + "tokens_seen": 568933376 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004179638916750251, + "loss": 3.1515, + "theoretical_loss": 3.861322696742036, + "tokens_seen": 568998912 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041795386158475427, + "loss": 3.0324, + "theoretical_loss": 3.8612758598558097, + "tokens_seen": 569064448 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041794383149448345, + "loss": 3.1009, + "theoretical_loss": 3.8612290298733236, + "tokens_seen": 569129984 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004179338014042127, + "loss": 3.0391, + "theoretical_loss": 3.8611822067927655, + "tokens_seen": 569195520 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004179237713139418, + "loss": 2.8401, + "theoretical_loss": 3.861135390612324, + "tokens_seen": 569261056 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041791374122367105, + "loss": 3.0155, + "theoretical_loss": 3.8610885813301876, + "tokens_seen": 569326592 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041790371113340017, + "loss": 3.0407, + "theoretical_loss": 3.8610417789445464, + "tokens_seen": 569392128 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178936810431294, + "loss": 3.0562, + "theoretical_loss": 3.86099498345359, + "tokens_seen": 569457664 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178836509528586, + "loss": 3.1143, + "theoretical_loss": 3.86094819485551, + "tokens_seen": 569523200 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041787362086258777, + "loss": 3.1014, + "theoretical_loss": 3.8609014131484978, + "tokens_seen": 569588736 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041786359077231695, + "loss": 3.1233, + "theoretical_loss": 3.860854638330746, + "tokens_seen": 569654272 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178535606820462, + "loss": 3.0729, + "theoretical_loss": 3.860807870400447, + "tokens_seen": 569719808 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178435305917753, + "loss": 3.1039, + "theoretical_loss": 3.8607611093557956, + "tokens_seen": 569785344 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041783350050150455, + "loss": 3.0556, + "theoretical_loss": 3.860714355194986, + "tokens_seen": 569850880 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178234704112337, + "loss": 2.9703, + "theoretical_loss": 3.8606676079162128, + "tokens_seen": 569916416 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178134403209629, + "loss": 2.989, + "theoretical_loss": 3.860620867517672, + "tokens_seen": 569981952 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004178034102306921, + "loss": 3.0503, + "theoretical_loss": 3.86057413399756, + "tokens_seen": 570047488 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1379025, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.879188299179077, + "objective/train/theoretical_loss": 3.8605274073540743, + "objective/train/tokens_used": 590573024, + "theoretical_loss": 3.8605274073540743, + "tokens_seen": 570113024 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177933801404213, + "loss": 3.0331, + "theoretical_loss": 3.8605274073540743, + "tokens_seen": 570113024 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041778335005015045, + "loss": 2.9959, + "theoretical_loss": 3.8604806875854116, + "tokens_seen": 570178560 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041777331995987964, + "loss": 3.0952, + "theoretical_loss": 3.8604339746897725, + "tokens_seen": 570244096 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177632898696088, + "loss": 2.9557, + "theoretical_loss": 3.860387268665354, + "tokens_seen": 570309632 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041775325977933805, + "loss": 3.0263, + "theoretical_loss": 3.860340569510357, + "tokens_seen": 570375168 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177432296890672, + "loss": 2.9916, + "theoretical_loss": 3.8602938772229827, + "tokens_seen": 570440704 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177331995987964, + "loss": 3.143, + "theoretical_loss": 3.8602471918014305, + "tokens_seen": 570506240 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177231695085256, + "loss": 2.9463, + "theoretical_loss": 3.8602005132439037, + "tokens_seen": 570571776 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004177131394182548, + "loss": 3.0905, + "theoretical_loss": 3.8601538415486045, + "tokens_seen": 570637312 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041770310932798396, + "loss": 3.1185, + "theoretical_loss": 3.8601071767137363, + "tokens_seen": 570702848 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041769307923771314, + "loss": 2.9409, + "theoretical_loss": 3.8600605187375026, + "tokens_seen": 570768384 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004176830491474423, + "loss": 3.0548, + "theoretical_loss": 3.860013867618109, + "tokens_seen": 570833920 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041767301905717156, + "loss": 3.0874, + "theoretical_loss": 3.8599672233537596, + "tokens_seen": 570899456 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004176629889669007, + "loss": 2.9071, + "theoretical_loss": 3.8599205859426604, + "tokens_seen": 570964992 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004176529588766299, + "loss": 3.09, + "theoretical_loss": 3.8598739553830193, + "tokens_seen": 571030528 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041764292878635904, + "loss": 2.9402, + "theoretical_loss": 3.859827331673042, + "tokens_seen": 571096064 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004176328986960883, + "loss": 3.0169, + "theoretical_loss": 3.8597807148109378, + "tokens_seen": 571161600 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041762286860581746, + "loss": 3.1951, + "theoretical_loss": 3.8597341047949145, + "tokens_seen": 571227136 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041761283851554664, + "loss": 3.1117, + "theoretical_loss": 3.8596875016231817, + "tokens_seen": 571292672 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004176028084252758, + "loss": 3.18, + "theoretical_loss": 3.8596409052939498, + "tokens_seen": 571358208 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417592778335005, + "loss": 3.082, + "theoretical_loss": 3.8595943158054284, + "tokens_seen": 571423744 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175827482447342, + "loss": 3.1198, + "theoretical_loss": 3.85954773315583, + "tokens_seen": 571489280 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175727181544634, + "loss": 2.9633, + "theoretical_loss": 3.8595011573433657, + "tokens_seen": 571554816 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041756268806419255, + "loss": 3.0673, + "theoretical_loss": 3.8594545883662494, + "tokens_seen": 571620352 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175526579739218, + "loss": 3.0741, + "theoretical_loss": 3.8594080262226935, + "tokens_seen": 571685888 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1384063, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2002673149108887, + "objective/train/theoretical_loss": 3.859361470910912, + "objective/train/tokens_used": 592211424, + "theoretical_loss": 3.859361470910912, + "tokens_seen": 571751424 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041754262788365096, + "loss": 3.0177, + "theoretical_loss": 3.859361470910912, + "tokens_seen": 571751424 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041753259779338015, + "loss": 2.9754, + "theoretical_loss": 3.8593149224291197, + "tokens_seen": 571816960 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175225677031093, + "loss": 3.0836, + "theoretical_loss": 3.8592683807755326, + "tokens_seen": 571882496 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175125376128385, + "loss": 3.0658, + "theoretical_loss": 3.8592218459483663, + "tokens_seen": 571948032 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004175025075225677, + "loss": 3.0526, + "theoretical_loss": 3.8591753179458372, + "tokens_seen": 572013568 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004174924774322969, + "loss": 3.1044, + "theoretical_loss": 3.859128796766163, + "tokens_seen": 572079104 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041748244734202605, + "loss": 3.102, + "theoretical_loss": 3.859082282407562, + "tokens_seen": 572144640 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004174724172517553, + "loss": 3.0113, + "theoretical_loss": 3.8590357748682527, + "tokens_seen": 572210176 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004174623871614844, + "loss": 3.0537, + "theoretical_loss": 3.858989274146454, + "tokens_seen": 572275712 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041745235707121365, + "loss": 3.1288, + "theoretical_loss": 3.858942780240387, + "tokens_seen": 572341248 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041744232698094283, + "loss": 3.0895, + "theoretical_loss": 3.858896293148272, + "tokens_seen": 572406784 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417432296890672, + "loss": 3.0875, + "theoretical_loss": 3.85884981286833, + "tokens_seen": 572472320 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004174222668004012, + "loss": 3.0826, + "theoretical_loss": 3.858803339398783, + "tokens_seen": 572537856 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041741223671013037, + "loss": 2.9577, + "theoretical_loss": 3.858756872737855, + "tokens_seen": 572603392 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004174022066198596, + "loss": 3.0848, + "theoretical_loss": 3.8587104128837675, + "tokens_seen": 572668928 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173921765295888, + "loss": 3.0814, + "theoretical_loss": 3.8586639598347463, + "tokens_seen": 572734464 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041738214643931797, + "loss": 3.0044, + "theoretical_loss": 3.8586175135890155, + "tokens_seen": 572800000 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041737211634904715, + "loss": 3.192, + "theoretical_loss": 3.8585710741448, + "tokens_seen": 572865536 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173620862587764, + "loss": 3.0171, + "theoretical_loss": 3.8585246415003267, + "tokens_seen": 572931072 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173520561685055, + "loss": 3.1517, + "theoretical_loss": 3.8584782156538218, + "tokens_seen": 572996608 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041734202607823475, + "loss": 2.9746, + "theoretical_loss": 3.858431796603513, + "tokens_seen": 573062144 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173319959879639, + "loss": 3.0118, + "theoretical_loss": 3.8583853843476277, + "tokens_seen": 573127680 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173219658976931, + "loss": 3.1725, + "theoretical_loss": 3.8583389788843956, + "tokens_seen": 573193216 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173119358074223, + "loss": 3.0458, + "theoretical_loss": 3.858292580212045, + "tokens_seen": 573258752 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004173019057171515, + "loss": 2.9615, + "theoretical_loss": 3.8582461883288075, + "tokens_seen": 573324288 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1386931, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2117390632629395, + "objective/train/theoretical_loss": 3.858199803232913, + "objective/train/tokens_used": 593849824, + "theoretical_loss": 3.858199803232913, + "tokens_seen": 573389824 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041729187562688065, + "loss": 3.1822, + "theoretical_loss": 3.858199803232913, + "tokens_seen": 573389824 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041728184553660984, + "loss": 2.9658, + "theoretical_loss": 3.858153424922592, + "tokens_seen": 573455360 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417271815446339, + "loss": 3.0249, + "theoretical_loss": 3.858107053396078, + "tokens_seen": 573520896 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041726178535606825, + "loss": 3.0973, + "theoretical_loss": 3.858060688651603, + "tokens_seen": 573586432 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004172517552657974, + "loss": 3.005, + "theoretical_loss": 3.8580143306874, + "tokens_seen": 573651968 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004172417251755266, + "loss": 3.0501, + "theoretical_loss": 3.857967979501704, + "tokens_seen": 573717504 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004172316950852558, + "loss": 3.0602, + "theoretical_loss": 3.857921635092749, + "tokens_seen": 573783040 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417221664994985, + "loss": 3.1045, + "theoretical_loss": 3.8578752974587704, + "tokens_seen": 573848576 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041721163490471416, + "loss": 3.2031, + "theoretical_loss": 3.857828966598005, + "tokens_seen": 573914112 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041720160481444334, + "loss": 3.102, + "theoretical_loss": 3.857782642508688, + "tokens_seen": 573979648 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004171915747241725, + "loss": 3.0633, + "theoretical_loss": 3.857736325189058, + "tokens_seen": 574045184 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041718154463390176, + "loss": 3.1821, + "theoretical_loss": 3.8576900146373525, + "tokens_seen": 574110720 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004171715145436309, + "loss": 3.0299, + "theoretical_loss": 3.8576437108518102, + "tokens_seen": 574176256 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004171614844533601, + "loss": 3.1418, + "theoretical_loss": 3.8575974138306703, + "tokens_seen": 574241792 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041715145436308924, + "loss": 3.0662, + "theoretical_loss": 3.857551123572174, + "tokens_seen": 574307328 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004171414242728185, + "loss": 3.0679, + "theoretical_loss": 3.8575048400745597, + "tokens_seen": 574372864 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041713139418254766, + "loss": 3.1262, + "theoretical_loss": 3.8574585633360705, + "tokens_seen": 574438400 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041712136409227684, + "loss": 3.1422, + "theoretical_loss": 3.8574122933549475, + "tokens_seen": 574503936 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417111334002006, + "loss": 3.1486, + "theoretical_loss": 3.857366030129434, + "tokens_seen": 574569472 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004171013039117352, + "loss": 3.0373, + "theoretical_loss": 3.857319773657772, + "tokens_seen": 574635008 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170912738214644, + "loss": 3.118, + "theoretical_loss": 3.8572735239382068, + "tokens_seen": 574700544 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170812437311936, + "loss": 3.0753, + "theoretical_loss": 3.8572272809689823, + "tokens_seen": 574766080 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041707121364092275, + "loss": 2.9903, + "theoretical_loss": 3.857181044748344, + "tokens_seen": 574831616 + }, + { + "epoch": 7.0, + "learning_rate": 0.000417061183550652, + "loss": 3.0968, + "theoretical_loss": 3.857134815274538, + "tokens_seen": 574897152 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041705115346038116, + "loss": 3.0618, + "theoretical_loss": 3.85708859254581, + "tokens_seen": 574962688 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1390599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0328938961029053, + "objective/train/theoretical_loss": 3.8570423765604076, + "objective/train/tokens_used": 595488224, + "theoretical_loss": 3.8570423765604076, + "tokens_seen": 575028224 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041704112337011035, + "loss": 3.0855, + "theoretical_loss": 3.8570423765604076, + "tokens_seen": 575028224 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170310932798395, + "loss": 2.9723, + "theoretical_loss": 3.856996167316579, + "tokens_seen": 575093760 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170210631895687, + "loss": 3.0655, + "theoretical_loss": 3.8569499648125727, + "tokens_seen": 575159296 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170110330992979, + "loss": 3.1011, + "theoretical_loss": 3.8569037690466375, + "tokens_seen": 575224832 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004170010030090271, + "loss": 3.1076, + "theoretical_loss": 3.8568575800170235, + "tokens_seen": 575290368 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041699097291875625, + "loss": 3.0479, + "theoretical_loss": 3.856811397721981, + "tokens_seen": 575355904 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004169809428284855, + "loss": 2.9629, + "theoretical_loss": 3.856765222159762, + "tokens_seen": 575421440 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004169709127382146, + "loss": 3.1, + "theoretical_loss": 3.856719053328616, + "tokens_seen": 575486976 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041696088264794385, + "loss": 3.0465, + "theoretical_loss": 3.8566728912267982, + "tokens_seen": 575552512 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041695085255767303, + "loss": 3.0029, + "theoretical_loss": 3.85662673585256, + "tokens_seen": 575618048 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004169408224674022, + "loss": 2.9555, + "theoretical_loss": 3.8565805872041556, + "tokens_seen": 575683584 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004169307923771314, + "loss": 3.0847, + "theoretical_loss": 3.8565344452798396, + "tokens_seen": 575749120 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004169207622868606, + "loss": 3.0313, + "theoretical_loss": 3.856488310077866, + "tokens_seen": 575814656 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041691073219658975, + "loss": 3.1084, + "theoretical_loss": 3.8564421815964924, + "tokens_seen": 575880192 + }, + { + "epoch": 7.0, + "learning_rate": 0.000416900702106319, + "loss": 2.9776, + "theoretical_loss": 3.856396059833974, + "tokens_seen": 575945728 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168906720160481, + "loss": 2.9757, + "theoretical_loss": 3.856349944788567, + "tokens_seen": 576011264 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041688064192577735, + "loss": 3.108, + "theoretical_loss": 3.8563038364585314, + "tokens_seen": 576076800 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041687061183550653, + "loss": 3.0817, + "theoretical_loss": 3.856257734842123, + "tokens_seen": 576142336 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168605817452357, + "loss": 3.0071, + "theoretical_loss": 3.856211639937602, + "tokens_seen": 576207872 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168505516549649, + "loss": 3.0354, + "theoretical_loss": 3.856165551743228, + "tokens_seen": 576273408 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168405215646941, + "loss": 3.1049, + "theoretical_loss": 3.8561194702572603, + "tokens_seen": 576338944 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041683049147442326, + "loss": 3.0954, + "theoretical_loss": 3.856073395477962, + "tokens_seen": 576404480 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168204613841525, + "loss": 3.0823, + "theoretical_loss": 3.856027327403592, + "tokens_seen": 576470016 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004168104312938816, + "loss": 3.1578, + "theoretical_loss": 3.8559812660324138, + "tokens_seen": 576535552 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041680040120361085, + "loss": 3.1341, + "theoretical_loss": 3.8559352113626906, + "tokens_seen": 576601088 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1395675, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.869959831237793, + "objective/train/theoretical_loss": 3.855889163392685, + "objective/train/tokens_used": 597126624, + "theoretical_loss": 3.855889163392685, + "tokens_seen": 576666624 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041679037111334, + "loss": 2.9649, + "theoretical_loss": 3.855889163392685, + "tokens_seen": 576666624 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167803410230692, + "loss": 2.9651, + "theoretical_loss": 3.855843122120662, + "tokens_seen": 576732160 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167703109327984, + "loss": 3.0664, + "theoretical_loss": 3.8557970875448855, + "tokens_seen": 576797696 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167602808425276, + "loss": 3.1345, + "theoretical_loss": 3.8557510596636217, + "tokens_seen": 576863232 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041675025075225676, + "loss": 3.1101, + "theoretical_loss": 3.8557050384751363, + "tokens_seen": 576928768 + }, + { + "epoch": 7.0, + "learning_rate": 0.000416740220661986, + "loss": 3.0495, + "theoretical_loss": 3.855659023977696, + "tokens_seen": 576994304 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167301905717151, + "loss": 2.9821, + "theoretical_loss": 3.8556130161695688, + "tokens_seen": 577059840 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041672016048144436, + "loss": 3.0124, + "theoretical_loss": 3.8555670150490213, + "tokens_seen": 577125376 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167101303911735, + "loss": 3.0494, + "theoretical_loss": 3.855521020614324, + "tokens_seen": 577190912 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004167001003009027, + "loss": 3.1344, + "theoretical_loss": 3.8554750328637444, + "tokens_seen": 577256448 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004166900702106319, + "loss": 2.9382, + "theoretical_loss": 3.8554290517955536, + "tokens_seen": 577321984 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004166800401203611, + "loss": 3.0246, + "theoretical_loss": 3.855383077408022, + "tokens_seen": 577387520 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041667001003009026, + "loss": 2.9717, + "theoretical_loss": 3.8553371096994207, + "tokens_seen": 577453056 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041665997993981944, + "loss": 3.1213, + "theoretical_loss": 3.8552911486680217, + "tokens_seen": 577518592 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004166499498495487, + "loss": 3.0611, + "theoretical_loss": 3.855245194312097, + "tokens_seen": 577584128 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041663991975927786, + "loss": 3.0048, + "theoretical_loss": 3.8551992466299208, + "tokens_seen": 577649664 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041662988966900704, + "loss": 3.0087, + "theoretical_loss": 3.8551533056197664, + "tokens_seen": 577715200 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004166198595787362, + "loss": 3.0719, + "theoretical_loss": 3.8551073712799075, + "tokens_seen": 577780736 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004166098294884654, + "loss": 3.1546, + "theoretical_loss": 3.85506144360862, + "tokens_seen": 577846272 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165997993981946, + "loss": 3.0644, + "theoretical_loss": 3.8550155226041802, + "tokens_seen": 577911808 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165897693079238, + "loss": 3.1673, + "theoretical_loss": 3.854969608264863, + "tokens_seen": 577977344 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041657973921765295, + "loss": 3.0866, + "theoretical_loss": 3.854923700588947, + "tokens_seen": 578042880 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165697091273822, + "loss": 3.0348, + "theoretical_loss": 3.8548777995747088, + "tokens_seen": 578108416 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041655967903711136, + "loss": 3.0292, + "theoretical_loss": 3.8548319052204265, + "tokens_seen": 578173952 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041654964894684055, + "loss": 3.0386, + "theoretical_loss": 3.8547860175243795, + "tokens_seen": 578239488 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1398475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0082831382751465, + "objective/train/theoretical_loss": 3.854740136484848, + "objective/train/tokens_used": 598765024, + "theoretical_loss": 3.854740136484848, + "tokens_seen": 578305024 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165396188565697, + "loss": 3.012, + "theoretical_loss": 3.854740136484848, + "tokens_seen": 578305024 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165295887662989, + "loss": 3.1217, + "theoretical_loss": 3.854694262100111, + "tokens_seen": 578370560 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165195586760281, + "loss": 3.0607, + "theoretical_loss": 3.85464839436845, + "tokens_seen": 578436096 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004165095285857573, + "loss": 3.0229, + "theoretical_loss": 3.854602533288147, + "tokens_seen": 578501632 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041649949849548645, + "loss": 3.0102, + "theoretical_loss": 3.8545566788574828, + "tokens_seen": 578567168 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164894684052157, + "loss": 3.0885, + "theoretical_loss": 3.854510831074742, + "tokens_seen": 578632704 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164794383149448, + "loss": 3.0812, + "theoretical_loss": 3.8544649899382053, + "tokens_seen": 578698240 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041646940822467405, + "loss": 2.9332, + "theoretical_loss": 3.85441915544616, + "tokens_seen": 578763776 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041645937813440323, + "loss": 3.1652, + "theoretical_loss": 3.854373327596888, + "tokens_seen": 578829312 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164493480441324, + "loss": 2.9007, + "theoretical_loss": 3.854327506388677, + "tokens_seen": 578894848 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164393179538616, + "loss": 3.0753, + "theoretical_loss": 3.854281691819811, + "tokens_seen": 578960384 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164292878635908, + "loss": 3.0938, + "theoretical_loss": 3.8542358838885775, + "tokens_seen": 579025920 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041641925777331995, + "loss": 3.0768, + "theoretical_loss": 3.854190082593264, + "tokens_seen": 579091456 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004164092276830492, + "loss": 2.9639, + "theoretical_loss": 3.854144287932158, + "tokens_seen": 579156992 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163991975927783, + "loss": 3.1498, + "theoretical_loss": 3.8540984999035475, + "tokens_seen": 579222528 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041638916750250755, + "loss": 3.0404, + "theoretical_loss": 3.8540527185057223, + "tokens_seen": 579288064 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041637913741223673, + "loss": 3.0797, + "theoretical_loss": 3.854006943736972, + "tokens_seen": 579353600 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163691073219659, + "loss": 3.0357, + "theoretical_loss": 3.853961175595587, + "tokens_seen": 579419136 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163590772316951, + "loss": 3.1623, + "theoretical_loss": 3.8539154140798586, + "tokens_seen": 579484672 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163490471414243, + "loss": 2.9936, + "theoretical_loss": 3.853869659188078, + "tokens_seen": 579550208 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041633901705115346, + "loss": 3.156, + "theoretical_loss": 3.8538239109185377, + "tokens_seen": 579615744 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163289869608827, + "loss": 3.0191, + "theoretical_loss": 3.8537781692695305, + "tokens_seen": 579681280 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004163189568706118, + "loss": 3.037, + "theoretical_loss": 3.8537324342393506, + "tokens_seen": 579746816 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041630892678034105, + "loss": 2.9686, + "theoretical_loss": 3.8536867058262914, + "tokens_seen": 579812352 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162988966900702, + "loss": 3.1187, + "theoretical_loss": 3.8536409840286483, + "tokens_seen": 579877888 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1403397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0384268760681152, + "objective/train/theoretical_loss": 3.8535952688447166, + "objective/train/tokens_used": 600403424, + "theoretical_loss": 3.8535952688447166, + "tokens_seen": 579943424 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162888665997994, + "loss": 3.022, + "theoretical_loss": 3.8535952688447166, + "tokens_seen": 579943424 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162788365095286, + "loss": 2.8953, + "theoretical_loss": 3.853549560272792, + "tokens_seen": 580008960 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162688064192578, + "loss": 3.0386, + "theoretical_loss": 3.8535038583111723, + "tokens_seen": 580074496 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041625877632898696, + "loss": 3.0335, + "theoretical_loss": 3.8534581629581535, + "tokens_seen": 580140032 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162487462387162, + "loss": 2.9774, + "theoretical_loss": 3.8534124742120346, + "tokens_seen": 580205568 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162387161484453, + "loss": 3.0648, + "theoretical_loss": 3.853366792071114, + "tokens_seen": 580271104 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041622868605817456, + "loss": 3.0852, + "theoretical_loss": 3.8533211165336905, + "tokens_seen": 580336640 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162186559679037, + "loss": 3.2013, + "theoretical_loss": 3.8532754475980644, + "tokens_seen": 580402176 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004162086258776329, + "loss": 3.0629, + "theoretical_loss": 3.8532297852625366, + "tokens_seen": 580467712 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161985957873621, + "loss": 3.0431, + "theoretical_loss": 3.8531841295254075, + "tokens_seen": 580533248 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161885656970913, + "loss": 3.0713, + "theoretical_loss": 3.8531384803849793, + "tokens_seen": 580598784 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041617853560682046, + "loss": 3.181, + "theoretical_loss": 3.8530928378395544, + "tokens_seen": 580664320 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041616850551654964, + "loss": 3.0225, + "theoretical_loss": 3.8530472018874358, + "tokens_seen": 580729856 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161584754262788, + "loss": 3.0192, + "theoretical_loss": 3.8530015725269267, + "tokens_seen": 580795392 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041614844533600806, + "loss": 3.1554, + "theoretical_loss": 3.852955949756332, + "tokens_seen": 580860928 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161384152457372, + "loss": 2.9902, + "theoretical_loss": 3.8529103335739565, + "tokens_seen": 580926464 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161283851554664, + "loss": 3.0759, + "theoretical_loss": 3.8528647239781053, + "tokens_seen": 580992000 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041611835506519555, + "loss": 3.0271, + "theoretical_loss": 3.852819120967085, + "tokens_seen": 581057536 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004161083249749248, + "loss": 3.1597, + "theoretical_loss": 3.8527735245392023, + "tokens_seen": 581123072 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041609829488465397, + "loss": 3.0212, + "theoretical_loss": 3.852727934692765, + "tokens_seen": 581188608 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041608826479438315, + "loss": 3.1021, + "theoretical_loss": 3.852682351426081, + "tokens_seen": 581254144 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041607823470411233, + "loss": 2.9872, + "theoretical_loss": 3.8526367747374577, + "tokens_seen": 581319680 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041606820461384156, + "loss": 3.0305, + "theoretical_loss": 3.852591204625206, + "tokens_seen": 581385216 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004160581745235707, + "loss": 3.1626, + "theoretical_loss": 3.8525456410876355, + "tokens_seen": 581450752 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004160481444332999, + "loss": 3.0579, + "theoretical_loss": 3.8525000841230566, + "tokens_seen": 581516288 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1406215, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1401724815368652, + "objective/train/theoretical_loss": 3.85245453372978, + "objective/train/tokens_used": 602041824, + "theoretical_loss": 3.85245453372978, + "tokens_seen": 581581824 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041603811434302905, + "loss": 3.1227, + "theoretical_loss": 3.85245453372978, + "tokens_seen": 581581824 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004160280842527583, + "loss": 3.1408, + "theoretical_loss": 3.8524089899061185, + "tokens_seen": 581647360 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041601805416248747, + "loss": 3.0133, + "theoretical_loss": 3.8523634526503834, + "tokens_seen": 581712896 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041600802407221665, + "loss": 3.0438, + "theoretical_loss": 3.8523179219608883, + "tokens_seen": 581778432 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041599799398194583, + "loss": 3.1137, + "theoretical_loss": 3.8522723978359474, + "tokens_seen": 581843968 + }, + { + "epoch": 7.0, + "learning_rate": 0.000415987963891675, + "loss": 3.0813, + "theoretical_loss": 3.852226880273874, + "tokens_seen": 581909504 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004159779338014042, + "loss": 3.0733, + "theoretical_loss": 3.8521813692729836, + "tokens_seen": 581975040 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041596790371113343, + "loss": 2.9257, + "theoretical_loss": 3.852135864831591, + "tokens_seen": 582040576 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041595787362086256, + "loss": 3.0505, + "theoretical_loss": 3.8520903669480138, + "tokens_seen": 582106112 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004159478435305918, + "loss": 2.9681, + "theoretical_loss": 3.8520448756205674, + "tokens_seen": 582171648 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004159378134403209, + "loss": 2.9848, + "theoretical_loss": 3.85199939084757, + "tokens_seen": 582237184 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041592778335005015, + "loss": 3.1309, + "theoretical_loss": 3.8519539126273394, + "tokens_seen": 582302720 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041591775325977934, + "loss": 3.1255, + "theoretical_loss": 3.8519084409581943, + "tokens_seen": 582368256 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004159077231695085, + "loss": 3.032, + "theoretical_loss": 3.8518629758384537, + "tokens_seen": 582433792 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041589769307923775, + "loss": 3.0551, + "theoretical_loss": 3.8518175172664377, + "tokens_seen": 582499328 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041588766298896693, + "loss": 3.0914, + "theoretical_loss": 3.851772065240467, + "tokens_seen": 582564864 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004158776328986961, + "loss": 3.0998, + "theoretical_loss": 3.851726619758862, + "tokens_seen": 582630400 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004158676028084253, + "loss": 3.1497, + "theoretical_loss": 3.851681180819945, + "tokens_seen": 582695936 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004158575727181545, + "loss": 3.0351, + "theoretical_loss": 3.851635748422039, + "tokens_seen": 582761472 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041584754262788366, + "loss": 3.0327, + "theoretical_loss": 3.8515903225634656, + "tokens_seen": 582827008 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004158375125376129, + "loss": 3.1792, + "theoretical_loss": 3.851544903242549, + "tokens_seen": 582892544 + }, + { + "epoch": 7.0, + "learning_rate": 0.000415827482447342, + "loss": 3.0996, + "theoretical_loss": 3.8514994904576136, + "tokens_seen": 582958080 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041581745235707126, + "loss": 3.106, + "theoretical_loss": 3.851454084206985, + "tokens_seen": 583023616 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004158074222668004, + "loss": 3.1274, + "theoretical_loss": 3.8514086844889865, + "tokens_seen": 583089152 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004157973921765296, + "loss": 3.0183, + "theoretical_loss": 3.851363291301946, + "tokens_seen": 583154688 + }, + { + "epoch": 7.0, + "objective/train/docs_used": 1409994, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0615293979644775, + "objective/train/theoretical_loss": 3.85131790464419, + "objective/train/tokens_used": 603680224, + "theoretical_loss": 3.85131790464419, + "tokens_seen": 583220224 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004157873620862588, + "loss": 2.9339, + "theoretical_loss": 3.85131790464419, + "tokens_seen": 583220224 + }, + { + "epoch": 7.0, + "learning_rate": 0.000415777331995988, + "loss": 2.9519, + "theoretical_loss": 3.8512725245140453, + "tokens_seen": 583285760 + }, + { + "epoch": 7.0, + "learning_rate": 0.00041576730190571716, + "loss": 3.0451, + "theoretical_loss": 3.85122715090984, + "tokens_seen": 583351296 + }, + { + "epoch": 7.0, + "learning_rate": 0.0004157572718154464, + "loss": 3.0661, + "theoretical_loss": 3.8511817838299023, + "tokens_seen": 583416832 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004157472417251755, + "loss": 3.1025, + "theoretical_loss": 3.8511364232725622, + "tokens_seen": 583482368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041573721163490476, + "loss": 3.0924, + "theoretical_loss": 3.8510910692361486, + "tokens_seen": 583547904 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004157271815446339, + "loss": 3.095, + "theoretical_loss": 3.851045721718992, + "tokens_seen": 583613440 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004157171514543631, + "loss": 3.0577, + "theoretical_loss": 3.851000380719424, + "tokens_seen": 583678976 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004157071213640923, + "loss": 3.1211, + "theoretical_loss": 3.850955046235776, + "tokens_seen": 583744512 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004156970912738215, + "loss": 2.9523, + "theoretical_loss": 3.85090971826638, + "tokens_seen": 583810048 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041568706118355066, + "loss": 3.0088, + "theoretical_loss": 3.850864396809569, + "tokens_seen": 583875584 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041567703109327984, + "loss": 2.9907, + "theoretical_loss": 3.8508190818636763, + "tokens_seen": 583941120 + }, + { + "epoch": 7.01, + "learning_rate": 0.000415667001003009, + "loss": 3.0846, + "theoretical_loss": 3.8507737734270355, + "tokens_seen": 584006656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041565697091273826, + "loss": 3.1503, + "theoretical_loss": 3.850728471497982, + "tokens_seen": 584072192 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004156469408224674, + "loss": 3.1423, + "theoretical_loss": 3.8506831760748517, + "tokens_seen": 584137728 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004156369107321966, + "loss": 3.0894, + "theoretical_loss": 3.8506378871559788, + "tokens_seen": 584203264 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041562688064192575, + "loss": 3.0951, + "theoretical_loss": 3.850592604739701, + "tokens_seen": 584268800 + }, + { + "epoch": 7.01, + "learning_rate": 0.000415616850551655, + "loss": 3.0438, + "theoretical_loss": 3.850547328824356, + "tokens_seen": 584334336 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041560682046138417, + "loss": 3.0929, + "theoretical_loss": 3.8505020594082797, + "tokens_seen": 584399872 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041559679037111335, + "loss": 2.9893, + "theoretical_loss": 3.850456796489812, + "tokens_seen": 584465408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041558676028084253, + "loss": 3.1425, + "theoretical_loss": 3.8504115400672916, + "tokens_seen": 584530944 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041557673019057176, + "loss": 3.0394, + "theoretical_loss": 3.850366290139057, + "tokens_seen": 584596480 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004155667001003009, + "loss": 2.983, + "theoretical_loss": 3.85032104670345, + "tokens_seen": 584662016 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004155566700100301, + "loss": 3.0825, + "theoretical_loss": 3.85027580975881, + "tokens_seen": 584727552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041554663991975925, + "loss": 2.9569, + "theoretical_loss": 3.8502305793034797, + "tokens_seen": 584793088 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1414754, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.214869976043701, + "objective/train/theoretical_loss": 3.8501853553357996, + "objective/train/tokens_used": 605318624, + "theoretical_loss": 3.8501853553357996, + "tokens_seen": 584858624 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004155366098294885, + "loss": 3.167, + "theoretical_loss": 3.8501853553357996, + "tokens_seen": 584858624 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041552657973921767, + "loss": 3.1242, + "theoretical_loss": 3.850140137854114, + "tokens_seen": 584924160 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041551654964894685, + "loss": 3.0391, + "theoretical_loss": 3.850094926856765, + "tokens_seen": 584989696 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041550651955867603, + "loss": 3.0764, + "theoretical_loss": 3.8500497223420966, + "tokens_seen": 585055232 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004154964894684052, + "loss": 2.9983, + "theoretical_loss": 3.8500045243084537, + "tokens_seen": 585120768 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004154864593781344, + "loss": 3.1123, + "theoretical_loss": 3.849959332754181, + "tokens_seen": 585186304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041547642928786363, + "loss": 3.0599, + "theoretical_loss": 3.849914147677624, + "tokens_seen": 585251840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041546639919759276, + "loss": 3.0972, + "theoretical_loss": 3.849868969077129, + "tokens_seen": 585317376 + }, + { + "epoch": 7.01, + "learning_rate": 0.000415456369107322, + "loss": 3.1052, + "theoretical_loss": 3.8498237969510436, + "tokens_seen": 585382912 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004154463390170511, + "loss": 2.9626, + "theoretical_loss": 3.849778631297715, + "tokens_seen": 585448448 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041543630892678035, + "loss": 3.003, + "theoretical_loss": 3.8497334721154903, + "tokens_seen": 585513984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041542627883650954, + "loss": 3.0898, + "theoretical_loss": 3.84968831940272, + "tokens_seen": 585579520 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004154162487462387, + "loss": 3.0613, + "theoretical_loss": 3.8496431731577516, + "tokens_seen": 585645056 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004154062186559679, + "loss": 3.1146, + "theoretical_loss": 3.849598033378936, + "tokens_seen": 585710592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041539618856569713, + "loss": 3.079, + "theoretical_loss": 3.849552900064624, + "tokens_seen": 585776128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041538615847542626, + "loss": 2.964, + "theoretical_loss": 3.8495077732131655, + "tokens_seen": 585841664 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004153761283851555, + "loss": 3.1112, + "theoretical_loss": 3.8494626528229134, + "tokens_seen": 585907200 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004153660982948846, + "loss": 3.1205, + "theoretical_loss": 3.8494175388922196, + "tokens_seen": 585972736 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041535606820461386, + "loss": 3.0829, + "theoretical_loss": 3.849372431419437, + "tokens_seen": 586038272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041534603811434304, + "loss": 3.0165, + "theoretical_loss": 3.8493273304029194, + "tokens_seen": 586103808 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004153360080240722, + "loss": 3.1544, + "theoretical_loss": 3.8492822358410206, + "tokens_seen": 586169344 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004153259779338014, + "loss": 3.0288, + "theoretical_loss": 3.8492371477320955, + "tokens_seen": 586234880 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004153159478435306, + "loss": 3.1119, + "theoretical_loss": 3.8491920660744996, + "tokens_seen": 586300416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041530591775325976, + "loss": 3.2108, + "theoretical_loss": 3.849146990866589, + "tokens_seen": 586365952 + }, + { + "epoch": 7.01, + "learning_rate": 0.000415295887662989, + "loss": 3.0408, + "theoretical_loss": 3.8491019221067195, + "tokens_seen": 586431488 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1417744, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9558894634246826, + "objective/train/theoretical_loss": 3.849056859793249, + "objective/train/tokens_used": 606957024, + "theoretical_loss": 3.849056859793249, + "tokens_seen": 586497024 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152858575727181, + "loss": 3.0123, + "theoretical_loss": 3.849056859793249, + "tokens_seen": 586497024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041527582748244736, + "loss": 2.994, + "theoretical_loss": 3.8490118039245353, + "tokens_seen": 586562560 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152657973921765, + "loss": 3.0092, + "theoretical_loss": 3.848966754498936, + "tokens_seen": 586628096 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152557673019057, + "loss": 3.2033, + "theoretical_loss": 3.848921711514811, + "tokens_seen": 586693632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152457372116349, + "loss": 2.9258, + "theoretical_loss": 3.848876674970519, + "tokens_seen": 586759168 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152357071213641, + "loss": 3.0153, + "theoretical_loss": 3.848831644864421, + "tokens_seen": 586824704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041522567703109327, + "loss": 2.9754, + "theoretical_loss": 3.8487866211948774, + "tokens_seen": 586890240 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004152156469408225, + "loss": 2.897, + "theoretical_loss": 3.8487416039602493, + "tokens_seen": 586955776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041520561685055163, + "loss": 3.0761, + "theoretical_loss": 3.8486965931588992, + "tokens_seen": 587021312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041519558676028086, + "loss": 3.0466, + "theoretical_loss": 3.8486515887891892, + "tokens_seen": 587086848 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041518555667001, + "loss": 3.0811, + "theoretical_loss": 3.8486065908494824, + "tokens_seen": 587152384 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004151755265797392, + "loss": 3.0423, + "theoretical_loss": 3.8485615993381432, + "tokens_seen": 587217920 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004151654964894684, + "loss": 3.0864, + "theoretical_loss": 3.8485166142535356, + "tokens_seen": 587283456 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004151554663991976, + "loss": 2.9703, + "theoretical_loss": 3.8484716355940245, + "tokens_seen": 587348992 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004151454363089268, + "loss": 2.942, + "theoretical_loss": 3.8484266633579756, + "tokens_seen": 587414528 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041513540621865595, + "loss": 3.0768, + "theoretical_loss": 3.848381697543755, + "tokens_seen": 587480064 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004151253761283852, + "loss": 3.0899, + "theoretical_loss": 3.8483367381497295, + "tokens_seen": 587545600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041511534603811437, + "loss": 2.9586, + "theoretical_loss": 3.8482917851742666, + "tokens_seen": 587611136 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041510531594784355, + "loss": 3.0824, + "theoretical_loss": 3.848246838615734, + "tokens_seen": 587676672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041509528585757273, + "loss": 3.0455, + "theoretical_loss": 3.8482018984724995, + "tokens_seen": 587742208 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041508525576730196, + "loss": 3.112, + "theoretical_loss": 3.8481569647429343, + "tokens_seen": 587807744 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004150752256770311, + "loss": 3.1304, + "theoretical_loss": 3.8481120374254063, + "tokens_seen": 587873280 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004150651955867603, + "loss": 3.1271, + "theoretical_loss": 3.848067116518287, + "tokens_seen": 587938816 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041505516549648945, + "loss": 3.0699, + "theoretical_loss": 3.848022202019946, + "tokens_seen": 588004352 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004150451354062187, + "loss": 3.1167, + "theoretical_loss": 3.847977293928756, + "tokens_seen": 588069888 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1422646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0882301330566406, + "objective/train/theoretical_loss": 3.8479323922430893, + "objective/train/tokens_used": 608595424, + "theoretical_loss": 3.8479323922430893, + "tokens_seen": 588135424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041503510531594787, + "loss": 3.1148, + "theoretical_loss": 3.8479323922430893, + "tokens_seen": 588135424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041502507522567705, + "loss": 2.9491, + "theoretical_loss": 3.8478874969613175, + "tokens_seen": 588200960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041501504513540623, + "loss": 3.1109, + "theoretical_loss": 3.847842608081815, + "tokens_seen": 588266496 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004150050150451354, + "loss": 3.0541, + "theoretical_loss": 3.8477977256029554, + "tokens_seen": 588332032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004149949849548646, + "loss": 2.9529, + "theoretical_loss": 3.847752849523112, + "tokens_seen": 588397568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041498495486459383, + "loss": 3.0145, + "theoretical_loss": 3.847707979840662, + "tokens_seen": 588463104 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041497492477432296, + "loss": 3.1806, + "theoretical_loss": 3.8476631165539796, + "tokens_seen": 588528640 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004149648946840522, + "loss": 3.0378, + "theoretical_loss": 3.8476182596614414, + "tokens_seen": 588594176 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004149548645937813, + "loss": 3.0538, + "theoretical_loss": 3.8475734091614253, + "tokens_seen": 588659712 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041494483450351055, + "loss": 2.992, + "theoretical_loss": 3.847528565052307, + "tokens_seen": 588725248 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041493480441323974, + "loss": 2.9839, + "theoretical_loss": 3.847483727332466, + "tokens_seen": 588790784 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004149247743229689, + "loss": 3.0814, + "theoretical_loss": 3.8474388960002797, + "tokens_seen": 588856320 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004149147442326981, + "loss": 3.0243, + "theoretical_loss": 3.8473940710541283, + "tokens_seen": 588921856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041490471414242733, + "loss": 3.0216, + "theoretical_loss": 3.847349252492392, + "tokens_seen": 588987392 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041489468405215646, + "loss": 3.1021, + "theoretical_loss": 3.84730444031345, + "tokens_seen": 589052928 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148846539618857, + "loss": 3.0469, + "theoretical_loss": 3.847259634515684, + "tokens_seen": 589118464 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148746238716148, + "loss": 3.1792, + "theoretical_loss": 3.847214835097476, + "tokens_seen": 589184000 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041486459378134406, + "loss": 2.9473, + "theoretical_loss": 3.8471700420572077, + "tokens_seen": 589249536 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041485456369107324, + "loss": 3.0712, + "theoretical_loss": 3.8471252553932618, + "tokens_seen": 589315072 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148445336008024, + "loss": 3.0125, + "theoretical_loss": 3.847080475104022, + "tokens_seen": 589380608 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148345035105316, + "loss": 3.0459, + "theoretical_loss": 3.847035701187872, + "tokens_seen": 589446144 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148244734202608, + "loss": 3.0752, + "theoretical_loss": 3.8469909336431964, + "tokens_seen": 589511680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041481444332998996, + "loss": 3.0015, + "theoretical_loss": 3.846946172468381, + "tokens_seen": 589577216 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004148044132397192, + "loss": 3.0714, + "theoretical_loss": 3.846901417661811, + "tokens_seen": 589642752 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147943831494483, + "loss": 3.0585, + "theoretical_loss": 3.846856669221872, + "tokens_seen": 589708288 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1425541, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.104495048522949, + "objective/train/theoretical_loss": 3.846811927146952, + "objective/train/tokens_used": 610233824, + "theoretical_loss": 3.846811927146952, + "tokens_seen": 589773824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041478435305917756, + "loss": 3.0998, + "theoretical_loss": 3.846811927146952, + "tokens_seen": 589773824 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147743229689067, + "loss": 3.0816, + "theoretical_loss": 3.846767191435438, + "tokens_seen": 589839360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147642928786359, + "loss": 3.1407, + "theoretical_loss": 3.8467224620857183, + "tokens_seen": 589904896 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147542627883651, + "loss": 3.1746, + "theoretical_loss": 3.846677739096182, + "tokens_seen": 589970432 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147442326980943, + "loss": 3.0823, + "theoretical_loss": 3.846633022465218, + "tokens_seen": 590035968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041473420260782347, + "loss": 3.0268, + "theoretical_loss": 3.846588312191215, + "tokens_seen": 590101504 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004147241725175527, + "loss": 3.0572, + "theoretical_loss": 3.8465436082725653, + "tokens_seen": 590167040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041471414242728183, + "loss": 3.0808, + "theoretical_loss": 3.846498910707659, + "tokens_seen": 590232576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041470411233701106, + "loss": 3.1353, + "theoretical_loss": 3.8464542194948876, + "tokens_seen": 590298112 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004146940822467402, + "loss": 3.073, + "theoretical_loss": 3.8464095346326435, + "tokens_seen": 590363648 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004146840521564694, + "loss": 2.9984, + "theoretical_loss": 3.8463648561193198, + "tokens_seen": 590429184 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004146740220661986, + "loss": 3.1623, + "theoretical_loss": 3.84632018395331, + "tokens_seen": 590494720 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004146639919759278, + "loss": 3.1382, + "theoretical_loss": 3.846275518133007, + "tokens_seen": 590560256 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041465396188565697, + "loss": 3.1119, + "theoretical_loss": 3.846230858656806, + "tokens_seen": 590625792 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041464393179538615, + "loss": 2.9511, + "theoretical_loss": 3.846186205523102, + "tokens_seen": 590691328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041463390170511533, + "loss": 3.0928, + "theoretical_loss": 3.846141558730291, + "tokens_seen": 590756864 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041462387161484457, + "loss": 3.0924, + "theoretical_loss": 3.8460969182767695, + "tokens_seen": 590822400 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004146138415245737, + "loss": 3.2076, + "theoretical_loss": 3.8460522841609333, + "tokens_seen": 590887936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041460381143430293, + "loss": 3.0571, + "theoretical_loss": 3.8460076563811807, + "tokens_seen": 590953472 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004145937813440321, + "loss": 3.0981, + "theoretical_loss": 3.8459630349359104, + "tokens_seen": 591019008 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004145837512537613, + "loss": 3.0315, + "theoretical_loss": 3.8459184198235192, + "tokens_seen": 591084544 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004145737211634905, + "loss": 3.1098, + "theoretical_loss": 3.8458738110424076, + "tokens_seen": 591150080 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041456369107321965, + "loss": 3.1199, + "theoretical_loss": 3.845829208590975, + "tokens_seen": 591215616 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041455366098294884, + "loss": 3.1694, + "theoretical_loss": 3.8457846124676225, + "tokens_seen": 591281152 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041454363089267807, + "loss": 3.118, + "theoretical_loss": 3.8457400226707494, + "tokens_seen": 591346688 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1429332, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.166201114654541, + "objective/train/theoretical_loss": 3.8456954391987592, + "objective/train/tokens_used": 611872224, + "theoretical_loss": 3.8456954391987592, + "tokens_seen": 591412224 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004145336008024072, + "loss": 3.2399, + "theoretical_loss": 3.8456954391987592, + "tokens_seen": 591412224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041452357071213643, + "loss": 3.006, + "theoretical_loss": 3.8456508620500527, + "tokens_seen": 591477760 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041451354062186556, + "loss": 3.1476, + "theoretical_loss": 3.8456062912230324, + "tokens_seen": 591543296 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004145035105315948, + "loss": 2.9157, + "theoretical_loss": 3.8455617267161024, + "tokens_seen": 591608832 + }, + { + "epoch": 7.01, + "learning_rate": 0.000414493480441324, + "loss": 3.0796, + "theoretical_loss": 3.8455171685276666, + "tokens_seen": 591674368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041448345035105316, + "loss": 3.1026, + "theoretical_loss": 3.845472616656129, + "tokens_seen": 591739904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041447342026078234, + "loss": 2.9955, + "theoretical_loss": 3.845428071099895, + "tokens_seen": 591805440 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004144633901705115, + "loss": 3.0734, + "theoretical_loss": 3.8453835318573697, + "tokens_seen": 591870976 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004144533600802407, + "loss": 3.1371, + "theoretical_loss": 3.8453389989269593, + "tokens_seen": 591936512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041444332998996994, + "loss": 3.1045, + "theoretical_loss": 3.8452944723070708, + "tokens_seen": 592002048 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041443329989969906, + "loss": 3.1255, + "theoretical_loss": 3.845249951996111, + "tokens_seen": 592067584 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004144232698094283, + "loss": 3.0226, + "theoretical_loss": 3.8452054379924894, + "tokens_seen": 592133120 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041441323971915753, + "loss": 3.0412, + "theoretical_loss": 3.845160930294613, + "tokens_seen": 592198656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041440320962888666, + "loss": 3.1299, + "theoretical_loss": 3.845116428900891, + "tokens_seen": 592264192 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004143931795386159, + "loss": 3.0716, + "theoretical_loss": 3.8450719338097326, + "tokens_seen": 592329728 + }, + { + "epoch": 7.01, + "learning_rate": 0.000414383149448345, + "loss": 2.9994, + "theoretical_loss": 3.8450274450195496, + "tokens_seen": 592395264 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041437311935807426, + "loss": 3.0672, + "theoretical_loss": 3.8449829625287517, + "tokens_seen": 592460800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041436308926780344, + "loss": 3.0353, + "theoretical_loss": 3.8449384863357503, + "tokens_seen": 592526336 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004143530591775326, + "loss": 3.1405, + "theoretical_loss": 3.8448940164389573, + "tokens_seen": 592591872 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004143430290872618, + "loss": 3.0423, + "theoretical_loss": 3.844849552836786, + "tokens_seen": 592657408 + }, + { + "epoch": 7.01, + "learning_rate": 0.000414332998996991, + "loss": 3.0234, + "theoretical_loss": 3.844805095527648, + "tokens_seen": 592722944 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041432296890672016, + "loss": 3.1297, + "theoretical_loss": 3.844760644509959, + "tokens_seen": 592788480 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004143129388164494, + "loss": 3.1922, + "theoretical_loss": 3.8447161997821313, + "tokens_seen": 592854016 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004143029087261785, + "loss": 2.9869, + "theoretical_loss": 3.844671761342581, + "tokens_seen": 592919552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041429287863590776, + "loss": 3.0396, + "theoretical_loss": 3.8446273291897226, + "tokens_seen": 592985088 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1434251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1962852478027344, + "objective/train/theoretical_loss": 3.844582903321973, + "objective/train/tokens_used": 613510624, + "theoretical_loss": 3.844582903321973, + "tokens_seen": 593050624 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142828485456369, + "loss": 3.1304, + "theoretical_loss": 3.844582903321973, + "tokens_seen": 593050624 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142728184553661, + "loss": 3.0102, + "theoretical_loss": 3.844538483737748, + "tokens_seen": 593116160 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142627883650953, + "loss": 3.1824, + "theoretical_loss": 3.8444940704354655, + "tokens_seen": 593181696 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142527582748245, + "loss": 3.1153, + "theoretical_loss": 3.844449663413542, + "tokens_seen": 593247232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041424272818455367, + "loss": 3.022, + "theoretical_loss": 3.844405262670397, + "tokens_seen": 593312768 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142326980942829, + "loss": 2.9785, + "theoretical_loss": 3.8443608682044488, + "tokens_seen": 593378304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041422266800401203, + "loss": 3.0413, + "theoretical_loss": 3.8443164800141165, + "tokens_seen": 593443840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041421263791374126, + "loss": 3.1206, + "theoretical_loss": 3.8442720980978207, + "tokens_seen": 593509376 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004142026078234704, + "loss": 3.0194, + "theoretical_loss": 3.844227722453981, + "tokens_seen": 593574912 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004141925777331996, + "loss": 3.083, + "theoretical_loss": 3.8441833530810197, + "tokens_seen": 593640448 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004141825476429288, + "loss": 3.032, + "theoretical_loss": 3.8441389899773584, + "tokens_seen": 593705984 + }, + { + "epoch": 7.01, + "learning_rate": 0.000414172517552658, + "loss": 3.1056, + "theoretical_loss": 3.8440946331414185, + "tokens_seen": 593771520 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041416248746238717, + "loss": 3.1464, + "theoretical_loss": 3.844050282571623, + "tokens_seen": 593837056 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041415245737211635, + "loss": 3.0847, + "theoretical_loss": 3.8440059382663962, + "tokens_seen": 593902592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041414242728184553, + "loss": 3.139, + "theoretical_loss": 3.8439616002241612, + "tokens_seen": 593968128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041413239719157477, + "loss": 3.0764, + "theoretical_loss": 3.8439172684433434, + "tokens_seen": 594033664 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004141223671013039, + "loss": 3.0834, + "theoretical_loss": 3.843872942922367, + "tokens_seen": 594099200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041411233701103313, + "loss": 3.1169, + "theoretical_loss": 3.843828623659658, + "tokens_seen": 594164736 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004141023069207623, + "loss": 2.9998, + "theoretical_loss": 3.8437843106536427, + "tokens_seen": 594230272 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004140922768304915, + "loss": 2.9994, + "theoretical_loss": 3.8437400039027483, + "tokens_seen": 594295808 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004140822467402207, + "loss": 3.0871, + "theoretical_loss": 3.843695703405402, + "tokens_seen": 594361344 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041407221664994985, + "loss": 3.094, + "theoretical_loss": 3.8436514091600316, + "tokens_seen": 594426880 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041406218655967904, + "loss": 3.1839, + "theoretical_loss": 3.843607121165065, + "tokens_seen": 594492416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041405215646940827, + "loss": 3.1456, + "theoretical_loss": 3.843562839418933, + "tokens_seen": 594557952 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004140421263791374, + "loss": 3.151, + "theoretical_loss": 3.843518563920064, + "tokens_seen": 594623488 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1437214, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.075615644454956, + "objective/train/theoretical_loss": 3.8434742946668883, + "objective/train/tokens_used": 615149024, + "theoretical_loss": 3.8434742946668883, + "tokens_seen": 594689024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041403209628886663, + "loss": 3.0439, + "theoretical_loss": 3.8434742946668883, + "tokens_seen": 594689024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041402206619859576, + "loss": 3.0375, + "theoretical_loss": 3.843430031657837, + "tokens_seen": 594754560 + }, + { + "epoch": 7.01, + "learning_rate": 0.000414012036108325, + "loss": 3.1789, + "theoretical_loss": 3.843385774891342, + "tokens_seen": 594820096 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004140020060180542, + "loss": 3.0255, + "theoretical_loss": 3.843341524365834, + "tokens_seen": 594885632 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041399197592778336, + "loss": 3.1269, + "theoretical_loss": 3.8432972800797462, + "tokens_seen": 594951168 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041398194583751254, + "loss": 3.0512, + "theoretical_loss": 3.843253042031512, + "tokens_seen": 595016704 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004139719157472417, + "loss": 2.8896, + "theoretical_loss": 3.8432088102195645, + "tokens_seen": 595082240 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004139618856569709, + "loss": 3.0367, + "theoretical_loss": 3.8431645846423383, + "tokens_seen": 595147776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041395185556670014, + "loss": 3.0955, + "theoretical_loss": 3.843120365298268, + "tokens_seen": 595213312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041394182547642926, + "loss": 3.0848, + "theoretical_loss": 3.8430761521857892, + "tokens_seen": 595278848 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004139317953861585, + "loss": 3.0177, + "theoretical_loss": 3.8430319453033372, + "tokens_seen": 595344384 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004139217652958877, + "loss": 3.0219, + "theoretical_loss": 3.842987744649349, + "tokens_seen": 595409920 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041391173520561686, + "loss": 3.0102, + "theoretical_loss": 3.8429435502222615, + "tokens_seen": 595475456 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041390170511534604, + "loss": 3.0468, + "theoretical_loss": 3.842899362020512, + "tokens_seen": 595540992 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004138916750250752, + "loss": 3.1247, + "theoretical_loss": 3.8428551800425392, + "tokens_seen": 595606528 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004138816449348044, + "loss": 3.1156, + "theoretical_loss": 3.842811004286781, + "tokens_seen": 595672064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041387161484453364, + "loss": 3.1306, + "theoretical_loss": 3.8427668347516777, + "tokens_seen": 595737600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041386158475426277, + "loss": 3.0534, + "theoretical_loss": 3.842722671435669, + "tokens_seen": 595803136 + }, + { + "epoch": 7.01, + "learning_rate": 0.000413851554663992, + "loss": 2.9715, + "theoretical_loss": 3.8426785143371944, + "tokens_seen": 595868672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041384152457372113, + "loss": 3.099, + "theoretical_loss": 3.842634363454696, + "tokens_seen": 595934208 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041383149448345036, + "loss": 3.0882, + "theoretical_loss": 3.842590218786615, + "tokens_seen": 595999744 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041382146439317954, + "loss": 3.0499, + "theoretical_loss": 3.8425460803313927, + "tokens_seen": 596065280 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004138114343029087, + "loss": 3.1181, + "theoretical_loss": 3.842501948087473, + "tokens_seen": 596130816 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004138014042126379, + "loss": 3.0042, + "theoretical_loss": 3.842457822053298, + "tokens_seen": 596196352 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004137913741223671, + "loss": 3.0548, + "theoretical_loss": 3.8424137022273124, + "tokens_seen": 596261888 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1442014, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1021785736083984, + "objective/train/theoretical_loss": 3.84236958860796, + "objective/train/tokens_used": 616787424, + "theoretical_loss": 3.84236958860796, + "tokens_seen": 596327424 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041378134403209627, + "loss": 2.966, + "theoretical_loss": 3.84236958860796, + "tokens_seen": 596327424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004137713139418255, + "loss": 3.0802, + "theoretical_loss": 3.8423254811936856, + "tokens_seen": 596392960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041376128385155463, + "loss": 2.9892, + "theoretical_loss": 3.8422813799829356, + "tokens_seen": 596458496 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041375125376128387, + "loss": 3.0588, + "theoretical_loss": 3.842237284974155, + "tokens_seen": 596524032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041374122367101305, + "loss": 3.0177, + "theoretical_loss": 3.842193196165791, + "tokens_seen": 596589568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041373119358074223, + "loss": 3.1167, + "theoretical_loss": 3.84214911355629, + "tokens_seen": 596655104 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004137211634904714, + "loss": 3.0682, + "theoretical_loss": 3.8421050371441003, + "tokens_seen": 596720640 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004137111334002006, + "loss": 2.9793, + "theoretical_loss": 3.8420609669276713, + "tokens_seen": 596786176 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041370110330992977, + "loss": 3.0426, + "theoretical_loss": 3.842016902905449, + "tokens_seen": 596851712 + }, + { + "epoch": 7.01, + "learning_rate": 0.000413691073219659, + "loss": 3.0982, + "theoretical_loss": 3.8419728450758854, + "tokens_seen": 596917248 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041368104312938813, + "loss": 3.125, + "theoretical_loss": 3.8419287934374293, + "tokens_seen": 596982784 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041367101303911737, + "loss": 3.1775, + "theoretical_loss": 3.8418847479885314, + "tokens_seen": 597048320 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041366098294884655, + "loss": 3.1329, + "theoretical_loss": 3.8418407087276423, + "tokens_seen": 597113856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041365095285857573, + "loss": 2.9455, + "theoretical_loss": 3.8417966756532147, + "tokens_seen": 597179392 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041364092276830497, + "loss": 3.1208, + "theoretical_loss": 3.8417526487636993, + "tokens_seen": 597244928 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004136308926780341, + "loss": 2.9799, + "theoretical_loss": 3.8417086280575505, + "tokens_seen": 597310464 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041362086258776333, + "loss": 3.0622, + "theoretical_loss": 3.8416646135332204, + "tokens_seen": 597376000 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004136108324974925, + "loss": 3.0811, + "theoretical_loss": 3.8416206051891626, + "tokens_seen": 597441536 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004136008024072217, + "loss": 3.0738, + "theoretical_loss": 3.841576603023833, + "tokens_seen": 597507072 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004135907723169509, + "loss": 3.071, + "theoretical_loss": 3.8415326070356857, + "tokens_seen": 597572608 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041358074222668005, + "loss": 3.1109, + "theoretical_loss": 3.8414886172231753, + "tokens_seen": 597638144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041357071213640924, + "loss": 2.96, + "theoretical_loss": 3.841444633584759, + "tokens_seen": 597703680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041356068204613847, + "loss": 3.1406, + "theoretical_loss": 3.8414006561188936, + "tokens_seen": 597769216 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004135506519558676, + "loss": 3.1056, + "theoretical_loss": 3.8413566848240355, + "tokens_seen": 597834752 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041354062186559683, + "loss": 3.0955, + "theoretical_loss": 3.8413127196986427, + "tokens_seen": 597900288 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1445084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0146145820617676, + "objective/train/theoretical_loss": 3.8412687607411735, + "objective/train/tokens_used": 618425824, + "theoretical_loss": 3.8412687607411735, + "tokens_seen": 597965824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041353059177532596, + "loss": 3.1039, + "theoretical_loss": 3.8412687607411735, + "tokens_seen": 597965824 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004135205616850552, + "loss": 3.0222, + "theoretical_loss": 3.8412248079500872, + "tokens_seen": 598031360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004135105315947844, + "loss": 3.1004, + "theoretical_loss": 3.841180861323843, + "tokens_seen": 598096896 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041350050150451356, + "loss": 3.1073, + "theoretical_loss": 3.8411369208609, + "tokens_seen": 598162432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041349047141424274, + "loss": 3.0578, + "theoretical_loss": 3.84109298655972, + "tokens_seen": 598227968 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004134804413239719, + "loss": 3.0089, + "theoretical_loss": 3.841049058418762, + "tokens_seen": 598293504 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004134704112337011, + "loss": 3.1214, + "theoretical_loss": 3.8410051364364906, + "tokens_seen": 598359040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041346038114343034, + "loss": 3.1697, + "theoretical_loss": 3.8409612206113657, + "tokens_seen": 598424576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041345035105315946, + "loss": 3.0265, + "theoretical_loss": 3.8409173109418506, + "tokens_seen": 598490112 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004134403209628887, + "loss": 3.0188, + "theoretical_loss": 3.8408734074264093, + "tokens_seen": 598555648 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004134302908726179, + "loss": 2.9995, + "theoretical_loss": 3.840829510063504, + "tokens_seen": 598621184 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041342026078234706, + "loss": 3.1519, + "theoretical_loss": 3.8407856188516005, + "tokens_seen": 598686720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041341023069207624, + "loss": 3.0956, + "theoretical_loss": 3.840741733789163, + "tokens_seen": 598752256 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004134002006018054, + "loss": 3.0778, + "theoretical_loss": 3.8406978548746578, + "tokens_seen": 598817792 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004133901705115346, + "loss": 3.0611, + "theoretical_loss": 3.8406539821065504, + "tokens_seen": 598883328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041338014042126384, + "loss": 3.0175, + "theoretical_loss": 3.840610115483307, + "tokens_seen": 598948864 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041337011033099297, + "loss": 3.1278, + "theoretical_loss": 3.840566255003395, + "tokens_seen": 599014400 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004133600802407222, + "loss": 3.0578, + "theoretical_loss": 3.8405224006652823, + "tokens_seen": 599079936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041335005015045133, + "loss": 3.079, + "theoretical_loss": 3.840478552467437, + "tokens_seen": 599145472 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041334002006018056, + "loss": 3.0433, + "theoretical_loss": 3.840434710408328, + "tokens_seen": 599211008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041332998996990975, + "loss": 2.9868, + "theoretical_loss": 3.840390874486424, + "tokens_seen": 599276544 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004133199598796389, + "loss": 3.0727, + "theoretical_loss": 3.840347044700196, + "tokens_seen": 599342080 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004133099297893681, + "loss": 3.0832, + "theoretical_loss": 3.8403032210481127, + "tokens_seen": 599407616 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004132998996990973, + "loss": 3.0458, + "theoretical_loss": 3.8402594035286475, + "tokens_seen": 599473152 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041328986960882647, + "loss": 3.1039, + "theoretical_loss": 3.84021559214027, + "tokens_seen": 599538688 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1449004, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1385202407836914, + "objective/train/theoretical_loss": 3.840171786881452, + "objective/train/tokens_used": 620064224, + "theoretical_loss": 3.840171786881452, + "tokens_seen": 599604224 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004132798395185557, + "loss": 3.0046, + "theoretical_loss": 3.840171786881452, + "tokens_seen": 599604224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041326980942828483, + "loss": 3.1456, + "theoretical_loss": 3.8401279877506678, + "tokens_seen": 599669760 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041325977933801407, + "loss": 3.0818, + "theoretical_loss": 3.840084194746389, + "tokens_seen": 599735296 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041324974924774325, + "loss": 3.0332, + "theoretical_loss": 3.8400404078670904, + "tokens_seen": 599800832 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041323971915747243, + "loss": 3.0645, + "theoretical_loss": 3.839996627111246, + "tokens_seen": 599866368 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004132296890672016, + "loss": 3.1204, + "theoretical_loss": 3.8399528524773303, + "tokens_seen": 599931904 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004132196589769308, + "loss": 3.1643, + "theoretical_loss": 3.8399090839638186, + "tokens_seen": 599997440 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041320962888665997, + "loss": 3.199, + "theoretical_loss": 3.839865321569187, + "tokens_seen": 600062976 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131995987963892, + "loss": 3.0512, + "theoretical_loss": 3.8398215652919125, + "tokens_seen": 600128512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041318956870611833, + "loss": 3.0106, + "theoretical_loss": 3.839777815130471, + "tokens_seen": 600194048 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041317953861584757, + "loss": 3.0022, + "theoretical_loss": 3.839734071083341, + "tokens_seen": 600259584 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131695085255767, + "loss": 3.0998, + "theoretical_loss": 3.839690333148999, + "tokens_seen": 600325120 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041315947843530593, + "loss": 3.0779, + "theoretical_loss": 3.839646601325926, + "tokens_seen": 600390656 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131494483450351, + "loss": 3.1287, + "theoretical_loss": 3.8396028756125995, + "tokens_seen": 600456192 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131394182547643, + "loss": 2.9722, + "theoretical_loss": 3.8395591560074998, + "tokens_seen": 600521728 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131293881644935, + "loss": 3.1446, + "theoretical_loss": 3.8395154425091063, + "tokens_seen": 600587264 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004131193580742227, + "loss": 3.1533, + "theoretical_loss": 3.839471735115901, + "tokens_seen": 600652800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041310932798395184, + "loss": 3.0295, + "theoretical_loss": 3.8394280338263647, + "tokens_seen": 600718336 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130992978936811, + "loss": 3.0649, + "theoretical_loss": 3.839384338638979, + "tokens_seen": 600783872 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130892678034102, + "loss": 3.0735, + "theoretical_loss": 3.8393406495522266, + "tokens_seen": 600849408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041307923771313944, + "loss": 3.192, + "theoretical_loss": 3.839296966564591, + "tokens_seen": 600914944 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130692076228686, + "loss": 2.9526, + "theoretical_loss": 3.839253289674555, + "tokens_seen": 600980480 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130591775325978, + "loss": 3.1467, + "theoretical_loss": 3.839209618880602, + "tokens_seen": 601046016 + }, + { + "epoch": 7.01, + "learning_rate": 0.000413049147442327, + "loss": 2.974, + "theoretical_loss": 3.839165954181219, + "tokens_seen": 601111552 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041303911735205616, + "loss": 3.0161, + "theoretical_loss": 3.8391222955748887, + "tokens_seen": 601177088 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1453487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.154001474380493, + "objective/train/theoretical_loss": 3.839078643060098, + "objective/train/tokens_used": 621702624, + "theoretical_loss": 3.839078643060098, + "tokens_seen": 601242624 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041302908726178534, + "loss": 3.0463, + "theoretical_loss": 3.839078643060098, + "tokens_seen": 601242624 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130190571715146, + "loss": 3.1141, + "theoretical_loss": 3.8390349966353323, + "tokens_seen": 601308160 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004130090270812437, + "loss": 3.0417, + "theoretical_loss": 3.838991356299079, + "tokens_seen": 601373696 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041299899699097294, + "loss": 3.0576, + "theoretical_loss": 3.838947722049826, + "tokens_seen": 601439232 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041298896690070207, + "loss": 3.1349, + "theoretical_loss": 3.8389040938860597, + "tokens_seen": 601504768 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004129789368104313, + "loss": 3.0279, + "theoretical_loss": 3.8388604718062695, + "tokens_seen": 601570304 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004129689067201605, + "loss": 3.2001, + "theoretical_loss": 3.838816855808944, + "tokens_seen": 601635840 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041295887662988966, + "loss": 2.9977, + "theoretical_loss": 3.8387732458925727, + "tokens_seen": 601701376 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041294884653961884, + "loss": 3.0912, + "theoretical_loss": 3.838729642055646, + "tokens_seen": 601766912 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004129388164493481, + "loss": 3.053, + "theoretical_loss": 3.8386860442966535, + "tokens_seen": 601832448 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004129287863590772, + "loss": 3.1162, + "theoretical_loss": 3.8386424526140868, + "tokens_seen": 601897984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041291875626880644, + "loss": 3.0635, + "theoretical_loss": 3.8385988670064384, + "tokens_seen": 601963520 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004129087261785356, + "loss": 3.0951, + "theoretical_loss": 3.8385552874721984, + "tokens_seen": 602029056 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004128986960882648, + "loss": 3.076, + "theoretical_loss": 3.8385117140098615, + "tokens_seen": 602094592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041288866599799404, + "loss": 3.1231, + "theoretical_loss": 3.8384681466179202, + "tokens_seen": 602160128 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041287863590772317, + "loss": 3.0939, + "theoretical_loss": 3.838424585294868, + "tokens_seen": 602225664 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004128686058174524, + "loss": 3.0258, + "theoretical_loss": 3.8383810300391996, + "tokens_seen": 602291200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041285857572718153, + "loss": 2.9984, + "theoretical_loss": 3.8383374808494093, + "tokens_seen": 602356736 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041284854563691076, + "loss": 2.9789, + "theoretical_loss": 3.8382939377239933, + "tokens_seen": 602422272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041283851554663995, + "loss": 2.9545, + "theoretical_loss": 3.8382504006614475, + "tokens_seen": 602487808 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004128284854563691, + "loss": 2.9584, + "theoretical_loss": 3.8382068696602674, + "tokens_seen": 602553344 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004128184553660983, + "loss": 2.9421, + "theoretical_loss": 3.838163344718951, + "tokens_seen": 602618880 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004128084252758275, + "loss": 3.0243, + "theoretical_loss": 3.838119825835995, + "tokens_seen": 602684416 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041279839518555667, + "loss": 3.0796, + "theoretical_loss": 3.838076313009898, + "tokens_seen": 602749952 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004127883650952859, + "loss": 2.9555, + "theoretical_loss": 3.838032806239158, + "tokens_seen": 602815488 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1456895, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.068000316619873, + "objective/train/theoretical_loss": 3.8379893055222754, + "objective/train/tokens_used": 623341024, + "theoretical_loss": 3.8379893055222754, + "tokens_seen": 602881024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041277833500501503, + "loss": 2.9828, + "theoretical_loss": 3.8379893055222754, + "tokens_seen": 602881024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041276830491474427, + "loss": 3.173, + "theoretical_loss": 3.8379458108577493, + "tokens_seen": 602946560 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041275827482447345, + "loss": 3.0294, + "theoretical_loss": 3.837902322244079, + "tokens_seen": 603012096 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041274824473420263, + "loss": 3.1543, + "theoretical_loss": 3.837858839679766, + "tokens_seen": 603077632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004127382146439318, + "loss": 3.0201, + "theoretical_loss": 3.837815363163312, + "tokens_seen": 603143168 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412728184553661, + "loss": 3.0738, + "theoretical_loss": 3.837771892693218, + "tokens_seen": 603208704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041271815446339017, + "loss": 3.1112, + "theoretical_loss": 3.8377284282679867, + "tokens_seen": 603274240 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004127081243731194, + "loss": 3.0969, + "theoretical_loss": 3.837684969886121, + "tokens_seen": 603339776 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041269809428284854, + "loss": 3.1254, + "theoretical_loss": 3.8376415175461247, + "tokens_seen": 603405312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041268806419257777, + "loss": 3.0902, + "theoretical_loss": 3.837598071246501, + "tokens_seen": 603470848 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126780341023069, + "loss": 3.0313, + "theoretical_loss": 3.837554630985755, + "tokens_seen": 603536384 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041266800401203613, + "loss": 3.0783, + "theoretical_loss": 3.8375111967623914, + "tokens_seen": 603601920 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126579739217653, + "loss": 2.9739, + "theoretical_loss": 3.8374677685749154, + "tokens_seen": 603667456 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126479438314945, + "loss": 3.1084, + "theoretical_loss": 3.837424346421834, + "tokens_seen": 603732992 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126379137412237, + "loss": 3.0746, + "theoretical_loss": 3.8373809303016526, + "tokens_seen": 603798528 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126278836509529, + "loss": 3.0424, + "theoretical_loss": 3.837337520212879, + "tokens_seen": 603864064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041261785356068204, + "loss": 3.0698, + "theoretical_loss": 3.837294116154022, + "tokens_seen": 603929600 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004126078234704113, + "loss": 3.1156, + "theoretical_loss": 3.8372507181235873, + "tokens_seen": 603995136 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004125977933801404, + "loss": 3.0481, + "theoretical_loss": 3.837207326120086, + "tokens_seen": 604060672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041258776328986964, + "loss": 3.0337, + "theoretical_loss": 3.837163940142026, + "tokens_seen": 604126208 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004125777331995988, + "loss": 3.1297, + "theoretical_loss": 3.837120560187918, + "tokens_seen": 604191744 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412567703109328, + "loss": 3.0946, + "theoretical_loss": 3.8370771862562707, + "tokens_seen": 604257280 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004125576730190572, + "loss": 3.1299, + "theoretical_loss": 3.8370338183455974, + "tokens_seen": 604322816 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041254764292878636, + "loss": 3.1286, + "theoretical_loss": 3.8369904564544073, + "tokens_seen": 604388352 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041253761283851554, + "loss": 2.8924, + "theoretical_loss": 3.836947100581213, + "tokens_seen": 604453888 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1459773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0732760429382324, + "objective/train/theoretical_loss": 3.8369037507245283, + "objective/train/tokens_used": 624979424, + "theoretical_loss": 3.8369037507245283, + "tokens_seen": 604519424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004125275827482448, + "loss": 3.0506, + "theoretical_loss": 3.8369037507245283, + "tokens_seen": 604519424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004125175526579739, + "loss": 3.0734, + "theoretical_loss": 3.836860406882864, + "tokens_seen": 604584960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041250752256770314, + "loss": 2.9782, + "theoretical_loss": 3.8368170690547343, + "tokens_seen": 604650496 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041249749247743227, + "loss": 3.0571, + "theoretical_loss": 3.836773737238654, + "tokens_seen": 604716032 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004124874623871615, + "loss": 3.0335, + "theoretical_loss": 3.8367304114331375, + "tokens_seen": 604781568 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004124774322968907, + "loss": 3.0837, + "theoretical_loss": 3.8366870916366986, + "tokens_seen": 604847104 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041246740220661986, + "loss": 3.1008, + "theoretical_loss": 3.836643777847855, + "tokens_seen": 604912640 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041245737211634904, + "loss": 3.1143, + "theoretical_loss": 3.8366004700651213, + "tokens_seen": 604978176 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004124473420260783, + "loss": 3.0268, + "theoretical_loss": 3.8365571682870145, + "tokens_seen": 605043712 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004124373119358074, + "loss": 3.1473, + "theoretical_loss": 3.836513872512052, + "tokens_seen": 605109248 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041242728184553664, + "loss": 3.125, + "theoretical_loss": 3.836470582738751, + "tokens_seen": 605174784 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041241725175526577, + "loss": 3.0911, + "theoretical_loss": 3.8364272989656305, + "tokens_seen": 605240320 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412407221664995, + "loss": 3.1637, + "theoretical_loss": 3.8363840211912086, + "tokens_seen": 605305856 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004123971915747242, + "loss": 3.0269, + "theoretical_loss": 3.8363407494140054, + "tokens_seen": 605371392 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041238716148445337, + "loss": 3.1115, + "theoretical_loss": 3.83629748363254, + "tokens_seen": 605436928 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041237713139418255, + "loss": 3.0749, + "theoretical_loss": 3.836254223845333, + "tokens_seen": 605502464 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041236710130391173, + "loss": 3.1461, + "theoretical_loss": 3.8362109700509057, + "tokens_seen": 605568000 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004123570712136409, + "loss": 3.1118, + "theoretical_loss": 3.8361677222477786, + "tokens_seen": 605633536 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041234704112337015, + "loss": 3.1486, + "theoretical_loss": 3.8361244804344743, + "tokens_seen": 605699072 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041233701103309927, + "loss": 3.0819, + "theoretical_loss": 3.836081244609515, + "tokens_seen": 605764608 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004123269809428285, + "loss": 3.1268, + "theoretical_loss": 3.8360380147714235, + "tokens_seen": 605830144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041231695085255763, + "loss": 2.9885, + "theoretical_loss": 3.8359947909187238, + "tokens_seen": 605895680 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041230692076228687, + "loss": 3.0137, + "theoretical_loss": 3.8359515730499396, + "tokens_seen": 605961216 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041229689067201605, + "loss": 3.0566, + "theoretical_loss": 3.8359083611635953, + "tokens_seen": 606026752 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041228686058174523, + "loss": 3.1114, + "theoretical_loss": 3.8358651552582166, + "tokens_seen": 606092288 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1463245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0986082553863525, + "objective/train/theoretical_loss": 3.835821955332328, + "objective/train/tokens_used": 626617824, + "theoretical_loss": 3.835821955332328, + "tokens_seen": 606157824 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004122768304914744, + "loss": 3.0654, + "theoretical_loss": 3.835821955332328, + "tokens_seen": 606157824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041226680040120365, + "loss": 2.9057, + "theoretical_loss": 3.835778761384457, + "tokens_seen": 606223360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004122567703109328, + "loss": 3.1355, + "theoretical_loss": 3.835735573413129, + "tokens_seen": 606288896 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412246740220662, + "loss": 3.1143, + "theoretical_loss": 3.835692391416872, + "tokens_seen": 606354432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041223671013039114, + "loss": 3.1106, + "theoretical_loss": 3.8356492153942137, + "tokens_seen": 606419968 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004122266800401204, + "loss": 3.1319, + "theoretical_loss": 3.8356060453436815, + "tokens_seen": 606485504 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041221664994984955, + "loss": 2.9855, + "theoretical_loss": 3.8355628812638045, + "tokens_seen": 606551040 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041220661985957874, + "loss": 3.0935, + "theoretical_loss": 3.835519723153112, + "tokens_seen": 606616576 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121965897693079, + "loss": 2.9812, + "theoretical_loss": 3.8354765710101346, + "tokens_seen": 606682112 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121865596790371, + "loss": 3.2031, + "theoretical_loss": 3.8354334248334014, + "tokens_seen": 606747648 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121765295887663, + "loss": 2.94, + "theoretical_loss": 3.835390284621443, + "tokens_seen": 606813184 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121664994984955, + "loss": 3.0325, + "theoretical_loss": 3.835347150372792, + "tokens_seen": 606878720 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121564694082247, + "loss": 3.2123, + "theoretical_loss": 3.83530402208598, + "tokens_seen": 606944256 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121464393179539, + "loss": 3.0162, + "theoretical_loss": 3.835260899759538, + "tokens_seen": 607009792 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121364092276831, + "loss": 3.1078, + "theoretical_loss": 3.8352177833920003, + "tokens_seen": 607075328 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041212637913741224, + "loss": 2.9691, + "theoretical_loss": 3.8351746729819, + "tokens_seen": 607140864 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121163490471415, + "loss": 3.0182, + "theoretical_loss": 3.835131568527771, + "tokens_seen": 607206400 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004121063189568706, + "loss": 3.0522, + "theoretical_loss": 3.8350884700281473, + "tokens_seen": 607271936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041209628886659984, + "loss": 3.1359, + "theoretical_loss": 3.8350453774815643, + "tokens_seen": 607337472 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412086258776329, + "loss": 3.1327, + "theoretical_loss": 3.8350022908865578, + "tokens_seen": 607403008 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004120762286860582, + "loss": 3.1189, + "theoretical_loss": 3.8349592102416628, + "tokens_seen": 607468544 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004120661985957874, + "loss": 3.1274, + "theoretical_loss": 3.8349161355454173, + "tokens_seen": 607534080 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041205616850551656, + "loss": 3.0642, + "theoretical_loss": 3.834873066796357, + "tokens_seen": 607599616 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041204613841524574, + "loss": 2.9812, + "theoretical_loss": 3.8348300039930194, + "tokens_seen": 607665152 + }, + { + "epoch": 7.01, + "learning_rate": 0.000412036108324975, + "loss": 3.1279, + "theoretical_loss": 3.834786947133944, + "tokens_seen": 607730688 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1468178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1205263137817383, + "objective/train/theoretical_loss": 3.8347438962176676, + "objective/train/tokens_used": 628256224, + "theoretical_loss": 3.8347438962176676, + "tokens_seen": 607796224 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004120260782347041, + "loss": 3.0285, + "theoretical_loss": 3.8347438962176676, + "tokens_seen": 607796224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041201604814443334, + "loss": 3.095, + "theoretical_loss": 3.8347008512427307, + "tokens_seen": 607861760 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041200601805416247, + "loss": 3.14, + "theoretical_loss": 3.8346578122076727, + "tokens_seen": 607927296 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119959879638917, + "loss": 3.1764, + "theoretical_loss": 3.8346147791110328, + "tokens_seen": 607992832 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119859578736209, + "loss": 3.0218, + "theoretical_loss": 3.8345717519513527, + "tokens_seen": 608058368 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041197592778335006, + "loss": 3.0483, + "theoretical_loss": 3.8345287307271736, + "tokens_seen": 608123904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041196589769307924, + "loss": 3.0546, + "theoretical_loss": 3.834485715437036, + "tokens_seen": 608189440 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119558676028085, + "loss": 3.1189, + "theoretical_loss": 3.8344427060794835, + "tokens_seen": 608254976 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119458375125376, + "loss": 3.0688, + "theoretical_loss": 3.834399702653058, + "tokens_seen": 608320512 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041193580742226684, + "loss": 3.1545, + "theoretical_loss": 3.834356705156303, + "tokens_seen": 608386048 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041192577733199597, + "loss": 3.1066, + "theoretical_loss": 3.834313713587762, + "tokens_seen": 608451584 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119157472417252, + "loss": 3.1323, + "theoretical_loss": 3.83427072794598, + "tokens_seen": 608517120 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004119057171514544, + "loss": 3.1681, + "theoretical_loss": 3.834227748229501, + "tokens_seen": 608582656 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041189568706118357, + "loss": 3.0756, + "theoretical_loss": 3.8341847744368702, + "tokens_seen": 608648192 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041188565697091275, + "loss": 2.896, + "theoretical_loss": 3.8341418065666337, + "tokens_seen": 608713728 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041187562688064193, + "loss": 3.1174, + "theoretical_loss": 3.8340988446173387, + "tokens_seen": 608779264 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004118655967903711, + "loss": 3.1264, + "theoretical_loss": 3.83405588858753, + "tokens_seen": 608844800 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041185556670010035, + "loss": 3.0628, + "theoretical_loss": 3.834012938475757, + "tokens_seen": 608910336 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041184553660982947, + "loss": 3.1045, + "theoretical_loss": 3.8339699942805665, + "tokens_seen": 608975872 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004118355065195587, + "loss": 3.1318, + "theoretical_loss": 3.833927056000507, + "tokens_seen": 609041408 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041182547642928783, + "loss": 2.9651, + "theoretical_loss": 3.8338841236341272, + "tokens_seen": 609106944 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041181544633901707, + "loss": 3.1063, + "theoretical_loss": 3.8338411971799773, + "tokens_seen": 609172480 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041180541624874625, + "loss": 3.0967, + "theoretical_loss": 3.8337982766366063, + "tokens_seen": 609238016 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041179538615847543, + "loss": 3.1491, + "theoretical_loss": 3.833755362002565, + "tokens_seen": 609303552 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004117853560682046, + "loss": 3.155, + "theoretical_loss": 3.8337124532764046, + "tokens_seen": 609369088 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1471221, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.063894748687744, + "objective/train/theoretical_loss": 3.8336695504566762, + "objective/train/tokens_used": 629894624, + "theoretical_loss": 3.8336695504566762, + "tokens_seen": 609434624 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041177532597793385, + "loss": 3.0518, + "theoretical_loss": 3.8336695504566762, + "tokens_seen": 609434624 + }, + { + "epoch": 7.01, + "learning_rate": 0.000411765295887663, + "loss": 3.0347, + "theoretical_loss": 3.8336266535419314, + "tokens_seen": 609500160 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004117552657973922, + "loss": 3.02, + "theoretical_loss": 3.833583762530723, + "tokens_seen": 609565696 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041174523570712134, + "loss": 3.1325, + "theoretical_loss": 3.8335408774216044, + "tokens_seen": 609631232 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004117352056168506, + "loss": 2.9083, + "theoretical_loss": 3.833497998213129, + "tokens_seen": 609696768 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041172517552657975, + "loss": 3.0401, + "theoretical_loss": 3.8334551249038498, + "tokens_seen": 609762304 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041171514543630894, + "loss": 3.0571, + "theoretical_loss": 3.8334122574923226, + "tokens_seen": 609827840 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004117051153460381, + "loss": 3.0986, + "theoretical_loss": 3.8333693959771016, + "tokens_seen": 609893376 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116950852557673, + "loss": 3.112, + "theoretical_loss": 3.833326540356742, + "tokens_seen": 609958912 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116850551654965, + "loss": 3.0494, + "theoretical_loss": 3.833283690629801, + "tokens_seen": 610024448 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116750250752257, + "loss": 3.12, + "theoretical_loss": 3.8332408467948347, + "tokens_seen": 610089984 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041166499498495484, + "loss": 3.1368, + "theoretical_loss": 3.8331980088504, + "tokens_seen": 610155520 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116549648946841, + "loss": 2.9471, + "theoretical_loss": 3.833155176795054, + "tokens_seen": 610221056 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116449348044132, + "loss": 3.0914, + "theoretical_loss": 3.8331123506273546, + "tokens_seen": 610286592 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041163490471414244, + "loss": 2.8597, + "theoretical_loss": 3.833069530345862, + "tokens_seen": 610352128 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116248746238716, + "loss": 3.0281, + "theoretical_loss": 3.8330267159491336, + "tokens_seen": 610417664 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004116148445336008, + "loss": 3.02, + "theoretical_loss": 3.8329839074357297, + "tokens_seen": 610483200 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041160481444333, + "loss": 3.0542, + "theoretical_loss": 3.8329411048042106, + "tokens_seen": 610548736 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115947843530592, + "loss": 3.0542, + "theoretical_loss": 3.832898308053137, + "tokens_seen": 610614272 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041158475426278834, + "loss": 3.0804, + "theoretical_loss": 3.832855517181069, + "tokens_seen": 610679808 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115747241725176, + "loss": 2.9662, + "theoretical_loss": 3.832812732186569, + "tokens_seen": 610745344 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115646940822467, + "loss": 3.0228, + "theoretical_loss": 3.8327699530681985, + "tokens_seen": 610810880 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041155466399197594, + "loss": 3.0829, + "theoretical_loss": 3.832727179824521, + "tokens_seen": 610876416 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115446339017051, + "loss": 3.041, + "theoretical_loss": 3.8326844124540993, + "tokens_seen": 610941952 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115346038114343, + "loss": 3.098, + "theoretical_loss": 3.8326416509554972, + "tokens_seen": 611007488 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1476172, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2061378955841064, + "objective/train/theoretical_loss": 3.832598895327278, + "objective/train/tokens_used": 631533024, + "theoretical_loss": 3.832598895327278, + "tokens_seen": 611073024 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004115245737211635, + "loss": 3.1486, + "theoretical_loss": 3.832598895327278, + "tokens_seen": 611073024 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041151454363089267, + "loss": 3.0466, + "theoretical_loss": 3.8325561455680073, + "tokens_seen": 611138560 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041150451354062185, + "loss": 3.0076, + "theoretical_loss": 3.8325134016762497, + "tokens_seen": 611204096 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004114944834503511, + "loss": 3.03, + "theoretical_loss": 3.832470663650571, + "tokens_seen": 611269632 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004114844533600802, + "loss": 3.0985, + "theoretical_loss": 3.832427931489538, + "tokens_seen": 611335168 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041147442326980944, + "loss": 3.0427, + "theoretical_loss": 3.8323852051917164, + "tokens_seen": 611400704 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041146439317953857, + "loss": 3.041, + "theoretical_loss": 3.8323424847556735, + "tokens_seen": 611466240 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004114543630892678, + "loss": 3.0895, + "theoretical_loss": 3.8322997701799775, + "tokens_seen": 611531776 + }, + { + "epoch": 7.01, + "learning_rate": 0.000411444332998997, + "loss": 3.0027, + "theoretical_loss": 3.8322570614631966, + "tokens_seen": 611597312 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041143430290872617, + "loss": 3.0402, + "theoretical_loss": 3.832214358603899, + "tokens_seen": 611662848 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041142427281845535, + "loss": 3.1092, + "theoretical_loss": 3.832171661600654, + "tokens_seen": 611728384 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004114142427281846, + "loss": 3.0175, + "theoretical_loss": 3.8321289704520307, + "tokens_seen": 611793920 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041140421263791377, + "loss": 3.073, + "theoretical_loss": 3.832086285156601, + "tokens_seen": 611859456 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041139418254764295, + "loss": 3.0852, + "theoretical_loss": 3.8320436057129337, + "tokens_seen": 611924992 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041138415245737213, + "loss": 2.9687, + "theoretical_loss": 3.832000932119602, + "tokens_seen": 611990528 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004113741223671013, + "loss": 3.0554, + "theoretical_loss": 3.8319582643751753, + "tokens_seen": 612056064 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041136409227683055, + "loss": 3.0696, + "theoretical_loss": 3.8319156024782273, + "tokens_seen": 612121600 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041135406218655967, + "loss": 3.084, + "theoretical_loss": 3.83187294642733, + "tokens_seen": 612187136 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004113440320962889, + "loss": 2.9738, + "theoretical_loss": 3.8318302962210575, + "tokens_seen": 612252672 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041133400200601803, + "loss": 3.1847, + "theoretical_loss": 3.8317876518579825, + "tokens_seen": 612318208 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041132397191574727, + "loss": 3.006, + "theoretical_loss": 3.83174501333668, + "tokens_seen": 612383744 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041131394182547645, + "loss": 3.0453, + "theoretical_loss": 3.8317023806557238, + "tokens_seen": 612449280 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041130391173520563, + "loss": 3.1465, + "theoretical_loss": 3.83165975381369, + "tokens_seen": 612514816 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112938816449348, + "loss": 2.9308, + "theoretical_loss": 3.831617132809154, + "tokens_seen": 612580352 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041128385155466405, + "loss": 2.9148, + "theoretical_loss": 3.831574517640692, + "tokens_seen": 612645888 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1479116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8883538246154785, + "objective/train/theoretical_loss": 3.8315319083068804, + "objective/train/tokens_used": 633171424, + "theoretical_loss": 3.8315319083068804, + "tokens_seen": 612711424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112738214643932, + "loss": 3.0627, + "theoretical_loss": 3.8315319083068804, + "tokens_seen": 612711424 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112637913741224, + "loss": 3.0771, + "theoretical_loss": 3.831489304806296, + "tokens_seen": 612776960 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041125376128385154, + "loss": 3.1326, + "theoretical_loss": 3.831446707137518, + "tokens_seen": 612842496 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112437311935808, + "loss": 2.9604, + "theoretical_loss": 3.8314041152991236, + "tokens_seen": 612908032 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041123370110330995, + "loss": 3.0742, + "theoretical_loss": 3.8313615292896914, + "tokens_seen": 612973568 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041122367101303914, + "loss": 3.04, + "theoretical_loss": 3.831318949107801, + "tokens_seen": 613039104 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112136409227683, + "loss": 3.1266, + "theoretical_loss": 3.831276374752032, + "tokens_seen": 613104640 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004112036108324975, + "loss": 3.0303, + "theoretical_loss": 3.831233806220964, + "tokens_seen": 613170176 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111935807422267, + "loss": 3.0192, + "theoretical_loss": 3.8311912435131785, + "tokens_seen": 613235712 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111835506519559, + "loss": 3.0388, + "theoretical_loss": 3.831148686627256, + "tokens_seen": 613301248 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041117352056168504, + "loss": 3.1283, + "theoretical_loss": 3.8311061355617793, + "tokens_seen": 613366784 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111634904714143, + "loss": 3.0955, + "theoretical_loss": 3.8310635903153294, + "tokens_seen": 613432320 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111534603811434, + "loss": 3.0181, + "theoretical_loss": 3.8310210508864895, + "tokens_seen": 613497856 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041114343029087264, + "loss": 3.0742, + "theoretical_loss": 3.8309785172738424, + "tokens_seen": 613563392 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111334002006018, + "loss": 3.1143, + "theoretical_loss": 3.8309359894759725, + "tokens_seen": 613628928 + }, + { + "epoch": 7.01, + "learning_rate": 0.000411123370110331, + "loss": 2.9584, + "theoretical_loss": 3.8308934674914634, + "tokens_seen": 613694464 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111133400200602, + "loss": 3.059, + "theoretical_loss": 3.8308509513189, + "tokens_seen": 613760000 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004111033099297894, + "loss": 3.0479, + "theoretical_loss": 3.830808440956867, + "tokens_seen": 613825536 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041109327983951854, + "loss": 3.1818, + "theoretical_loss": 3.8307659364039504, + "tokens_seen": 613891072 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110832497492478, + "loss": 3.0374, + "theoretical_loss": 3.8307234376587367, + "tokens_seen": 613956608 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110732196589769, + "loss": 3.1834, + "theoretical_loss": 3.830680944719812, + "tokens_seen": 614022144 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041106318956870614, + "loss": 3.1765, + "theoretical_loss": 3.8306384575857635, + "tokens_seen": 614087680 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110531594784353, + "loss": 3.0894, + "theoretical_loss": 3.8305959762551787, + "tokens_seen": 614153216 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110431293881645, + "loss": 3.1025, + "theoretical_loss": 3.830553500726647, + "tokens_seen": 614218752 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110330992978937, + "loss": 3.0155, + "theoretical_loss": 3.8305110309987547, + "tokens_seen": 614284288 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1482810, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.866050958633423, + "objective/train/theoretical_loss": 3.830468567070093, + "objective/train/tokens_used": 634809824, + "theoretical_loss": 3.830468567070093, + "tokens_seen": 614349824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041102306920762287, + "loss": 2.9586, + "theoretical_loss": 3.830468567070093, + "tokens_seen": 614349824 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041101303911735205, + "loss": 3.0414, + "theoretical_loss": 3.8304261089392506, + "tokens_seen": 614415360 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004110030090270813, + "loss": 3.1505, + "theoretical_loss": 3.830383656604818, + "tokens_seen": 614480896 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004109929789368104, + "loss": 3.02, + "theoretical_loss": 3.830341210065385, + "tokens_seen": 614546432 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041098294884653965, + "loss": 3.1332, + "theoretical_loss": 3.830298769319543, + "tokens_seen": 614611968 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041097291875626877, + "loss": 3.0571, + "theoretical_loss": 3.8302563343658846, + "tokens_seen": 614677504 + }, + { + "epoch": 7.01, + "learning_rate": 0.000410962888665998, + "loss": 2.989, + "theoretical_loss": 3.8302139052030006, + "tokens_seen": 614743040 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004109528585757272, + "loss": 3.1235, + "theoretical_loss": 3.830171481829484, + "tokens_seen": 614808576 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041094282848545637, + "loss": 3.0338, + "theoretical_loss": 3.830129064243928, + "tokens_seen": 614874112 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041093279839518555, + "loss": 3.1133, + "theoretical_loss": 3.830086652444926, + "tokens_seen": 614939648 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004109227683049148, + "loss": 3.1581, + "theoretical_loss": 3.8300442464310724, + "tokens_seen": 615005184 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004109127382146439, + "loss": 3.0004, + "theoretical_loss": 3.830001846200961, + "tokens_seen": 615070720 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041090270812437315, + "loss": 3.098, + "theoretical_loss": 3.8299594517531874, + "tokens_seen": 615136256 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004108926780341023, + "loss": 3.1145, + "theoretical_loss": 3.829917063086347, + "tokens_seen": 615201792 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004108826479438315, + "loss": 3.1067, + "theoretical_loss": 3.8298746801990355, + "tokens_seen": 615267328 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004108726178535607, + "loss": 2.9806, + "theoretical_loss": 3.82983230308985, + "tokens_seen": 615332864 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041086258776328987, + "loss": 3.1206, + "theoretical_loss": 3.8297899317573867, + "tokens_seen": 615398400 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041085255767301905, + "loss": 3.0829, + "theoretical_loss": 3.829747566200244, + "tokens_seen": 615463936 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041084252758274823, + "loss": 3.0806, + "theoretical_loss": 3.829705206417019, + "tokens_seen": 615529472 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004108324974924774, + "loss": 3.0558, + "theoretical_loss": 3.8296628524063108, + "tokens_seen": 615595008 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041082246740220665, + "loss": 2.9656, + "theoretical_loss": 3.829620504166718, + "tokens_seen": 615660544 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004108124373119358, + "loss": 3.0077, + "theoretical_loss": 3.8295781616968396, + "tokens_seen": 615726080 + }, + { + "epoch": 7.01, + "learning_rate": 0.000410802407221665, + "loss": 3.1025, + "theoretical_loss": 3.8295358249952764, + "tokens_seen": 615791616 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107923771313942, + "loss": 3.0595, + "theoretical_loss": 3.829493494060629, + "tokens_seen": 615857152 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107823470411234, + "loss": 3.0873, + "theoretical_loss": 3.829451168891497, + "tokens_seen": 615922688 + }, + { + "epoch": 7.01, + "objective/train/docs_used": 1487798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.195228099822998, + "objective/train/theoretical_loss": 3.829408849486483, + "objective/train/tokens_used": 636448224, + "theoretical_loss": 3.829408849486483, + "tokens_seen": 615988224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041077231695085256, + "loss": 3.0524, + "theoretical_loss": 3.829408849486483, + "tokens_seen": 615988224 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041076228686058174, + "loss": 3.0673, + "theoretical_loss": 3.8293665358441884, + "tokens_seen": 616053760 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107522567703109, + "loss": 3.0922, + "theoretical_loss": 3.829324227963215, + "tokens_seen": 616119296 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041074222668004015, + "loss": 3.0443, + "theoretical_loss": 3.829281925842167, + "tokens_seen": 616184832 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107321965897693, + "loss": 3.1221, + "theoretical_loss": 3.8292396294796465, + "tokens_seen": 616250368 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107221664994985, + "loss": 2.9767, + "theoretical_loss": 3.8291973388742577, + "tokens_seen": 616315904 + }, + { + "epoch": 7.01, + "learning_rate": 0.00041071213640922764, + "loss": 3.1164, + "theoretical_loss": 3.8291550540246053, + "tokens_seen": 616381440 + }, + { + "epoch": 7.01, + "learning_rate": 0.0004107021063189569, + "loss": 3.0939, + "theoretical_loss": 3.8291127749292935, + "tokens_seen": 616446976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041069207622868606, + "loss": 3.0351, + "theoretical_loss": 3.829070501586928, + "tokens_seen": 616512512 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041068204613841524, + "loss": 3.0775, + "theoretical_loss": 3.8290282339961146, + "tokens_seen": 616578048 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004106720160481444, + "loss": 3.0816, + "theoretical_loss": 3.8289859721554595, + "tokens_seen": 616643584 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004106619859578736, + "loss": 2.984, + "theoretical_loss": 3.828943716063569, + "tokens_seen": 616709120 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041065195586760284, + "loss": 3.1275, + "theoretical_loss": 3.828901465719051, + "tokens_seen": 616774656 + }, + { + "epoch": 7.02, + "learning_rate": 0.000410641925777332, + "loss": 2.901, + "theoretical_loss": 3.828859221120513, + "tokens_seen": 616840192 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004106318956870612, + "loss": 3.067, + "theoretical_loss": 3.828816982266563, + "tokens_seen": 616905728 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004106218655967904, + "loss": 2.9447, + "theoretical_loss": 3.8287747491558095, + "tokens_seen": 616971264 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004106118355065196, + "loss": 3.1774, + "theoretical_loss": 3.828732521786862, + "tokens_seen": 617036800 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041060180541624874, + "loss": 2.9677, + "theoretical_loss": 3.8286903001583306, + "tokens_seen": 617102336 + }, + { + "epoch": 7.02, + "learning_rate": 0.000410591775325978, + "loss": 3.0245, + "theoretical_loss": 3.828648084268824, + "tokens_seen": 617167872 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105817452357071, + "loss": 3.1368, + "theoretical_loss": 3.8286058741169553, + "tokens_seen": 617233408 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041057171514543634, + "loss": 3.1117, + "theoretical_loss": 3.828563669701333, + "tokens_seen": 617298944 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105616850551655, + "loss": 3.1245, + "theoretical_loss": 3.82852147102057, + "tokens_seen": 617364480 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105516549648947, + "loss": 2.9692, + "theoretical_loss": 3.828479278073278, + "tokens_seen": 617430016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105416248746239, + "loss": 3.0889, + "theoretical_loss": 3.82843709085807, + "tokens_seen": 617495552 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041053159478435307, + "loss": 2.9769, + "theoretical_loss": 3.8283949093735585, + "tokens_seen": 617561088 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1490554, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.103069543838501, + "objective/train/theoretical_loss": 3.8283527336183574, + "objective/train/tokens_used": 638086624, + "theoretical_loss": 3.8283527336183574, + "tokens_seen": 617626624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041052156469408225, + "loss": 3.1239, + "theoretical_loss": 3.8283527336183574, + "tokens_seen": 617626624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105115346038115, + "loss": 3.1048, + "theoretical_loss": 3.8283105635910806, + "tokens_seen": 617692160 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004105015045135406, + "loss": 3.062, + "theoretical_loss": 3.8282683992903426, + "tokens_seen": 617757696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041049147442326985, + "loss": 3.078, + "theoretical_loss": 3.8282262407147583, + "tokens_seen": 617823232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041048144433299897, + "loss": 3.1658, + "theoretical_loss": 3.828184087862943, + "tokens_seen": 617888768 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004104714142427282, + "loss": 3.1365, + "theoretical_loss": 3.828141940733513, + "tokens_seen": 617954304 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004104613841524574, + "loss": 3.0456, + "theoretical_loss": 3.8280997993250843, + "tokens_seen": 618019840 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041045135406218657, + "loss": 3.0908, + "theoretical_loss": 3.8280576636362746, + "tokens_seen": 618085376 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041044132397191575, + "loss": 3.0609, + "theoretical_loss": 3.8280155336657002, + "tokens_seen": 618150912 + }, + { + "epoch": 7.02, + "learning_rate": 0.000410431293881645, + "loss": 2.9765, + "theoretical_loss": 3.82797340941198, + "tokens_seen": 618216448 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004104212637913741, + "loss": 3.1139, + "theoretical_loss": 3.827931290873731, + "tokens_seen": 618281984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041041123370110335, + "loss": 3.1761, + "theoretical_loss": 3.827889178049573, + "tokens_seen": 618347520 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004104012036108325, + "loss": 2.9915, + "theoretical_loss": 3.827847070938126, + "tokens_seen": 618413056 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004103911735205617, + "loss": 3.1691, + "theoretical_loss": 3.8278049695380076, + "tokens_seen": 618478592 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004103811434302909, + "loss": 3.0829, + "theoretical_loss": 3.82776287384784, + "tokens_seen": 618544128 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041037111334002007, + "loss": 3.1102, + "theoretical_loss": 3.8277207838662433, + "tokens_seen": 618609664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041036108324974925, + "loss": 3.0568, + "theoretical_loss": 3.827678699591839, + "tokens_seen": 618675200 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041035105315947844, + "loss": 3.0719, + "theoretical_loss": 3.827636621023248, + "tokens_seen": 618740736 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004103410230692076, + "loss": 3.0903, + "theoretical_loss": 3.827594548159093, + "tokens_seen": 618806272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041033099297893685, + "loss": 3.0984, + "theoretical_loss": 3.827552480997997, + "tokens_seen": 618871808 + }, + { + "epoch": 7.02, + "learning_rate": 0.000410320962888666, + "loss": 2.975, + "theoretical_loss": 3.8275104195385827, + "tokens_seen": 618937344 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004103109327983952, + "loss": 3.1521, + "theoretical_loss": 3.8274683637794737, + "tokens_seen": 619002880 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004103009027081244, + "loss": 3.217, + "theoretical_loss": 3.827426313719294, + "tokens_seen": 619068416 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004102908726178536, + "loss": 3.157, + "theoretical_loss": 3.827384269356668, + "tokens_seen": 619133952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041028084252758276, + "loss": 3.0335, + "theoretical_loss": 3.827342230690222, + "tokens_seen": 619199488 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1493588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.099539279937744, + "objective/train/theoretical_loss": 3.82730019771858, + "objective/train/tokens_used": 639725024, + "theoretical_loss": 3.82730019771858, + "tokens_seen": 619265024 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041027081243731194, + "loss": 3.0898, + "theoretical_loss": 3.82730019771858, + "tokens_seen": 619265024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004102607823470411, + "loss": 2.992, + "theoretical_loss": 3.8272581704403685, + "tokens_seen": 619330560 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041025075225677035, + "loss": 3.0873, + "theoretical_loss": 3.8272161488542142, + "tokens_seen": 619396096 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004102407221664995, + "loss": 3.1173, + "theoretical_loss": 3.827174132958744, + "tokens_seen": 619461632 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004102306920762287, + "loss": 2.9817, + "theoretical_loss": 3.827132122752585, + "tokens_seen": 619527168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041022066198595784, + "loss": 3.0795, + "theoretical_loss": 3.8270901182343655, + "tokens_seen": 619592704 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004102106318956871, + "loss": 2.9789, + "theoretical_loss": 3.827048119402714, + "tokens_seen": 619658240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041020060180541626, + "loss": 3.0338, + "theoretical_loss": 3.8270061262562587, + "tokens_seen": 619723776 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041019057171514544, + "loss": 2.9949, + "theoretical_loss": 3.8269641387936293, + "tokens_seen": 619789312 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004101805416248746, + "loss": 3.1108, + "theoretical_loss": 3.8269221570134553, + "tokens_seen": 619854848 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004101705115346038, + "loss": 3.1405, + "theoretical_loss": 3.826880180914368, + "tokens_seen": 619920384 + }, + { + "epoch": 7.02, + "learning_rate": 0.000410160481444333, + "loss": 3.1226, + "theoretical_loss": 3.826838210494997, + "tokens_seen": 619985920 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004101504513540622, + "loss": 3.0391, + "theoretical_loss": 3.8267962457539744, + "tokens_seen": 620051456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041014042126379135, + "loss": 2.9978, + "theoretical_loss": 3.826754286689931, + "tokens_seen": 620116992 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004101303911735206, + "loss": 3.0912, + "theoretical_loss": 3.8267123333015, + "tokens_seen": 620182528 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041012036108324976, + "loss": 3.037, + "theoretical_loss": 3.826670385587313, + "tokens_seen": 620248064 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041011033099297894, + "loss": 2.9748, + "theoretical_loss": 3.826628443546004, + "tokens_seen": 620313600 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004101003009027081, + "loss": 2.9817, + "theoretical_loss": 3.8265865071762057, + "tokens_seen": 620379136 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100902708124373, + "loss": 2.9926, + "theoretical_loss": 3.8265445764765533, + "tokens_seen": 620444672 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100802407221665, + "loss": 3.0388, + "theoretical_loss": 3.8265026514456806, + "tokens_seen": 620510208 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100702106318957, + "loss": 3.0812, + "theoretical_loss": 3.826460732082223, + "tokens_seen": 620575744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041006018054162485, + "loss": 3.1126, + "theoretical_loss": 3.8264188183848153, + "tokens_seen": 620641280 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100501504513541, + "loss": 3.0985, + "theoretical_loss": 3.826376910352094, + "tokens_seen": 620706816 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100401203610832, + "loss": 3.0604, + "theoretical_loss": 3.8263350079826957, + "tokens_seen": 620772352 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041003009027081245, + "loss": 3.0992, + "theoretical_loss": 3.826293111275257, + "tokens_seen": 620837888 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1497563, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0676932334899902, + "objective/train/theoretical_loss": 3.826251220228415, + "objective/train/tokens_used": 641363424, + "theoretical_loss": 3.826251220228415, + "tokens_seen": 620903424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041002006018054163, + "loss": 3.1063, + "theoretical_loss": 3.826251220228415, + "tokens_seen": 620903424 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004100100300902708, + "loss": 2.9519, + "theoretical_loss": 3.826209334840809, + "tokens_seen": 620968960 + }, + { + "epoch": 7.02, + "learning_rate": 0.00041, + "loss": 3.0429, + "theoretical_loss": 3.8261674551110754, + "tokens_seen": 621034496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040998996990972917, + "loss": 2.9747, + "theoretical_loss": 3.826125581037854, + "tokens_seen": 621100032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040997993981945835, + "loss": 2.985, + "theoretical_loss": 3.826083712619784, + "tokens_seen": 621165568 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004099699097291876, + "loss": 3.0336, + "theoretical_loss": 3.826041849855505, + "tokens_seen": 621231104 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004099598796389167, + "loss": 2.9982, + "theoretical_loss": 3.8259999927436574, + "tokens_seen": 621296640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040994984954864595, + "loss": 3.0911, + "theoretical_loss": 3.825958141282882, + "tokens_seen": 621362176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040993981945837513, + "loss": 2.9639, + "theoretical_loss": 3.825916295471819, + "tokens_seen": 621427712 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004099297893681043, + "loss": 3.0559, + "theoretical_loss": 3.825874455309111, + "tokens_seen": 621493248 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004099197592778335, + "loss": 3.0254, + "theoretical_loss": 3.8258326207934, + "tokens_seen": 621558784 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004099097291875627, + "loss": 3.0228, + "theoretical_loss": 3.8257907919233283, + "tokens_seen": 621624320 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098996990972919, + "loss": 3.0766, + "theoretical_loss": 3.825748968697539, + "tokens_seen": 621689856 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098896690070211, + "loss": 3.0689, + "theoretical_loss": 3.8257071511146754, + "tokens_seen": 621755392 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098796389167503, + "loss": 3.141, + "theoretical_loss": 3.8256653391733817, + "tokens_seen": 621820928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040986960882647945, + "loss": 3.0572, + "theoretical_loss": 3.8256235328723025, + "tokens_seen": 621886464 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040985957873620864, + "loss": 3.0608, + "theoretical_loss": 3.8255817322100825, + "tokens_seen": 621952000 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098495486459378, + "loss": 3.0622, + "theoretical_loss": 3.825539937185367, + "tokens_seen": 622017536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040983951855566705, + "loss": 2.9501, + "theoretical_loss": 3.825498147796802, + "tokens_seen": 622083072 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098294884653962, + "loss": 2.9649, + "theoretical_loss": 3.825456364043034, + "tokens_seen": 622148608 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098194583751254, + "loss": 3.0123, + "theoretical_loss": 3.825414585922709, + "tokens_seen": 622214144 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004098094282848546, + "loss": 3.073, + "theoretical_loss": 3.825372813434475, + "tokens_seen": 622279680 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004097993981945838, + "loss": 3.0117, + "theoretical_loss": 3.825331046576979, + "tokens_seen": 622345216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040978936810431296, + "loss": 3.0877, + "theoretical_loss": 3.8252892853488705, + "tokens_seen": 622410752 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040977933801404214, + "loss": 3.017, + "theoretical_loss": 3.8252475297487964, + "tokens_seen": 622476288 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1502358, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.206096649169922, + "objective/train/theoretical_loss": 3.8252057797754073, + "objective/train/tokens_used": 643001824, + "theoretical_loss": 3.8252057797754073, + "tokens_seen": 622541824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004097693079237713, + "loss": 3.0738, + "theoretical_loss": 3.8252057797754073, + "tokens_seen": 622541824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040975927783350055, + "loss": 3.1927, + "theoretical_loss": 3.8251640354273517, + "tokens_seen": 622607360 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004097492477432297, + "loss": 3.1363, + "theoretical_loss": 3.82512229670328, + "tokens_seen": 622672896 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004097392176529589, + "loss": 3.0675, + "theoretical_loss": 3.8250805636018432, + "tokens_seen": 622738432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040972918756268804, + "loss": 3.0942, + "theoretical_loss": 3.825038836121692, + "tokens_seen": 622803968 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004097191574724173, + "loss": 3.0829, + "theoretical_loss": 3.824997114261478, + "tokens_seen": 622869504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040970912738214646, + "loss": 2.9905, + "theoretical_loss": 3.8249553980198523, + "tokens_seen": 622935040 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040969909729187564, + "loss": 3.0441, + "theoretical_loss": 3.824913687395468, + "tokens_seen": 623000576 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004096890672016048, + "loss": 3.0152, + "theoretical_loss": 3.824871982386978, + "tokens_seen": 623066112 + }, + { + "epoch": 7.02, + "learning_rate": 0.000409679037111334, + "loss": 3.0942, + "theoretical_loss": 3.8248302829930347, + "tokens_seen": 623131648 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004096690070210632, + "loss": 3.1629, + "theoretical_loss": 3.824788589212293, + "tokens_seen": 623197184 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004096589769307924, + "loss": 2.9811, + "theoretical_loss": 3.8247469010434063, + "tokens_seen": 623262720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040964894684052155, + "loss": 3.1536, + "theoretical_loss": 3.8247052184850303, + "tokens_seen": 623328256 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004096389167502508, + "loss": 3.0162, + "theoretical_loss": 3.824663541535819, + "tokens_seen": 623393792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040962888665997996, + "loss": 3.1228, + "theoretical_loss": 3.8246218701944286, + "tokens_seen": 623459328 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040961885656970914, + "loss": 2.9294, + "theoretical_loss": 3.824580204459515, + "tokens_seen": 623524864 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004096088264794383, + "loss": 3.147, + "theoretical_loss": 3.824538544329735, + "tokens_seen": 623590400 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095987963891675, + "loss": 3.1436, + "theoretical_loss": 3.824496889803746, + "tokens_seen": 623655936 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095887662988967, + "loss": 2.9688, + "theoretical_loss": 3.824455240880204, + "tokens_seen": 623721472 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095787362086259, + "loss": 3.0472, + "theoretical_loss": 3.8244135975577684, + "tokens_seen": 623787008 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040956870611835505, + "loss": 3.0413, + "theoretical_loss": 3.824371959835097, + "tokens_seen": 623852544 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095586760280843, + "loss": 3.1349, + "theoretical_loss": 3.824330327710849, + "tokens_seen": 623918080 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095486459378134, + "loss": 3.062, + "theoretical_loss": 3.8242887011836832, + "tokens_seen": 623983616 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040953861584754265, + "loss": 3.1018, + "theoretical_loss": 3.8242470802522597, + "tokens_seen": 624049152 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040952858575727183, + "loss": 3.1212, + "theoretical_loss": 3.824205464915239, + "tokens_seen": 624114688 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1505157, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1323907375335693, + "objective/train/theoretical_loss": 3.824163855171281, + "objective/train/tokens_used": 644640224, + "theoretical_loss": 3.824163855171281, + "tokens_seen": 624180224 + }, + { + "epoch": 7.02, + "learning_rate": 0.000409518555667001, + "loss": 3.0579, + "theoretical_loss": 3.824163855171281, + "tokens_seen": 624180224 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004095085255767302, + "loss": 2.9657, + "theoretical_loss": 3.824122251019048, + "tokens_seen": 624245760 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040949849548645937, + "loss": 3.033, + "theoretical_loss": 3.8240806524572006, + "tokens_seen": 624311296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040948846539618855, + "loss": 3.0691, + "theoretical_loss": 3.8240390594844014, + "tokens_seen": 624376832 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004094784353059178, + "loss": 3.0116, + "theoretical_loss": 3.823997472099313, + "tokens_seen": 624442368 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004094684052156469, + "loss": 3.1168, + "theoretical_loss": 3.8239558903005975, + "tokens_seen": 624507904 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040945837512537615, + "loss": 3.1069, + "theoretical_loss": 3.8239143140869203, + "tokens_seen": 624573440 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040944834503510533, + "loss": 3.1174, + "theoretical_loss": 3.8238727434569433, + "tokens_seen": 624638976 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004094383149448345, + "loss": 3.0977, + "theoretical_loss": 3.823831178409332, + "tokens_seen": 624704512 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004094282848545637, + "loss": 3.045, + "theoretical_loss": 3.8237896189427514, + "tokens_seen": 624770048 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004094182547642929, + "loss": 2.9942, + "theoretical_loss": 3.823748065055866, + "tokens_seen": 624835584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040940822467402206, + "loss": 3.1826, + "theoretical_loss": 3.8237065167473423, + "tokens_seen": 624901120 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093981945837513, + "loss": 3.1443, + "theoretical_loss": 3.8236649740158466, + "tokens_seen": 624966656 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093881644934804, + "loss": 3.0091, + "theoretical_loss": 3.8236234368600446, + "tokens_seen": 625032192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040937813440320965, + "loss": 3.1081, + "theoretical_loss": 3.823581905278604, + "tokens_seen": 625097728 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093681043129388, + "loss": 3.0485, + "theoretical_loss": 3.823540379270193, + "tokens_seen": 625163264 + }, + { + "epoch": 7.02, + "learning_rate": 0.000409358074222668, + "loss": 3.0006, + "theoretical_loss": 3.823498858833479, + "tokens_seen": 625228800 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093480441323972, + "loss": 3.1776, + "theoretical_loss": 3.8234573439671307, + "tokens_seen": 625294336 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093380140421264, + "loss": 3.1503, + "theoretical_loss": 3.8234158346698166, + "tokens_seen": 625359872 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040932798395185556, + "loss": 3.1599, + "theoretical_loss": 3.823374330940207, + "tokens_seen": 625425408 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093179538615848, + "loss": 3.1123, + "theoretical_loss": 3.823332832776971, + "tokens_seen": 625490944 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004093079237713139, + "loss": 2.9994, + "theoretical_loss": 3.8232913401787796, + "tokens_seen": 625556480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040929789368104316, + "loss": 3.0932, + "theoretical_loss": 3.8232498531443033, + "tokens_seen": 625622016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092878635907723, + "loss": 3.1428, + "theoretical_loss": 3.8232083716722136, + "tokens_seen": 625687552 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092778335005015, + "loss": 3.1432, + "theoretical_loss": 3.823166895761182, + "tokens_seen": 625753088 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1509779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.053635835647583, + "objective/train/theoretical_loss": 3.8231254254098808, + "objective/train/tokens_used": 646278624, + "theoretical_loss": 3.8231254254098808, + "tokens_seen": 625818624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092678034102307, + "loss": 3.1199, + "theoretical_loss": 3.8231254254098808, + "tokens_seen": 625818624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092577733199599, + "loss": 3.0382, + "theoretical_loss": 3.823083960616982, + "tokens_seen": 625884160 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040924774322968906, + "loss": 3.1284, + "theoretical_loss": 3.8230425013811598, + "tokens_seen": 625949696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040923771313941824, + "loss": 3.1057, + "theoretical_loss": 3.8230010477010867, + "tokens_seen": 626015232 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092276830491474, + "loss": 2.9536, + "theoretical_loss": 3.8229595995754373, + "tokens_seen": 626080768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040921765295887666, + "loss": 3.1831, + "theoretical_loss": 3.8229181570028863, + "tokens_seen": 626146304 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004092076228686058, + "loss": 3.1139, + "theoretical_loss": 3.8228767199821077, + "tokens_seen": 626211840 + }, + { + "epoch": 7.02, + "learning_rate": 0.000409197592778335, + "loss": 3.0619, + "theoretical_loss": 3.8228352885117776, + "tokens_seen": 626277376 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040918756268806415, + "loss": 3.1432, + "theoretical_loss": 3.8227938625905717, + "tokens_seen": 626342912 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004091775325977934, + "loss": 3.0713, + "theoretical_loss": 3.8227524422171664, + "tokens_seen": 626408448 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040916750250752257, + "loss": 3.0909, + "theoretical_loss": 3.8227110273902376, + "tokens_seen": 626473984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040915747241725175, + "loss": 3.1177, + "theoretical_loss": 3.8226696181084634, + "tokens_seen": 626539520 + }, + { + "epoch": 7.02, + "learning_rate": 0.000409147442326981, + "loss": 3.0373, + "theoretical_loss": 3.8226282143705212, + "tokens_seen": 626605056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040913741223671016, + "loss": 3.0608, + "theoretical_loss": 3.822586816175089, + "tokens_seen": 626670592 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040912738214643934, + "loss": 2.9975, + "theoretical_loss": 3.822545423520846, + "tokens_seen": 626736128 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004091173520561685, + "loss": 3.0898, + "theoretical_loss": 3.8225040364064697, + "tokens_seen": 626801664 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004091073219658977, + "loss": 3.0902, + "theoretical_loss": 3.8224626548306406, + "tokens_seen": 626867200 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090972918756269, + "loss": 2.9404, + "theoretical_loss": 3.8224212787920386, + "tokens_seen": 626932736 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090872617853561, + "loss": 3.0599, + "theoretical_loss": 3.822379908289344, + "tokens_seen": 626998272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040907723169508525, + "loss": 3.0972, + "theoretical_loss": 3.822338543321237, + "tokens_seen": 627063808 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090672016048145, + "loss": 3.0086, + "theoretical_loss": 3.8222971838863997, + "tokens_seen": 627129344 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090571715145436, + "loss": 3.1128, + "theoretical_loss": 3.8222558299835137, + "tokens_seen": 627194880 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040904714142427285, + "loss": 3.1146, + "theoretical_loss": 3.8222144816112604, + "tokens_seen": 627260416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040903711133400203, + "loss": 3.1719, + "theoretical_loss": 3.8221731387683233, + "tokens_seen": 627325952 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090270812437312, + "loss": 3.0667, + "theoretical_loss": 3.822131801453385, + "tokens_seen": 627391488 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8800454139709473, + "objective/train/theoretical_loss": 3.822090469665129, + "objective/train/tokens_used": 647917024, + "theoretical_loss": 3.822090469665129, + "tokens_seen": 627457024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004090170511534604, + "loss": 3.0267, + "theoretical_loss": 3.822090469665129, + "tokens_seen": 627457024 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040900702106318957, + "loss": 3.0645, + "theoretical_loss": 3.8220491434022392, + "tokens_seen": 627522560 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040899699097291875, + "loss": 3.0612, + "theoretical_loss": 3.8220078226634007, + "tokens_seen": 627588096 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408986960882648, + "loss": 3.0412, + "theoretical_loss": 3.8219665074472977, + "tokens_seen": 627653632 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004089769307923771, + "loss": 3.153, + "theoretical_loss": 3.8219251977526154, + "tokens_seen": 627719168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040896690070210635, + "loss": 3.1415, + "theoretical_loss": 3.8218838935780406, + "tokens_seen": 627784704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040895687061183553, + "loss": 3.0354, + "theoretical_loss": 3.8218425949222583, + "tokens_seen": 627850240 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004089468405215647, + "loss": 3.0786, + "theoretical_loss": 3.8218013017839554, + "tokens_seen": 627915776 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004089368104312939, + "loss": 2.9794, + "theoretical_loss": 3.8217600141618195, + "tokens_seen": 627981312 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004089267803410231, + "loss": 3.0995, + "theoretical_loss": 3.821718732054538, + "tokens_seen": 628046848 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040891675025075226, + "loss": 2.9997, + "theoretical_loss": 3.821677455460799, + "tokens_seen": 628112384 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004089067201604815, + "loss": 3.2475, + "theoretical_loss": 3.8216361843792903, + "tokens_seen": 628177920 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004088966900702106, + "loss": 3.0226, + "theoretical_loss": 3.821594918808702, + "tokens_seen": 628243456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040888665997993985, + "loss": 3.0503, + "theoretical_loss": 3.821553658747723, + "tokens_seen": 628308992 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408876629889669, + "loss": 2.9278, + "theoretical_loss": 3.821512404195042, + "tokens_seen": 628374528 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004088665997993982, + "loss": 3.0106, + "theoretical_loss": 3.821471155149351, + "tokens_seen": 628440064 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004088565697091274, + "loss": 3.0477, + "theoretical_loss": 3.8214299116093398, + "tokens_seen": 628505600 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004088465396188566, + "loss": 3.1122, + "theoretical_loss": 3.8213886735736993, + "tokens_seen": 628571136 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040883650952858576, + "loss": 3.1052, + "theoretical_loss": 3.8213474410411217, + "tokens_seen": 628636672 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408826479438315, + "loss": 3.0687, + "theoretical_loss": 3.8213062140102987, + "tokens_seen": 628702208 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004088164493480441, + "loss": 3.0633, + "theoretical_loss": 3.8212649924799225, + "tokens_seen": 628767744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040880641925777336, + "loss": 3.1167, + "theoretical_loss": 3.821223776448687, + "tokens_seen": 628833280 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087963891675025, + "loss": 2.9292, + "theoretical_loss": 3.821182565915285, + "tokens_seen": 628898816 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087863590772317, + "loss": 3.0355, + "theoretical_loss": 3.8211413608784106, + "tokens_seen": 628964352 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087763289869609, + "loss": 3.0354, + "theoretical_loss": 3.8211001613367577, + "tokens_seen": 629029888 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0217220783233643, + "objective/train/theoretical_loss": 3.8210589672890203, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8210589672890203, + "tokens_seen": 629095424 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087662988966901, + "loss": 3.1055, + "theoretical_loss": 3.8210589672890203, + "tokens_seen": 629095424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040875626880641926, + "loss": 3.1322, + "theoretical_loss": 3.8210177787338955, + "tokens_seen": 629160960 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040874623871614844, + "loss": 3.0385, + "theoretical_loss": 3.8209765956700776, + "tokens_seen": 629226496 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087362086258776, + "loss": 3.0303, + "theoretical_loss": 3.820935418096263, + "tokens_seen": 629292032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040872617853560686, + "loss": 3.0343, + "theoretical_loss": 3.8208942460111484, + "tokens_seen": 629357568 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408716148445336, + "loss": 3.1043, + "theoretical_loss": 3.82085307941343, + "tokens_seen": 629423104 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004087061183550652, + "loss": 3.0442, + "theoretical_loss": 3.8208119183018066, + "tokens_seen": 629488640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040869608826479435, + "loss": 3.0874, + "theoretical_loss": 3.8207707626749743, + "tokens_seen": 629554176 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004086860581745236, + "loss": 3.1032, + "theoretical_loss": 3.8207296125316335, + "tokens_seen": 629619712 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040867602808425277, + "loss": 3.0608, + "theoretical_loss": 3.8206884678704807, + "tokens_seen": 629685248 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040866599799398195, + "loss": 3.0334, + "theoretical_loss": 3.8206473286902165, + "tokens_seen": 629750784 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040865596790371113, + "loss": 3.0302, + "theoretical_loss": 3.82060619498954, + "tokens_seen": 629816320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040864593781344036, + "loss": 3.0878, + "theoretical_loss": 3.820565066767151, + "tokens_seen": 629881856 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004086359077231695, + "loss": 3.0516, + "theoretical_loss": 3.820523944021751, + "tokens_seen": 629947392 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004086258776328987, + "loss": 3.0833, + "theoretical_loss": 3.8204828267520403, + "tokens_seen": 630012928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040861584754262785, + "loss": 3.123, + "theoretical_loss": 3.8204417149567202, + "tokens_seen": 630078464 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004086058174523571, + "loss": 3.1763, + "theoretical_loss": 3.8204006086344933, + "tokens_seen": 630144000 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040859578736208627, + "loss": 3.1301, + "theoretical_loss": 3.820359507784061, + "tokens_seen": 630209536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040858575727181545, + "loss": 3.0378, + "theoretical_loss": 3.8203184124041263, + "tokens_seen": 630275072 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040857572718154463, + "loss": 3.1469, + "theoretical_loss": 3.820277322493392, + "tokens_seen": 630340608 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004085656970912738, + "loss": 2.9765, + "theoretical_loss": 3.820236238050563, + "tokens_seen": 630406144 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408555667001003, + "loss": 3.1372, + "theoretical_loss": 3.8201951590743413, + "tokens_seen": 630471680 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040854563691073223, + "loss": 3.0123, + "theoretical_loss": 3.8201540855634333, + "tokens_seen": 630537216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040853560682046136, + "loss": 3.0961, + "theoretical_loss": 3.8201130175165434, + "tokens_seen": 630602752 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004085255767301906, + "loss": 3.0529, + "theoretical_loss": 3.8200719549323763, + "tokens_seen": 630668288 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.202115774154663, + "objective/train/theoretical_loss": 3.820030897809639, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.820030897809639, + "tokens_seen": 630733824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004085155466399197, + "loss": 3.1556, + "theoretical_loss": 3.820030897809639, + "tokens_seen": 630733824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040850551654964895, + "loss": 3.0817, + "theoretical_loss": 3.8199898461470365, + "tokens_seen": 630799360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040849548645937814, + "loss": 2.9984, + "theoretical_loss": 3.819948799943276, + "tokens_seen": 630864896 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004084854563691073, + "loss": 3.0867, + "theoretical_loss": 3.8199077591970654, + "tokens_seen": 630930432 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004084754262788365, + "loss": 3.0754, + "theoretical_loss": 3.8198667239071113, + "tokens_seen": 630995968 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040846539618856573, + "loss": 3.0367, + "theoretical_loss": 3.8198256940721222, + "tokens_seen": 631061504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040845536609829486, + "loss": 2.9656, + "theoretical_loss": 3.819784669690806, + "tokens_seen": 631127040 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004084453360080241, + "loss": 3.1339, + "theoretical_loss": 3.8197436507618723, + "tokens_seen": 631192576 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004084353059177532, + "loss": 3.0863, + "theoretical_loss": 3.8197026372840304, + "tokens_seen": 631258112 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040842527582748246, + "loss": 3.0674, + "theoretical_loss": 3.819661629255989, + "tokens_seen": 631323648 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040841524573721164, + "loss": 3.0059, + "theoretical_loss": 3.8196206266764596, + "tokens_seen": 631389184 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004084052156469408, + "loss": 3.1048, + "theoretical_loss": 3.819579629544153, + "tokens_seen": 631454720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040839518555667005, + "loss": 3.0618, + "theoretical_loss": 3.819538637857779, + "tokens_seen": 631520256 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083851554663992, + "loss": 3.0244, + "theoretical_loss": 3.81949765161605, + "tokens_seen": 631585792 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083751253761284, + "loss": 2.9891, + "theoretical_loss": 3.819456670817678, + "tokens_seen": 631651328 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083650952858576, + "loss": 3.0304, + "theoretical_loss": 3.819415695461376, + "tokens_seen": 631716864 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083550651955868, + "loss": 3.0161, + "theoretical_loss": 3.8193747255458548, + "tokens_seen": 631782400 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040834503510531596, + "loss": 3.0975, + "theoretical_loss": 3.8193337610698297, + "tokens_seen": 631847936 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083350050150452, + "loss": 3.0691, + "theoretical_loss": 3.819292802032013, + "tokens_seen": 631913472 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083249749247743, + "loss": 3.0596, + "theoretical_loss": 3.819251848431121, + "tokens_seen": 631979008 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040831494483450356, + "loss": 3.1359, + "theoretical_loss": 3.819210900265866, + "tokens_seen": 632044544 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004083049147442327, + "loss": 3.0876, + "theoretical_loss": 3.819169957534964, + "tokens_seen": 632110080 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082948846539619, + "loss": 2.98, + "theoretical_loss": 3.8191290202371304, + "tokens_seen": 632175616 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082848545636911, + "loss": 3.0917, + "theoretical_loss": 3.8190880883710814, + "tokens_seen": 632241152 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082748244734203, + "loss": 3.1197, + "theoretical_loss": 3.819047161935533, + "tokens_seen": 632306688 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.931941509246826, + "objective/train/theoretical_loss": 3.819006240929202, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.819006240929202, + "tokens_seen": 632372224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040826479438314946, + "loss": 3.1013, + "theoretical_loss": 3.819006240929202, + "tokens_seen": 632372224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040825476429287864, + "loss": 3.1127, + "theoretical_loss": 3.818965325350806, + "tokens_seen": 632437760 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082447342026078, + "loss": 2.9829, + "theoretical_loss": 3.8189244151990622, + "tokens_seen": 632503296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040823470411233706, + "loss": 3.109, + "theoretical_loss": 3.818883510472689, + "tokens_seen": 632568832 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082246740220662, + "loss": 3.0841, + "theoretical_loss": 3.818842611170405, + "tokens_seen": 632634368 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004082146439317954, + "loss": 3.0542, + "theoretical_loss": 3.8188017172909285, + "tokens_seen": 632699904 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040820461384152455, + "loss": 3.028, + "theoretical_loss": 3.8187608288329793, + "tokens_seen": 632765440 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004081945837512538, + "loss": 3.0326, + "theoretical_loss": 3.8187199457952774, + "tokens_seen": 632830976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040818455366098297, + "loss": 3.0987, + "theoretical_loss": 3.8186790681765435, + "tokens_seen": 632896512 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040817452357071215, + "loss": 2.9057, + "theoretical_loss": 3.8186381959754976, + "tokens_seen": 632962048 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040816449348044133, + "loss": 3.1248, + "theoretical_loss": 3.818597329190861, + "tokens_seen": 633027584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040815446339017056, + "loss": 2.9284, + "theoretical_loss": 3.818556467821355, + "tokens_seen": 633093120 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004081444332998997, + "loss": 3.0637, + "theoretical_loss": 3.818515611865702, + "tokens_seen": 633158656 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004081344032096289, + "loss": 3.0155, + "theoretical_loss": 3.8184747613226246, + "tokens_seen": 633224192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040812437311935805, + "loss": 3.1324, + "theoretical_loss": 3.8184339161908456, + "tokens_seen": 633289728 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004081143430290873, + "loss": 3.0656, + "theoretical_loss": 3.818393076469088, + "tokens_seen": 633355264 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040810431293881647, + "loss": 3.0061, + "theoretical_loss": 3.8183522421560756, + "tokens_seen": 633420800 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040809428284854565, + "loss": 3.0253, + "theoretical_loss": 3.818311413250533, + "tokens_seen": 633486336 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040808425275827483, + "loss": 3.0356, + "theoretical_loss": 3.818270589751184, + "tokens_seen": 633551872 + }, + { + "epoch": 7.02, + "learning_rate": 0.000408074222668004, + "loss": 3.0845, + "theoretical_loss": 3.818229771656754, + "tokens_seen": 633617408 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004080641925777332, + "loss": 3.1416, + "theoretical_loss": 3.8181889589659694, + "tokens_seen": 633682944 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040805416248746243, + "loss": 3.1309, + "theoretical_loss": 3.818148151677555, + "tokens_seen": 633748480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040804413239719156, + "loss": 3.1434, + "theoretical_loss": 3.818107349790237, + "tokens_seen": 633814016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004080341023069208, + "loss": 3.097, + "theoretical_loss": 3.8180665533027436, + "tokens_seen": 633879552 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004080240722166499, + "loss": 3.0981, + "theoretical_loss": 3.8180257622138, + "tokens_seen": 633945088 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9158191680908203, + "objective/train/theoretical_loss": 3.8179849765221356, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8179849765221356, + "tokens_seen": 634010624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040801404212637915, + "loss": 3.0361, + "theoretical_loss": 3.8179849765221356, + "tokens_seen": 634010624 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040800401203610834, + "loss": 3.1214, + "theoretical_loss": 3.8179441962264775, + "tokens_seen": 634076160 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004079939819458375, + "loss": 3.0771, + "theoretical_loss": 3.8179034213255547, + "tokens_seen": 634141696 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004079839518555667, + "loss": 3.1217, + "theoretical_loss": 3.8178626518180954, + "tokens_seen": 634207232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040797392176529593, + "loss": 3.082, + "theoretical_loss": 3.81782188770283, + "tokens_seen": 634272768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040796389167502506, + "loss": 3.0364, + "theoretical_loss": 3.8177811289784866, + "tokens_seen": 634338304 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004079538615847543, + "loss": 3.0921, + "theoretical_loss": 3.8177403756437975, + "tokens_seen": 634403840 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004079438314944834, + "loss": 3.0737, + "theoretical_loss": 3.8176996276974924, + "tokens_seen": 634469376 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040793380140421266, + "loss": 3.0516, + "theoretical_loss": 3.817658885138302, + "tokens_seen": 634534912 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040792377131394184, + "loss": 3.0106, + "theoretical_loss": 3.8176181479649585, + "tokens_seen": 634600448 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407913741223671, + "loss": 3.0829, + "theoretical_loss": 3.817577416176193, + "tokens_seen": 634665984 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004079037111334002, + "loss": 3.123, + "theoretical_loss": 3.817536689770739, + "tokens_seen": 634731520 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078936810431294, + "loss": 3.0519, + "theoretical_loss": 3.817495968747328, + "tokens_seen": 634797056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040788365095285856, + "loss": 3.0276, + "theoretical_loss": 3.8174552531046944, + "tokens_seen": 634862592 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078736208625878, + "loss": 3.1181, + "theoretical_loss": 3.817414542841571, + "tokens_seen": 634928128 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078635907723169, + "loss": 2.9802, + "theoretical_loss": 3.8173738379566933, + "tokens_seen": 634993664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040785356068204616, + "loss": 3.1206, + "theoretical_loss": 3.817333138448794, + "tokens_seen": 635059200 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078435305917753, + "loss": 3.0488, + "theoretical_loss": 3.8172924443166085, + "tokens_seen": 635124736 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078335005015045, + "loss": 3.0572, + "theoretical_loss": 3.817251755558873, + "tokens_seen": 635190272 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078234704112337, + "loss": 2.976, + "theoretical_loss": 3.817211072174323, + "tokens_seen": 635255808 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004078134403209629, + "loss": 3.0426, + "theoretical_loss": 3.8171703941616935, + "tokens_seen": 635321344 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040780341023069207, + "loss": 2.9401, + "theoretical_loss": 3.817129721519723, + "tokens_seen": 635386880 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004077933801404213, + "loss": 3.047, + "theoretical_loss": 3.817089054247148, + "tokens_seen": 635452416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040778335005015043, + "loss": 3.1592, + "theoretical_loss": 3.8170483923427048, + "tokens_seen": 635517952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040777331995987966, + "loss": 3.0977, + "theoretical_loss": 3.8170077358051326, + "tokens_seen": 635583488 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.057966947555542, + "objective/train/theoretical_loss": 3.8169670846331702, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8169670846331702, + "tokens_seen": 635649024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004077632898696088, + "loss": 3.127, + "theoretical_loss": 3.8169670846331702, + "tokens_seen": 635649024 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407753259779338, + "loss": 3.0447, + "theoretical_loss": 3.816926438825555, + "tokens_seen": 635714560 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004077432296890672, + "loss": 3.0785, + "theoretical_loss": 3.816885798381027, + "tokens_seen": 635780096 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004077331995987964, + "loss": 3.1599, + "theoretical_loss": 3.816845163298326, + "tokens_seen": 635845632 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040772316950852557, + "loss": 3.073, + "theoretical_loss": 3.8168045335761915, + "tokens_seen": 635911168 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040771313941825475, + "loss": 3.0338, + "theoretical_loss": 3.816763909213364, + "tokens_seen": 635976704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040770310932798393, + "loss": 3.0799, + "theoretical_loss": 3.816723290208585, + "tokens_seen": 636042240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040769307923771317, + "loss": 2.9906, + "theoretical_loss": 3.8166826765605952, + "tokens_seen": 636107776 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004076830491474423, + "loss": 2.9673, + "theoretical_loss": 3.816642068268137, + "tokens_seen": 636173312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040767301905717153, + "loss": 3.0233, + "theoretical_loss": 3.816601465329952, + "tokens_seen": 636238848 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004076629889669007, + "loss": 3.1406, + "theoretical_loss": 3.8165608677447835, + "tokens_seen": 636304384 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004076529588766299, + "loss": 3.0258, + "theoretical_loss": 3.816520275511374, + "tokens_seen": 636369920 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004076429287863591, + "loss": 3.1634, + "theoretical_loss": 3.816479688628467, + "tokens_seen": 636435456 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040763289869608825, + "loss": 3.0324, + "theoretical_loss": 3.8164391070948067, + "tokens_seen": 636500992 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004076228686058175, + "loss": 3.1152, + "theoretical_loss": 3.816398530909137, + "tokens_seen": 636566528 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040761283851554667, + "loss": 3.0253, + "theoretical_loss": 3.816357960070203, + "tokens_seen": 636632064 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040760280842527585, + "loss": 3.1802, + "theoretical_loss": 3.8163173945767497, + "tokens_seen": 636697600 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040759277833500503, + "loss": 3.1231, + "theoretical_loss": 3.816276834427523, + "tokens_seen": 636763136 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004075827482447342, + "loss": 3.0827, + "theoretical_loss": 3.8162362796212683, + "tokens_seen": 636828672 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004075727181544634, + "loss": 3.0455, + "theoretical_loss": 3.8161957301567324, + "tokens_seen": 636894208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040756268806419263, + "loss": 2.9735, + "theoretical_loss": 3.8161551860326623, + "tokens_seen": 636959744 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040755265797392176, + "loss": 3.0336, + "theoretical_loss": 3.8161146472478054, + "tokens_seen": 637025280 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407542627883651, + "loss": 3.0277, + "theoretical_loss": 3.8160741138009087, + "tokens_seen": 637090816 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004075325977933801, + "loss": 3.1864, + "theoretical_loss": 3.816033585690721, + "tokens_seen": 637156352 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040752256770310935, + "loss": 3.1146, + "theoretical_loss": 3.8159930629159904, + "tokens_seen": 637221888 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2467246055603027, + "objective/train/theoretical_loss": 3.815952545475466, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.815952545475466, + "tokens_seen": 637287424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040751253761283854, + "loss": 3.1516, + "theoretical_loss": 3.815952545475466, + "tokens_seen": 637287424 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004075025075225677, + "loss": 2.9374, + "theoretical_loss": 3.8159120333678977, + "tokens_seen": 637352960 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074924774322969, + "loss": 3.161, + "theoretical_loss": 3.8158715265920344, + "tokens_seen": 637418496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040748244734202613, + "loss": 3.0144, + "theoretical_loss": 3.815831025146627, + "tokens_seen": 637484032 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040747241725175526, + "loss": 3.1115, + "theoretical_loss": 3.815790529030426, + "tokens_seen": 637549568 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074623871614845, + "loss": 2.9958, + "theoretical_loss": 3.8157500382421823, + "tokens_seen": 637615104 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074523570712136, + "loss": 2.9598, + "theoretical_loss": 3.815709552780648, + "tokens_seen": 637680640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040744232698094286, + "loss": 3.0826, + "theoretical_loss": 3.8156690726445746, + "tokens_seen": 637746176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040743229689067204, + "loss": 3.0202, + "theoretical_loss": 3.815628597832714, + "tokens_seen": 637811712 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074222668004012, + "loss": 3.1353, + "theoretical_loss": 3.8155881283438196, + "tokens_seen": 637877248 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074122367101304, + "loss": 3.1222, + "theoretical_loss": 3.815547664176644, + "tokens_seen": 637942784 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004074022066198596, + "loss": 3.12, + "theoretical_loss": 3.8155072053299417, + "tokens_seen": 638008320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040739217652958876, + "loss": 3.0967, + "theoretical_loss": 3.815466751802466, + "tokens_seen": 638073856 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407382146439318, + "loss": 3.0843, + "theoretical_loss": 3.8154263035929716, + "tokens_seen": 638139392 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073721163490471, + "loss": 2.9478, + "theoretical_loss": 3.8153858607002133, + "tokens_seen": 638204928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040736208625877636, + "loss": 3.1075, + "theoretical_loss": 3.8153454231229462, + "tokens_seen": 638270464 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073520561685055, + "loss": 3.0129, + "theoretical_loss": 3.815304990859926, + "tokens_seen": 638336000 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073420260782347, + "loss": 3.0204, + "theoretical_loss": 3.8152645639099094, + "tokens_seen": 638401536 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073319959879639, + "loss": 2.9986, + "theoretical_loss": 3.815224142271653, + "tokens_seen": 638467072 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073219658976931, + "loss": 3.0537, + "theoretical_loss": 3.8151837259439123, + "tokens_seen": 638532608 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040731193580742227, + "loss": 3.0873, + "theoretical_loss": 3.815143314925446, + "tokens_seen": 638598144 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004073019057171515, + "loss": 3.1717, + "theoretical_loss": 3.8151029092150117, + "tokens_seen": 638663680 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040729187562688063, + "loss": 3.0147, + "theoretical_loss": 3.8150625088113674, + "tokens_seen": 638729216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040728184553660986, + "loss": 3.101, + "theoretical_loss": 3.8150221137132716, + "tokens_seen": 638794752 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407271815446339, + "loss": 2.999, + "theoretical_loss": 3.814981723919484, + "tokens_seen": 638860288 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.133014678955078, + "objective/train/theoretical_loss": 3.8149413394287635, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8149413394287635, + "tokens_seen": 638925824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004072617853560682, + "loss": 3.1157, + "theoretical_loss": 3.8149413394287635, + "tokens_seen": 638925824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004072517552657974, + "loss": 3.0546, + "theoretical_loss": 3.81490096023987, + "tokens_seen": 638991360 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004072417251755266, + "loss": 3.1315, + "theoretical_loss": 3.814860586351564, + "tokens_seen": 639056896 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040723169508525577, + "loss": 2.9474, + "theoretical_loss": 3.8148202177626063, + "tokens_seen": 639122432 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040722166499498495, + "loss": 3.1665, + "theoretical_loss": 3.814779854471757, + "tokens_seen": 639187968 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040721163490471413, + "loss": 3.0667, + "theoretical_loss": 3.814739496477779, + "tokens_seen": 639253504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040720160481444337, + "loss": 3.02, + "theoretical_loss": 3.814699143779434, + "tokens_seen": 639319040 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004071915747241725, + "loss": 3.1581, + "theoretical_loss": 3.8146587963754843, + "tokens_seen": 639384576 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040718154463390173, + "loss": 3.0103, + "theoretical_loss": 3.814618454264692, + "tokens_seen": 639450112 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004071715145436309, + "loss": 3.0673, + "theoretical_loss": 3.814578117445821, + "tokens_seen": 639515648 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004071614844533601, + "loss": 3.1132, + "theoretical_loss": 3.8145377859176355, + "tokens_seen": 639581184 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040715145436308927, + "loss": 3.1032, + "theoretical_loss": 3.814497459678898, + "tokens_seen": 639646720 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040714142427281845, + "loss": 3.1119, + "theoretical_loss": 3.8144571387283737, + "tokens_seen": 639712256 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040713139418254763, + "loss": 3.0547, + "theoretical_loss": 3.8144168230648283, + "tokens_seen": 639777792 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040712136409227687, + "loss": 3.2567, + "theoretical_loss": 3.8143765126870255, + "tokens_seen": 639843328 + }, + { + "epoch": 7.02, + "learning_rate": 0.000407111334002006, + "loss": 3.0491, + "theoretical_loss": 3.8143362075937324, + "tokens_seen": 639908864 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040710130391173523, + "loss": 3.1579, + "theoretical_loss": 3.814295907783715, + "tokens_seen": 639974400 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040709127382146436, + "loss": 3.0557, + "theoretical_loss": 3.814255613255739, + "tokens_seen": 640039936 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004070812437311936, + "loss": 3.0063, + "theoretical_loss": 3.8142153240085714, + "tokens_seen": 640105472 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004070712136409228, + "loss": 3.0265, + "theoretical_loss": 3.81417504004098, + "tokens_seen": 640171008 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040706118355065196, + "loss": 3.1177, + "theoretical_loss": 3.814134761351733, + "tokens_seen": 640236544 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040705115346038114, + "loss": 2.956, + "theoretical_loss": 3.814094487939598, + "tokens_seen": 640302080 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004070411233701103, + "loss": 3.0969, + "theoretical_loss": 3.8140542198033436, + "tokens_seen": 640367616 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004070310932798395, + "loss": 3.1119, + "theoretical_loss": 3.8140139569417393, + "tokens_seen": 640433152 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040702106318956874, + "loss": 3.0478, + "theoretical_loss": 3.8139736993535536, + "tokens_seen": 640498688 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.133199453353882, + "objective/train/theoretical_loss": 3.8139334470375568, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8139334470375568, + "tokens_seen": 640564224 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040701103309929786, + "loss": 3.0855, + "theoretical_loss": 3.8139334470375568, + "tokens_seen": 640564224 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004070010030090271, + "loss": 3.0703, + "theoretical_loss": 3.8138931999925196, + "tokens_seen": 640629760 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004069909729187563, + "loss": 3.1379, + "theoretical_loss": 3.813852958217212, + "tokens_seen": 640695296 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040698094282848546, + "loss": 3.091, + "theoretical_loss": 3.8138127217104056, + "tokens_seen": 640760832 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040697091273821464, + "loss": 3.0723, + "theoretical_loss": 3.813772490470872, + "tokens_seen": 640826368 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004069608826479438, + "loss": 3.0518, + "theoretical_loss": 3.8137322644973826, + "tokens_seen": 640891904 + }, + { + "epoch": 7.02, + "learning_rate": 0.000406950852557673, + "loss": 2.9272, + "theoretical_loss": 3.81369204378871, + "tokens_seen": 640957440 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040694082246740224, + "loss": 3.1026, + "theoretical_loss": 3.8136518283436267, + "tokens_seen": 641022976 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040693079237713137, + "loss": 2.9726, + "theoretical_loss": 3.813611618160906, + "tokens_seen": 641088512 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004069207622868606, + "loss": 3.1379, + "theoretical_loss": 3.8135714132393215, + "tokens_seen": 641154048 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040691073219658973, + "loss": 3.1861, + "theoretical_loss": 3.813531213577647, + "tokens_seen": 641219584 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040690070210631896, + "loss": 3.0268, + "theoretical_loss": 3.813491019174657, + "tokens_seen": 641285120 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068906720160482, + "loss": 3.0896, + "theoretical_loss": 3.8134508300291263, + "tokens_seen": 641350656 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068806419257773, + "loss": 2.998, + "theoretical_loss": 3.81341064613983, + "tokens_seen": 641416192 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040687061183550656, + "loss": 3.0922, + "theoretical_loss": 3.8133704675055435, + "tokens_seen": 641481728 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068605817452357, + "loss": 3.0715, + "theoretical_loss": 3.8133302941250435, + "tokens_seen": 641547264 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068505516549649, + "loss": 3.1828, + "theoretical_loss": 3.8132901259971064, + "tokens_seen": 641612800 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068405215646941, + "loss": 3.1869, + "theoretical_loss": 3.8132499631205077, + "tokens_seen": 641678336 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068304914744233, + "loss": 3.0284, + "theoretical_loss": 3.813209805494026, + "tokens_seen": 641743872 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040682046138415247, + "loss": 3.0521, + "theoretical_loss": 3.8131696531164385, + "tokens_seen": 641809408 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004068104312938817, + "loss": 3.0181, + "theoretical_loss": 3.813129505986524, + "tokens_seen": 641874944 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040680040120361083, + "loss": 3.0878, + "theoretical_loss": 3.8130893641030594, + "tokens_seen": 641940480 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040679037111334006, + "loss": 3.0869, + "theoretical_loss": 3.813049227464825, + "tokens_seen": 642006016 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004067803410230692, + "loss": 3.0481, + "theoretical_loss": 3.813009096070599, + "tokens_seen": 642071552 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004067703109327984, + "loss": 3.1358, + "theoretical_loss": 3.812968969919162, + "tokens_seen": 642137088 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0767900943756104, + "objective/train/theoretical_loss": 3.812928849009294, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.812928849009294, + "tokens_seen": 642202624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004067602808425276, + "loss": 3.1062, + "theoretical_loss": 3.812928849009294, + "tokens_seen": 642202624 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004067502507522568, + "loss": 3.0686, + "theoretical_loss": 3.8128887333397747, + "tokens_seen": 642268160 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040674022066198597, + "loss": 3.1287, + "theoretical_loss": 3.812848622909386, + "tokens_seen": 642333696 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040673019057171515, + "loss": 3.0701, + "theoretical_loss": 3.812808517716909, + "tokens_seen": 642399232 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040672016048144433, + "loss": 3.0505, + "theoretical_loss": 3.8127684177611254, + "tokens_seen": 642464768 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040671013039117357, + "loss": 3.078, + "theoretical_loss": 3.812728323040817, + "tokens_seen": 642530304 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004067001003009027, + "loss": 3.1187, + "theoretical_loss": 3.8126882335547663, + "tokens_seen": 642595840 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040669007021063193, + "loss": 3.0237, + "theoretical_loss": 3.812648149301757, + "tokens_seen": 642661376 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004066800401203611, + "loss": 3.0765, + "theoretical_loss": 3.8126080702805716, + "tokens_seen": 642726912 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004066700100300903, + "loss": 3.0945, + "theoretical_loss": 3.8125679964899946, + "tokens_seen": 642792448 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040665997993981947, + "loss": 3.0313, + "theoretical_loss": 3.8125279279288096, + "tokens_seen": 642857984 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040664994984954865, + "loss": 3.1273, + "theoretical_loss": 3.812487864595802, + "tokens_seen": 642923520 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040663991975927783, + "loss": 2.9984, + "theoretical_loss": 3.8124478064897556, + "tokens_seen": 642989056 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040662988966900707, + "loss": 3.0499, + "theoretical_loss": 3.812407753609457, + "tokens_seen": 643054592 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004066198595787362, + "loss": 3.0057, + "theoretical_loss": 3.812367705953691, + "tokens_seen": 643120128 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040660982948846543, + "loss": 2.9637, + "theoretical_loss": 3.812327663521245, + "tokens_seen": 643185664 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040659979939819456, + "loss": 3.1478, + "theoretical_loss": 3.8122876263109045, + "tokens_seen": 643251200 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004065897693079238, + "loss": 3.0144, + "theoretical_loss": 3.812247594321457, + "tokens_seen": 643316736 + }, + { + "epoch": 7.02, + "learning_rate": 0.000406579739217653, + "loss": 3.1511, + "theoretical_loss": 3.81220756755169, + "tokens_seen": 643382272 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040656970912738216, + "loss": 3.102, + "theoretical_loss": 3.812167546000391, + "tokens_seen": 643447808 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040655967903711134, + "loss": 3.0222, + "theoretical_loss": 3.8121275296663484, + "tokens_seen": 643513344 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004065496489468405, + "loss": 2.9662, + "theoretical_loss": 3.8120875185483514, + "tokens_seen": 643578880 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004065396188565697, + "loss": 3.1122, + "theoretical_loss": 3.8120475126451883, + "tokens_seen": 643644416 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040652958876629894, + "loss": 3.0822, + "theoretical_loss": 3.8120075119556485, + "tokens_seen": 643709952 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040651955867602806, + "loss": 3.0278, + "theoretical_loss": 3.8119675164785223, + "tokens_seen": 643775488 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2879045009613037, + "objective/train/theoretical_loss": 3.8119275262126, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.8119275262126, + "tokens_seen": 643841024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004065095285857573, + "loss": 3.1818, + "theoretical_loss": 3.8119275262126, + "tokens_seen": 643841024 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004064994984954865, + "loss": 3.1496, + "theoretical_loss": 3.811887541156672, + "tokens_seen": 643906560 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040648946840521566, + "loss": 3.0333, + "theoretical_loss": 3.8118475613095297, + "tokens_seen": 643972096 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040647943831494484, + "loss": 3.0749, + "theoretical_loss": 3.811807586669964, + "tokens_seen": 644037632 + }, + { + "epoch": 7.02, + "learning_rate": 0.000406469408224674, + "loss": 3.1135, + "theoretical_loss": 3.811767617236767, + "tokens_seen": 644103168 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004064593781344032, + "loss": 3.1436, + "theoretical_loss": 3.8117276530087314, + "tokens_seen": 644168704 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040644934804413244, + "loss": 2.9954, + "theoretical_loss": 3.8116876939846493, + "tokens_seen": 644234240 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040643931795386157, + "loss": 3.1361, + "theoretical_loss": 3.8116477401633144, + "tokens_seen": 644299776 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004064292878635908, + "loss": 2.9817, + "theoretical_loss": 3.81160779154352, + "tokens_seen": 644365312 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040641925777331993, + "loss": 3.02, + "theoretical_loss": 3.81156784812406, + "tokens_seen": 644430848 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040640922768304916, + "loss": 3.0179, + "theoretical_loss": 3.811527909903728, + "tokens_seen": 644496384 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040639919759277834, + "loss": 3.1588, + "theoretical_loss": 3.8114879768813195, + "tokens_seen": 644561920 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004063891675025075, + "loss": 3.0096, + "theoretical_loss": 3.8114480490556293, + "tokens_seen": 644627456 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004063791374122367, + "loss": 3.1825, + "theoretical_loss": 3.8114081264254525, + "tokens_seen": 644692992 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004063691073219659, + "loss": 2.9392, + "theoretical_loss": 3.811368208989586, + "tokens_seen": 644758528 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040635907723169507, + "loss": 3.04, + "theoretical_loss": 3.811328296746825, + "tokens_seen": 644824064 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004063490471414243, + "loss": 3.0656, + "theoretical_loss": 3.8112883896959673, + "tokens_seen": 644889600 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040633901705115343, + "loss": 2.975, + "theoretical_loss": 3.811248487835809, + "tokens_seen": 644955136 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040632898696088267, + "loss": 3.0541, + "theoretical_loss": 3.8112085911651485, + "tokens_seen": 645020672 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040631895687061185, + "loss": 3.1275, + "theoretical_loss": 3.811168699682783, + "tokens_seen": 645086208 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040630892678034103, + "loss": 3.2011, + "theoretical_loss": 3.811128813387511, + "tokens_seen": 645151744 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062988966900702, + "loss": 3.1214, + "theoretical_loss": 3.811088932278132, + "tokens_seen": 645217280 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062888665997994, + "loss": 3.0914, + "theoretical_loss": 3.811049056353444, + "tokens_seen": 645282816 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040627883650952857, + "loss": 2.9232, + "theoretical_loss": 3.8110091856122468, + "tokens_seen": 645348352 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062688064192578, + "loss": 3.1226, + "theoretical_loss": 3.81096932005334, + "tokens_seen": 645413888 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.05462384223938, + "objective/train/theoretical_loss": 3.810929459675525, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.810929459675525, + "tokens_seen": 645479424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040625877632898693, + "loss": 3.0885, + "theoretical_loss": 3.810929459675525, + "tokens_seen": 645479424 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040624874623871617, + "loss": 3.1287, + "theoretical_loss": 3.8108896044776013, + "tokens_seen": 645544960 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062387161484453, + "loss": 3.0827, + "theoretical_loss": 3.8108497544583706, + "tokens_seen": 645610496 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040622868605817453, + "loss": 2.9918, + "theoretical_loss": 3.8108099096166344, + "tokens_seen": 645676032 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062186559679037, + "loss": 3.087, + "theoretical_loss": 3.810770069951195, + "tokens_seen": 645741568 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004062086258776329, + "loss": 2.9767, + "theoretical_loss": 3.810730235460853, + "tokens_seen": 645807104 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004061985957873621, + "loss": 3.1002, + "theoretical_loss": 3.8106904061444133, + "tokens_seen": 645872640 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040618856569709126, + "loss": 3.1381, + "theoretical_loss": 3.8106505820006777, + "tokens_seen": 645938176 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040617853560682044, + "loss": 3.0377, + "theoretical_loss": 3.8106107630284507, + "tokens_seen": 646003712 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040616850551654967, + "loss": 3.1545, + "theoretical_loss": 3.810570949226535, + "tokens_seen": 646069248 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004061584754262788, + "loss": 3.125, + "theoretical_loss": 3.810531140593735, + "tokens_seen": 646134784 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040614844533600804, + "loss": 3.0401, + "theoretical_loss": 3.8104913371288562, + "tokens_seen": 646200320 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040613841524573727, + "loss": 3.0874, + "theoretical_loss": 3.8104515388307036, + "tokens_seen": 646265856 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004061283851554664, + "loss": 3.1923, + "theoretical_loss": 3.810411745698082, + "tokens_seen": 646331392 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040611835506519563, + "loss": 3.0689, + "theoretical_loss": 3.810371957729798, + "tokens_seen": 646396928 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040610832497492476, + "loss": 3.1169, + "theoretical_loss": 3.8103321749246577, + "tokens_seen": 646462464 + }, + { + "epoch": 7.02, + "learning_rate": 0.000406098294884654, + "loss": 3.0817, + "theoretical_loss": 3.810292397281467, + "tokens_seen": 646528000 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004060882647943832, + "loss": 3.2238, + "theoretical_loss": 3.810252624799034, + "tokens_seen": 646593536 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040607823470411236, + "loss": 3.0832, + "theoretical_loss": 3.8102128574761656, + "tokens_seen": 646659072 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040606820461384154, + "loss": 3.0679, + "theoretical_loss": 3.8101730953116704, + "tokens_seen": 646724608 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004060581745235707, + "loss": 3.0581, + "theoretical_loss": 3.8101333383043556, + "tokens_seen": 646790144 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004060481444332999, + "loss": 3.1561, + "theoretical_loss": 3.8100935864530303, + "tokens_seen": 646855680 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040603811434302914, + "loss": 3.0696, + "theoretical_loss": 3.8100538397565042, + "tokens_seen": 646921216 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040602808425275826, + "loss": 3.1189, + "theoretical_loss": 3.810014098213586, + "tokens_seen": 646986752 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004060180541624875, + "loss": 3.109, + "theoretical_loss": 3.8099743618230852, + "tokens_seen": 647052288 + }, + { + "epoch": 7.02, + "objective/train/docs_used": 1512704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2009267807006836, + "objective/train/theoretical_loss": 3.809934630583813, + "objective/train/tokens_used": 647942624, + "theoretical_loss": 3.809934630583813, + "tokens_seen": 647117824 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004060080240722167, + "loss": 3.1657, + "theoretical_loss": 3.809934630583813, + "tokens_seen": 647117824 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040599799398194586, + "loss": 3.0737, + "theoretical_loss": 3.8098949044945796, + "tokens_seen": 647183360 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040598796389167504, + "loss": 2.9884, + "theoretical_loss": 3.809855183554196, + "tokens_seen": 647248896 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004059779338014042, + "loss": 3.0568, + "theoretical_loss": 3.809815467761474, + "tokens_seen": 647314432 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004059679037111334, + "loss": 3.0595, + "theoretical_loss": 3.809775757115225, + "tokens_seen": 647379968 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040595787362086264, + "loss": 3.0926, + "theoretical_loss": 3.809736051614261, + "tokens_seen": 647445504 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040594784353059177, + "loss": 3.0614, + "theoretical_loss": 3.809696351257395, + "tokens_seen": 647511040 + }, + { + "epoch": 7.02, + "learning_rate": 0.000405937813440321, + "loss": 3.1862, + "theoretical_loss": 3.80965665604344, + "tokens_seen": 647576576 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040592778335005013, + "loss": 3.1076, + "theoretical_loss": 3.809616965971209, + "tokens_seen": 647642112 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040591775325977936, + "loss": 3.1288, + "theoretical_loss": 3.809577281039517, + "tokens_seen": 647707648 + }, + { + "epoch": 7.02, + "learning_rate": 0.00040590772316950854, + "loss": 3.0984, + "theoretical_loss": 3.809537601247176, + "tokens_seen": 647773184 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004058976930792377, + "loss": 3.1255, + "theoretical_loss": 3.809497926593003, + "tokens_seen": 647838720 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004058876629889669, + "loss": 3.1453, + "theoretical_loss": 3.8094582570758115, + "tokens_seen": 647904256 + }, + { + "epoch": 7.02, + "learning_rate": 0.0004058776328986961, + "loss": 3.0174, + "theoretical_loss": 3.809422930736024, + "tokens_seen": 647962624 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040586760280842527, + "loss": 2.8799, + "theoretical_loss": 3.809383270927702, + "tokens_seen": 648028160 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004058575727181545, + "loss": 2.9219, + "theoretical_loss": 3.8093436162529386, + "tokens_seen": 648093696 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040584754262788363, + "loss": 2.9858, + "theoretical_loss": 3.8093039667105506, + "tokens_seen": 648159232 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040583751253761287, + "loss": 2.9566, + "theoretical_loss": 3.809264322299355, + "tokens_seen": 648224768 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040582748244734205, + "loss": 2.9407, + "theoretical_loss": 3.8092246830181677, + "tokens_seen": 648290304 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040581745235707123, + "loss": 2.9557, + "theoretical_loss": 3.809185048865808, + "tokens_seen": 648355840 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004058074222668004, + "loss": 2.9809, + "theoretical_loss": 3.809145419841093, + "tokens_seen": 648421376 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004057973921765296, + "loss": 2.9835, + "theoretical_loss": 3.809105795942842, + "tokens_seen": 648486912 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040578736208625877, + "loss": 2.8483, + "theoretical_loss": 3.809066177169873, + "tokens_seen": 648552448 + }, + { + "epoch": 8.0, + "learning_rate": 0.000405777331995988, + "loss": 2.9886, + "theoretical_loss": 3.809026563521005, + "tokens_seen": 648617984 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040576730190571713, + "loss": 2.9758, + "theoretical_loss": 3.8089869549950595, + "tokens_seen": 648683520 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1563148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1946511268615723, + "objective/train/theoretical_loss": 3.8089473515908545, + "objective/train/tokens_used": 669209056, + "theoretical_loss": 3.8089473515908545, + "tokens_seen": 648749056 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040575727181544637, + "loss": 3.0978, + "theoretical_loss": 3.8089473515908545, + "tokens_seen": 648749056 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004057472417251755, + "loss": 2.9742, + "theoretical_loss": 3.8089077533072118, + "tokens_seen": 648814592 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040573721163490473, + "loss": 3.0051, + "theoretical_loss": 3.808868160142951, + "tokens_seen": 648880128 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004057271815446339, + "loss": 3.0163, + "theoretical_loss": 3.808828572096894, + "tokens_seen": 648945664 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004057171514543631, + "loss": 2.9084, + "theoretical_loss": 3.808788989167862, + "tokens_seen": 649011200 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004057071213640923, + "loss": 2.9569, + "theoretical_loss": 3.808749411354678, + "tokens_seen": 649076736 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040569709127382146, + "loss": 3.0259, + "theoretical_loss": 3.8087098386561635, + "tokens_seen": 649142272 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040568706118355064, + "loss": 3.0752, + "theoretical_loss": 3.808670271071141, + "tokens_seen": 649207808 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004056770310932799, + "loss": 3.0783, + "theoretical_loss": 3.808630708598434, + "tokens_seen": 649273344 + }, + { + "epoch": 8.0, + "learning_rate": 0.000405667001003009, + "loss": 3.0072, + "theoretical_loss": 3.8085911512368664, + "tokens_seen": 649338880 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040565697091273824, + "loss": 3.042, + "theoretical_loss": 3.8085515989852623, + "tokens_seen": 649404416 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004056469408224674, + "loss": 2.9642, + "theoretical_loss": 3.808512051842445, + "tokens_seen": 649469952 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004056369107321966, + "loss": 3.0888, + "theoretical_loss": 3.8084725098072396, + "tokens_seen": 649535488 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004056268806419258, + "loss": 2.9784, + "theoretical_loss": 3.808432972878472, + "tokens_seen": 649601024 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040561685055165496, + "loss": 3.0273, + "theoretical_loss": 3.8083934410549665, + "tokens_seen": 649666560 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040560682046138414, + "loss": 3.0494, + "theoretical_loss": 3.80835391433555, + "tokens_seen": 649732096 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004055967903711134, + "loss": 3.0037, + "theoretical_loss": 3.8083143927190477, + "tokens_seen": 649797632 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004055867602808425, + "loss": 3.1382, + "theoretical_loss": 3.808274876204287, + "tokens_seen": 649863168 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040557673019057174, + "loss": 2.9693, + "theoretical_loss": 3.808235364790095, + "tokens_seen": 649928704 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040556670010030087, + "loss": 2.8991, + "theoretical_loss": 3.8081958584752984, + "tokens_seen": 649994240 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004055566700100301, + "loss": 3.0612, + "theoretical_loss": 3.808156357258726, + "tokens_seen": 650059776 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004055466399197593, + "loss": 3.0374, + "theoretical_loss": 3.8081168611392053, + "tokens_seen": 650125312 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040553660982948846, + "loss": 3.0165, + "theoretical_loss": 3.8080773701155657, + "tokens_seen": 650190848 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040552657973921764, + "loss": 2.8642, + "theoretical_loss": 3.808037884186635, + "tokens_seen": 650256384 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004055165496489469, + "loss": 3.0194, + "theoretical_loss": 3.8079984033512435, + "tokens_seen": 650321920 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1566294, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.639150381088257, + "objective/train/theoretical_loss": 3.80795892760822, + "objective/train/tokens_used": 670847456, + "theoretical_loss": 3.80795892760822, + "tokens_seen": 650387456 + }, + { + "epoch": 8.0, + "learning_rate": 0.000405506519558676, + "loss": 3.0118, + "theoretical_loss": 3.80795892760822, + "tokens_seen": 650387456 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040549648946840524, + "loss": 3.0162, + "theoretical_loss": 3.807919456956396, + "tokens_seen": 650452992 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040548645937813437, + "loss": 2.9858, + "theoretical_loss": 3.807879991394601, + "tokens_seen": 650518528 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004054764292878636, + "loss": 3.0309, + "theoretical_loss": 3.8078405309216663, + "tokens_seen": 650584064 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004054663991975928, + "loss": 3.0343, + "theoretical_loss": 3.807801075536423, + "tokens_seen": 650649600 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040545636910732197, + "loss": 3.0191, + "theoretical_loss": 3.8077616252377022, + "tokens_seen": 650715136 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040544633901705115, + "loss": 3.0045, + "theoretical_loss": 3.8077221800243377, + "tokens_seen": 650780672 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040543630892678033, + "loss": 2.8987, + "theoretical_loss": 3.80768273989516, + "tokens_seen": 650846208 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004054262788365095, + "loss": 3.0084, + "theoretical_loss": 3.8076433048490035, + "tokens_seen": 650911744 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040541624874623874, + "loss": 2.9644, + "theoretical_loss": 3.8076038748847, + "tokens_seen": 650977280 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040540621865596787, + "loss": 3.0533, + "theoretical_loss": 3.8075644500010846, + "tokens_seen": 651042816 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004053961885656971, + "loss": 3.0376, + "theoretical_loss": 3.8075250301969903, + "tokens_seen": 651108352 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004053861584754263, + "loss": 2.9781, + "theoretical_loss": 3.8074856154712515, + "tokens_seen": 651173888 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040537612838515547, + "loss": 3.0644, + "theoretical_loss": 3.807446205822704, + "tokens_seen": 651239424 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004053660982948847, + "loss": 3.0617, + "theoretical_loss": 3.807406801250181, + "tokens_seen": 651304960 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040535606820461383, + "loss": 2.9419, + "theoretical_loss": 3.8073674017525203, + "tokens_seen": 651370496 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040534603811434307, + "loss": 2.9616, + "theoretical_loss": 3.807328007328556, + "tokens_seen": 651436032 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040533600802407225, + "loss": 3.0328, + "theoretical_loss": 3.8072886179771257, + "tokens_seen": 651501568 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040532597793380143, + "loss": 3.0257, + "theoretical_loss": 3.807249233697065, + "tokens_seen": 651567104 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004053159478435306, + "loss": 3.0049, + "theoretical_loss": 3.807209854487212, + "tokens_seen": 651632640 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004053059177532598, + "loss": 2.93, + "theoretical_loss": 3.8071704803464033, + "tokens_seen": 651698176 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040529588766298897, + "loss": 3.0001, + "theoretical_loss": 3.8071311112734767, + "tokens_seen": 651763712 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004052858575727182, + "loss": 2.966, + "theoretical_loss": 3.8070917472672714, + "tokens_seen": 651829248 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040527582748244733, + "loss": 2.8559, + "theoretical_loss": 3.8070523883266256, + "tokens_seen": 651894784 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040526579739217657, + "loss": 2.9748, + "theoretical_loss": 3.807013034450377, + "tokens_seen": 651960320 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1569967, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2018237113952637, + "objective/train/theoretical_loss": 3.8069736856373675, + "objective/train/tokens_used": 672485856, + "theoretical_loss": 3.8069736856373675, + "tokens_seen": 652025856 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004052557673019057, + "loss": 3.0617, + "theoretical_loss": 3.8069736856373675, + "tokens_seen": 652025856 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040524573721163493, + "loss": 3.0399, + "theoretical_loss": 3.8069343418864343, + "tokens_seen": 652091392 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004052357071213641, + "loss": 3.0239, + "theoretical_loss": 3.806895003196419, + "tokens_seen": 652156928 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004052256770310933, + "loss": 2.885, + "theoretical_loss": 3.806855669566162, + "tokens_seen": 652222464 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004052156469408225, + "loss": 2.9304, + "theoretical_loss": 3.806816340994504, + "tokens_seen": 652288000 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040520561685055166, + "loss": 3.037, + "theoretical_loss": 3.806777017480286, + "tokens_seen": 652353536 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040519558676028084, + "loss": 3.1169, + "theoretical_loss": 3.80673769902235, + "tokens_seen": 652419072 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004051855566700101, + "loss": 3.0118, + "theoretical_loss": 3.806698385619538, + "tokens_seen": 652484608 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004051755265797392, + "loss": 3.0555, + "theoretical_loss": 3.8066590772706923, + "tokens_seen": 652550144 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040516549648946844, + "loss": 3.0752, + "theoretical_loss": 3.8066197739746555, + "tokens_seen": 652615680 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004051554663991976, + "loss": 2.8772, + "theoretical_loss": 3.8065804757302715, + "tokens_seen": 652681216 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004051454363089268, + "loss": 3.0175, + "theoretical_loss": 3.8065411825363835, + "tokens_seen": 652746752 + }, + { + "epoch": 8.0, + "learning_rate": 0.000405135406218656, + "loss": 2.9597, + "theoretical_loss": 3.806501894391835, + "tokens_seen": 652812288 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040512537612838516, + "loss": 2.9934, + "theoretical_loss": 3.8064626112954714, + "tokens_seen": 652877824 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040511534603811434, + "loss": 2.8732, + "theoretical_loss": 3.806423333246136, + "tokens_seen": 652943360 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004051053159478436, + "loss": 2.972, + "theoretical_loss": 3.8063840602426753, + "tokens_seen": 653008896 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004050952858575727, + "loss": 3.0319, + "theoretical_loss": 3.8063447922839337, + "tokens_seen": 653074432 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040508525576730194, + "loss": 3.0208, + "theoretical_loss": 3.8063055293687578, + "tokens_seen": 653139968 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040507522567703107, + "loss": 2.9025, + "theoretical_loss": 3.806266271495993, + "tokens_seen": 653205504 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004050651955867603, + "loss": 2.9204, + "theoretical_loss": 3.8062270186644866, + "tokens_seen": 653271040 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004050551654964895, + "loss": 3.0151, + "theoretical_loss": 3.806187770873086, + "tokens_seen": 653336576 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040504513540621866, + "loss": 3.0716, + "theoretical_loss": 3.8061485281206373, + "tokens_seen": 653402112 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040503510531594784, + "loss": 2.9755, + "theoretical_loss": 3.806109290405989, + "tokens_seen": 653467648 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004050250752256771, + "loss": 3.0124, + "theoretical_loss": 3.8060700577279896, + "tokens_seen": 653533184 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004050150451354062, + "loss": 3.08, + "theoretical_loss": 3.806030830085487, + "tokens_seen": 653598720 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1575114, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.027125597000122, + "objective/train/theoretical_loss": 3.8059916074773303, + "objective/train/tokens_used": 674124256, + "theoretical_loss": 3.8059916074773303, + "tokens_seen": 653664256 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040500501504513544, + "loss": 3.0406, + "theoretical_loss": 3.8059916074773303, + "tokens_seen": 653664256 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040499498495486457, + "loss": 2.9417, + "theoretical_loss": 3.8059523899023686, + "tokens_seen": 653729792 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004049849548645938, + "loss": 3.0269, + "theoretical_loss": 3.8059131773594514, + "tokens_seen": 653795328 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404974924774323, + "loss": 3.0106, + "theoretical_loss": 3.805873969847429, + "tokens_seen": 653860864 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040496489468405217, + "loss": 2.8642, + "theoretical_loss": 3.8058347673651522, + "tokens_seen": 653926400 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040495486459378135, + "loss": 2.9202, + "theoretical_loss": 3.8057955699114707, + "tokens_seen": 653991936 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040494483450351053, + "loss": 2.9471, + "theoretical_loss": 3.805756377485237, + "tokens_seen": 654057472 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004049348044132397, + "loss": 2.9887, + "theoretical_loss": 3.8057171900853017, + "tokens_seen": 654123008 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040492477432296894, + "loss": 2.9057, + "theoretical_loss": 3.805678007710517, + "tokens_seen": 654188544 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040491474423269807, + "loss": 3.1189, + "theoretical_loss": 3.8056388303597357, + "tokens_seen": 654254080 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004049047141424273, + "loss": 2.9867, + "theoretical_loss": 3.8055996580318086, + "tokens_seen": 654319616 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040489468405215643, + "loss": 2.9265, + "theoretical_loss": 3.805560490725591, + "tokens_seen": 654385152 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040488465396188567, + "loss": 2.9132, + "theoretical_loss": 3.8055213284399354, + "tokens_seen": 654450688 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040487462387161485, + "loss": 2.9164, + "theoretical_loss": 3.8054821711736952, + "tokens_seen": 654516224 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040486459378134403, + "loss": 3.1034, + "theoretical_loss": 3.8054430189257253, + "tokens_seen": 654581760 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004048545636910732, + "loss": 3.0391, + "theoretical_loss": 3.8054038716948795, + "tokens_seen": 654647296 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040484453360080245, + "loss": 3.0711, + "theoretical_loss": 3.8053647294800133, + "tokens_seen": 654712832 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004048345035105316, + "loss": 2.9775, + "theoretical_loss": 3.805325592279982, + "tokens_seen": 654778368 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004048244734202608, + "loss": 2.7941, + "theoretical_loss": 3.8052864600936402, + "tokens_seen": 654843904 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040481444332998994, + "loss": 3.0503, + "theoretical_loss": 3.805247332919846, + "tokens_seen": 654909440 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040480441323971917, + "loss": 3.0302, + "theoretical_loss": 3.805208210757454, + "tokens_seen": 654974976 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040479438314944835, + "loss": 2.9643, + "theoretical_loss": 3.805169093605322, + "tokens_seen": 655040512 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040478435305917753, + "loss": 2.9774, + "theoretical_loss": 3.8051299814623065, + "tokens_seen": 655106048 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047743229689067, + "loss": 2.9089, + "theoretical_loss": 3.8050908743272656, + "tokens_seen": 655171584 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047642928786359, + "loss": 2.9034, + "theoretical_loss": 3.8050517721990573, + "tokens_seen": 655237120 + }, + { + "debugging/Self-BLEU-5": 0.6868702010722929, + "debugging/distinct-1-grams": 0.7337255873650311, + "debugging/distinct-2-grams": 0.9448987176439994, + "debugging/entropy-1-grams": 6.322116353271571, + "debugging/entropy-2-grams": 7.597009204995782, + "debugging/length": 587.2857142857143, + "debugging/num_segments": 28, + "epoch": 8.0, + "objective/train/docs_used": 1577811, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0758790969848633, + "objective/train/theoretical_loss": 3.805012675076539, + "objective/train/tokens_used": 675762656, + "theoretical_loss": 3.805012675076539, + "tokens_seen": 655302656 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047542627883651, + "loss": 3.0744, + "theoretical_loss": 3.805012675076539, + "tokens_seen": 655302656 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047442326980943, + "loss": 3.1221, + "theoretical_loss": 3.8049735829585702, + "tokens_seen": 655368192 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040473420260782344, + "loss": 3.0091, + "theoretical_loss": 3.8049344958440097, + "tokens_seen": 655433728 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047241725175527, + "loss": 2.9378, + "theoretical_loss": 3.8048954137317175, + "tokens_seen": 655499264 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004047141424272818, + "loss": 3.0725, + "theoretical_loss": 3.804856336620552, + "tokens_seen": 655564800 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040470411233701104, + "loss": 3.0231, + "theoretical_loss": 3.804817264509375, + "tokens_seen": 655630336 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046940822467402, + "loss": 3.0202, + "theoretical_loss": 3.804778197397046, + "tokens_seen": 655695872 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046840521564694, + "loss": 2.8958, + "theoretical_loss": 3.8047391352824262, + "tokens_seen": 655761408 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046740220661986, + "loss": 3.0478, + "theoretical_loss": 3.8047000781643767, + "tokens_seen": 655826944 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046639919759278, + "loss": 3.0095, + "theoretical_loss": 3.80466102604176, + "tokens_seen": 655892480 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404653961885657, + "loss": 3.0174, + "theoretical_loss": 3.8046219789134366, + "tokens_seen": 655958016 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046439317953862, + "loss": 3.0988, + "theoretical_loss": 3.8045829367782704, + "tokens_seen": 656023552 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040463390170511536, + "loss": 3.0102, + "theoretical_loss": 3.8045438996351235, + "tokens_seen": 656089088 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040462387161484454, + "loss": 3.002, + "theoretical_loss": 3.8045048674828594, + "tokens_seen": 656154624 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046138415245738, + "loss": 3.0162, + "theoretical_loss": 3.804465840320341, + "tokens_seen": 656220160 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004046038114343029, + "loss": 3.0826, + "theoretical_loss": 3.8044268181464327, + "tokens_seen": 656285696 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040459378134403214, + "loss": 2.9372, + "theoretical_loss": 3.804387800959999, + "tokens_seen": 656351232 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040458375125376127, + "loss": 3.1039, + "theoretical_loss": 3.8043487887599037, + "tokens_seen": 656416768 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004045737211634905, + "loss": 3.0843, + "theoretical_loss": 3.8043097815450126, + "tokens_seen": 656482304 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004045636910732197, + "loss": 2.99, + "theoretical_loss": 3.8042707793141908, + "tokens_seen": 656547840 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040455366098294886, + "loss": 2.9611, + "theoretical_loss": 3.804231782066304, + "tokens_seen": 656613376 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040454363089267804, + "loss": 2.9911, + "theoretical_loss": 3.804192789800218, + "tokens_seen": 656678912 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004045336008024073, + "loss": 3.0961, + "theoretical_loss": 3.8041538025148003, + "tokens_seen": 656744448 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004045235707121364, + "loss": 3.0933, + "theoretical_loss": 3.804114820208917, + "tokens_seen": 656809984 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004045235707121364, + "loss": 2.9729, + "theoretical_loss": 3.8040758428814354, + "tokens_seen": 656875520 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1582872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0849356651306152, + "objective/train/theoretical_loss": 3.8040368705312235, + "objective/train/tokens_used": 677401056, + "theoretical_loss": 3.8040368705312235, + "tokens_seen": 656941056 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040451354062186564, + "loss": 2.9973, + "theoretical_loss": 3.8040368705312235, + "tokens_seen": 656941056 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040450351053159477, + "loss": 2.8934, + "theoretical_loss": 3.803997903157149, + "tokens_seen": 657006592 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404493480441324, + "loss": 3.074, + "theoretical_loss": 3.80395894075808, + "tokens_seen": 657072128 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004044834503510532, + "loss": 2.9072, + "theoretical_loss": 3.803919983332886, + "tokens_seen": 657137664 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040447342026078237, + "loss": 3.0267, + "theoretical_loss": 3.8038810308804347, + "tokens_seen": 657203200 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040446339017051155, + "loss": 3.0174, + "theoretical_loss": 3.8038420833995974, + "tokens_seen": 657268736 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040445336008024073, + "loss": 2.9964, + "theoretical_loss": 3.8038031408892428, + "tokens_seen": 657334272 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004044433299899699, + "loss": 2.9602, + "theoretical_loss": 3.803764203348241, + "tokens_seen": 657399808 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040443329989969915, + "loss": 3.0484, + "theoretical_loss": 3.8037252707754634, + "tokens_seen": 657465344 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040442326980942827, + "loss": 3.0755, + "theoretical_loss": 3.8036863431697796, + "tokens_seen": 657530880 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004044132397191575, + "loss": 3.0494, + "theoretical_loss": 3.803647420530063, + "tokens_seen": 657596416 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040440320962888663, + "loss": 2.9394, + "theoretical_loss": 3.803608502855183, + "tokens_seen": 657661952 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040439317953861587, + "loss": 2.9592, + "theoretical_loss": 3.803569590144013, + "tokens_seen": 657727488 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040438314944834505, + "loss": 3.027, + "theoretical_loss": 3.8035306823954254, + "tokens_seen": 657793024 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040437311935807423, + "loss": 3.0486, + "theoretical_loss": 3.8034917796082923, + "tokens_seen": 657858560 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004043630892678034, + "loss": 3.02, + "theoretical_loss": 3.8034528817814874, + "tokens_seen": 657924096 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040435305917753265, + "loss": 2.9658, + "theoretical_loss": 3.803413988913884, + "tokens_seen": 657989632 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004043430290872618, + "loss": 2.879, + "theoretical_loss": 3.803375101004356, + "tokens_seen": 658055168 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404332998996991, + "loss": 2.8787, + "theoretical_loss": 3.803336218051778, + "tokens_seen": 658120704 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040432296890672014, + "loss": 2.9649, + "theoretical_loss": 3.803297340055024, + "tokens_seen": 658186240 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040431293881644937, + "loss": 2.946, + "theoretical_loss": 3.803258467012969, + "tokens_seen": 658251776 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040430290872617855, + "loss": 2.9334, + "theoretical_loss": 3.8032195989244895, + "tokens_seen": 658317312 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040429287863590773, + "loss": 3.1102, + "theoretical_loss": 3.80318073578846, + "tokens_seen": 658382848 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042828485456369, + "loss": 2.9778, + "theoretical_loss": 3.8031418776037578, + "tokens_seen": 658448384 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042728184553661, + "loss": 3.032, + "theoretical_loss": 3.803103024369258, + "tokens_seen": 658513920 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1585709, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0676233768463135, + "objective/train/theoretical_loss": 3.8030641760838377, + "objective/train/tokens_used": 679039456, + "theoretical_loss": 3.8030641760838377, + "tokens_seen": 658579456 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042627883650953, + "loss": 2.8802, + "theoretical_loss": 3.8030641760838377, + "tokens_seen": 658579456 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042527582748245, + "loss": 3.0441, + "theoretical_loss": 3.803025332746375, + "tokens_seen": 658644992 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040424272818455364, + "loss": 3.0609, + "theoretical_loss": 3.8029864943557463, + "tokens_seen": 658710528 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042326980942829, + "loss": 3.0322, + "theoretical_loss": 3.8029476609108306, + "tokens_seen": 658776064 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404222668004012, + "loss": 3.0671, + "theoretical_loss": 3.802908832410506, + "tokens_seen": 658841600 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040421263791374124, + "loss": 2.9718, + "theoretical_loss": 3.8028700088536502, + "tokens_seen": 658907136 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004042026078234704, + "loss": 3.0549, + "theoretical_loss": 3.802831190239143, + "tokens_seen": 658972672 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041925777331996, + "loss": 3.0053, + "theoretical_loss": 3.802792376565864, + "tokens_seen": 659038208 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041825476429288, + "loss": 3.0694, + "theoretical_loss": 3.802753567832693, + "tokens_seen": 659103744 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404172517552658, + "loss": 2.9355, + "theoretical_loss": 3.8027147640385093, + "tokens_seen": 659169280 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040416248746238714, + "loss": 2.9868, + "theoretical_loss": 3.802675965182194, + "tokens_seen": 659234816 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041524573721164, + "loss": 3.0998, + "theoretical_loss": 3.802637171262628, + "tokens_seen": 659300352 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041424272818455, + "loss": 3.083, + "theoretical_loss": 3.802598382278692, + "tokens_seen": 659365888 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040413239719157474, + "loss": 3.0402, + "theoretical_loss": 3.802559598229268, + "tokens_seen": 659431424 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041223671013039, + "loss": 3.0528, + "theoretical_loss": 3.802520819113238, + "tokens_seen": 659496960 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041123370110331, + "loss": 2.9757, + "theoretical_loss": 3.802482044929484, + "tokens_seen": 659562496 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004041023069207623, + "loss": 2.9538, + "theoretical_loss": 3.8024432756768896, + "tokens_seen": 659628032 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040409227683049147, + "loss": 2.9746, + "theoretical_loss": 3.802404511354336, + "tokens_seen": 659693568 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040408224674022065, + "loss": 2.9427, + "theoretical_loss": 3.802365751960709, + "tokens_seen": 659759104 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004040722166499499, + "loss": 2.8353, + "theoretical_loss": 3.8023269974948897, + "tokens_seen": 659824640 + }, + { + "epoch": 8.0, + "learning_rate": 0.000404062186559679, + "loss": 3.0584, + "theoretical_loss": 3.8022882479557647, + "tokens_seen": 659890176 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040405215646940824, + "loss": 3.0789, + "theoretical_loss": 3.8022495033422166, + "tokens_seen": 659955712 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040404212637913737, + "loss": 3.0157, + "theoretical_loss": 3.8022107636531315, + "tokens_seen": 660021248 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004040320962888666, + "loss": 2.9582, + "theoretical_loss": 3.8021720288873944, + "tokens_seen": 660086784 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004040220661985958, + "loss": 2.7778, + "theoretical_loss": 3.8021332990438905, + "tokens_seen": 660152320 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1589363, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9366538524627686, + "objective/train/theoretical_loss": 3.802094574121506, + "objective/train/tokens_used": 680677856, + "theoretical_loss": 3.802094574121506, + "tokens_seen": 660217856 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040401203610832497, + "loss": 3.0128, + "theoretical_loss": 3.802094574121506, + "tokens_seen": 660217856 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040400200601805415, + "loss": 3.0895, + "theoretical_loss": 3.8020558541191267, + "tokens_seen": 660283392 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004039919759277834, + "loss": 3.0679, + "theoretical_loss": 3.802017139035641, + "tokens_seen": 660348928 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004039819458375125, + "loss": 3.0031, + "theoretical_loss": 3.8019784288699334, + "tokens_seen": 660414464 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040397191574724175, + "loss": 2.8591, + "theoretical_loss": 3.801939723620893, + "tokens_seen": 660480000 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004039618856569709, + "loss": 2.8693, + "theoretical_loss": 3.8019010232874066, + "tokens_seen": 660545536 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004039518555667001, + "loss": 2.8249, + "theoretical_loss": 3.8018623278683634, + "tokens_seen": 660611072 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004039418254764293, + "loss": 2.8691, + "theoretical_loss": 3.8018236373626513, + "tokens_seen": 660676608 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040393179538615847, + "loss": 3.1103, + "theoretical_loss": 3.801784951769159, + "tokens_seen": 660742144 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040392176529588765, + "loss": 3.0733, + "theoretical_loss": 3.8017462710867767, + "tokens_seen": 660807680 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040391173520561683, + "loss": 3.047, + "theoretical_loss": 3.801707595314392, + "tokens_seen": 660873216 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040390170511534607, + "loss": 3.0016, + "theoretical_loss": 3.8016689244508965, + "tokens_seen": 660938752 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040389167502507525, + "loss": 3.0886, + "theoretical_loss": 3.8016302584951798, + "tokens_seen": 661004288 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040388164493480443, + "loss": 3.074, + "theoretical_loss": 3.8015915974461327, + "tokens_seen": 661069824 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004038716148445336, + "loss": 2.9712, + "theoretical_loss": 3.8015529413026465, + "tokens_seen": 661135360 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040386158475426285, + "loss": 2.998, + "theoretical_loss": 3.8015142900636123, + "tokens_seen": 661200896 + }, + { + "epoch": 8.0, + "learning_rate": 0.000403851554663992, + "loss": 2.8454, + "theoretical_loss": 3.8014756437279216, + "tokens_seen": 661266432 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004038415245737212, + "loss": 3.0461, + "theoretical_loss": 3.8014370022944663, + "tokens_seen": 661331968 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040383149448345034, + "loss": 3.0047, + "theoretical_loss": 3.80139836576214, + "tokens_seen": 661397504 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040382146439317957, + "loss": 3.0113, + "theoretical_loss": 3.8013597341298344, + "tokens_seen": 661463040 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040381143430290875, + "loss": 2.9973, + "theoretical_loss": 3.8013211073964435, + "tokens_seen": 661528576 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040380140421263794, + "loss": 2.9492, + "theoretical_loss": 3.80128248556086, + "tokens_seen": 661594112 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037913741223671, + "loss": 3.0295, + "theoretical_loss": 3.8012438686219783, + "tokens_seen": 661659648 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037813440320963, + "loss": 2.9866, + "theoretical_loss": 3.801205256578693, + "tokens_seen": 661725184 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037713139418255, + "loss": 2.9844, + "theoretical_loss": 3.8011666494298977, + "tokens_seen": 661790720 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1594222, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.855337142944336, + "objective/train/theoretical_loss": 3.801128047174488, + "objective/train/tokens_used": 682316256, + "theoretical_loss": 3.801128047174488, + "tokens_seen": 661856256 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037612838515547, + "loss": 2.9943, + "theoretical_loss": 3.801128047174488, + "tokens_seen": 661856256 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040375125376128384, + "loss": 3.007, + "theoretical_loss": 3.801089449811359, + "tokens_seen": 661921792 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037412236710131, + "loss": 3.0523, + "theoretical_loss": 3.801050857339407, + "tokens_seen": 661987328 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037311935807422, + "loss": 3.0228, + "theoretical_loss": 3.8010122697575275, + "tokens_seen": 662052864 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040372116349047144, + "loss": 3.0785, + "theoretical_loss": 3.8009736870646167, + "tokens_seen": 662118400 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037111334002006, + "loss": 2.9625, + "theoretical_loss": 3.800935109259572, + "tokens_seen": 662183936 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004037011033099298, + "loss": 2.9784, + "theoretical_loss": 3.80089653634129, + "tokens_seen": 662249472 + }, + { + "epoch": 8.0, + "learning_rate": 0.000403691073219659, + "loss": 3.1266, + "theoretical_loss": 3.8008579683086685, + "tokens_seen": 662315008 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036810431293882, + "loss": 3.0517, + "theoretical_loss": 3.800819405160605, + "tokens_seen": 662380544 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040367101303911734, + "loss": 2.972, + "theoretical_loss": 3.800780846895998, + "tokens_seen": 662446080 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036609829488466, + "loss": 3.0439, + "theoretical_loss": 3.800742293513746, + "tokens_seen": 662511616 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036509528585757, + "loss": 3.0264, + "theoretical_loss": 3.8007037450127474, + "tokens_seen": 662577152 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040364092276830494, + "loss": 3.0673, + "theoretical_loss": 3.8006652013919027, + "tokens_seen": 662642688 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036308926780341, + "loss": 2.9322, + "theoretical_loss": 3.80062666265011, + "tokens_seen": 662708224 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036208625877633, + "loss": 3.0781, + "theoretical_loss": 3.800588128786271, + "tokens_seen": 662773760 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004036108324974925, + "loss": 3.0359, + "theoretical_loss": 3.8005495997992838, + "tokens_seen": 662839296 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040360080240722167, + "loss": 3.0558, + "theoretical_loss": 3.800511075688051, + "tokens_seen": 662904832 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040359077231695085, + "loss": 3.0276, + "theoretical_loss": 3.8004725564514734, + "tokens_seen": 662970368 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004035807422266801, + "loss": 3.0391, + "theoretical_loss": 3.800434042088451, + "tokens_seen": 663035904 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004035707121364092, + "loss": 3.065, + "theoretical_loss": 3.8003955325978875, + "tokens_seen": 663101440 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040356068204613844, + "loss": 2.9492, + "theoretical_loss": 3.8003570279786834, + "tokens_seen": 663166976 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040355065195586757, + "loss": 2.9486, + "theoretical_loss": 3.8003185282297425, + "tokens_seen": 663232512 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004035406218655968, + "loss": 2.9716, + "theoretical_loss": 3.8002800333499667, + "tokens_seen": 663298048 + }, + { + "epoch": 8.0, + "learning_rate": 0.000403530591775326, + "loss": 3.012, + "theoretical_loss": 3.8002415433382595, + "tokens_seen": 663363584 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040352056168505517, + "loss": 3.0788, + "theoretical_loss": 3.8002030581935244, + "tokens_seen": 663429120 + }, + { + "epoch": 8.0, + "objective/train/docs_used": 1597126, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1106107234954834, + "objective/train/theoretical_loss": 3.800164577914665, + "objective/train/tokens_used": 683954656, + "theoretical_loss": 3.800164577914665, + "tokens_seen": 663494656 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040351053159478435, + "loss": 3.049, + "theoretical_loss": 3.800164577914665, + "tokens_seen": 663494656 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004035005015045136, + "loss": 3.0421, + "theoretical_loss": 3.8001261025005864, + "tokens_seen": 663560192 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004034904714142427, + "loss": 3.1219, + "theoretical_loss": 3.800087631950192, + "tokens_seen": 663625728 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040348044132397195, + "loss": 3.0096, + "theoretical_loss": 3.8000491662623883, + "tokens_seen": 663691264 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004034704112337011, + "loss": 2.8579, + "theoretical_loss": 3.8000107054360797, + "tokens_seen": 663756800 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004034603811434303, + "loss": 3.1166, + "theoretical_loss": 3.799972249470171, + "tokens_seen": 663822336 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004034503510531595, + "loss": 2.991, + "theoretical_loss": 3.79993379836357, + "tokens_seen": 663887872 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040344032096288867, + "loss": 2.9363, + "theoretical_loss": 3.799895352115182, + "tokens_seen": 663953408 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040343029087261785, + "loss": 2.9647, + "theoretical_loss": 3.799856910723914, + "tokens_seen": 664018944 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040342026078234703, + "loss": 3.0565, + "theoretical_loss": 3.7998184741886725, + "tokens_seen": 664084480 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004034102306920762, + "loss": 3.0181, + "theoretical_loss": 3.7997800425083663, + "tokens_seen": 664150016 + }, + { + "epoch": 8.0, + "learning_rate": 0.00040340020060180545, + "loss": 3.0394, + "theoretical_loss": 3.799741615681902, + "tokens_seen": 664215552 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004033901705115346, + "loss": 2.9107, + "theoretical_loss": 3.799703193708188, + "tokens_seen": 664281088 + }, + { + "epoch": 8.0, + "learning_rate": 0.0004033801404212638, + "loss": 3.042, + "theoretical_loss": 3.799664776586133, + "tokens_seen": 664346624 + }, + { + "epoch": 8.0, + "learning_rate": 0.000403370110330993, + "loss": 2.9723, + "theoretical_loss": 3.799626364314646, + "tokens_seen": 664412160 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004033600802407222, + "loss": 3.1152, + "theoretical_loss": 3.799587956892636, + "tokens_seen": 664477696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040335005015045136, + "loss": 2.9794, + "theoretical_loss": 3.7995495543190123, + "tokens_seen": 664543232 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040334002006018054, + "loss": 3.0943, + "theoretical_loss": 3.7995111565926853, + "tokens_seen": 664608768 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004033299899699097, + "loss": 3.0719, + "theoretical_loss": 3.799472763712565, + "tokens_seen": 664674304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040331995987963895, + "loss": 3.0182, + "theoretical_loss": 3.7994343756775617, + "tokens_seen": 664739840 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004033099297893681, + "loss": 3.0717, + "theoretical_loss": 3.7993959924865868, + "tokens_seen": 664805376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032998996990973, + "loss": 2.9433, + "theoretical_loss": 3.799357614138552, + "tokens_seen": 664870912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040328986960882644, + "loss": 2.7361, + "theoretical_loss": 3.799319240632368, + "tokens_seen": 664936448 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032798395185557, + "loss": 3.054, + "theoretical_loss": 3.7992808719669475, + "tokens_seen": 665001984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040326980942828486, + "loss": 3.0157, + "theoretical_loss": 3.799242508141203, + "tokens_seen": 665067520 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1602001, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963728666305542, + "objective/train/theoretical_loss": 3.799204149154047, + "objective/train/tokens_used": 685593056, + "theoretical_loss": 3.799204149154047, + "tokens_seen": 665133056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040325977933801404, + "loss": 3.0182, + "theoretical_loss": 3.799204149154047, + "tokens_seen": 665133056 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032497492477432, + "loss": 3.0807, + "theoretical_loss": 3.7991657950043916, + "tokens_seen": 665198592 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032397191574724, + "loss": 3.0164, + "theoretical_loss": 3.799127445691152, + "tokens_seen": 665264128 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032296890672016, + "loss": 2.9823, + "theoretical_loss": 3.799089101213241, + "tokens_seen": 665329664 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004032196589769308, + "loss": 2.9324, + "theoretical_loss": 3.7990507615695726, + "tokens_seen": 665395200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040320962888665995, + "loss": 2.9823, + "theoretical_loss": 3.7990124267590613, + "tokens_seen": 665460736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031995987963892, + "loss": 3.0706, + "theoretical_loss": 3.798974096780623, + "tokens_seen": 665526272 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040318956870611836, + "loss": 3.0441, + "theoretical_loss": 3.7989357716331718, + "tokens_seen": 665591808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040317953861584754, + "loss": 2.9806, + "theoretical_loss": 3.7988974513156233, + "tokens_seen": 665657344 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031695085255767, + "loss": 3.0529, + "theoretical_loss": 3.7988591358268935, + "tokens_seen": 665722880 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031594784353059, + "loss": 3.1016, + "theoretical_loss": 3.798820825165899, + "tokens_seen": 665788416 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040314944834503514, + "loss": 3.005, + "theoretical_loss": 3.7987825193315556, + "tokens_seen": 665853952 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031394182547643, + "loss": 2.9328, + "theoretical_loss": 3.7987442183227813, + "tokens_seen": 665919488 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031293881644935, + "loss": 3.0276, + "theoretical_loss": 3.798705922138493, + "tokens_seen": 665985024 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004031193580742227, + "loss": 3.0168, + "theoretical_loss": 3.798667630777608, + "tokens_seen": 666050560 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040310932798395187, + "loss": 2.9005, + "theoretical_loss": 3.7986293442390444, + "tokens_seen": 666116096 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040309929789368105, + "loss": 3.0107, + "theoretical_loss": 3.7985910625217203, + "tokens_seen": 666181632 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004030892678034103, + "loss": 3.1383, + "theoretical_loss": 3.7985527856245547, + "tokens_seen": 666247168 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004030792377131394, + "loss": 2.9499, + "theoretical_loss": 3.7985145135464666, + "tokens_seen": 666312704 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040306920762286864, + "loss": 3.0618, + "theoretical_loss": 3.7984762462863757, + "tokens_seen": 666378240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040305917753259777, + "loss": 2.9724, + "theoretical_loss": 3.7984379838432014, + "tokens_seen": 666443776 + }, + { + "epoch": 8.01, + "learning_rate": 0.000403049147442327, + "loss": 2.9603, + "theoretical_loss": 3.7983997262158637, + "tokens_seen": 666509312 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004030391173520562, + "loss": 3.0326, + "theoretical_loss": 3.7983614734032827, + "tokens_seen": 666574848 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040302908726178537, + "loss": 3.0043, + "theoretical_loss": 3.7983232254043804, + "tokens_seen": 666640384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040301905717151455, + "loss": 3.0102, + "theoretical_loss": 3.7982849822180764, + "tokens_seen": 666705920 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1604941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.006728410720825, + "objective/train/theoretical_loss": 3.798246743843293, + "objective/train/tokens_used": 687231456, + "theoretical_loss": 3.798246743843293, + "tokens_seen": 666771456 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004030090270812438, + "loss": 3.0849, + "theoretical_loss": 3.798246743843293, + "tokens_seen": 666771456 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004029989969909729, + "loss": 3.0341, + "theoretical_loss": 3.7982085102789522, + "tokens_seen": 666836992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040298896690070215, + "loss": 3.1028, + "theoretical_loss": 3.7981702815239755, + "tokens_seen": 666902528 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004029789368104313, + "loss": 2.8975, + "theoretical_loss": 3.798132057577286, + "tokens_seen": 666968064 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004029689067201605, + "loss": 2.9302, + "theoretical_loss": 3.798093838437806, + "tokens_seen": 667033600 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004029588766298897, + "loss": 2.9201, + "theoretical_loss": 3.7980556241044594, + "tokens_seen": 667099136 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040294884653961887, + "loss": 3.0156, + "theoretical_loss": 3.7980174145761687, + "tokens_seen": 667164672 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040293881644934805, + "loss": 2.8745, + "theoretical_loss": 3.797979209851859, + "tokens_seen": 667230208 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040292878635907723, + "loss": 3.0151, + "theoretical_loss": 3.7979410099304545, + "tokens_seen": 667295744 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004029187562688064, + "loss": 2.9914, + "theoretical_loss": 3.797902814810879, + "tokens_seen": 667361280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040290872617853565, + "loss": 2.9998, + "theoretical_loss": 3.797864624492057, + "tokens_seen": 667426816 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028986960882648, + "loss": 3.057, + "theoretical_loss": 3.797826438972915, + "tokens_seen": 667492352 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402888665997994, + "loss": 2.9366, + "theoretical_loss": 3.7977882582523783, + "tokens_seen": 667557888 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028786359077232, + "loss": 3.1521, + "theoretical_loss": 3.7977500823293724, + "tokens_seen": 667623424 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028686058174524, + "loss": 2.9246, + "theoretical_loss": 3.797711911202825, + "tokens_seen": 667688960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040285857572718156, + "loss": 2.923, + "theoretical_loss": 3.797673744871661, + "tokens_seen": 667754496 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040284854563691074, + "loss": 2.9612, + "theoretical_loss": 3.7976355833348077, + "tokens_seen": 667820032 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028385155466399, + "loss": 2.95, + "theoretical_loss": 3.7975974265911936, + "tokens_seen": 667885568 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040282848545636915, + "loss": 2.9751, + "theoretical_loss": 3.7975592746397453, + "tokens_seen": 667951104 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028184553660983, + "loss": 3.0472, + "theoretical_loss": 3.797521127479391, + "tokens_seen": 668016640 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004028084252758275, + "loss": 2.977, + "theoretical_loss": 3.79748298510906, + "tokens_seen": 668082176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040279839518555664, + "loss": 3.0815, + "theoretical_loss": 3.7974448475276805, + "tokens_seen": 668147712 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004027883650952859, + "loss": 3.0886, + "theoretical_loss": 3.797406714734181, + "tokens_seen": 668213248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040277833500501506, + "loss": 2.9563, + "theoretical_loss": 3.7973685867274916, + "tokens_seen": 668278784 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040276830491474424, + "loss": 2.9636, + "theoretical_loss": 3.797330463506542, + "tokens_seen": 668344320 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1608752, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0573925971984863, + "objective/train/theoretical_loss": 3.797292345070262, + "objective/train/tokens_used": 688869856, + "theoretical_loss": 3.797292345070262, + "tokens_seen": 668409856 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004027582748244734, + "loss": 2.9817, + "theoretical_loss": 3.797292345070262, + "tokens_seen": 668409856 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004027482447342026, + "loss": 3.0137, + "theoretical_loss": 3.7972542314175826, + "tokens_seen": 668475392 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004027382146439318, + "loss": 3.0722, + "theoretical_loss": 3.7972161225474337, + "tokens_seen": 668540928 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402728184553661, + "loss": 2.9732, + "theoretical_loss": 3.797178018458747, + "tokens_seen": 668606464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040271815446339015, + "loss": 2.9659, + "theoretical_loss": 3.797139919150455, + "tokens_seen": 668672000 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004027081243731194, + "loss": 3.1329, + "theoretical_loss": 3.797101824621487, + "tokens_seen": 668737536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040269809428284856, + "loss": 3.1373, + "theoretical_loss": 3.797063734870778, + "tokens_seen": 668803072 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040268806419257774, + "loss": 2.9974, + "theoretical_loss": 3.797025649897259, + "tokens_seen": 668868608 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004026780341023069, + "loss": 2.9108, + "theoretical_loss": 3.796987569699863, + "tokens_seen": 668934144 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004026680040120361, + "loss": 3.0442, + "theoretical_loss": 3.796949494277523, + "tokens_seen": 668999680 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004026579739217653, + "loss": 3.0786, + "theoretical_loss": 3.796911423629173, + "tokens_seen": 669065216 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004026479438314945, + "loss": 3.0034, + "theoretical_loss": 3.7968733577537472, + "tokens_seen": 669130752 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040263791374122365, + "loss": 3.0502, + "theoretical_loss": 3.796835296650179, + "tokens_seen": 669196288 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004026278836509529, + "loss": 3.0617, + "theoretical_loss": 3.796797240317403, + "tokens_seen": 669261824 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402617853560682, + "loss": 3.0667, + "theoretical_loss": 3.796759188754355, + "tokens_seen": 669327360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040260782347041125, + "loss": 2.9903, + "theoretical_loss": 3.79672114195997, + "tokens_seen": 669392896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040259779338014043, + "loss": 3.1065, + "theoretical_loss": 3.796683099933183, + "tokens_seen": 669458432 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004025877632898696, + "loss": 3.0314, + "theoretical_loss": 3.7966450626729302, + "tokens_seen": 669523968 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004025777331995988, + "loss": 3.0056, + "theoretical_loss": 3.7966070301781483, + "tokens_seen": 669589504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040256770310932797, + "loss": 3.0385, + "theoretical_loss": 3.7965690024477734, + "tokens_seen": 669655040 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040255767301905715, + "loss": 3.1289, + "theoretical_loss": 3.7965309794807425, + "tokens_seen": 669720576 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004025476429287864, + "loss": 2.9284, + "theoretical_loss": 3.7964929612759937, + "tokens_seen": 669786112 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004025376128385155, + "loss": 2.9287, + "theoretical_loss": 3.796454947832464, + "tokens_seen": 669851648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040252758274824475, + "loss": 3.1145, + "theoretical_loss": 3.7964169391490907, + "tokens_seen": 669917184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040251755265797393, + "loss": 2.9484, + "theoretical_loss": 3.7963789352248134, + "tokens_seen": 669982720 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1613667, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.035874843597412, + "objective/train/theoretical_loss": 3.7963409360585705, + "objective/train/tokens_used": 690508256, + "theoretical_loss": 3.7963409360585705, + "tokens_seen": 670048256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004025075225677031, + "loss": 2.9222, + "theoretical_loss": 3.7963409360585705, + "tokens_seen": 670048256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024974924774323, + "loss": 3.0285, + "theoretical_loss": 3.7963029416493, + "tokens_seen": 670113792 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024874623871615, + "loss": 2.941, + "theoretical_loss": 3.7962649519959433, + "tokens_seen": 670179328 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040247743229689066, + "loss": 3.0274, + "theoretical_loss": 3.7962269670974376, + "tokens_seen": 670244864 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024674022066199, + "loss": 2.9342, + "theoretical_loss": 3.7961889869527248, + "tokens_seen": 670310400 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402457372116349, + "loss": 2.952, + "theoretical_loss": 3.7961510115607444, + "tokens_seen": 670375936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040244734202607825, + "loss": 3.0429, + "theoretical_loss": 3.7961130409204373, + "tokens_seen": 670441472 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024373119358074, + "loss": 3.0465, + "theoretical_loss": 3.796075075030745, + "tokens_seen": 670507008 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024272818455366, + "loss": 2.9823, + "theoretical_loss": 3.796037113890608, + "tokens_seen": 670572544 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004024172517552658, + "loss": 2.9271, + "theoretical_loss": 3.795999157498969, + "tokens_seen": 670638080 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402407221664995, + "loss": 3.0574, + "theoretical_loss": 3.7959612058547694, + "tokens_seen": 670703616 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023971915747242, + "loss": 2.9943, + "theoretical_loss": 3.795923258956952, + "tokens_seen": 670769152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023871614844534, + "loss": 3.0702, + "theoretical_loss": 3.7958853168044593, + "tokens_seen": 670834688 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023771313941826, + "loss": 3.1034, + "theoretical_loss": 3.795847379396234, + "tokens_seen": 670900224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040236710130391176, + "loss": 3.0586, + "theoretical_loss": 3.7958094467312202, + "tokens_seen": 670965760 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040235707121364094, + "loss": 3.0866, + "theoretical_loss": 3.795771518808362, + "tokens_seen": 671031296 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023470411233701, + "loss": 2.9834, + "theoretical_loss": 3.7957335956266025, + "tokens_seen": 671096832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040233701103309935, + "loss": 3.0276, + "theoretical_loss": 3.7956956771848867, + "tokens_seen": 671162368 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023269809428285, + "loss": 3.0786, + "theoretical_loss": 3.7956577634821596, + "tokens_seen": 671227904 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004023169508525577, + "loss": 3.0147, + "theoretical_loss": 3.7956198545173656, + "tokens_seen": 671293440 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040230692076228684, + "loss": 3.0003, + "theoretical_loss": 3.7955819502894506, + "tokens_seen": 671358976 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004022968906720161, + "loss": 3.0337, + "theoretical_loss": 3.7955440507973606, + "tokens_seen": 671424512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040228686058174526, + "loss": 3.0127, + "theoretical_loss": 3.795506156040041, + "tokens_seen": 671490048 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040227683049147444, + "loss": 3.0698, + "theoretical_loss": 3.7954682660164396, + "tokens_seen": 671555584 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004022668004012036, + "loss": 3.1116, + "theoretical_loss": 3.7954303807255014, + "tokens_seen": 671621120 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1616644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0977630615234375, + "objective/train/theoretical_loss": 3.7953925001661752, + "objective/train/tokens_used": 692146656, + "theoretical_loss": 3.7953925001661752, + "tokens_seen": 671686656 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004022567703109328, + "loss": 2.9348, + "theoretical_loss": 3.7953925001661752, + "tokens_seen": 671686656 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402246740220662, + "loss": 3.0344, + "theoretical_loss": 3.7953546243374077, + "tokens_seen": 671752192 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004022367101303912, + "loss": 3.0501, + "theoretical_loss": 3.7953167532381467, + "tokens_seen": 671817728 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040222668004012035, + "loss": 3.0071, + "theoretical_loss": 3.79527888686734, + "tokens_seen": 671883264 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004022166499498496, + "loss": 2.9789, + "theoretical_loss": 3.7952410252239375, + "tokens_seen": 671948800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040220661985957876, + "loss": 2.9634, + "theoretical_loss": 3.7952031683068865, + "tokens_seen": 672014336 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040219658976930794, + "loss": 3.0642, + "theoretical_loss": 3.7951653161151366, + "tokens_seen": 672079872 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021865596790371, + "loss": 3.0946, + "theoretical_loss": 3.795127468647638, + "tokens_seen": 672145408 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021765295887663, + "loss": 3.0176, + "theoretical_loss": 3.7950896259033398, + "tokens_seen": 672210944 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021664994984955, + "loss": 2.9871, + "theoretical_loss": 3.7950517878811922, + "tokens_seen": 672276480 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021564694082247, + "loss": 3.027, + "theoretical_loss": 3.7950139545801465, + "tokens_seen": 672342016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040214643931795385, + "loss": 3.0201, + "theoretical_loss": 3.794976125999152, + "tokens_seen": 672407552 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021364092276831, + "loss": 2.9463, + "theoretical_loss": 3.794938302137161, + "tokens_seen": 672473088 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004021263791374122, + "loss": 2.9872, + "theoretical_loss": 3.7949004829931248, + "tokens_seen": 672538624 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040211634904714145, + "loss": 2.986, + "theoretical_loss": 3.7948626685659956, + "tokens_seen": 672604160 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040210631895687063, + "loss": 2.9883, + "theoretical_loss": 3.7948248588547253, + "tokens_seen": 672669696 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004020962888665998, + "loss": 2.972, + "theoretical_loss": 3.7947870538582658, + "tokens_seen": 672735232 + }, + { + "epoch": 8.01, + "learning_rate": 0.000402086258776329, + "loss": 2.9653, + "theoretical_loss": 3.7947492535755707, + "tokens_seen": 672800768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040207622868605817, + "loss": 2.9317, + "theoretical_loss": 3.7947114580055934, + "tokens_seen": 672866304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040206619859578735, + "loss": 2.9163, + "theoretical_loss": 3.7946736671472867, + "tokens_seen": 672931840 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004020561685055166, + "loss": 3.0128, + "theoretical_loss": 3.7946358809996044, + "tokens_seen": 672997376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004020461384152457, + "loss": 2.9819, + "theoretical_loss": 3.794598099561502, + "tokens_seen": 673062912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040203610832497495, + "loss": 3.0567, + "theoretical_loss": 3.794560322831932, + "tokens_seen": 673128448 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040202607823470413, + "loss": 2.9028, + "theoretical_loss": 3.794522550809851, + "tokens_seen": 673193984 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004020160481444333, + "loss": 3.0231, + "theoretical_loss": 3.7944847834942133, + "tokens_seen": 673259520 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1621500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0954275131225586, + "objective/train/theoretical_loss": 3.7944470208839745, + "objective/train/tokens_used": 693785056, + "theoretical_loss": 3.7944470208839745, + "tokens_seen": 673325056 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004020060180541625, + "loss": 3.0729, + "theoretical_loss": 3.7944470208839745, + "tokens_seen": 673325056 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019959879638917, + "loss": 3.0751, + "theoretical_loss": 3.794409262978091, + "tokens_seen": 673390592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040198595787362086, + "loss": 3.0124, + "theoretical_loss": 3.7943715097755177, + "tokens_seen": 673456128 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019759277833501, + "loss": 3.0461, + "theoretical_loss": 3.794333761275213, + "tokens_seen": 673521664 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019658976930792, + "loss": 3.0203, + "theoretical_loss": 3.7942960174761327, + "tokens_seen": 673587200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040195586760280845, + "loss": 3.0441, + "theoretical_loss": 3.794258278377234, + "tokens_seen": 673652736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019458375125376, + "loss": 3.047, + "theoretical_loss": 3.794220543977474, + "tokens_seen": 673718272 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019358074222668, + "loss": 2.9075, + "theoretical_loss": 3.7941828142758114, + "tokens_seen": 673783808 + }, + { + "epoch": 8.01, + "learning_rate": 0.000401925777331996, + "loss": 3.0496, + "theoretical_loss": 3.794145089271204, + "tokens_seen": 673849344 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004019157472417252, + "loss": 3.0929, + "theoretical_loss": 3.79410736896261, + "tokens_seen": 673914880 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040190571715145436, + "loss": 3.0229, + "theoretical_loss": 3.7940696533489895, + "tokens_seen": 673980416 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018956870611836, + "loss": 3.0509, + "theoretical_loss": 3.7940319424293003, + "tokens_seen": 674045952 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018856569709127, + "loss": 3.0622, + "theoretical_loss": 3.7939942362025025, + "tokens_seen": 674111488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040187562688064196, + "loss": 3.0475, + "theoretical_loss": 3.793956534667556, + "tokens_seen": 674177024 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018655967903711, + "loss": 2.9672, + "theoretical_loss": 3.79391883782342, + "tokens_seen": 674242560 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018555667001003, + "loss": 3.0502, + "theoretical_loss": 3.793881145669057, + "tokens_seen": 674308096 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018455366098295, + "loss": 2.9076, + "theoretical_loss": 3.793843458203426, + "tokens_seen": 674373632 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018355065195587, + "loss": 3.0006, + "theoretical_loss": 3.793805775425489, + "tokens_seen": 674439168 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040182547642928786, + "loss": 3.0256, + "theoretical_loss": 3.793768097334208, + "tokens_seen": 674504704 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040181544633901704, + "loss": 3.1109, + "theoretical_loss": 3.793730423928544, + "tokens_seen": 674570240 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004018054162487462, + "loss": 2.9339, + "theoretical_loss": 3.7936927552074584, + "tokens_seen": 674635776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040179538615847546, + "loss": 3.0159, + "theoretical_loss": 3.7936550911699154, + "tokens_seen": 674701312 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004017853560682046, + "loss": 3.0382, + "theoretical_loss": 3.7936174318148774, + "tokens_seen": 674766848 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004017753259779338, + "loss": 2.9907, + "theoretical_loss": 3.7935797771413067, + "tokens_seen": 674832384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040176529588766295, + "loss": 2.9472, + "theoretical_loss": 3.793542127148168, + "tokens_seen": 674897920 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1624386, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7871644496917725, + "objective/train/theoretical_loss": 3.793504481834424, + "objective/train/tokens_used": 695423456, + "theoretical_loss": 3.793504481834424, + "tokens_seen": 674963456 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004017552657973922, + "loss": 2.9898, + "theoretical_loss": 3.793504481834424, + "tokens_seen": 674963456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040174523570712137, + "loss": 3.1374, + "theoretical_loss": 3.793466841199039, + "tokens_seen": 675028992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040173520561685055, + "loss": 2.9707, + "theoretical_loss": 3.7934292052409786, + "tokens_seen": 675094528 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040172517552657973, + "loss": 3.0592, + "theoretical_loss": 3.793391573959206, + "tokens_seen": 675160064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040171514543630896, + "loss": 2.9991, + "theoretical_loss": 3.7933539473526876, + "tokens_seen": 675225600 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004017051153460381, + "loss": 3.0384, + "theoretical_loss": 3.7933163254203883, + "tokens_seen": 675291136 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004016950852557673, + "loss": 3.0519, + "theoretical_loss": 3.7932787081612744, + "tokens_seen": 675356672 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040168505516549645, + "loss": 2.9562, + "theoretical_loss": 3.793241095574311, + "tokens_seen": 675422208 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004016750250752257, + "loss": 2.9483, + "theoretical_loss": 3.7932034876584657, + "tokens_seen": 675487744 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040166499498495487, + "loss": 2.989, + "theoretical_loss": 3.793165884412705, + "tokens_seen": 675553280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040165496489468405, + "loss": 3.0566, + "theoretical_loss": 3.793128285835995, + "tokens_seen": 675618816 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004016449348044133, + "loss": 2.9658, + "theoretical_loss": 3.7930906919273046, + "tokens_seen": 675684352 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004016349047141424, + "loss": 3.0255, + "theoretical_loss": 3.793053102685601, + "tokens_seen": 675749888 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040162487462387165, + "loss": 2.9692, + "theoretical_loss": 3.7930155181098515, + "tokens_seen": 675815424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040161484453360083, + "loss": 3.0182, + "theoretical_loss": 3.7929779381990256, + "tokens_seen": 675880960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040160481444333, + "loss": 3.0543, + "theoretical_loss": 3.792940362952092, + "tokens_seen": 675946496 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015947843530592, + "loss": 2.9664, + "theoretical_loss": 3.792902792368019, + "tokens_seen": 676012032 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040158475426278837, + "loss": 3.0204, + "theoretical_loss": 3.7928652264457767, + "tokens_seen": 676077568 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040157472417251755, + "loss": 3.1327, + "theoretical_loss": 3.7928276651843342, + "tokens_seen": 676143104 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015646940822468, + "loss": 3.0995, + "theoretical_loss": 3.792790108582663, + "tokens_seen": 676208640 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015546639919759, + "loss": 3.043, + "theoretical_loss": 3.7927525566397318, + "tokens_seen": 676274176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040154463390170515, + "loss": 2.9983, + "theoretical_loss": 3.792715009354512, + "tokens_seen": 676339712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040153460381143433, + "loss": 2.9894, + "theoretical_loss": 3.7926774667259746, + "tokens_seen": 676405248 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015245737211635, + "loss": 3.1004, + "theoretical_loss": 3.7926399287530908, + "tokens_seen": 676470784 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015145436308927, + "loss": 2.9366, + "theoretical_loss": 3.792602395434833, + "tokens_seen": 676536320 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1628134, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.986851930618286, + "objective/train/theoretical_loss": 3.7925648667701726, + "objective/train/tokens_used": 697061856, + "theoretical_loss": 3.7925648667701726, + "tokens_seen": 676601856 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004015045135406219, + "loss": 3.032, + "theoretical_loss": 3.7925648667701726, + "tokens_seen": 676601856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040149448345035106, + "loss": 3.0474, + "theoretical_loss": 3.792527342758082, + "tokens_seen": 676667392 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014844533600803, + "loss": 2.9985, + "theoretical_loss": 3.792489823397534, + "tokens_seen": 676732928 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014744232698094, + "loss": 2.9623, + "theoretical_loss": 3.7924523086875013, + "tokens_seen": 676798464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040146439317953865, + "loss": 3.0844, + "theoretical_loss": 3.792414798626958, + "tokens_seen": 676864000 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014543630892678, + "loss": 3.1257, + "theoretical_loss": 3.792377293214877, + "tokens_seen": 676929536 + }, + { + "epoch": 8.01, + "learning_rate": 0.000401444332998997, + "loss": 2.9626, + "theoretical_loss": 3.7923397924502327, + "tokens_seen": 676995072 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014343029087262, + "loss": 2.9712, + "theoretical_loss": 3.7923022963319992, + "tokens_seen": 677060608 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014242728184554, + "loss": 3.0283, + "theoretical_loss": 3.792264804859151, + "tokens_seen": 677126144 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040141424272818456, + "loss": 2.9644, + "theoretical_loss": 3.7922273180306636, + "tokens_seen": 677191680 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004014042126379138, + "loss": 3.0219, + "theoretical_loss": 3.7921898358455115, + "tokens_seen": 677257216 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013941825476429, + "loss": 3.0715, + "theoretical_loss": 3.7921523583026717, + "tokens_seen": 677322752 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040138415245737216, + "loss": 2.9695, + "theoretical_loss": 3.7921148854011184, + "tokens_seen": 677388288 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013741223671013, + "loss": 3.0279, + "theoretical_loss": 3.792077417139829, + "tokens_seen": 677453824 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013640922768305, + "loss": 3.0742, + "theoretical_loss": 3.7920399535177793, + "tokens_seen": 677519360 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013540621865597, + "loss": 3.0174, + "theoretical_loss": 3.792002494533947, + "tokens_seen": 677584896 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013440320962889, + "loss": 2.9827, + "theoretical_loss": 3.791965040187309, + "tokens_seen": 677650432 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040133400200601806, + "loss": 3.0622, + "theoretical_loss": 3.791927590476843, + "tokens_seen": 677715968 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040132397191574724, + "loss": 3.0757, + "theoretical_loss": 3.7918901454015264, + "tokens_seen": 677781504 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004013139418254764, + "loss": 3.0537, + "theoretical_loss": 3.791852704960338, + "tokens_seen": 677847040 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040130391173520566, + "loss": 3.0015, + "theoretical_loss": 3.791815269152256, + "tokens_seen": 677912576 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004012938816449348, + "loss": 3.0701, + "theoretical_loss": 3.7917778379762597, + "tokens_seen": 677978112 + }, + { + "epoch": 8.01, + "learning_rate": 0.000401283851554664, + "loss": 2.9985, + "theoretical_loss": 3.7917404114313276, + "tokens_seen": 678043648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040127382146439315, + "loss": 2.9804, + "theoretical_loss": 3.7917029895164394, + "tokens_seen": 678109184 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004012637913741224, + "loss": 3.1098, + "theoretical_loss": 3.7916655722305754, + "tokens_seen": 678174720 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1632944, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.944676637649536, + "objective/train/theoretical_loss": 3.7916281595727153, + "objective/train/tokens_used": 698700256, + "theoretical_loss": 3.7916281595727153, + "tokens_seen": 678240256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040125376128385157, + "loss": 3.0302, + "theoretical_loss": 3.7916281595727153, + "tokens_seen": 678240256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040124373119358075, + "loss": 2.9819, + "theoretical_loss": 3.7915907515418397, + "tokens_seen": 678305792 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040123370110330993, + "loss": 3.1395, + "theoretical_loss": 3.7915533481369295, + "tokens_seen": 678371328 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040122367101303916, + "loss": 3.0192, + "theoretical_loss": 3.7915159493569655, + "tokens_seen": 678436864 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004012136409227683, + "loss": 3.0434, + "theoretical_loss": 3.7914785552009294, + "tokens_seen": 678502400 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004012036108324975, + "loss": 2.9563, + "theoretical_loss": 3.791441165667803, + "tokens_seen": 678567936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040119358074222665, + "loss": 3.0323, + "theoretical_loss": 3.7914037807565677, + "tokens_seen": 678633472 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004011835506519559, + "loss": 2.9577, + "theoretical_loss": 3.7913664004662073, + "tokens_seen": 678699008 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040117352056168507, + "loss": 3.0327, + "theoretical_loss": 3.7913290247957034, + "tokens_seen": 678764544 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040116349047141425, + "loss": 3.0, + "theoretical_loss": 3.7912916537440395, + "tokens_seen": 678830080 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040115346038114343, + "loss": 3.0164, + "theoretical_loss": 3.791254287310199, + "tokens_seen": 678895616 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004011434302908726, + "loss": 2.9802, + "theoretical_loss": 3.7912169254931656, + "tokens_seen": 678961152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004011334002006018, + "loss": 3.0874, + "theoretical_loss": 3.791179568291923, + "tokens_seen": 679026688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040112337011033103, + "loss": 2.9593, + "theoretical_loss": 3.7911422157054555, + "tokens_seen": 679092224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040111334002006016, + "loss": 3.049, + "theoretical_loss": 3.791104867732748, + "tokens_seen": 679157760 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004011033099297894, + "loss": 3.0604, + "theoretical_loss": 3.791067524372786, + "tokens_seen": 679223296 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004010932798395185, + "loss": 3.0026, + "theoretical_loss": 3.791030185624554, + "tokens_seen": 679288832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040108324974924775, + "loss": 2.9924, + "theoretical_loss": 3.790992851487038, + "tokens_seen": 679354368 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040107321965897693, + "loss": 3.0377, + "theoretical_loss": 3.7909555219592237, + "tokens_seen": 679419904 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004010631895687061, + "loss": 2.9012, + "theoretical_loss": 3.7909181970400976, + "tokens_seen": 679485440 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004010531594784353, + "loss": 2.9758, + "theoretical_loss": 3.7908808767286466, + "tokens_seen": 679550976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040104312938816453, + "loss": 3.1, + "theoretical_loss": 3.7908435610238564, + "tokens_seen": 679616512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040103309929789366, + "loss": 3.0092, + "theoretical_loss": 3.790806249924716, + "tokens_seen": 679682048 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004010230692076229, + "loss": 3.0311, + "theoretical_loss": 3.790768943430211, + "tokens_seen": 679747584 + }, + { + "epoch": 8.01, + "learning_rate": 0.000401013039117352, + "loss": 3.0226, + "theoretical_loss": 3.7907316415393315, + "tokens_seen": 679813120 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1636148, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9628124237060547, + "objective/train/theoretical_loss": 3.7906943442510643, + "objective/train/tokens_used": 700338656, + "theoretical_loss": 3.7906943442510643, + "tokens_seen": 679878656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040100300902708126, + "loss": 3.0962, + "theoretical_loss": 3.7906943442510643, + "tokens_seen": 679878656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040099297893681044, + "loss": 3.0531, + "theoretical_loss": 3.7906570515643976, + "tokens_seen": 679944192 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009829488465396, + "loss": 2.9828, + "theoretical_loss": 3.790619763478321, + "tokens_seen": 680009728 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009729187562688, + "loss": 2.9487, + "theoretical_loss": 3.790582479991823, + "tokens_seen": 680075264 + }, + { + "epoch": 8.01, + "learning_rate": 0.000400962888665998, + "loss": 3.1185, + "theoretical_loss": 3.790545201103894, + "tokens_seen": 680140800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040095285857572716, + "loss": 2.9494, + "theoretical_loss": 3.7905079268135227, + "tokens_seen": 680206336 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009428284854564, + "loss": 3.0338, + "theoretical_loss": 3.7904706571197, + "tokens_seen": 680271872 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009327983951855, + "loss": 3.0265, + "theoretical_loss": 3.7904333920214164, + "tokens_seen": 680337408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040092276830491476, + "loss": 3.1296, + "theoretical_loss": 3.790396131517662, + "tokens_seen": 680402944 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009127382146439, + "loss": 3.0502, + "theoretical_loss": 3.790358875607428, + "tokens_seen": 680468480 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004009027081243731, + "loss": 3.0652, + "theoretical_loss": 3.7903216242897066, + "tokens_seen": 680534016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040089267803410236, + "loss": 3.0618, + "theoretical_loss": 3.7902843775634882, + "tokens_seen": 680599552 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004008826479438315, + "loss": 2.9504, + "theoretical_loss": 3.790247135427766, + "tokens_seen": 680665088 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004008726178535607, + "loss": 2.9727, + "theoretical_loss": 3.7902098978815313, + "tokens_seen": 680730624 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004008625877632899, + "loss": 3.0343, + "theoretical_loss": 3.7901726649237775, + "tokens_seen": 680796160 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004008525576730191, + "loss": 3.0473, + "theoretical_loss": 3.7901354365534976, + "tokens_seen": 680861696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040084252758274826, + "loss": 3.0613, + "theoretical_loss": 3.790098212769685, + "tokens_seen": 680927232 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040083249749247744, + "loss": 3.0308, + "theoretical_loss": 3.7900609935713323, + "tokens_seen": 680992768 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004008224674022066, + "loss": 3.0901, + "theoretical_loss": 3.7900237789574343, + "tokens_seen": 681058304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040081243731193586, + "loss": 3.1125, + "theoretical_loss": 3.7899865689269854, + "tokens_seen": 681123840 + }, + { + "epoch": 8.01, + "learning_rate": 0.000400802407221665, + "loss": 3.0727, + "theoretical_loss": 3.7899493634789794, + "tokens_seen": 681189376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004007923771313942, + "loss": 3.0199, + "theoretical_loss": 3.7899121626124117, + "tokens_seen": 681254912 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040078234704112335, + "loss": 3.1133, + "theoretical_loss": 3.789874966326278, + "tokens_seen": 681320448 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004007723169508526, + "loss": 3.0158, + "theoretical_loss": 3.7898377746195724, + "tokens_seen": 681385984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040076228686058177, + "loss": 3.046, + "theoretical_loss": 3.7898005874912917, + "tokens_seen": 681451520 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1640777, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.087345838546753, + "objective/train/theoretical_loss": 3.789763404940432, + "objective/train/tokens_used": 701977056, + "theoretical_loss": 3.789763404940432, + "tokens_seen": 681517056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040075225677031095, + "loss": 3.0671, + "theoretical_loss": 3.789763404940432, + "tokens_seen": 681517056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040074222668004013, + "loss": 3.0537, + "theoretical_loss": 3.78972622696599, + "tokens_seen": 681582592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040073219658976936, + "loss": 3.1228, + "theoretical_loss": 3.7896890535669616, + "tokens_seen": 681648128 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004007221664994985, + "loss": 3.0406, + "theoretical_loss": 3.7896518847423453, + "tokens_seen": 681713664 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004007121364092277, + "loss": 2.9972, + "theoretical_loss": 3.7896147204911372, + "tokens_seen": 681779200 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040070210631895685, + "loss": 3.0144, + "theoretical_loss": 3.7895775608123348, + "tokens_seen": 681844736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004006920762286861, + "loss": 3.0565, + "theoretical_loss": 3.7895404057049378, + "tokens_seen": 681910272 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040068204613841527, + "loss": 3.0411, + "theoretical_loss": 3.7895032551679435, + "tokens_seen": 681975808 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040067201604814445, + "loss": 3.0136, + "theoretical_loss": 3.78946610920035, + "tokens_seen": 682041344 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040066198595787363, + "loss": 3.0596, + "theoretical_loss": 3.7894289678011575, + "tokens_seen": 682106880 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004006519558676028, + "loss": 3.0396, + "theoretical_loss": 3.7893918309693646, + "tokens_seen": 682172416 + }, + { + "epoch": 8.01, + "learning_rate": 0.000400641925777332, + "loss": 3.0548, + "theoretical_loss": 3.7893546987039715, + "tokens_seen": 682237952 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040063189568706123, + "loss": 2.9898, + "theoretical_loss": 3.789317571003977, + "tokens_seen": 682303488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040062186559679036, + "loss": 3.107, + "theoretical_loss": 3.7892804478683826, + "tokens_seen": 682369024 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004006118355065196, + "loss": 3.0183, + "theoretical_loss": 3.7892433292961885, + "tokens_seen": 682434560 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004006018054162487, + "loss": 2.9879, + "theoretical_loss": 3.789206215286395, + "tokens_seen": 682500096 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040059177532597795, + "loss": 3.0619, + "theoretical_loss": 3.7891691058380044, + "tokens_seen": 682565632 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040058174523570713, + "loss": 3.0043, + "theoretical_loss": 3.789132000950017, + "tokens_seen": 682631168 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004005717151454363, + "loss": 3.0492, + "theoretical_loss": 3.7890949006214356, + "tokens_seen": 682696704 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004005616850551655, + "loss": 2.9636, + "theoretical_loss": 3.7890578048512618, + "tokens_seen": 682762240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040055165496489473, + "loss": 3.0771, + "theoretical_loss": 3.789020713638498, + "tokens_seen": 682827776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040054162487462386, + "loss": 3.0554, + "theoretical_loss": 3.788983626982147, + "tokens_seen": 682893312 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004005315947843531, + "loss": 2.928, + "theoretical_loss": 3.7889465448812127, + "tokens_seen": 682958848 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004005215646940822, + "loss": 2.9601, + "theoretical_loss": 3.7889094673346975, + "tokens_seen": 683024384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040051153460381146, + "loss": 3.0624, + "theoretical_loss": 3.7888723943416056, + "tokens_seen": 683089920 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1644043, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2214767932891846, + "objective/train/theoretical_loss": 3.7888353259009406, + "objective/train/tokens_used": 703615456, + "theoretical_loss": 3.7888353259009406, + "tokens_seen": 683155456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040050150451354064, + "loss": 2.9219, + "theoretical_loss": 3.7888353259009406, + "tokens_seen": 683155456 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004914744232698, + "loss": 3.0802, + "theoretical_loss": 3.7887982620117073, + "tokens_seen": 683220992 + }, + { + "epoch": 8.01, + "learning_rate": 0.000400481444332999, + "loss": 2.9482, + "theoretical_loss": 3.78876120267291, + "tokens_seen": 683286528 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004714142427282, + "loss": 3.0433, + "theoretical_loss": 3.7887241478835545, + "tokens_seen": 683352064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040046138415245736, + "loss": 3.1008, + "theoretical_loss": 3.7886870976426446, + "tokens_seen": 683417600 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004513540621866, + "loss": 3.0677, + "theoretical_loss": 3.7886500519491877, + "tokens_seen": 683483136 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004413239719157, + "loss": 2.9283, + "theoretical_loss": 3.788613010802188, + "tokens_seen": 683548672 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040043129388164496, + "loss": 2.9774, + "theoretical_loss": 3.7885759742006524, + "tokens_seen": 683614208 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004212637913741, + "loss": 2.9892, + "theoretical_loss": 3.788538942143587, + "tokens_seen": 683679744 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004112337011033, + "loss": 3.0227, + "theoretical_loss": 3.7885019146300003, + "tokens_seen": 683745280 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004004012036108325, + "loss": 3.0404, + "theoretical_loss": 3.788464891658897, + "tokens_seen": 683810816 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004003911735205617, + "loss": 3.0065, + "theoretical_loss": 3.7884278732292866, + "tokens_seen": 683876352 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040038114343029087, + "loss": 3.0467, + "theoretical_loss": 3.7883908593401765, + "tokens_seen": 683941888 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004003711133400201, + "loss": 3.0129, + "theoretical_loss": 3.788353849990574, + "tokens_seen": 684007424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040036108324974923, + "loss": 2.9903, + "theoretical_loss": 3.7883168451794873, + "tokens_seen": 684072960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040035105315947846, + "loss": 3.039, + "theoretical_loss": 3.7882798449059263, + "tokens_seen": 684138496 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004003410230692076, + "loss": 3.0202, + "theoretical_loss": 3.7882428491688995, + "tokens_seen": 684204032 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004003309929789368, + "loss": 2.9364, + "theoretical_loss": 3.788205857967416, + "tokens_seen": 684269568 + }, + { + "epoch": 8.01, + "learning_rate": 0.000400320962888666, + "loss": 3.017, + "theoretical_loss": 3.7881688713004857, + "tokens_seen": 684335104 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004003109327983952, + "loss": 2.9984, + "theoretical_loss": 3.7881318891671185, + "tokens_seen": 684400640 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040030090270812437, + "loss": 2.9647, + "theoretical_loss": 3.7880949115663243, + "tokens_seen": 684466176 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040029087261785355, + "loss": 3.0557, + "theoretical_loss": 3.788057938497114, + "tokens_seen": 684531712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040028084252758273, + "loss": 2.9132, + "theoretical_loss": 3.788020969958499, + "tokens_seen": 684597248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040027081243731197, + "loss": 3.0518, + "theoretical_loss": 3.7879840059494896, + "tokens_seen": 684662784 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004002607823470411, + "loss": 2.9584, + "theoretical_loss": 3.787947046469098, + "tokens_seen": 684728320 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1647830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9487104415893555, + "objective/train/theoretical_loss": 3.7879100915163355, + "objective/train/tokens_used": 705253856, + "theoretical_loss": 3.7879100915163355, + "tokens_seen": 684793856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040025075225677033, + "loss": 3.0134, + "theoretical_loss": 3.7879100915163355, + "tokens_seen": 684793856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040024072216649946, + "loss": 2.9669, + "theoretical_loss": 3.7878731410902144, + "tokens_seen": 684859392 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004002306920762287, + "loss": 3.0733, + "theoretical_loss": 3.7878361951897475, + "tokens_seen": 684924928 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040022066198595787, + "loss": 3.0524, + "theoretical_loss": 3.7877992538139473, + "tokens_seen": 684990464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040021063189568705, + "loss": 2.9843, + "theoretical_loss": 3.7877623169618264, + "tokens_seen": 685056000 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040020060180541623, + "loss": 2.9882, + "theoretical_loss": 3.787725384632399, + "tokens_seen": 685121536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040019057171514547, + "loss": 3.0116, + "theoretical_loss": 3.787688456824678, + "tokens_seen": 685187072 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004001805416248746, + "loss": 3.1039, + "theoretical_loss": 3.787651533537678, + "tokens_seen": 685252608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040017051153460383, + "loss": 3.0449, + "theoretical_loss": 3.787614614770413, + "tokens_seen": 685318144 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040016048144433296, + "loss": 3.0087, + "theoretical_loss": 3.787577700521898, + "tokens_seen": 685383680 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004001504513540622, + "loss": 3.0007, + "theoretical_loss": 3.787540790791147, + "tokens_seen": 685449216 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040014042126379143, + "loss": 2.9237, + "theoretical_loss": 3.787503885577176, + "tokens_seen": 685514752 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040013039117352056, + "loss": 2.9741, + "theoretical_loss": 3.787466984879001, + "tokens_seen": 685580288 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004001203610832498, + "loss": 3.1479, + "theoretical_loss": 3.7874300886956362, + "tokens_seen": 685645824 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004001103309929789, + "loss": 3.1457, + "theoretical_loss": 3.7873931970260992, + "tokens_seen": 685711360 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040010030090270815, + "loss": 3.0978, + "theoretical_loss": 3.7873563098694065, + "tokens_seen": 685776896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040009027081243733, + "loss": 3.0198, + "theoretical_loss": 3.7873194272245736, + "tokens_seen": 685842432 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004000802407221665, + "loss": 3.0734, + "theoretical_loss": 3.7872825490906186, + "tokens_seen": 685907968 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004000702106318957, + "loss": 3.1168, + "theoretical_loss": 3.7872456754665587, + "tokens_seen": 685973504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040006018054162493, + "loss": 2.9782, + "theoretical_loss": 3.787208806351411, + "tokens_seen": 686039040 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040005015045135406, + "loss": 2.9709, + "theoretical_loss": 3.787171941744195, + "tokens_seen": 686104576 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004000401203610833, + "loss": 3.0548, + "theoretical_loss": 3.7871350816439273, + "tokens_seen": 686170112 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004000300902708124, + "loss": 3.0811, + "theoretical_loss": 3.7870982260496278, + "tokens_seen": 686235648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040002006018054166, + "loss": 2.9525, + "theoretical_loss": 3.787061374960315, + "tokens_seen": 686301184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00040001003009027084, + "loss": 3.0023, + "theoretical_loss": 3.787024528375007, + "tokens_seen": 686366720 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1650693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.107738733291626, + "objective/train/theoretical_loss": 3.786987686292725, + "objective/train/tokens_used": 706892256, + "theoretical_loss": 3.786987686292725, + "tokens_seen": 686432256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0004, + "loss": 2.9732, + "theoretical_loss": 3.786987686292725, + "tokens_seen": 686432256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999899699097292, + "loss": 2.9879, + "theoretical_loss": 3.786950848712488, + "tokens_seen": 686497792 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999799398194584, + "loss": 3.022, + "theoretical_loss": 3.7869140156333163, + "tokens_seen": 686563328 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039996990972918756, + "loss": 3.1364, + "theoretical_loss": 3.7868771870542313, + "tokens_seen": 686628864 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999598796389168, + "loss": 3.0544, + "theoretical_loss": 3.786840362974252, + "tokens_seen": 686694400 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999498495486459, + "loss": 3.1481, + "theoretical_loss": 3.7868035433924003, + "tokens_seen": 686759936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039993981945837516, + "loss": 3.0372, + "theoretical_loss": 3.786766728307698, + "tokens_seen": 686825472 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999297893681043, + "loss": 3.0255, + "theoretical_loss": 3.786729917719166, + "tokens_seen": 686891008 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999197592778335, + "loss": 3.0018, + "theoretical_loss": 3.786693111625827, + "tokens_seen": 686956544 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003999097291875627, + "loss": 3.0756, + "theoretical_loss": 3.7866563100267028, + "tokens_seen": 687022080 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003998996990972919, + "loss": 2.9649, + "theoretical_loss": 3.786619512920816, + "tokens_seen": 687087616 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039988966900702107, + "loss": 2.962, + "theoretical_loss": 3.78658272030719, + "tokens_seen": 687153152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003998796389167503, + "loss": 3.0218, + "theoretical_loss": 3.786545932184848, + "tokens_seen": 687218688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039986960882647943, + "loss": 3.1059, + "theoretical_loss": 3.7865091485528124, + "tokens_seen": 687284224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039985957873620866, + "loss": 2.8713, + "theoretical_loss": 3.7864723694101086, + "tokens_seen": 687349760 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003998495486459378, + "loss": 3.0228, + "theoretical_loss": 3.7864355947557597, + "tokens_seen": 687415296 + }, + { + "epoch": 8.01, + "learning_rate": 0.000399839518555667, + "loss": 3.022, + "theoretical_loss": 3.7863988245887903, + "tokens_seen": 687480832 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003998294884653962, + "loss": 3.0417, + "theoretical_loss": 3.7863620589082254, + "tokens_seen": 687546368 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003998194583751254, + "loss": 3.1025, + "theoretical_loss": 3.7863252977130903, + "tokens_seen": 687611904 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039980942828485457, + "loss": 2.9505, + "theoretical_loss": 3.786288541002409, + "tokens_seen": 687677440 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039979939819458375, + "loss": 3.0256, + "theoretical_loss": 3.7862517887752087, + "tokens_seen": 687742976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039978936810431293, + "loss": 3.0743, + "theoretical_loss": 3.786215041030515, + "tokens_seen": 687808512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039977933801404217, + "loss": 3.0109, + "theoretical_loss": 3.7861782977673535, + "tokens_seen": 687874048 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003997693079237713, + "loss": 3.0191, + "theoretical_loss": 3.786141558984751, + "tokens_seen": 687939584 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039975927783350053, + "loss": 2.9978, + "theoretical_loss": 3.7861048246817353, + "tokens_seen": 688005120 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1655468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1608974933624268, + "objective/train/theoretical_loss": 3.786068094857332, + "objective/train/tokens_used": 708530656, + "theoretical_loss": 3.786068094857332, + "tokens_seen": 688070656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039974924774322966, + "loss": 3.0056, + "theoretical_loss": 3.786068094857332, + "tokens_seen": 688070656 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003997392176529589, + "loss": 3.068, + "theoretical_loss": 3.78603136951057, + "tokens_seen": 688136192 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039972918756268807, + "loss": 2.9885, + "theoretical_loss": 3.785994648640476, + "tokens_seen": 688201728 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039971915747241725, + "loss": 3.1095, + "theoretical_loss": 3.7859579322460792, + "tokens_seen": 688267264 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039970912738214643, + "loss": 3.0573, + "theoretical_loss": 3.785921220326407, + "tokens_seen": 688332800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039969909729187567, + "loss": 3.0248, + "theoretical_loss": 3.7858845128804886, + "tokens_seen": 688398336 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003996890672016048, + "loss": 3.029, + "theoretical_loss": 3.785847809907353, + "tokens_seen": 688463872 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039967903711133403, + "loss": 2.9404, + "theoretical_loss": 3.785811111406029, + "tokens_seen": 688529408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039966900702106316, + "loss": 2.9956, + "theoretical_loss": 3.7857744173755465, + "tokens_seen": 688594944 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003996589769307924, + "loss": 3.0829, + "theoretical_loss": 3.7857377278149364, + "tokens_seen": 688660480 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003996489468405216, + "loss": 3.0076, + "theoretical_loss": 3.785701042723227, + "tokens_seen": 688726016 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039963891675025076, + "loss": 3.0859, + "theoretical_loss": 3.78566436209945, + "tokens_seen": 688791552 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039962888665997994, + "loss": 3.1174, + "theoretical_loss": 3.7856276859426363, + "tokens_seen": 688857088 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003996188565697091, + "loss": 3.1449, + "theoretical_loss": 3.7855910142518168, + "tokens_seen": 688922624 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003996088264794383, + "loss": 2.9902, + "theoretical_loss": 3.7855543470260224, + "tokens_seen": 688988160 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039959879638916754, + "loss": 3.0783, + "theoretical_loss": 3.7855176842642857, + "tokens_seen": 689053696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039958876629889666, + "loss": 3.0085, + "theoretical_loss": 3.785481025965638, + "tokens_seen": 689119232 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003995787362086259, + "loss": 2.9397, + "theoretical_loss": 3.785444372129112, + "tokens_seen": 689184768 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003995687061183551, + "loss": 3.0748, + "theoretical_loss": 3.78540772275374, + "tokens_seen": 689250304 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039955867602808426, + "loss": 2.9932, + "theoretical_loss": 3.7853710778385556, + "tokens_seen": 689315840 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039954864593781344, + "loss": 3.0385, + "theoretical_loss": 3.7853344373825912, + "tokens_seen": 689381376 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003995386158475426, + "loss": 3.0513, + "theoretical_loss": 3.785297801384881, + "tokens_seen": 689446912 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003995285857572718, + "loss": 3.0367, + "theoretical_loss": 3.7852611698444587, + "tokens_seen": 689512448 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039951855566700104, + "loss": 3.0873, + "theoretical_loss": 3.785224542760358, + "tokens_seen": 689577984 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039950852557673017, + "loss": 2.9894, + "theoretical_loss": 3.785187920131614, + "tokens_seen": 689643520 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1658475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.925386905670166, + "objective/train/theoretical_loss": 3.785151301957261, + "objective/train/tokens_used": 710169056, + "theoretical_loss": 3.785151301957261, + "tokens_seen": 689709056 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003994984954864594, + "loss": 3.0019, + "theoretical_loss": 3.785151301957261, + "tokens_seen": 689709056 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039948846539618853, + "loss": 3.011, + "theoretical_loss": 3.785114688236334, + "tokens_seen": 689774592 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039947843530591776, + "loss": 3.0258, + "theoretical_loss": 3.785078078967868, + "tokens_seen": 689840128 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039946840521564694, + "loss": 3.0447, + "theoretical_loss": 3.7850414741508995, + "tokens_seen": 689905664 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003994583751253761, + "loss": 2.9962, + "theoretical_loss": 3.785004873784464, + "tokens_seen": 689971200 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003994483450351053, + "loss": 2.981, + "theoretical_loss": 3.784968277867598, + "tokens_seen": 690036736 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003994383149448345, + "loss": 3.0468, + "theoretical_loss": 3.784931686399337, + "tokens_seen": 690102272 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039942828485456367, + "loss": 3.1008, + "theoretical_loss": 3.7848950993787196, + "tokens_seen": 690167808 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003994182547642929, + "loss": 2.9536, + "theoretical_loss": 3.7848585168047815, + "tokens_seen": 690233344 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039940822467402203, + "loss": 3.1194, + "theoretical_loss": 3.7848219386765605, + "tokens_seen": 690298880 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039939819458375127, + "loss": 2.9757, + "theoretical_loss": 3.7847853649930947, + "tokens_seen": 690364416 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003993881644934805, + "loss": 3.0684, + "theoretical_loss": 3.784748795753422, + "tokens_seen": 690429952 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039937813440320963, + "loss": 3.0094, + "theoretical_loss": 3.7847122309565804, + "tokens_seen": 690495488 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039936810431293886, + "loss": 3.1103, + "theoretical_loss": 3.784675670601609, + "tokens_seen": 690561024 + }, + { + "epoch": 8.01, + "learning_rate": 0.000399358074222668, + "loss": 3.0311, + "theoretical_loss": 3.784639114687546, + "tokens_seen": 690626560 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003993480441323972, + "loss": 2.9384, + "theoretical_loss": 3.784602563213432, + "tokens_seen": 690692096 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003993380140421264, + "loss": 2.9592, + "theoretical_loss": 3.784566016178305, + "tokens_seen": 690757632 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003993279839518556, + "loss": 2.9966, + "theoretical_loss": 3.7845294735812054, + "tokens_seen": 690823168 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039931795386158477, + "loss": 3.0624, + "theoretical_loss": 3.7844929354211736, + "tokens_seen": 690888704 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039930792377131395, + "loss": 2.9907, + "theoretical_loss": 3.78445640169725, + "tokens_seen": 690954240 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039929789368104313, + "loss": 3.0381, + "theoretical_loss": 3.7844198724084754, + "tokens_seen": 691019776 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039928786359077237, + "loss": 3.0254, + "theoretical_loss": 3.7843833475538906, + "tokens_seen": 691085312 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003992778335005015, + "loss": 2.9922, + "theoretical_loss": 3.7843468271325365, + "tokens_seen": 691150848 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039926780341023073, + "loss": 3.0346, + "theoretical_loss": 3.7843103111434555, + "tokens_seen": 691216384 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039925777331995986, + "loss": 3.0368, + "theoretical_loss": 3.784273799585689, + "tokens_seen": 691281920 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1662206, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1148383617401123, + "objective/train/theoretical_loss": 3.78423729245828, + "objective/train/tokens_used": 711807456, + "theoretical_loss": 3.78423729245828, + "tokens_seen": 691347456 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003992477432296891, + "loss": 3.0113, + "theoretical_loss": 3.78423729245828, + "tokens_seen": 691347456 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039923771313941827, + "loss": 3.0473, + "theoretical_loss": 3.7842007897602694, + "tokens_seen": 691412992 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039922768304914745, + "loss": 3.0764, + "theoretical_loss": 3.7841642914907014, + "tokens_seen": 691478528 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039921765295887663, + "loss": 3.0211, + "theoretical_loss": 3.7841277976486194, + "tokens_seen": 691544064 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039920762286860587, + "loss": 2.8833, + "theoretical_loss": 3.784091308233066, + "tokens_seen": 691609600 + }, + { + "epoch": 8.01, + "learning_rate": 0.000399197592778335, + "loss": 2.937, + "theoretical_loss": 3.7840548232430846, + "tokens_seen": 691675136 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039918756268806423, + "loss": 3.062, + "theoretical_loss": 3.7840183426777196, + "tokens_seen": 691740672 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039917753259779336, + "loss": 3.0562, + "theoretical_loss": 3.783981866536016, + "tokens_seen": 691806208 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003991675025075226, + "loss": 3.116, + "theoretical_loss": 3.7839453948170174, + "tokens_seen": 691871744 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003991574724172518, + "loss": 3.0544, + "theoretical_loss": 3.7839089275197693, + "tokens_seen": 691937280 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039914744232698096, + "loss": 2.9624, + "theoretical_loss": 3.7838724646433164, + "tokens_seen": 692002816 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039913741223671014, + "loss": 3.03, + "theoretical_loss": 3.7838360061867045, + "tokens_seen": 692068352 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003991273821464393, + "loss": 2.9198, + "theoretical_loss": 3.78379955214898, + "tokens_seen": 692133888 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003991173520561685, + "loss": 3.1238, + "theoretical_loss": 3.783763102529188, + "tokens_seen": 692199424 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039910732196589774, + "loss": 3.0727, + "theoretical_loss": 3.7837266573263744, + "tokens_seen": 692264960 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039909729187562686, + "loss": 3.1291, + "theoretical_loss": 3.7836902165395876, + "tokens_seen": 692330496 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003990872617853561, + "loss": 2.9686, + "theoretical_loss": 3.7836537801678736, + "tokens_seen": 692396032 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003990772316950853, + "loss": 3.0705, + "theoretical_loss": 3.7836173482102793, + "tokens_seen": 692461568 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039906720160481446, + "loss": 2.9969, + "theoretical_loss": 3.7835809206658526, + "tokens_seen": 692527104 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039905717151454364, + "loss": 3.1019, + "theoretical_loss": 3.783544497533642, + "tokens_seen": 692592640 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003990471414242728, + "loss": 2.9887, + "theoretical_loss": 3.7835080788126945, + "tokens_seen": 692658176 + }, + { + "epoch": 8.01, + "learning_rate": 0.000399037111334002, + "loss": 3.0477, + "theoretical_loss": 3.7834716645020596, + "tokens_seen": 692723712 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039902708124373124, + "loss": 3.0455, + "theoretical_loss": 3.7834352546007857, + "tokens_seen": 692789248 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039901705115346037, + "loss": 3.0933, + "theoretical_loss": 3.783398849107921, + "tokens_seen": 692854784 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003990070210631896, + "loss": 3.0238, + "theoretical_loss": 3.7833624480225163, + "tokens_seen": 692920320 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1667204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.818321704864502, + "objective/train/theoretical_loss": 3.7833260513436198, + "objective/train/tokens_used": 713445856, + "theoretical_loss": 3.7833260513436198, + "tokens_seen": 692985856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039899699097291873, + "loss": 2.9333, + "theoretical_loss": 3.7833260513436198, + "tokens_seen": 692985856 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039898696088264796, + "loss": 2.9699, + "theoretical_loss": 3.7832896590702827, + "tokens_seen": 693051392 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039897693079237714, + "loss": 2.9102, + "theoretical_loss": 3.7832532712015543, + "tokens_seen": 693116928 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003989669007021063, + "loss": 2.9474, + "theoretical_loss": 3.7832168877364856, + "tokens_seen": 693182464 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003989568706118355, + "loss": 3.0684, + "theoretical_loss": 3.7831805086741266, + "tokens_seen": 693248000 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003989468405215647, + "loss": 3.0389, + "theoretical_loss": 3.78314413401353, + "tokens_seen": 693313536 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039893681043129387, + "loss": 3.0776, + "theoretical_loss": 3.783107763753746, + "tokens_seen": 693379072 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003989267803410231, + "loss": 3.0585, + "theoretical_loss": 3.783071397893826, + "tokens_seen": 693444608 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039891675025075223, + "loss": 2.9425, + "theoretical_loss": 3.783035036432823, + "tokens_seen": 693510144 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039890672016048147, + "loss": 3.0206, + "theoretical_loss": 3.782998679369789, + "tokens_seen": 693575680 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039889669007021065, + "loss": 3.0133, + "theoretical_loss": 3.7829623267037755, + "tokens_seen": 693641216 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039888665997993983, + "loss": 2.983, + "theoretical_loss": 3.7829259784338367, + "tokens_seen": 693706752 + }, + { + "epoch": 8.01, + "learning_rate": 0.000398876629889669, + "loss": 2.9038, + "theoretical_loss": 3.7828896345590257, + "tokens_seen": 693772288 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003988665997993982, + "loss": 3.0381, + "theoretical_loss": 3.782853295078395, + "tokens_seen": 693837824 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039885656970912737, + "loss": 2.974, + "theoretical_loss": 3.782816959990999, + "tokens_seen": 693903360 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003988465396188566, + "loss": 3.0161, + "theoretical_loss": 3.7827806292958917, + "tokens_seen": 693968896 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039883650952858573, + "loss": 3.0201, + "theoretical_loss": 3.782744302992127, + "tokens_seen": 694034432 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039882647943831497, + "loss": 2.9777, + "theoretical_loss": 3.7827079810787603, + "tokens_seen": 694099968 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003988164493480441, + "loss": 3.0785, + "theoretical_loss": 3.782671663554846, + "tokens_seen": 694165504 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039880641925777333, + "loss": 3.1092, + "theoretical_loss": 3.782635350419439, + "tokens_seen": 694231040 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003987963891675025, + "loss": 3.114, + "theoretical_loss": 3.7825990416715958, + "tokens_seen": 694296576 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003987863590772317, + "loss": 3.0073, + "theoretical_loss": 3.7825627373103714, + "tokens_seen": 694362112 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003987763289869609, + "loss": 3.0748, + "theoretical_loss": 3.7825264373348215, + "tokens_seen": 694427648 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039876629889669006, + "loss": 3.0637, + "theoretical_loss": 3.7824901417440033, + "tokens_seen": 694493184 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039875626880641924, + "loss": 2.9898, + "theoretical_loss": 3.7824538505369736, + "tokens_seen": 694558720 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1670028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.850348711013794, + "objective/train/theoretical_loss": 3.782417563712789, + "objective/train/tokens_used": 715084256, + "theoretical_loss": 3.782417563712789, + "tokens_seen": 694624256 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039874623871614847, + "loss": 2.9985, + "theoretical_loss": 3.782417563712789, + "tokens_seen": 694624256 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003987362086258776, + "loss": 2.9973, + "theoretical_loss": 3.7823812812705064, + "tokens_seen": 694689792 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039872617853560683, + "loss": 3.0166, + "theoretical_loss": 3.782345003209184, + "tokens_seen": 694755328 + }, + { + "epoch": 8.01, + "learning_rate": 0.000398716148445336, + "loss": 3.042, + "theoretical_loss": 3.782308729527879, + "tokens_seen": 694820864 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003987061183550652, + "loss": 2.9964, + "theoretical_loss": 3.7822724602256503, + "tokens_seen": 694886400 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003986960882647944, + "loss": 3.0746, + "theoretical_loss": 3.7822361953015555, + "tokens_seen": 694951936 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039868605817452356, + "loss": 3.0774, + "theoretical_loss": 3.7821999347546535, + "tokens_seen": 695017472 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039867602808425274, + "loss": 2.9998, + "theoretical_loss": 3.782163678584004, + "tokens_seen": 695083008 + }, + { + "epoch": 8.01, + "learning_rate": 0.000398665997993982, + "loss": 2.9783, + "theoretical_loss": 3.7821274267886653, + "tokens_seen": 695148544 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003986559679037111, + "loss": 3.0323, + "theoretical_loss": 3.782091179367698, + "tokens_seen": 695214080 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039864593781344034, + "loss": 3.0456, + "theoretical_loss": 3.7820549363201605, + "tokens_seen": 695279616 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003986359077231695, + "loss": 2.9607, + "theoretical_loss": 3.782018697645115, + "tokens_seen": 695345152 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003986258776328987, + "loss": 3.018, + "theoretical_loss": 3.78198246334162, + "tokens_seen": 695410688 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039861584754262794, + "loss": 3.0765, + "theoretical_loss": 3.7819462334087373, + "tokens_seen": 695476224 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039860581745235706, + "loss": 3.0231, + "theoretical_loss": 3.781910007845528, + "tokens_seen": 695541760 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003985957873620863, + "loss": 2.9601, + "theoretical_loss": 3.7818737866510532, + "tokens_seen": 695607296 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003985857572718155, + "loss": 3.0379, + "theoretical_loss": 3.7818375698243742, + "tokens_seen": 695672832 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039857572718154466, + "loss": 3.1, + "theoretical_loss": 3.7818013573645533, + "tokens_seen": 695738368 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039856569709127384, + "loss": 3.0173, + "theoretical_loss": 3.7817651492706528, + "tokens_seen": 695803904 + }, + { + "epoch": 8.01, + "learning_rate": 0.000398555667001003, + "loss": 3.0205, + "theoretical_loss": 3.7817289455417344, + "tokens_seen": 695869440 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003985456369107322, + "loss": 3.0052, + "theoretical_loss": 3.781692746176862, + "tokens_seen": 695934976 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039853560682046144, + "loss": 3.0511, + "theoretical_loss": 3.7816565511750984, + "tokens_seen": 696000512 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039852557673019057, + "loss": 3.0216, + "theoretical_loss": 3.7816203605355057, + "tokens_seen": 696066048 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003985155466399198, + "loss": 3.0572, + "theoretical_loss": 3.781584174257149, + "tokens_seen": 696131584 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039850551654964893, + "loss": 3.0315, + "theoretical_loss": 3.7815479923390916, + "tokens_seen": 696197120 + }, + { + "epoch": 8.01, + "objective/train/docs_used": 1674934, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0255625247955322, + "objective/train/theoretical_loss": 3.7815118147803983, + "objective/train/tokens_used": 716722656, + "theoretical_loss": 3.7815118147803983, + "tokens_seen": 696262656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039849548645937816, + "loss": 2.9232, + "theoretical_loss": 3.7815118147803983, + "tokens_seen": 696262656 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039848545636910734, + "loss": 3.0942, + "theoretical_loss": 3.781475641580133, + "tokens_seen": 696328192 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003984754262788365, + "loss": 2.8862, + "theoretical_loss": 3.7814394727373606, + "tokens_seen": 696393728 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003984653961885657, + "loss": 3.0993, + "theoretical_loss": 3.7814033082511465, + "tokens_seen": 696459264 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003984553660982949, + "loss": 2.9622, + "theoretical_loss": 3.7813671481205553, + "tokens_seen": 696524800 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039844533600802407, + "loss": 2.9664, + "theoretical_loss": 3.781330992344653, + "tokens_seen": 696590336 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003984353059177533, + "loss": 2.9801, + "theoretical_loss": 3.7812948409225067, + "tokens_seen": 696655872 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039842527582748243, + "loss": 2.9827, + "theoretical_loss": 3.7812586938531814, + "tokens_seen": 696721408 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039841524573721167, + "loss": 3.006, + "theoretical_loss": 3.781222551135744, + "tokens_seen": 696786944 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039840521564694085, + "loss": 3.0429, + "theoretical_loss": 3.781186412769261, + "tokens_seen": 696852480 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039839518555667003, + "loss": 3.0026, + "theoretical_loss": 3.7811502787527997, + "tokens_seen": 696918016 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003983851554663992, + "loss": 3.1424, + "theoretical_loss": 3.781114149085428, + "tokens_seen": 696983552 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003983751253761284, + "loss": 3.0549, + "theoretical_loss": 3.7810780237662125, + "tokens_seen": 697049088 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039836509528585757, + "loss": 3.0392, + "theoretical_loss": 3.781041902794222, + "tokens_seen": 697114624 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003983550651955868, + "loss": 2.9957, + "theoretical_loss": 3.781005786168525, + "tokens_seen": 697180160 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039834503510531593, + "loss": 3.1234, + "theoretical_loss": 3.780969673888189, + "tokens_seen": 697245696 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039833500501504517, + "loss": 3.0299, + "theoretical_loss": 3.7809335659522842, + "tokens_seen": 697311232 + }, + { + "epoch": 8.01, + "learning_rate": 0.0003983249749247743, + "loss": 3.1379, + "theoretical_loss": 3.7808974623598783, + "tokens_seen": 697376768 + }, + { + "epoch": 8.01, + "learning_rate": 0.00039831494483450353, + "loss": 2.9675, + "theoretical_loss": 3.7808613631100414, + "tokens_seen": 697442304 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003983049147442327, + "loss": 2.9305, + "theoretical_loss": 3.780825268201843, + "tokens_seen": 697507840 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982948846539619, + "loss": 3.0152, + "theoretical_loss": 3.780789177634354, + "tokens_seen": 697573376 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982848545636911, + "loss": 2.8834, + "theoretical_loss": 3.7807530914066434, + "tokens_seen": 697638912 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039827482447342026, + "loss": 2.957, + "theoretical_loss": 3.7807170095177822, + "tokens_seen": 697704448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039826479438314944, + "loss": 2.9996, + "theoretical_loss": 3.7806809319668417, + "tokens_seen": 697769984 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039825476429287867, + "loss": 3.0905, + "theoretical_loss": 3.780644858752892, + "tokens_seen": 697835520 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1677760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.874699592590332, + "objective/train/theoretical_loss": 3.7806087898750054, + "objective/train/tokens_used": 718361056, + "theoretical_loss": 3.7806087898750054, + "tokens_seen": 697901056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982447342026078, + "loss": 2.94, + "theoretical_loss": 3.7806087898750054, + "tokens_seen": 697901056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039823470411233703, + "loss": 3.0022, + "theoretical_loss": 3.780572725332253, + "tokens_seen": 697966592 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982246740220662, + "loss": 3.0033, + "theoretical_loss": 3.780536665123708, + "tokens_seen": 698032128 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982146439317954, + "loss": 2.9619, + "theoretical_loss": 3.7805006092484406, + "tokens_seen": 698097664 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003982046138415246, + "loss": 3.092, + "theoretical_loss": 3.7804645577055247, + "tokens_seen": 698163200 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039819458375125376, + "loss": 2.9187, + "theoretical_loss": 3.7804285104940334, + "tokens_seen": 698228736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039818455366098294, + "loss": 3.0225, + "theoretical_loss": 3.7803924676130394, + "tokens_seen": 698294272 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003981745235707122, + "loss": 3.0224, + "theoretical_loss": 3.780356429061616, + "tokens_seen": 698359808 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003981644934804413, + "loss": 3.0658, + "theoretical_loss": 3.780320394838837, + "tokens_seen": 698425344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039815446339017054, + "loss": 3.0196, + "theoretical_loss": 3.780284364943776, + "tokens_seen": 698490880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039814443329989966, + "loss": 3.0083, + "theoretical_loss": 3.780248339375508, + "tokens_seen": 698556416 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003981344032096289, + "loss": 3.0714, + "theoretical_loss": 3.780212318133107, + "tokens_seen": 698621952 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003981243731193581, + "loss": 3.1196, + "theoretical_loss": 3.780176301215648, + "tokens_seen": 698687488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039811434302908726, + "loss": 3.0948, + "theoretical_loss": 3.7801402886222064, + "tokens_seen": 698753024 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039810431293881644, + "loss": 2.8838, + "theoretical_loss": 3.780104280351857, + "tokens_seen": 698818560 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003980942828485457, + "loss": 2.9928, + "theoretical_loss": 3.780068276403676, + "tokens_seen": 698884096 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003980842527582748, + "loss": 3.0883, + "theoretical_loss": 3.7800322767767387, + "tokens_seen": 698949632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039807422266800404, + "loss": 3.0671, + "theoretical_loss": 3.7799962814701225, + "tokens_seen": 699015168 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039806419257773317, + "loss": 2.9645, + "theoretical_loss": 3.7799602904829026, + "tokens_seen": 699080704 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003980541624874624, + "loss": 3.0836, + "theoretical_loss": 3.7799243038141572, + "tokens_seen": 699146240 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003980441323971916, + "loss": 3.0648, + "theoretical_loss": 3.7798883214629626, + "tokens_seen": 699211776 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039803410230692077, + "loss": 3.0373, + "theoretical_loss": 3.779852343428396, + "tokens_seen": 699277312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039802407221664995, + "loss": 3.0468, + "theoretical_loss": 3.7798163697095353, + "tokens_seen": 699342848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039801404212637913, + "loss": 2.8821, + "theoretical_loss": 3.7797804003054587, + "tokens_seen": 699408384 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003980040120361083, + "loss": 3.0274, + "theoretical_loss": 3.7797444352152443, + "tokens_seen": 699473920 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1681395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9923887252807617, + "objective/train/theoretical_loss": 3.779708474437971, + "objective/train/tokens_used": 719999456, + "theoretical_loss": 3.779708474437971, + "tokens_seen": 699539456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039799398194583754, + "loss": 3.0922, + "theoretical_loss": 3.779708474437971, + "tokens_seen": 699539456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039798395185556667, + "loss": 3.1245, + "theoretical_loss": 3.7796725179727164, + "tokens_seen": 699604992 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003979739217652959, + "loss": 3.0998, + "theoretical_loss": 3.779636565818561, + "tokens_seen": 699670528 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039796389167502503, + "loss": 3.0774, + "theoretical_loss": 3.7796006179745834, + "tokens_seen": 699736064 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039795386158475427, + "loss": 3.0674, + "theoretical_loss": 3.7795646744398637, + "tokens_seen": 699801600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039794383149448345, + "loss": 3.0276, + "theoretical_loss": 3.779528735213481, + "tokens_seen": 699867136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039793380140421263, + "loss": 3.0205, + "theoretical_loss": 3.7794928002945163, + "tokens_seen": 699932672 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003979237713139418, + "loss": 3.0243, + "theoretical_loss": 3.77945686968205, + "tokens_seen": 699998208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039791374122367105, + "loss": 3.105, + "theoretical_loss": 3.779420943375163, + "tokens_seen": 700063744 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003979037111334002, + "loss": 2.9631, + "theoretical_loss": 3.779385021372936, + "tokens_seen": 700129280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003978936810431294, + "loss": 2.9186, + "theoretical_loss": 3.77934910367445, + "tokens_seen": 700194816 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003978836509528586, + "loss": 3.1036, + "theoretical_loss": 3.7793131902787875, + "tokens_seen": 700260352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039787362086258777, + "loss": 3.08, + "theoretical_loss": 3.7792772811850304, + "tokens_seen": 700325888 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397863590772317, + "loss": 2.9652, + "theoretical_loss": 3.7792413763922603, + "tokens_seen": 700391424 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039785356068204613, + "loss": 3.0599, + "theoretical_loss": 3.77920547589956, + "tokens_seen": 700456960 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039784353059177537, + "loss": 3.0878, + "theoretical_loss": 3.7791695797060125, + "tokens_seen": 700522496 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003978335005015045, + "loss": 2.9801, + "theoretical_loss": 3.7791336878107, + "tokens_seen": 700588032 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039782347041123373, + "loss": 2.8813, + "theoretical_loss": 3.7790978002127065, + "tokens_seen": 700653568 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003978134403209629, + "loss": 3.0682, + "theoretical_loss": 3.7790619169111155, + "tokens_seen": 700719104 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003978034102306921, + "loss": 3.112, + "theoretical_loss": 3.7790260379050107, + "tokens_seen": 700784640 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003977933801404213, + "loss": 3.0419, + "theoretical_loss": 3.778990163193477, + "tokens_seen": 700850176 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039778335005015046, + "loss": 3.0319, + "theoretical_loss": 3.7789542927755977, + "tokens_seen": 700915712 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039777331995987964, + "loss": 3.0214, + "theoretical_loss": 3.778918426650458, + "tokens_seen": 700981248 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039776328986960887, + "loss": 3.0114, + "theoretical_loss": 3.7788825648171436, + "tokens_seen": 701046784 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397753259779338, + "loss": 3.0557, + "theoretical_loss": 3.778846707274739, + "tokens_seen": 701112320 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1686651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0661773681640625, + "objective/train/theoretical_loss": 3.7788108540223293, + "objective/train/tokens_used": 721637856, + "theoretical_loss": 3.7788108540223293, + "tokens_seen": 701177856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039774322968906723, + "loss": 3.0446, + "theoretical_loss": 3.7788108540223293, + "tokens_seen": 701177856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003977331995987964, + "loss": 3.0565, + "theoretical_loss": 3.7787750050590017, + "tokens_seen": 701243392 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003977231695085256, + "loss": 3.1053, + "theoretical_loss": 3.778739160383841, + "tokens_seen": 701308928 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003977131394182548, + "loss": 2.9662, + "theoretical_loss": 3.778703319995935, + "tokens_seen": 701374464 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039770310932798396, + "loss": 3.0566, + "theoretical_loss": 3.7786674838943695, + "tokens_seen": 701440000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039769307923771314, + "loss": 3.0337, + "theoretical_loss": 3.7786316520782313, + "tokens_seen": 701505536 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003976830491474424, + "loss": 2.9497, + "theoretical_loss": 3.778595824546608, + "tokens_seen": 701571072 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003976730190571715, + "loss": 3.0673, + "theoretical_loss": 3.778560001298587, + "tokens_seen": 701636608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039766298896690074, + "loss": 2.9736, + "theoretical_loss": 3.7785241823332565, + "tokens_seen": 701702144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039765295887662987, + "loss": 2.995, + "theoretical_loss": 3.778488367649704, + "tokens_seen": 701767680 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003976429287863591, + "loss": 3.031, + "theoretical_loss": 3.7784525572470185, + "tokens_seen": 701833216 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003976328986960883, + "loss": 2.9332, + "theoretical_loss": 3.778416751124288, + "tokens_seen": 701898752 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039762286860581746, + "loss": 3.001, + "theoretical_loss": 3.778380949280602, + "tokens_seen": 701964288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039761283851554664, + "loss": 3.0396, + "theoretical_loss": 3.7783451517150493, + "tokens_seen": 702029824 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003976028084252759, + "loss": 2.9869, + "theoretical_loss": 3.7783093584267196, + "tokens_seen": 702095360 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397592778335005, + "loss": 3.1345, + "theoretical_loss": 3.7782735694147025, + "tokens_seen": 702160896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039758274824473424, + "loss": 2.9355, + "theoretical_loss": 3.7782377846780877, + "tokens_seen": 702226432 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039757271815446337, + "loss": 3.0843, + "theoretical_loss": 3.7782020042159665, + "tokens_seen": 702291968 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003975626880641926, + "loss": 3.0513, + "theoretical_loss": 3.7781662280274286, + "tokens_seen": 702357504 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003975526579739218, + "loss": 2.9568, + "theoretical_loss": 3.7781304561115654, + "tokens_seen": 702423040 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039754262788365097, + "loss": 3.022, + "theoretical_loss": 3.778094688467468, + "tokens_seen": 702488576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039753259779338015, + "loss": 2.9614, + "theoretical_loss": 3.7780589250942276, + "tokens_seen": 702554112 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039752256770310933, + "loss": 2.9314, + "theoretical_loss": 3.778023165990936, + "tokens_seen": 702619648 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003975125376128385, + "loss": 2.9543, + "theoretical_loss": 3.777987411156685, + "tokens_seen": 702685184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039750250752256774, + "loss": 3.029, + "theoretical_loss": 3.7779516605905674, + "tokens_seen": 702750720 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1689449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.059896945953369, + "objective/train/theoretical_loss": 3.7779159142916754, + "objective/train/tokens_used": 723276256, + "theoretical_loss": 3.7779159142916754, + "tokens_seen": 702816256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039749247743229687, + "loss": 3.0512, + "theoretical_loss": 3.7779159142916754, + "tokens_seen": 702816256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003974824473420261, + "loss": 2.976, + "theoretical_loss": 3.7778801722591018, + "tokens_seen": 702881792 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039747241725175523, + "loss": 3.0912, + "theoretical_loss": 3.7778444344919393, + "tokens_seen": 702947328 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039746238716148447, + "loss": 3.0267, + "theoretical_loss": 3.777808700989282, + "tokens_seen": 703012864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039745235707121365, + "loss": 3.0462, + "theoretical_loss": 3.7777729717502235, + "tokens_seen": 703078400 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039744232698094283, + "loss": 2.9501, + "theoretical_loss": 3.7777372467738575, + "tokens_seen": 703143936 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397432296890672, + "loss": 3.0619, + "theoretical_loss": 3.7777015260592774, + "tokens_seen": 703209472 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039742226680040125, + "loss": 2.9344, + "theoretical_loss": 3.7776658096055793, + "tokens_seen": 703275008 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003974122367101304, + "loss": 2.9662, + "theoretical_loss": 3.777630097411857, + "tokens_seen": 703340544 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003974022066198596, + "loss": 2.9787, + "theoretical_loss": 3.7775943894772057, + "tokens_seen": 703406080 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039739217652958874, + "loss": 3.0767, + "theoretical_loss": 3.7775586858007206, + "tokens_seen": 703471616 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039738214643931797, + "loss": 3.0411, + "theoretical_loss": 3.7775229863814976, + "tokens_seen": 703537152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039737211634904715, + "loss": 3.0521, + "theoretical_loss": 3.777487291218632, + "tokens_seen": 703602688 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039736208625877633, + "loss": 3.0566, + "theoretical_loss": 3.7774516003112204, + "tokens_seen": 703668224 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003973520561685055, + "loss": 2.9711, + "theoretical_loss": 3.777415913658359, + "tokens_seen": 703733760 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003973420260782347, + "loss": 3.1163, + "theoretical_loss": 3.777380231259145, + "tokens_seen": 703799296 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003973319959879639, + "loss": 3.0256, + "theoretical_loss": 3.777344553112675, + "tokens_seen": 703864832 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003973219658976931, + "loss": 3.0824, + "theoretical_loss": 3.777308879218046, + "tokens_seen": 703930368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039731193580742224, + "loss": 3.0062, + "theoretical_loss": 3.7772732095743558, + "tokens_seen": 703995904 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003973019057171515, + "loss": 2.968, + "theoretical_loss": 3.7772375441807022, + "tokens_seen": 704061440 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003972918756268806, + "loss": 3.1056, + "theoretical_loss": 3.7772018830361835, + "tokens_seen": 704126976 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039728184553660984, + "loss": 3.0655, + "theoretical_loss": 3.7771662261398973, + "tokens_seen": 704192512 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397271815446339, + "loss": 2.9797, + "theoretical_loss": 3.7771305734909424, + "tokens_seen": 704258048 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003972617853560682, + "loss": 2.9857, + "theoretical_loss": 3.7770949250884187, + "tokens_seen": 704323584 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003972517552657974, + "loss": 3.0527, + "theoretical_loss": 3.7770592809314243, + "tokens_seen": 704389120 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1694245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0314347743988037, + "objective/train/theoretical_loss": 3.7770236410190594, + "objective/train/tokens_used": 724914656, + "theoretical_loss": 3.7770236410190594, + "tokens_seen": 704454656 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003972417251755266, + "loss": 3.0263, + "theoretical_loss": 3.7770236410190594, + "tokens_seen": 704454656 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039723169508525574, + "loss": 3.0046, + "theoretical_loss": 3.7769880053504234, + "tokens_seen": 704520192 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397221664994985, + "loss": 3.0034, + "theoretical_loss": 3.776952373924616, + "tokens_seen": 704585728 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003972116349047141, + "loss": 3.0489, + "theoretical_loss": 3.7769167467407376, + "tokens_seen": 704651264 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039720160481444334, + "loss": 2.9507, + "theoretical_loss": 3.7768811237978888, + "tokens_seen": 704716800 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971915747241725, + "loss": 3.0803, + "theoretical_loss": 3.776845505095171, + "tokens_seen": 704782336 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971815446339017, + "loss": 2.9987, + "theoretical_loss": 3.7768098906316845, + "tokens_seen": 704847872 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971715145436309, + "loss": 3.0579, + "theoretical_loss": 3.776774280406531, + "tokens_seen": 704913408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039716148445336007, + "loss": 3.0728, + "theoretical_loss": 3.776738674418812, + "tokens_seen": 704978944 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039715145436308925, + "loss": 3.125, + "theoretical_loss": 3.7767030726676296, + "tokens_seen": 705044480 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971414242728185, + "loss": 2.9178, + "theoretical_loss": 3.7766674751520855, + "tokens_seen": 705110016 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039713139418254766, + "loss": 3.0477, + "theoretical_loss": 3.776631881871283, + "tokens_seen": 705175552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039712136409227684, + "loss": 3.0692, + "theoretical_loss": 3.7765962928243244, + "tokens_seen": 705241088 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971113340020061, + "loss": 3.1185, + "theoretical_loss": 3.7765607080103125, + "tokens_seen": 705306624 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003971013039117352, + "loss": 3.0561, + "theoretical_loss": 3.7765251274283513, + "tokens_seen": 705372160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039709127382146444, + "loss": 2.9362, + "theoretical_loss": 3.7764895510775434, + "tokens_seen": 705437696 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039708124373119357, + "loss": 3.1023, + "theoretical_loss": 3.7764539789569938, + "tokens_seen": 705503232 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003970712136409228, + "loss": 2.9612, + "theoretical_loss": 3.776418411065805, + "tokens_seen": 705568768 + }, + { + "epoch": 8.02, + "learning_rate": 0.000397061183550652, + "loss": 3.093, + "theoretical_loss": 3.776382847403083, + "tokens_seen": 705634304 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039705115346038117, + "loss": 3.0388, + "theoretical_loss": 3.7763472879679316, + "tokens_seen": 705699840 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039704112337011035, + "loss": 3.007, + "theoretical_loss": 3.7763117327594555, + "tokens_seen": 705765376 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039703109327983953, + "loss": 3.0293, + "theoretical_loss": 3.776276181776761, + "tokens_seen": 705830912 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003970210631895687, + "loss": 3.0742, + "theoretical_loss": 3.7762406350189517, + "tokens_seen": 705896448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039701103309929794, + "loss": 2.9456, + "theoretical_loss": 3.7762050924851356, + "tokens_seen": 705961984 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039700100300902707, + "loss": 3.0186, + "theoretical_loss": 3.776169554174417, + "tokens_seen": 706027520 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1697061, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1317431926727295, + "objective/train/theoretical_loss": 3.7761340200859026, + "objective/train/tokens_used": 726553056, + "theoretical_loss": 3.7761340200859026, + "tokens_seen": 706093056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003969909729187563, + "loss": 2.8845, + "theoretical_loss": 3.7761340200859026, + "tokens_seen": 706093056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039698094282848543, + "loss": 2.9742, + "theoretical_loss": 3.7760984902186996, + "tokens_seen": 706158592 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039697091273821467, + "loss": 3.0043, + "theoretical_loss": 3.7760629645719144, + "tokens_seen": 706224128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039696088264794385, + "loss": 3.009, + "theoretical_loss": 3.776027443144654, + "tokens_seen": 706289664 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039695085255767303, + "loss": 2.9761, + "theoretical_loss": 3.775991925936026, + "tokens_seen": 706355200 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003969408224674022, + "loss": 3.1187, + "theoretical_loss": 3.7759564129451375, + "tokens_seen": 706420736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039693079237713145, + "loss": 3.0353, + "theoretical_loss": 3.7759209041710964, + "tokens_seen": 706486272 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003969207622868606, + "loss": 2.9391, + "theoretical_loss": 3.775885399613012, + "tokens_seen": 706551808 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003969107321965898, + "loss": 3.0536, + "theoretical_loss": 3.775849899269992, + "tokens_seen": 706617344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039690070210631894, + "loss": 3.0587, + "theoretical_loss": 3.775814403141145, + "tokens_seen": 706682880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039689067201604817, + "loss": 3.0287, + "theoretical_loss": 3.77577891122558, + "tokens_seen": 706748416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039688064192577735, + "loss": 3.07, + "theoretical_loss": 3.775743423522407, + "tokens_seen": 706813952 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039687061183550653, + "loss": 3.0476, + "theoretical_loss": 3.7757079400307347, + "tokens_seen": 706879488 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968605817452357, + "loss": 3.1286, + "theoretical_loss": 3.775672460749673, + "tokens_seen": 706945024 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968505516549649, + "loss": 3.0923, + "theoretical_loss": 3.775636985678333, + "tokens_seen": 707010560 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968405215646941, + "loss": 3.0103, + "theoretical_loss": 3.7756015148158237, + "tokens_seen": 707076096 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968304914744233, + "loss": 3.0843, + "theoretical_loss": 3.7755660481612563, + "tokens_seen": 707141632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039682046138415244, + "loss": 3.0302, + "theoretical_loss": 3.7755305857137413, + "tokens_seen": 707207168 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968104312938817, + "loss": 3.0025, + "theoretical_loss": 3.775495127472391, + "tokens_seen": 707272704 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003968004012036108, + "loss": 3.0555, + "theoretical_loss": 3.7754596734363157, + "tokens_seen": 707338240 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039679037111334004, + "loss": 3.0616, + "theoretical_loss": 3.7754242236046274, + "tokens_seen": 707403776 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967803410230692, + "loss": 2.9242, + "theoretical_loss": 3.7753887779764383, + "tokens_seen": 707469312 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967703109327984, + "loss": 3.0757, + "theoretical_loss": 3.7753533365508605, + "tokens_seen": 707534848 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967602808425276, + "loss": 2.9698, + "theoretical_loss": 3.7753178993270065, + "tokens_seen": 707600384 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967502507522568, + "loss": 3.0745, + "theoretical_loss": 3.775282466303989, + "tokens_seen": 707665920 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1700837, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.085143566131592, + "objective/train/theoretical_loss": 3.7752470374809213, + "objective/train/tokens_used": 728191456, + "theoretical_loss": 3.7752470374809213, + "tokens_seen": 707731456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039674022066198594, + "loss": 3.0919, + "theoretical_loss": 3.7752470374809213, + "tokens_seen": 707731456 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967301905717152, + "loss": 2.9628, + "theoretical_loss": 3.775211612856917, + "tokens_seen": 707796992 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967201604814443, + "loss": 3.1311, + "theoretical_loss": 3.775176192431089, + "tokens_seen": 707862528 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039671013039117354, + "loss": 3.0723, + "theoretical_loss": 3.7751407762025506, + "tokens_seen": 707928064 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003967001003009027, + "loss": 3.0981, + "theoretical_loss": 3.7751053641704178, + "tokens_seen": 707993600 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003966900702106319, + "loss": 2.9964, + "theoretical_loss": 3.7750699563338035, + "tokens_seen": 708059136 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003966800401203611, + "loss": 3.0709, + "theoretical_loss": 3.775034552691823, + "tokens_seen": 708124672 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039667001003009027, + "loss": 2.9876, + "theoretical_loss": 3.7749991532435914, + "tokens_seen": 708190208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039665997993981945, + "loss": 2.9222, + "theoretical_loss": 3.7749637579882234, + "tokens_seen": 708255744 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003966499498495487, + "loss": 3.0329, + "theoretical_loss": 3.7749283669248346, + "tokens_seen": 708321280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003966399197592778, + "loss": 3.003, + "theoretical_loss": 3.774892980052541, + "tokens_seen": 708386816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039662988966900704, + "loss": 3.0258, + "theoretical_loss": 3.774857597370458, + "tokens_seen": 708452352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039661985957873617, + "loss": 3.0845, + "theoretical_loss": 3.774822218877703, + "tokens_seen": 708517888 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003966098294884654, + "loss": 2.9835, + "theoretical_loss": 3.7747868445733914, + "tokens_seen": 708583424 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965997993981946, + "loss": 2.994, + "theoretical_loss": 3.7747514744566404, + "tokens_seen": 708648960 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039658976930792377, + "loss": 3.0002, + "theoretical_loss": 3.774716108526567, + "tokens_seen": 708714496 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039657973921765295, + "loss": 2.9913, + "theoretical_loss": 3.7746807467822894, + "tokens_seen": 708780032 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965697091273822, + "loss": 3.1808, + "theoretical_loss": 3.774645389222924, + "tokens_seen": 708845568 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965596790371113, + "loss": 3.0576, + "theoretical_loss": 3.7746100358475894, + "tokens_seen": 708911104 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039654964894684055, + "loss": 2.9454, + "theoretical_loss": 3.774574686655404, + "tokens_seen": 708976640 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965396188565697, + "loss": 2.9372, + "theoretical_loss": 3.774539341645485, + "tokens_seen": 709042176 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965295887662989, + "loss": 3.0677, + "theoretical_loss": 3.774504000816952, + "tokens_seen": 709107712 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003965195586760281, + "loss": 2.9094, + "theoretical_loss": 3.7744686641689245, + "tokens_seen": 709173248 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039650952858575727, + "loss": 3.0276, + "theoretical_loss": 3.7744333317005205, + "tokens_seen": 709238784 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039649949849548645, + "loss": 2.9962, + "theoretical_loss": 3.7743980034108606, + "tokens_seen": 709304320 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.141862392425537, + "objective/train/theoretical_loss": 3.7743626792990637, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7743626792990637, + "tokens_seen": 709369856 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039648946840521563, + "loss": 3.0939, + "theoretical_loss": 3.7743626792990637, + "tokens_seen": 709369856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003964794383149448, + "loss": 3.0574, + "theoretical_loss": 3.77432735936425, + "tokens_seen": 709435392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039646940822467405, + "loss": 2.9835, + "theoretical_loss": 3.77429204360554, + "tokens_seen": 709500928 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003964593781344032, + "loss": 3.1134, + "theoretical_loss": 3.7742567320220544, + "tokens_seen": 709566464 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003964493480441324, + "loss": 2.9629, + "theoretical_loss": 3.7742214246129135, + "tokens_seen": 709632000 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003964393179538616, + "loss": 3.0474, + "theoretical_loss": 3.774186121377239, + "tokens_seen": 709697536 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003964292878635908, + "loss": 3.0769, + "theoretical_loss": 3.774150822314152, + "tokens_seen": 709763072 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039641925777331996, + "loss": 2.958, + "theoretical_loss": 3.774115527422774, + "tokens_seen": 709828608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039640922768304914, + "loss": 2.9968, + "theoretical_loss": 3.774080236702227, + "tokens_seen": 709894144 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963991975927783, + "loss": 3.0025, + "theoretical_loss": 3.774044950151633, + "tokens_seen": 709959680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039638916750250755, + "loss": 3.008, + "theoretical_loss": 3.7740096677701147, + "tokens_seen": 710025216 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039637913741223673, + "loss": 2.9746, + "theoretical_loss": 3.7739743895567948, + "tokens_seen": 710090752 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963691073219659, + "loss": 3.0136, + "theoretical_loss": 3.7739391155107955, + "tokens_seen": 710156288 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963590772316951, + "loss": 2.9681, + "theoretical_loss": 3.7739038456312413, + "tokens_seen": 710221824 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963490471414243, + "loss": 2.9389, + "theoretical_loss": 3.7738685799172544, + "tokens_seen": 710287360 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963390170511535, + "loss": 2.9983, + "theoretical_loss": 3.7738333183679593, + "tokens_seen": 710352896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039632898696088264, + "loss": 3.0307, + "theoretical_loss": 3.77379806098248, + "tokens_seen": 710418432 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003963189568706119, + "loss": 3.125, + "theoretical_loss": 3.7737628077599403, + "tokens_seen": 710483968 + }, + { + "epoch": 8.02, + "learning_rate": 0.000396308926780341, + "loss": 3.0785, + "theoretical_loss": 3.773727558699466, + "tokens_seen": 710549504 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039629889669007024, + "loss": 3.0528, + "theoretical_loss": 3.77369231380018, + "tokens_seen": 710615040 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962888665997994, + "loss": 3.0291, + "theoretical_loss": 3.7736570730612087, + "tokens_seen": 710680576 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962788365095286, + "loss": 3.0819, + "theoretical_loss": 3.773621836481677, + "tokens_seen": 710746112 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962688064192578, + "loss": 3.0223, + "theoretical_loss": 3.7735866040607107, + "tokens_seen": 710811648 + }, + { + "epoch": 8.02, + "learning_rate": 0.000396258776328987, + "loss": 3.1104, + "theoretical_loss": 3.7735513757974353, + "tokens_seen": 710877184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039624874623871614, + "loss": 3.0428, + "theoretical_loss": 3.773516151690978, + "tokens_seen": 710942720 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1161739826202393, + "objective/train/theoretical_loss": 3.7734809317404636, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7734809317404636, + "tokens_seen": 711008256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962387161484454, + "loss": 3.0499, + "theoretical_loss": 3.7734809317404636, + "tokens_seen": 711008256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962286860581745, + "loss": 2.9393, + "theoretical_loss": 3.7734457159450194, + "tokens_seen": 711073792 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039621865596790374, + "loss": 2.9898, + "theoretical_loss": 3.773410504303773, + "tokens_seen": 711139328 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003962086258776329, + "loss": 3.0663, + "theoretical_loss": 3.7733752968158507, + "tokens_seen": 711204864 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003961985957873621, + "loss": 2.9154, + "theoretical_loss": 3.7733400934803805, + "tokens_seen": 711270400 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003961885656970913, + "loss": 3.0581, + "theoretical_loss": 3.7733048942964897, + "tokens_seen": 711335936 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039617853560682047, + "loss": 3.0448, + "theoretical_loss": 3.773269699263307, + "tokens_seen": 711401472 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039616850551654965, + "loss": 2.9739, + "theoretical_loss": 3.7732345083799594, + "tokens_seen": 711467008 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003961584754262789, + "loss": 3.046, + "theoretical_loss": 3.7731993216455764, + "tokens_seen": 711532544 + }, + { + "epoch": 8.02, + "learning_rate": 0.000396148445336008, + "loss": 3.1162, + "theoretical_loss": 3.7731641390592867, + "tokens_seen": 711598080 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039613841524573724, + "loss": 3.2166, + "theoretical_loss": 3.7731289606202187, + "tokens_seen": 711663616 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039612838515546637, + "loss": 2.9924, + "theoretical_loss": 3.773093786327503, + "tokens_seen": 711729152 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003961183550651956, + "loss": 3.0499, + "theoretical_loss": 3.7730586161802675, + "tokens_seen": 711794688 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003961083249749248, + "loss": 3.051, + "theoretical_loss": 3.773023450177643, + "tokens_seen": 711860224 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039609829488465397, + "loss": 3.0253, + "theoretical_loss": 3.7729882883187593, + "tokens_seen": 711925760 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039608826479438315, + "loss": 3.0097, + "theoretical_loss": 3.7729531306027466, + "tokens_seen": 711991296 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003960782347041124, + "loss": 3.087, + "theoretical_loss": 3.7729179770287367, + "tokens_seen": 712056832 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003960682046138415, + "loss": 3.1203, + "theoretical_loss": 3.7728828275958586, + "tokens_seen": 712122368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039605817452357075, + "loss": 3.1605, + "theoretical_loss": 3.7728476823032446, + "tokens_seen": 712187904 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003960481444332999, + "loss": 2.9913, + "theoretical_loss": 3.7728125411500253, + "tokens_seen": 712253440 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003960381143430291, + "loss": 3.0557, + "theoretical_loss": 3.7727774041353337, + "tokens_seen": 712318976 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003960280842527583, + "loss": 2.9859, + "theoretical_loss": 3.7727422712583003, + "tokens_seen": 712384512 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039601805416248747, + "loss": 2.9995, + "theoretical_loss": 3.7727071425180583, + "tokens_seen": 712450048 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039600802407221665, + "loss": 2.8145, + "theoretical_loss": 3.77267201791374, + "tokens_seen": 712515584 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039599799398194583, + "loss": 3.0408, + "theoretical_loss": 3.7726368974444773, + "tokens_seen": 712581120 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9949710369110107, + "objective/train/theoretical_loss": 3.7726017811094037, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7726017811094037, + "tokens_seen": 712646656 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395987963891675, + "loss": 2.9567, + "theoretical_loss": 3.7726017811094037, + "tokens_seen": 712646656 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039597793380140425, + "loss": 3.0632, + "theoretical_loss": 3.7725666689076522, + "tokens_seen": 712712192 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003959679037111334, + "loss": 3.0738, + "theoretical_loss": 3.772531560838357, + "tokens_seen": 712777728 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003959578736208626, + "loss": 3.1777, + "theoretical_loss": 3.7724964569006514, + "tokens_seen": 712843264 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003959478435305918, + "loss": 3.0649, + "theoretical_loss": 3.772461357093669, + "tokens_seen": 712908800 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395937813440321, + "loss": 3.0197, + "theoretical_loss": 3.7724262614165447, + "tokens_seen": 712974336 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039592778335005016, + "loss": 3.0433, + "theoretical_loss": 3.772391169868413, + "tokens_seen": 713039872 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039591775325977934, + "loss": 3.0084, + "theoretical_loss": 3.772356082448408, + "tokens_seen": 713105408 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003959077231695085, + "loss": 3.0312, + "theoretical_loss": 3.7723209991556654, + "tokens_seen": 713170944 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039589769307923775, + "loss": 3.0559, + "theoretical_loss": 3.7722859199893204, + "tokens_seen": 713236480 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003958876629889669, + "loss": 2.98, + "theoretical_loss": 3.7722508449485086, + "tokens_seen": 713302016 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003958776328986961, + "loss": 2.9742, + "theoretical_loss": 3.7722157740323654, + "tokens_seen": 713367552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039586760280842524, + "loss": 3.0511, + "theoretical_loss": 3.772180707240028, + "tokens_seen": 713433088 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003958575727181545, + "loss": 3.0604, + "theoretical_loss": 3.772145644570631, + "tokens_seen": 713498624 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039584754262788366, + "loss": 3.0582, + "theoretical_loss": 3.772110586023313, + "tokens_seen": 713564160 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039583751253761284, + "loss": 3.0967, + "theoretical_loss": 3.772075531597209, + "tokens_seen": 713629696 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395827482447342, + "loss": 3.0855, + "theoretical_loss": 3.772040481291458, + "tokens_seen": 713695232 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003958174523570712, + "loss": 2.9719, + "theoretical_loss": 3.7720054351051955, + "tokens_seen": 713760768 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003958074222668004, + "loss": 3.0182, + "theoretical_loss": 3.7719703930375608, + "tokens_seen": 713826304 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003957973921765296, + "loss": 2.8835, + "theoretical_loss": 3.771935355087691, + "tokens_seen": 713891840 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039578736208625875, + "loss": 3.1579, + "theoretical_loss": 3.771900321254724, + "tokens_seen": 713957376 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395777331995988, + "loss": 3.0755, + "theoretical_loss": 3.771865291537799, + "tokens_seen": 714022912 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039576730190571716, + "loss": 3.0402, + "theoretical_loss": 3.771830265936054, + "tokens_seen": 714088448 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039575727181544634, + "loss": 2.8873, + "theoretical_loss": 3.7717952444486285, + "tokens_seen": 714153984 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003957472417251755, + "loss": 3.0217, + "theoretical_loss": 3.7717602270746617, + "tokens_seen": 714219520 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1933400630950928, + "objective/train/theoretical_loss": 3.7717252138132924, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7717252138132924, + "tokens_seen": 714285056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003957372116349047, + "loss": 3.1084, + "theoretical_loss": 3.7717252138132924, + "tokens_seen": 714285056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003957271815446339, + "loss": 3.0205, + "theoretical_loss": 3.771690204663661, + "tokens_seen": 714350592 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003957171514543631, + "loss": 3.0188, + "theoretical_loss": 3.7716551996249073, + "tokens_seen": 714416128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039570712136409225, + "loss": 2.8949, + "theoretical_loss": 3.771620198696171, + "tokens_seen": 714481664 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003956970912738215, + "loss": 3.035, + "theoretical_loss": 3.771585201876594, + "tokens_seen": 714547200 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003956870611835506, + "loss": 3.0687, + "theoretical_loss": 3.7715502091653157, + "tokens_seen": 714612736 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039567703109327985, + "loss": 2.9762, + "theoretical_loss": 3.7715152205614784, + "tokens_seen": 714678272 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039566700100300903, + "loss": 3.0766, + "theoretical_loss": 3.771480236064222, + "tokens_seen": 714743808 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003956569709127382, + "loss": 3.0955, + "theoretical_loss": 3.7714452556726883, + "tokens_seen": 714809344 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003956469408224674, + "loss": 2.9204, + "theoretical_loss": 3.77141027938602, + "tokens_seen": 714874880 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039563691073219657, + "loss": 2.9949, + "theoretical_loss": 3.7713753072033587, + "tokens_seen": 714940416 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003956268806419258, + "loss": 3.0959, + "theoretical_loss": 3.771340339123846, + "tokens_seen": 715005952 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395616850551655, + "loss": 3.0525, + "theoretical_loss": 3.7713053751466257, + "tokens_seen": 715071488 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039560682046138417, + "loss": 3.0479, + "theoretical_loss": 3.7712704152708394, + "tokens_seen": 715137024 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039559679037111335, + "loss": 3.0025, + "theoretical_loss": 3.7712354594956317, + "tokens_seen": 715202560 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003955867602808426, + "loss": 3.1567, + "theoretical_loss": 3.7712005078201445, + "tokens_seen": 715268096 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003955767301905717, + "loss": 2.9232, + "theoretical_loss": 3.771165560243522, + "tokens_seen": 715333632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039556670010030095, + "loss": 3.0002, + "theoretical_loss": 3.771130616764908, + "tokens_seen": 715399168 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003955566700100301, + "loss": 3.043, + "theoretical_loss": 3.771095677383447, + "tokens_seen": 715464704 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003955466399197593, + "loss": 2.9771, + "theoretical_loss": 3.7710607420982827, + "tokens_seen": 715530240 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003955366098294885, + "loss": 2.9277, + "theoretical_loss": 3.7710258109085597, + "tokens_seen": 715595776 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039552657973921767, + "loss": 3.031, + "theoretical_loss": 3.7709908838134236, + "tokens_seen": 715661312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039551654964894685, + "loss": 3.0188, + "theoretical_loss": 3.77095596081202, + "tokens_seen": 715726848 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039550651955867603, + "loss": 3.0199, + "theoretical_loss": 3.7709210419034926, + "tokens_seen": 715792384 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003954964894684052, + "loss": 2.989, + "theoretical_loss": 3.770886127086988, + "tokens_seen": 715857920 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1003291606903076, + "objective/train/theoretical_loss": 3.7708512163616525, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7708512163616525, + "tokens_seen": 715923456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039548645937813445, + "loss": 3.0832, + "theoretical_loss": 3.7708512163616525, + "tokens_seen": 715923456 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003954764292878636, + "loss": 2.9839, + "theoretical_loss": 3.7708163097266314, + "tokens_seen": 715988992 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003954663991975928, + "loss": 3.0372, + "theoretical_loss": 3.7707814071810715, + "tokens_seen": 716054528 + }, + { + "epoch": 8.02, + "learning_rate": 0.000395456369107322, + "loss": 3.038, + "theoretical_loss": 3.7707465087241205, + "tokens_seen": 716120064 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003954463390170512, + "loss": 3.0214, + "theoretical_loss": 3.7707116143549237, + "tokens_seen": 716185600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039543630892678036, + "loss": 3.0738, + "theoretical_loss": 3.7706767240726284, + "tokens_seen": 716251136 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039542627883650954, + "loss": 3.022, + "theoretical_loss": 3.7706418378763837, + "tokens_seen": 716316672 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003954162487462387, + "loss": 3.0595, + "theoretical_loss": 3.7706069557653357, + "tokens_seen": 716382208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039540621865596795, + "loss": 2.9098, + "theoretical_loss": 3.770572077738633, + "tokens_seen": 716447744 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953961885656971, + "loss": 3.0154, + "theoretical_loss": 3.7705372037954237, + "tokens_seen": 716513280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953861584754263, + "loss": 3.0162, + "theoretical_loss": 3.7705023339348567, + "tokens_seen": 716578816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039537612838515544, + "loss": 3.0762, + "theoretical_loss": 3.77046746815608, + "tokens_seen": 716644352 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953660982948847, + "loss": 3.1269, + "theoretical_loss": 3.770432606458243, + "tokens_seen": 716709888 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039535606820461386, + "loss": 3.0171, + "theoretical_loss": 3.770397748840495, + "tokens_seen": 716775424 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039534603811434304, + "loss": 2.9973, + "theoretical_loss": 3.770362895301985, + "tokens_seen": 716840960 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953360080240722, + "loss": 3.0933, + "theoretical_loss": 3.7703280458418638, + "tokens_seen": 716906496 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953259779338014, + "loss": 3.0683, + "theoretical_loss": 3.77029320045928, + "tokens_seen": 716972032 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953159478435306, + "loss": 2.9503, + "theoretical_loss": 3.770258359153385, + "tokens_seen": 717037568 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003953059177532598, + "loss": 3.0295, + "theoretical_loss": 3.770223521923329, + "tokens_seen": 717103104 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039529588766298895, + "loss": 2.97, + "theoretical_loss": 3.7701886887682625, + "tokens_seen": 717168640 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952858575727182, + "loss": 2.9852, + "theoretical_loss": 3.770153859687337, + "tokens_seen": 717234176 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039527582748244736, + "loss": 3.0932, + "theoretical_loss": 3.770119034679703, + "tokens_seen": 717299712 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039526579739217654, + "loss": 3.0137, + "theoretical_loss": 3.770084213744513, + "tokens_seen": 717365248 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952557673019057, + "loss": 3.079, + "theoretical_loss": 3.7700493968809177, + "tokens_seen": 717430784 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952457372116349, + "loss": 2.9719, + "theoretical_loss": 3.77001458408807, + "tokens_seen": 717496320 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0355122089385986, + "objective/train/theoretical_loss": 3.7699797753651225, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7699797753651225, + "tokens_seen": 717561856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952357071213641, + "loss": 3.0235, + "theoretical_loss": 3.7699797753651225, + "tokens_seen": 717561856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952256770310933, + "loss": 2.9798, + "theoretical_loss": 3.7699449707112267, + "tokens_seen": 717627392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039521564694082245, + "loss": 3.0647, + "theoretical_loss": 3.769910170125536, + "tokens_seen": 717692928 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003952056168505517, + "loss": 3.0194, + "theoretical_loss": 3.7698753736072037, + "tokens_seen": 717758464 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003951955867602808, + "loss": 2.9763, + "theoretical_loss": 3.769840581155382, + "tokens_seen": 717824000 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039518555667001005, + "loss": 2.9531, + "theoretical_loss": 3.769805792769226, + "tokens_seen": 717889536 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039517552657973923, + "loss": 2.9849, + "theoretical_loss": 3.7697710084478886, + "tokens_seen": 717955072 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003951654964894684, + "loss": 3.0815, + "theoretical_loss": 3.769736228190524, + "tokens_seen": 718020608 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003951554663991976, + "loss": 3.0011, + "theoretical_loss": 3.769701451996286, + "tokens_seen": 718086144 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039514543630892677, + "loss": 3.065, + "theoretical_loss": 3.7696666798643306, + "tokens_seen": 718151680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039513540621865595, + "loss": 3.1157, + "theoretical_loss": 3.7696319117938115, + "tokens_seen": 718217216 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003951253761283852, + "loss": 3.0129, + "theoretical_loss": 3.769597147783884, + "tokens_seen": 718282752 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003951153460381143, + "loss": 2.9563, + "theoretical_loss": 3.7695623878337035, + "tokens_seen": 718348288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039510531594784355, + "loss": 3.0014, + "theoretical_loss": 3.769527631942425, + "tokens_seen": 718413824 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039509528585757273, + "loss": 3.0052, + "theoretical_loss": 3.7694928801092056, + "tokens_seen": 718479360 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950852557673019, + "loss": 3.0029, + "theoretical_loss": 3.7694581323332006, + "tokens_seen": 718544896 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950752256770311, + "loss": 3.0741, + "theoretical_loss": 3.769423388613567, + "tokens_seen": 718610432 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950651955867603, + "loss": 3.0585, + "theoretical_loss": 3.7693886489494597, + "tokens_seen": 718675968 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039505516549648946, + "loss": 2.9272, + "theoretical_loss": 3.7693539133400376, + "tokens_seen": 718741504 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950451354062187, + "loss": 3.0397, + "theoretical_loss": 3.7693191817844562, + "tokens_seen": 718807040 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950351053159478, + "loss": 2.9683, + "theoretical_loss": 3.769284454281874, + "tokens_seen": 718872576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039502507522567705, + "loss": 2.9912, + "theoretical_loss": 3.769249730831449, + "tokens_seen": 718938112 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950150451354062, + "loss": 3.0873, + "theoretical_loss": 3.7692150114323373, + "tokens_seen": 719003648 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003950050150451354, + "loss": 3.1306, + "theoretical_loss": 3.7691802960836984, + "tokens_seen": 719069184 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949949849548646, + "loss": 3.0203, + "theoretical_loss": 3.7691455847846895, + "tokens_seen": 719134720 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.988893747329712, + "objective/train/theoretical_loss": 3.7691108775344704, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7691108775344704, + "tokens_seen": 719200256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949849548645938, + "loss": 2.917, + "theoretical_loss": 3.7691108775344704, + "tokens_seen": 719200256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039497492477432296, + "loss": 3.0782, + "theoretical_loss": 3.7690761743321994, + "tokens_seen": 719265792 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949648946840522, + "loss": 3.0782, + "theoretical_loss": 3.769041475177036, + "tokens_seen": 719331328 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949548645937813, + "loss": 3.0328, + "theoretical_loss": 3.769006780068139, + "tokens_seen": 719396864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039494483450351056, + "loss": 3.1395, + "theoretical_loss": 3.7689720890046687, + "tokens_seen": 719462400 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949348044132397, + "loss": 3.0037, + "theoretical_loss": 3.7689374019857844, + "tokens_seen": 719527936 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949247743229689, + "loss": 3.1421, + "theoretical_loss": 3.7689027190106463, + "tokens_seen": 719593472 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949147442326981, + "loss": 2.9586, + "theoretical_loss": 3.768868040078415, + "tokens_seen": 719659008 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003949047141424273, + "loss": 3.0831, + "theoretical_loss": 3.768833365188251, + "tokens_seen": 719724544 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003948946840521565, + "loss": 3.041, + "theoretical_loss": 3.7687986943393152, + "tokens_seen": 719790080 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039488465396188564, + "loss": 2.9488, + "theoretical_loss": 3.768764027530769, + "tokens_seen": 719855616 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003948746238716149, + "loss": 2.9137, + "theoretical_loss": 3.7687293647617732, + "tokens_seen": 719921152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039486459378134406, + "loss": 2.9423, + "theoretical_loss": 3.7686947060314893, + "tokens_seen": 719986688 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039485456369107324, + "loss": 3.1365, + "theoretical_loss": 3.76866005133908, + "tokens_seen": 720052224 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003948445336008024, + "loss": 3.1262, + "theoretical_loss": 3.7686254006837068, + "tokens_seen": 720117760 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003948345035105316, + "loss": 2.9177, + "theoretical_loss": 3.7685907540645327, + "tokens_seen": 720183296 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003948244734202608, + "loss": 3.0366, + "theoretical_loss": 3.76855611148072, + "tokens_seen": 720248832 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039481444332999, + "loss": 2.9826, + "theoretical_loss": 3.7685214729314307, + "tokens_seen": 720314368 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039480441323971915, + "loss": 2.9846, + "theoretical_loss": 3.7684868384158294, + "tokens_seen": 720379904 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947943831494484, + "loss": 3.0312, + "theoretical_loss": 3.7684522079330787, + "tokens_seen": 720445440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039478435305917756, + "loss": 3.1588, + "theoretical_loss": 3.7684175814823417, + "tokens_seen": 720510976 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039477432296890674, + "loss": 3.075, + "theoretical_loss": 3.7683829590627838, + "tokens_seen": 720576512 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947642928786359, + "loss": 3.0378, + "theoretical_loss": 3.768348340673568, + "tokens_seen": 720642048 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947542627883651, + "loss": 3.1098, + "theoretical_loss": 3.768313726313859, + "tokens_seen": 720707584 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947442326980943, + "loss": 3.1144, + "theoretical_loss": 3.768279115982821, + "tokens_seen": 720773120 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9813129901885986, + "objective/train/theoretical_loss": 3.7682445096796187, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7682445096796187, + "tokens_seen": 720838656 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947342026078235, + "loss": 2.9584, + "theoretical_loss": 3.7682445096796187, + "tokens_seen": 720838656 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039472417251755265, + "loss": 2.9008, + "theoretical_loss": 3.7682099074034188, + "tokens_seen": 720904192 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003947141424272819, + "loss": 3.0521, + "theoretical_loss": 3.768175309153385, + "tokens_seen": 720969728 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394704112337011, + "loss": 3.1304, + "theoretical_loss": 3.768140714928683, + "tokens_seen": 721035264 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039469408224674025, + "loss": 3.0805, + "theoretical_loss": 3.7681061247284804, + "tokens_seen": 721100800 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039468405215646943, + "loss": 3.071, + "theoretical_loss": 3.768071538551941, + "tokens_seen": 721166336 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003946740220661986, + "loss": 3.1369, + "theoretical_loss": 3.768036956398233, + "tokens_seen": 721231872 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003946639919759278, + "loss": 2.9868, + "theoretical_loss": 3.768002378266522, + "tokens_seen": 721297408 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039465396188565697, + "loss": 3.0295, + "theoretical_loss": 3.767967804155975, + "tokens_seen": 721362944 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039464393179538615, + "loss": 3.0951, + "theoretical_loss": 3.767933234065759, + "tokens_seen": 721428480 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003946339017051154, + "loss": 2.9439, + "theoretical_loss": 3.767898667995042, + "tokens_seen": 721494016 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003946238716148445, + "loss": 3.0555, + "theoretical_loss": 3.7678641059429916, + "tokens_seen": 721559552 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039461384152457375, + "loss": 3.0921, + "theoretical_loss": 3.767829547908775, + "tokens_seen": 721625088 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039460381143430293, + "loss": 3.1137, + "theoretical_loss": 3.7677949938915605, + "tokens_seen": 721690624 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945937813440321, + "loss": 3.0012, + "theoretical_loss": 3.7677604438905163, + "tokens_seen": 721756160 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945837512537613, + "loss": 2.9964, + "theoretical_loss": 3.767725897904811, + "tokens_seen": 721821696 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945737211634905, + "loss": 3.0775, + "theoretical_loss": 3.7676913559336143, + "tokens_seen": 721887232 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039456369107321966, + "loss": 3.0473, + "theoretical_loss": 3.767656817976094, + "tokens_seen": 721952768 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945536609829489, + "loss": 3.0676, + "theoretical_loss": 3.767622284031421, + "tokens_seen": 722018304 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394543630892678, + "loss": 2.9665, + "theoretical_loss": 3.767587754098763, + "tokens_seen": 722083840 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039453360080240725, + "loss": 2.9918, + "theoretical_loss": 3.767553228177291, + "tokens_seen": 722149376 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945235707121364, + "loss": 2.9797, + "theoretical_loss": 3.7675187062661752, + "tokens_seen": 722214912 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945135406218656, + "loss": 3.0346, + "theoretical_loss": 3.7674841883645858, + "tokens_seen": 722280448 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003945035105315948, + "loss": 3.0123, + "theoretical_loss": 3.7674496744716928, + "tokens_seen": 722345984 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394493480441324, + "loss": 3.0066, + "theoretical_loss": 3.7674151645866676, + "tokens_seen": 722411520 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0978918075561523, + "objective/train/theoretical_loss": 3.7673806587086807, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7673806587086807, + "tokens_seen": 722477056 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039448345035105316, + "loss": 2.9799, + "theoretical_loss": 3.7673806587086807, + "tokens_seen": 722477056 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944734202607824, + "loss": 3.1406, + "theoretical_loss": 3.767346156836904, + "tokens_seen": 722542592 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944633901705115, + "loss": 2.9123, + "theoretical_loss": 3.7673116589705096, + "tokens_seen": 722608128 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039445336008024076, + "loss": 3.0315, + "theoretical_loss": 3.767277165108668, + "tokens_seen": 722673664 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944433299899699, + "loss": 3.1031, + "theoretical_loss": 3.767242675250552, + "tokens_seen": 722739200 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944332998996991, + "loss": 3.0689, + "theoretical_loss": 3.7672081893953333, + "tokens_seen": 722804736 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944232698094283, + "loss": 3.1313, + "theoretical_loss": 3.7671737075421854, + "tokens_seen": 722870272 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003944132397191575, + "loss": 3.0873, + "theoretical_loss": 3.7671392296902804, + "tokens_seen": 722935808 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039440320962888666, + "loss": 2.9943, + "theoretical_loss": 3.7671047558387913, + "tokens_seen": 723001344 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039439317953861584, + "loss": 3.0065, + "theoretical_loss": 3.7670702859868914, + "tokens_seen": 723066880 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394383149448345, + "loss": 3.0022, + "theoretical_loss": 3.767035820133755, + "tokens_seen": 723132416 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039437311935807426, + "loss": 2.9683, + "theoretical_loss": 3.7670013582785553, + "tokens_seen": 723197952 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003943630892678034, + "loss": 3.0648, + "theoretical_loss": 3.7669669004204662, + "tokens_seen": 723263488 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003943530591775326, + "loss": 3.0902, + "theoretical_loss": 3.7669324465586618, + "tokens_seen": 723329024 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039434302908726175, + "loss": 3.0397, + "theoretical_loss": 3.7668979966923173, + "tokens_seen": 723394560 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394332998996991, + "loss": 2.9952, + "theoretical_loss": 3.7668635508206068, + "tokens_seen": 723460096 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039432296890672017, + "loss": 3.0131, + "theoretical_loss": 3.766829108942706, + "tokens_seen": 723525632 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039431293881644935, + "loss": 3.0754, + "theoretical_loss": 3.766794671057789, + "tokens_seen": 723591168 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039430290872617853, + "loss": 3.0024, + "theoretical_loss": 3.766760237165032, + "tokens_seen": 723656704 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039429287863590776, + "loss": 2.9857, + "theoretical_loss": 3.7667258072636116, + "tokens_seen": 723722240 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003942828485456369, + "loss": 2.8887, + "theoretical_loss": 3.7666913813527025, + "tokens_seen": 723787776 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003942728184553661, + "loss": 3.079, + "theoretical_loss": 3.766656959431481, + "tokens_seen": 723853312 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039426278836509525, + "loss": 3.0136, + "theoretical_loss": 3.7666225414991246, + "tokens_seen": 723918848 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003942527582748245, + "loss": 2.951, + "theoretical_loss": 3.766588127554809, + "tokens_seen": 723984384 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039424272818455367, + "loss": 3.0088, + "theoretical_loss": 3.766553717597712, + "tokens_seen": 724049920 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.877624034881592, + "objective/train/theoretical_loss": 3.7665193116270097, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7665193116270097, + "tokens_seen": 724115456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039423269809428285, + "loss": 2.9675, + "theoretical_loss": 3.7665193116270097, + "tokens_seen": 724115456 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039422266800401203, + "loss": 3.0522, + "theoretical_loss": 3.76648490964188, + "tokens_seen": 724180992 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003942126379137412, + "loss": 3.0892, + "theoretical_loss": 3.7664505116415015, + "tokens_seen": 724246528 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003942026078234704, + "loss": 2.9751, + "theoretical_loss": 3.766416117625051, + "tokens_seen": 724312064 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039419257773319963, + "loss": 3.0305, + "theoretical_loss": 3.7663817275917073, + "tokens_seen": 724377600 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039418254764292876, + "loss": 3.0708, + "theoretical_loss": 3.7663473415406488, + "tokens_seen": 724443136 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394172517552658, + "loss": 3.093, + "theoretical_loss": 3.7663129594710534, + "tokens_seen": 724508672 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003941624874623871, + "loss": 3.0276, + "theoretical_loss": 3.7662785813821005, + "tokens_seen": 724574208 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039415245737211635, + "loss": 3.0744, + "theoretical_loss": 3.76624420727297, + "tokens_seen": 724639744 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003941424272818456, + "loss": 2.9535, + "theoretical_loss": 3.7662098371428403, + "tokens_seen": 724705280 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003941323971915747, + "loss": 2.9172, + "theoretical_loss": 3.7661754709908917, + "tokens_seen": 724770816 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039412236710130395, + "loss": 3.0295, + "theoretical_loss": 3.7661411088163037, + "tokens_seen": 724836352 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039411233701103313, + "loss": 2.9424, + "theoretical_loss": 3.766106750618256, + "tokens_seen": 724901888 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003941023069207623, + "loss": 3.0787, + "theoretical_loss": 3.7660723963959297, + "tokens_seen": 724967424 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940922768304915, + "loss": 3.0157, + "theoretical_loss": 3.7660380461485055, + "tokens_seen": 725032960 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940822467402207, + "loss": 3.131, + "theoretical_loss": 3.766003699875164, + "tokens_seen": 725098496 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039407221664994986, + "loss": 3.0169, + "theoretical_loss": 3.765969357575086, + "tokens_seen": 725164032 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940621865596791, + "loss": 3.0154, + "theoretical_loss": 3.765935019247453, + "tokens_seen": 725229568 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940521564694082, + "loss": 3.0075, + "theoretical_loss": 3.7659006848914474, + "tokens_seen": 725295104 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039404212637913745, + "loss": 3.0369, + "theoretical_loss": 3.76586635450625, + "tokens_seen": 725360640 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940320962888666, + "loss": 3.0661, + "theoretical_loss": 3.765832028091043, + "tokens_seen": 725426176 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940220661985958, + "loss": 3.1069, + "theoretical_loss": 3.765797705645009, + "tokens_seen": 725491712 + }, + { + "epoch": 8.02, + "learning_rate": 0.000394012036108325, + "loss": 3.0505, + "theoretical_loss": 3.765763387167331, + "tokens_seen": 725557248 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003940020060180542, + "loss": 3.1076, + "theoretical_loss": 3.7657290726571904, + "tokens_seen": 725622784 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039399197592778336, + "loss": 3.0444, + "theoretical_loss": 3.765694762113771, + "tokens_seen": 725688320 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.909982919692993, + "objective/train/theoretical_loss": 3.765660455536257, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.765660455536257, + "tokens_seen": 725753856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939819458375126, + "loss": 3.0619, + "theoretical_loss": 3.765660455536257, + "tokens_seen": 725753856 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939719157472417, + "loss": 3.0113, + "theoretical_loss": 3.765626152923831, + "tokens_seen": 725819392 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039396188565697096, + "loss": 3.0208, + "theoretical_loss": 3.7655918542756766, + "tokens_seen": 725884928 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939518555667001, + "loss": 3.0199, + "theoretical_loss": 3.7655575595909783, + "tokens_seen": 725950464 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939418254764293, + "loss": 3.1374, + "theoretical_loss": 3.76552326886892, + "tokens_seen": 726016000 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939317953861585, + "loss": 3.0588, + "theoretical_loss": 3.7654889821086868, + "tokens_seen": 726081536 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003939217652958877, + "loss": 2.9898, + "theoretical_loss": 3.765454699309463, + "tokens_seen": 726147072 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039391173520561686, + "loss": 2.9629, + "theoretical_loss": 3.7654204204704325, + "tokens_seen": 726212608 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039390170511534604, + "loss": 3.0165, + "theoretical_loss": 3.7653861455907824, + "tokens_seen": 726278144 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003938916750250752, + "loss": 3.0742, + "theoretical_loss": 3.7653518746696975, + "tokens_seen": 726343680 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039388164493480446, + "loss": 2.976, + "theoretical_loss": 3.765317607706363, + "tokens_seen": 726409216 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003938716148445336, + "loss": 3.0573, + "theoretical_loss": 3.765283344699965, + "tokens_seen": 726474752 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003938615847542628, + "loss": 3.0314, + "theoretical_loss": 3.76524908564969, + "tokens_seen": 726540288 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039385155466399195, + "loss": 3.0326, + "theoretical_loss": 3.7652148305547244, + "tokens_seen": 726605824 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003938415245737212, + "loss": 2.9505, + "theoretical_loss": 3.765180579414255, + "tokens_seen": 726671360 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039383149448345037, + "loss": 3.022, + "theoretical_loss": 3.765146332227468, + "tokens_seen": 726736896 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039382146439317955, + "loss": 3.0445, + "theoretical_loss": 3.765112088993551, + "tokens_seen": 726802432 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039381143430290873, + "loss": 2.986, + "theoretical_loss": 3.7650778497116915, + "tokens_seen": 726867968 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039380140421263796, + "loss": 3.0139, + "theoretical_loss": 3.765043614381077, + "tokens_seen": 726933504 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003937913741223671, + "loss": 3.0949, + "theoretical_loss": 3.765009383000895, + "tokens_seen": 726999040 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003937813440320963, + "loss": 3.0615, + "theoretical_loss": 3.7649751555703346, + "tokens_seen": 727064576 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039377131394182545, + "loss": 2.9577, + "theoretical_loss": 3.764940932088583, + "tokens_seen": 727130112 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003937612838515547, + "loss": 2.9508, + "theoretical_loss": 3.7649067125548292, + "tokens_seen": 727195648 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039375125376128387, + "loss": 3.1321, + "theoretical_loss": 3.7648724969682625, + "tokens_seen": 727261184 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039374122367101305, + "loss": 3.0291, + "theoretical_loss": 3.7648382853280715, + "tokens_seen": 727326720 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1701792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7902650833129883, + "objective/train/theoretical_loss": 3.7648040776334453, + "objective/train/tokens_used": 728937952, + "theoretical_loss": 3.7648040776334453, + "tokens_seen": 727392256 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039373119358074223, + "loss": 3.0056, + "theoretical_loss": 3.7648040776334453, + "tokens_seen": 727392256 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003937211634904714, + "loss": 2.9453, + "theoretical_loss": 3.7647698738835738, + "tokens_seen": 727457792 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003937111334002006, + "loss": 3.0593, + "theoretical_loss": 3.764735674077647, + "tokens_seen": 727523328 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039370110330992983, + "loss": 3.0139, + "theoretical_loss": 3.764701478214854, + "tokens_seen": 727588864 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039369107321965896, + "loss": 3.1184, + "theoretical_loss": 3.7646672862943866, + "tokens_seen": 727654400 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003936810431293882, + "loss": 3.0053, + "theoretical_loss": 3.7646330983154335, + "tokens_seen": 727719936 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003936710130391173, + "loss": 3.1332, + "theoretical_loss": 3.7645989142771867, + "tokens_seen": 727785472 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039366098294884655, + "loss": 3.1204, + "theoretical_loss": 3.764564734178837, + "tokens_seen": 727851008 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039365095285857573, + "loss": 3.0375, + "theoretical_loss": 3.7645305580195747, + "tokens_seen": 727916544 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003936409227683049, + "loss": 3.0444, + "theoretical_loss": 3.7644963857985925, + "tokens_seen": 727982080 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003936308926780341, + "loss": 3.1703, + "theoretical_loss": 3.7644622175150815, + "tokens_seen": 728047616 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039362086258776333, + "loss": 3.0638, + "theoretical_loss": 3.764428053168234, + "tokens_seen": 728113152 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039361083249749246, + "loss": 3.0816, + "theoretical_loss": 3.764393892757241, + "tokens_seen": 728178688 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003936008024072217, + "loss": 3.0189, + "theoretical_loss": 3.764359736281296, + "tokens_seen": 728244224 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935907723169508, + "loss": 3.0852, + "theoretical_loss": 3.764325583739592, + "tokens_seen": 728309760 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039358074222668006, + "loss": 2.9548, + "theoretical_loss": 3.7642914351313213, + "tokens_seen": 728375296 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039357071213640924, + "loss": 3.0413, + "theoretical_loss": 3.7642572904556766, + "tokens_seen": 728440832 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935606820461384, + "loss": 3.1057, + "theoretical_loss": 3.764223149711852, + "tokens_seen": 728506368 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935506519558676, + "loss": 3.0844, + "theoretical_loss": 3.764189012899041, + "tokens_seen": 728571904 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935406218655968, + "loss": 3.0975, + "theoretical_loss": 3.764154880016436, + "tokens_seen": 728637440 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039353059177532596, + "loss": 3.0311, + "theoretical_loss": 3.7641207510632335, + "tokens_seen": 728702976 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935205616850552, + "loss": 2.902, + "theoretical_loss": 3.7640866260386265, + "tokens_seen": 728768512 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003935105315947843, + "loss": 3.0041, + "theoretical_loss": 3.7640525049418097, + "tokens_seen": 728834048 + }, + { + "epoch": 8.02, + "learning_rate": 0.00039350050150451356, + "loss": 3.0043, + "theoretical_loss": 3.7640183877719773, + "tokens_seen": 728899584 + }, + { + "epoch": 8.02, + "learning_rate": 0.0003934904714142427, + "loss": 3.0332, + "theoretical_loss": 3.763988005473146, + "tokens_seen": 728957952 + }, + { + "epoch": 8.02, + "objective/train/docs_used": 1750467, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9299066066741943, + "objective/train/theoretical_loss": 3.76395389572557, + "objective/train/tokens_used": 749483488, + "theoretical_loss": 3.76395389572557, + "tokens_seen": 729023488 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934804413239719, + "loss": 2.9363, + "theoretical_loss": 3.76395389572557, + "tokens_seen": 729023488 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934704112337011, + "loss": 2.8356, + "theoretical_loss": 3.7639197899026535, + "tokens_seen": 729089024 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934603811434303, + "loss": 3.074, + "theoretical_loss": 3.763885688003591, + "tokens_seen": 729154560 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039345035105315946, + "loss": 2.9469, + "theoretical_loss": 3.7638515900275795, + "tokens_seen": 729220096 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934403209628887, + "loss": 2.8321, + "theoretical_loss": 3.763817495973814, + "tokens_seen": 729285632 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039343029087261783, + "loss": 2.9172, + "theoretical_loss": 3.7637834058414916, + "tokens_seen": 729351168 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039342026078234706, + "loss": 2.9839, + "theoretical_loss": 3.763749319629809, + "tokens_seen": 729416704 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934102306920762, + "loss": 2.9521, + "theoretical_loss": 3.763715237337963, + "tokens_seen": 729482240 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003934002006018054, + "loss": 2.9464, + "theoretical_loss": 3.7636811589651504, + "tokens_seen": 729547776 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039339017051153466, + "loss": 3.0298, + "theoretical_loss": 3.763647084510569, + "tokens_seen": 729613312 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003933801404212638, + "loss": 3.0709, + "theoretical_loss": 3.7636130139734165, + "tokens_seen": 729678848 + }, + { + "epoch": 9.0, + "learning_rate": 0.000393370110330993, + "loss": 2.9807, + "theoretical_loss": 3.7635789473528907, + "tokens_seen": 729744384 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039336008024072215, + "loss": 2.9455, + "theoretical_loss": 3.7635448846481894, + "tokens_seen": 729809920 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003933500501504514, + "loss": 2.8478, + "theoretical_loss": 3.7635108258585106, + "tokens_seen": 729875456 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039334002006018057, + "loss": 3.013, + "theoretical_loss": 3.7634767709830537, + "tokens_seen": 729940992 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039332998996990975, + "loss": 2.9076, + "theoretical_loss": 3.7634427200210165, + "tokens_seen": 730006528 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039331995987963893, + "loss": 2.843, + "theoretical_loss": 3.7634086729715994, + "tokens_seen": 730072064 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039330992978936816, + "loss": 2.9774, + "theoretical_loss": 3.7633746298340007, + "tokens_seen": 730137600 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003932998996990973, + "loss": 2.8323, + "theoretical_loss": 3.7633405906074193, + "tokens_seen": 730203136 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003932898696088265, + "loss": 2.9164, + "theoretical_loss": 3.7633065552910567, + "tokens_seen": 730268672 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039327983951855565, + "loss": 3.0142, + "theoretical_loss": 3.763272523884111, + "tokens_seen": 730334208 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003932698094282849, + "loss": 3.04, + "theoretical_loss": 3.7632384963857834, + "tokens_seen": 730399744 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039325977933801407, + "loss": 2.9592, + "theoretical_loss": 3.7632044727952745, + "tokens_seen": 730465280 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039324974924774325, + "loss": 2.9165, + "theoretical_loss": 3.763170453111784, + "tokens_seen": 730530816 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039323971915747243, + "loss": 2.9892, + "theoretical_loss": 3.7631364373345146, + "tokens_seen": 730596352 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1754122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8058929443359375, + "objective/train/theoretical_loss": 3.7631024254626655, + "objective/train/tokens_used": 751121888, + "theoretical_loss": 3.7631024254626655, + "tokens_seen": 730661888 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003932296890672016, + "loss": 2.9602, + "theoretical_loss": 3.7631024254626655, + "tokens_seen": 730661888 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003932196589769308, + "loss": 2.9492, + "theoretical_loss": 3.7630684174954387, + "tokens_seen": 730727424 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039320962888666003, + "loss": 2.865, + "theoretical_loss": 3.7630344134320364, + "tokens_seen": 730792960 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039319959879638916, + "loss": 2.8889, + "theoretical_loss": 3.7630004132716603, + "tokens_seen": 730858496 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003931895687061184, + "loss": 2.8571, + "theoretical_loss": 3.7629664170135118, + "tokens_seen": 730924032 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003931795386158475, + "loss": 2.9113, + "theoretical_loss": 3.7629324246567935, + "tokens_seen": 730989568 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039316950852557675, + "loss": 2.888, + "theoretical_loss": 3.7628984362007083, + "tokens_seen": 731055104 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039315947843530593, + "loss": 2.9744, + "theoretical_loss": 3.762864451644459, + "tokens_seen": 731120640 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003931494483450351, + "loss": 2.9073, + "theoretical_loss": 3.7628304709872484, + "tokens_seen": 731186176 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003931394182547643, + "loss": 2.7814, + "theoretical_loss": 3.7627964942282794, + "tokens_seen": 731251712 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039312938816449353, + "loss": 2.9327, + "theoretical_loss": 3.7627625213667564, + "tokens_seen": 731317248 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039311935807422266, + "loss": 2.965, + "theoretical_loss": 3.7627285524018816, + "tokens_seen": 731382784 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003931093279839519, + "loss": 2.9645, + "theoretical_loss": 3.7626945873328603, + "tokens_seen": 731448320 + }, + { + "epoch": 9.0, + "learning_rate": 0.000393099297893681, + "loss": 3.0065, + "theoretical_loss": 3.7626606261588966, + "tokens_seen": 731513856 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039308926780341026, + "loss": 2.8996, + "theoretical_loss": 3.7626266688791947, + "tokens_seen": 731579392 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039307923771313944, + "loss": 2.9681, + "theoretical_loss": 3.7625927154929593, + "tokens_seen": 731644928 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003930692076228686, + "loss": 2.8178, + "theoretical_loss": 3.762558765999395, + "tokens_seen": 731710464 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003930591775325978, + "loss": 3.0638, + "theoretical_loss": 3.7625248203977066, + "tokens_seen": 731776000 + }, + { + "epoch": 9.0, + "learning_rate": 0.000393049147442327, + "loss": 2.9811, + "theoretical_loss": 3.7624908786871005, + "tokens_seen": 731841536 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039303911735205616, + "loss": 2.8972, + "theoretical_loss": 3.762456940866782, + "tokens_seen": 731907072 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003930290872617854, + "loss": 3.0948, + "theoretical_loss": 3.7624230069359563, + "tokens_seen": 731972608 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003930190571715145, + "loss": 2.9366, + "theoretical_loss": 3.7623890768938297, + "tokens_seen": 732038144 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039300902708124376, + "loss": 2.9773, + "theoretical_loss": 3.762355150739609, + "tokens_seen": 732103680 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929989969909729, + "loss": 2.9479, + "theoretical_loss": 3.7623212284725005, + "tokens_seen": 732169216 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929889669007021, + "loss": 3.0596, + "theoretical_loss": 3.7622873100917102, + "tokens_seen": 732234752 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1759055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9374194145202637, + "objective/train/theoretical_loss": 3.7622533955964466, + "objective/train/tokens_used": 752760288, + "theoretical_loss": 3.7622533955964466, + "tokens_seen": 732300288 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929789368104313, + "loss": 2.8887, + "theoretical_loss": 3.7622533955964466, + "tokens_seen": 732300288 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929689067201605, + "loss": 2.9442, + "theoretical_loss": 3.7622194849859154, + "tokens_seen": 732365824 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039295887662988967, + "loss": 2.8453, + "theoretical_loss": 3.7621855782593245, + "tokens_seen": 732431360 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929488465396189, + "loss": 3.0184, + "theoretical_loss": 3.7621516754158826, + "tokens_seen": 732496896 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039293881644934803, + "loss": 2.8685, + "theoretical_loss": 3.7621177764547964, + "tokens_seen": 732562432 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039292878635907726, + "loss": 3.0372, + "theoretical_loss": 3.7620838813752746, + "tokens_seen": 732627968 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929187562688064, + "loss": 2.992, + "theoretical_loss": 3.762049990176525, + "tokens_seen": 732693504 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003929087261785356, + "loss": 2.8204, + "theoretical_loss": 3.762016102857757, + "tokens_seen": 732759040 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003928986960882648, + "loss": 2.9711, + "theoretical_loss": 3.7619822194181793, + "tokens_seen": 732824576 + }, + { + "epoch": 9.0, + "learning_rate": 0.000392888665997994, + "loss": 3.0824, + "theoretical_loss": 3.7619483398570006, + "tokens_seen": 732890112 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039287863590772317, + "loss": 2.9956, + "theoretical_loss": 3.7619144641734303, + "tokens_seen": 732955648 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039286860581745235, + "loss": 2.9977, + "theoretical_loss": 3.761880592366678, + "tokens_seen": 733021184 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039285857572718153, + "loss": 2.9727, + "theoretical_loss": 3.761846724435954, + "tokens_seen": 733086720 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039284854563691077, + "loss": 2.8091, + "theoretical_loss": 3.7618128603804672, + "tokens_seen": 733152256 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003928385155466399, + "loss": 2.9301, + "theoretical_loss": 3.7617790001994296, + "tokens_seen": 733217792 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039282848545636913, + "loss": 3.011, + "theoretical_loss": 3.7617451438920493, + "tokens_seen": 733283328 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039281845536609825, + "loss": 2.8306, + "theoretical_loss": 3.761711291457539, + "tokens_seen": 733348864 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003928084252758275, + "loss": 2.9477, + "theoretical_loss": 3.7616774428951087, + "tokens_seen": 733414400 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039279839518555667, + "loss": 2.8808, + "theoretical_loss": 3.76164359820397, + "tokens_seen": 733479936 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039278836509528585, + "loss": 2.928, + "theoretical_loss": 3.761609757383334, + "tokens_seen": 733545472 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039277833500501503, + "loss": 3.032, + "theoretical_loss": 3.761575920432412, + "tokens_seen": 733611008 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039276830491474427, + "loss": 2.8973, + "theoretical_loss": 3.7615420873504166, + "tokens_seen": 733676544 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003927582748244734, + "loss": 2.9302, + "theoretical_loss": 3.7615082581365598, + "tokens_seen": 733742080 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039274824473420263, + "loss": 2.904, + "theoretical_loss": 3.7614744327900533, + "tokens_seen": 733807616 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039273821464393176, + "loss": 2.9588, + "theoretical_loss": 3.7614406113101104, + "tokens_seen": 733873152 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1762239, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.985929489135742, + "objective/train/theoretical_loss": 3.761406793695943, + "objective/train/tokens_used": 754398688, + "theoretical_loss": 3.761406793695943, + "tokens_seen": 733938688 + }, + { + "epoch": 9.0, + "learning_rate": 0.000392728184553661, + "loss": 2.9488, + "theoretical_loss": 3.761406793695943, + "tokens_seen": 733938688 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003927181544633902, + "loss": 2.9551, + "theoretical_loss": 3.7613729799467652, + "tokens_seen": 734004224 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039270812437311936, + "loss": 2.9469, + "theoretical_loss": 3.761339170061789, + "tokens_seen": 734069760 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039269809428284854, + "loss": 3.0539, + "theoretical_loss": 3.761305364040229, + "tokens_seen": 734135296 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003926880641925777, + "loss": 2.8254, + "theoretical_loss": 3.7612715618812986, + "tokens_seen": 734200832 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003926780341023069, + "loss": 2.9426, + "theoretical_loss": 3.7612377635842114, + "tokens_seen": 734266368 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039266800401203613, + "loss": 3.0062, + "theoretical_loss": 3.7612039691481822, + "tokens_seen": 734331904 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039265797392176526, + "loss": 2.9371, + "theoretical_loss": 3.7611701785724243, + "tokens_seen": 734397440 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003926479438314945, + "loss": 2.9883, + "theoretical_loss": 3.7611363918561533, + "tokens_seen": 734462976 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039263791374122373, + "loss": 3.058, + "theoretical_loss": 3.7611026089985833, + "tokens_seen": 734528512 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039262788365095286, + "loss": 2.9749, + "theoretical_loss": 3.7610688299989308, + "tokens_seen": 734594048 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003926178535606821, + "loss": 2.9525, + "theoretical_loss": 3.7610350548564093, + "tokens_seen": 734659584 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003926078234704112, + "loss": 2.9227, + "theoretical_loss": 3.761001283570235, + "tokens_seen": 734725120 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039259779338014046, + "loss": 2.8635, + "theoretical_loss": 3.7609675161396243, + "tokens_seen": 734790656 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039258776328986964, + "loss": 2.8446, + "theoretical_loss": 3.760933752563793, + "tokens_seen": 734856192 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003925777331995988, + "loss": 2.8974, + "theoretical_loss": 3.760899992841957, + "tokens_seen": 734921728 + }, + { + "epoch": 9.0, + "learning_rate": 0.000392567703109328, + "loss": 3.0372, + "theoretical_loss": 3.7608662369733326, + "tokens_seen": 734987264 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003925576730190572, + "loss": 2.9298, + "theoretical_loss": 3.760832484957137, + "tokens_seen": 735052800 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039254764292878636, + "loss": 3.0278, + "theoretical_loss": 3.7607987367925864, + "tokens_seen": 735118336 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003925376128385156, + "loss": 2.7971, + "theoretical_loss": 3.760764992478899, + "tokens_seen": 735183872 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003925275827482447, + "loss": 2.9312, + "theoretical_loss": 3.760731252015291, + "tokens_seen": 735249408 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039251755265797396, + "loss": 2.8914, + "theoretical_loss": 3.760697515400981, + "tokens_seen": 735314944 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003925075225677031, + "loss": 2.8497, + "theoretical_loss": 3.760663782635186, + "tokens_seen": 735380480 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924974924774323, + "loss": 2.9216, + "theoretical_loss": 3.760630053717125, + "tokens_seen": 735446016 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924874623871615, + "loss": 2.9764, + "theoretical_loss": 3.760596328646016, + "tokens_seen": 735511552 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1765107, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8674466609954834, + "objective/train/theoretical_loss": 3.7605626074210767, + "objective/train/tokens_used": 756037088, + "theoretical_loss": 3.7605626074210767, + "tokens_seen": 735577088 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924774322968907, + "loss": 2.9424, + "theoretical_loss": 3.7605626074210767, + "tokens_seen": 735577088 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039246740220661987, + "loss": 2.8861, + "theoretical_loss": 3.7605288900415266, + "tokens_seen": 735642624 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924573721163491, + "loss": 2.9426, + "theoretical_loss": 3.7604951765065846, + "tokens_seen": 735708160 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039244734202607823, + "loss": 2.8836, + "theoretical_loss": 3.76046146681547, + "tokens_seen": 735773696 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039243731193580746, + "loss": 2.8488, + "theoretical_loss": 3.760427760967402, + "tokens_seen": 735839232 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924272818455366, + "loss": 2.8532, + "theoretical_loss": 3.7603940589616003, + "tokens_seen": 735904768 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003924172517552658, + "loss": 2.95, + "theoretical_loss": 3.760360360797285, + "tokens_seen": 735970304 + }, + { + "epoch": 9.0, + "learning_rate": 0.000392407221664995, + "loss": 3.0059, + "theoretical_loss": 3.7603266664736754, + "tokens_seen": 736035840 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003923971915747242, + "loss": 2.9179, + "theoretical_loss": 3.7602929759899935, + "tokens_seen": 736101376 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039238716148445337, + "loss": 2.8059, + "theoretical_loss": 3.760259289345458, + "tokens_seen": 736166912 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039237713139418255, + "loss": 2.9481, + "theoretical_loss": 3.760225606539291, + "tokens_seen": 736232448 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039236710130391173, + "loss": 3.0041, + "theoretical_loss": 3.7601919275707125, + "tokens_seen": 736297984 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039235707121364097, + "loss": 3.0002, + "theoretical_loss": 3.7601582524389454, + "tokens_seen": 736363520 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003923470411233701, + "loss": 2.8931, + "theoretical_loss": 3.7601245811432094, + "tokens_seen": 736429056 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039233701103309933, + "loss": 3.0821, + "theoretical_loss": 3.760090913682727, + "tokens_seen": 736494592 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039232698094282846, + "loss": 2.9215, + "theoretical_loss": 3.7600572500567204, + "tokens_seen": 736560128 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003923169508525577, + "loss": 3.0165, + "theoretical_loss": 3.760023590264411, + "tokens_seen": 736625664 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039230692076228687, + "loss": 2.8433, + "theoretical_loss": 3.759989934305022, + "tokens_seen": 736691200 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039229689067201605, + "loss": 2.9587, + "theoretical_loss": 3.759956282177776, + "tokens_seen": 736756736 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039228686058174523, + "loss": 2.943, + "theoretical_loss": 3.759922633881895, + "tokens_seen": 736822272 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039227683049147447, + "loss": 2.9759, + "theoretical_loss": 3.7598889894166025, + "tokens_seen": 736887808 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003922668004012036, + "loss": 2.8345, + "theoretical_loss": 3.759855348781122, + "tokens_seen": 736953344 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039225677031093283, + "loss": 2.9846, + "theoretical_loss": 3.759821711974677, + "tokens_seen": 737018880 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039224674022066196, + "loss": 3.0082, + "theoretical_loss": 3.7597880789964906, + "tokens_seen": 737084416 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003922367101303912, + "loss": 2.9535, + "theoretical_loss": 3.759754449845788, + "tokens_seen": 737149952 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1768775, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8531599044799805, + "objective/train/theoretical_loss": 3.759720824521793, + "objective/train/tokens_used": 757675488, + "theoretical_loss": 3.759720824521793, + "tokens_seen": 737215488 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003922266800401204, + "loss": 2.886, + "theoretical_loss": 3.759720824521793, + "tokens_seen": 737215488 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039221664994984956, + "loss": 2.9788, + "theoretical_loss": 3.759687203023729, + "tokens_seen": 737281024 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039220661985957874, + "loss": 3.024, + "theoretical_loss": 3.759653585350822, + "tokens_seen": 737346560 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921965897693079, + "loss": 2.9846, + "theoretical_loss": 3.7596199715022958, + "tokens_seen": 737412096 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921865596790371, + "loss": 2.9347, + "theoretical_loss": 3.759586361477376, + "tokens_seen": 737477632 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039217652958876633, + "loss": 3.0138, + "theoretical_loss": 3.7595527552752883, + "tokens_seen": 737543168 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039216649949849546, + "loss": 2.8868, + "theoretical_loss": 3.759519152895258, + "tokens_seen": 737608704 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921564694082247, + "loss": 2.9801, + "theoretical_loss": 3.7594855543365107, + "tokens_seen": 737674240 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921464393179539, + "loss": 2.99, + "theoretical_loss": 3.759451959598272, + "tokens_seen": 737739776 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039213640922768306, + "loss": 3.041, + "theoretical_loss": 3.759418368679769, + "tokens_seen": 737805312 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039212637913741224, + "loss": 2.9419, + "theoretical_loss": 3.759384781580228, + "tokens_seen": 737870848 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921163490471414, + "loss": 2.865, + "theoretical_loss": 3.7593511982988748, + "tokens_seen": 737936384 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003921063189568706, + "loss": 2.968, + "theoretical_loss": 3.7593176188349378, + "tokens_seen": 738001920 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039209628886659984, + "loss": 2.9137, + "theoretical_loss": 3.759284043187643, + "tokens_seen": 738067456 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039208625877632896, + "loss": 2.8974, + "theoretical_loss": 3.759250471356218, + "tokens_seen": 738132992 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003920762286860582, + "loss": 2.958, + "theoretical_loss": 3.7592169033398903, + "tokens_seen": 738198528 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003920661985957873, + "loss": 2.9583, + "theoretical_loss": 3.7591833391378886, + "tokens_seen": 738264064 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039205616850551656, + "loss": 2.9679, + "theoretical_loss": 3.75914977874944, + "tokens_seen": 738329600 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039204613841524574, + "loss": 3.0058, + "theoretical_loss": 3.7591162221737724, + "tokens_seen": 738395136 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003920361083249749, + "loss": 2.9378, + "theoretical_loss": 3.7590826694101156, + "tokens_seen": 738460672 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003920260782347041, + "loss": 3.0465, + "theoretical_loss": 3.7590491204576972, + "tokens_seen": 738526208 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003920160481444333, + "loss": 2.9146, + "theoretical_loss": 3.759015575315747, + "tokens_seen": 738591744 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039200601805416247, + "loss": 2.9386, + "theoretical_loss": 3.758982033983493, + "tokens_seen": 738657280 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003919959879638917, + "loss": 2.9975, + "theoretical_loss": 3.7589484964601656, + "tokens_seen": 738722816 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039198595787362083, + "loss": 2.9814, + "theoretical_loss": 3.7589149627449943, + "tokens_seen": 738788352 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1773851, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8705432415008545, + "objective/train/theoretical_loss": 3.7588814328372084, + "objective/train/tokens_used": 759313888, + "theoretical_loss": 3.7588814328372084, + "tokens_seen": 738853888 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039197592778335007, + "loss": 2.9988, + "theoretical_loss": 3.7588814328372084, + "tokens_seen": 738853888 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039196589769307925, + "loss": 2.8564, + "theoretical_loss": 3.7588479067360385, + "tokens_seen": 738919424 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039195586760280843, + "loss": 2.8618, + "theoretical_loss": 3.7588143844407145, + "tokens_seen": 738984960 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003919458375125376, + "loss": 3.0287, + "theoretical_loss": 3.7587808659504676, + "tokens_seen": 739050496 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003919358074222668, + "loss": 3.034, + "theoretical_loss": 3.7587473512645273, + "tokens_seen": 739116032 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039192577733199597, + "loss": 2.9437, + "theoretical_loss": 3.758713840382125, + "tokens_seen": 739181568 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003919157472417252, + "loss": 2.9434, + "theoretical_loss": 3.758680333302493, + "tokens_seen": 739247104 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039190571715145433, + "loss": 2.7737, + "theoretical_loss": 3.758646830024861, + "tokens_seen": 739312640 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039189568706118357, + "loss": 3.0485, + "theoretical_loss": 3.758613330548462, + "tokens_seen": 739378176 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039188565697091275, + "loss": 2.9787, + "theoretical_loss": 3.7585798348725277, + "tokens_seen": 739443712 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039187562688064193, + "loss": 3.0287, + "theoretical_loss": 3.758546342996289, + "tokens_seen": 739509248 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039186559679037117, + "loss": 2.9351, + "theoretical_loss": 3.758512854918979, + "tokens_seen": 739574784 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003918555667001003, + "loss": 3.0148, + "theoretical_loss": 3.7584793706398303, + "tokens_seen": 739640320 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039184553660982953, + "loss": 2.9903, + "theoretical_loss": 3.7584458901580757, + "tokens_seen": 739705856 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039183550651955866, + "loss": 2.9976, + "theoretical_loss": 3.758412413472948, + "tokens_seen": 739771392 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003918254764292879, + "loss": 2.9765, + "theoretical_loss": 3.75837894058368, + "tokens_seen": 739836928 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039181544633901707, + "loss": 2.9584, + "theoretical_loss": 3.7583454714895055, + "tokens_seen": 739902464 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039180541624874625, + "loss": 2.9815, + "theoretical_loss": 3.7583120061896587, + "tokens_seen": 739968000 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039179538615847543, + "loss": 3.072, + "theoretical_loss": 3.758278544683372, + "tokens_seen": 740033536 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039178535606820467, + "loss": 2.9698, + "theoretical_loss": 3.758245086969881, + "tokens_seen": 740099072 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003917753259779338, + "loss": 2.9578, + "theoretical_loss": 3.758211633048419, + "tokens_seen": 740164608 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039176529588766303, + "loss": 3.0447, + "theoretical_loss": 3.758178182918221, + "tokens_seen": 740230144 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039175526579739216, + "loss": 2.8925, + "theoretical_loss": 3.7581447365785214, + "tokens_seen": 740295680 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003917452357071214, + "loss": 3.0438, + "theoretical_loss": 3.7581112940285557, + "tokens_seen": 740361216 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003917352056168506, + "loss": 2.928, + "theoretical_loss": 3.758077855267558, + "tokens_seen": 740426752 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1776651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963792562484741, + "objective/train/theoretical_loss": 3.758044420294765, + "objective/train/tokens_used": 760952288, + "theoretical_loss": 3.758044420294765, + "tokens_seen": 740492288 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039172517552657976, + "loss": 2.9336, + "theoretical_loss": 3.758044420294765, + "tokens_seen": 740492288 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039171514543630894, + "loss": 2.9276, + "theoretical_loss": 3.7580109891094122, + "tokens_seen": 740557824 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003917051153460381, + "loss": 2.7747, + "theoretical_loss": 3.7579775617107343, + "tokens_seen": 740623360 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003916950852557673, + "loss": 2.9084, + "theoretical_loss": 3.757944138097969, + "tokens_seen": 740688896 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039168505516549653, + "loss": 3.0459, + "theoretical_loss": 3.7579107182703506, + "tokens_seen": 740754432 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039167502507522566, + "loss": 2.968, + "theoretical_loss": 3.7578773022271172, + "tokens_seen": 740819968 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003916649949849549, + "loss": 2.945, + "theoretical_loss": 3.7578438899675053, + "tokens_seen": 740885504 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003916549648946841, + "loss": 2.9672, + "theoretical_loss": 3.7578104814907514, + "tokens_seen": 740951040 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039164493480441326, + "loss": 2.9321, + "theoretical_loss": 3.757777076796093, + "tokens_seen": 741016576 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039163490471414244, + "loss": 2.9789, + "theoretical_loss": 3.757743675882767, + "tokens_seen": 741082112 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003916248746238716, + "loss": 2.9988, + "theoretical_loss": 3.757710278750012, + "tokens_seen": 741147648 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003916148445336008, + "loss": 2.9345, + "theoretical_loss": 3.757676885397065, + "tokens_seen": 741213184 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039160481444333004, + "loss": 2.9759, + "theoretical_loss": 3.7576434958231637, + "tokens_seen": 741278720 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039159478435305916, + "loss": 2.8517, + "theoretical_loss": 3.7576101100275485, + "tokens_seen": 741344256 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915847542627884, + "loss": 2.9726, + "theoretical_loss": 3.757576728009455, + "tokens_seen": 741409792 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915747241725175, + "loss": 2.9287, + "theoretical_loss": 3.7575433497681243, + "tokens_seen": 741475328 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039156469408224676, + "loss": 3.0057, + "theoretical_loss": 3.757509975302794, + "tokens_seen": 741540864 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039155466399197594, + "loss": 2.989, + "theoretical_loss": 3.757476604612704, + "tokens_seen": 741606400 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915446339017051, + "loss": 2.9123, + "theoretical_loss": 3.7574432376970934, + "tokens_seen": 741671936 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915346038114343, + "loss": 3.034, + "theoretical_loss": 3.7574098745552016, + "tokens_seen": 741737472 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915245737211635, + "loss": 3.0257, + "theoretical_loss": 3.7573765151862686, + "tokens_seen": 741803008 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039151454363089267, + "loss": 3.082, + "theoretical_loss": 3.757343159589534, + "tokens_seen": 741868544 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003915045135406219, + "loss": 3.0042, + "theoretical_loss": 3.7573098077642393, + "tokens_seen": 741934080 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039149448345035103, + "loss": 2.9873, + "theoretical_loss": 3.757276459709624, + "tokens_seen": 741999616 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039148445336008027, + "loss": 2.8802, + "theoretical_loss": 3.757243115424929, + "tokens_seen": 742065152 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1781573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9696123600006104, + "objective/train/theoretical_loss": 3.757209774909396, + "objective/train/tokens_used": 762590688, + "theoretical_loss": 3.757209774909396, + "tokens_seen": 742130688 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039147442326980945, + "loss": 3.0286, + "theoretical_loss": 3.757209774909396, + "tokens_seen": 742130688 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039146439317953863, + "loss": 2.9987, + "theoretical_loss": 3.7571764381622645, + "tokens_seen": 742196224 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003914543630892678, + "loss": 2.95, + "theoretical_loss": 3.7571431051827773, + "tokens_seen": 742261760 + }, + { + "epoch": 9.0, + "learning_rate": 0.000391444332998997, + "loss": 2.9948, + "theoretical_loss": 3.757109775970175, + "tokens_seen": 742327296 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039143430290872617, + "loss": 2.8793, + "theoretical_loss": 3.7570764505237007, + "tokens_seen": 742392832 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003914242728184554, + "loss": 3.039, + "theoretical_loss": 3.7570431288425947, + "tokens_seen": 742458368 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039141424272818453, + "loss": 2.9834, + "theoretical_loss": 3.7570098109261005, + "tokens_seen": 742523904 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039140421263791377, + "loss": 2.9827, + "theoretical_loss": 3.756976496773461, + "tokens_seen": 742589440 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003913941825476429, + "loss": 2.9702, + "theoretical_loss": 3.7569431863839173, + "tokens_seen": 742654976 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039138415245737213, + "loss": 2.9681, + "theoretical_loss": 3.7569098797567135, + "tokens_seen": 742720512 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003913741223671013, + "loss": 2.8108, + "theoretical_loss": 3.7568765768910923, + "tokens_seen": 742786048 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003913640922768305, + "loss": 2.9758, + "theoretical_loss": 3.7568432777862975, + "tokens_seen": 742851584 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003913540621865597, + "loss": 2.9488, + "theoretical_loss": 3.756809982441572, + "tokens_seen": 742917120 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039134403209628886, + "loss": 3.0222, + "theoretical_loss": 3.75677669085616, + "tokens_seen": 742982656 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039133400200601804, + "loss": 3.0043, + "theoretical_loss": 3.756743403029305, + "tokens_seen": 743048192 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039132397191574727, + "loss": 3.0098, + "theoretical_loss": 3.7567101189602523, + "tokens_seen": 743113728 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003913139418254764, + "loss": 2.9735, + "theoretical_loss": 3.756676838648245, + "tokens_seen": 743179264 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039130391173520563, + "loss": 3.0327, + "theoretical_loss": 3.756643562092529, + "tokens_seen": 743244800 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003912938816449348, + "loss": 3.0202, + "theoretical_loss": 3.7566102892923485, + "tokens_seen": 743310336 + }, + { + "epoch": 9.0, + "learning_rate": 0.000391283851554664, + "loss": 2.8726, + "theoretical_loss": 3.756577020246948, + "tokens_seen": 743375872 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003912738214643932, + "loss": 2.9594, + "theoretical_loss": 3.7565437549555742, + "tokens_seen": 743441408 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039126379137412236, + "loss": 2.8788, + "theoretical_loss": 3.756510493417472, + "tokens_seen": 743506944 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039125376128385154, + "loss": 3.1415, + "theoretical_loss": 3.7564772356318867, + "tokens_seen": 743572480 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003912437311935808, + "loss": 2.8553, + "theoretical_loss": 3.7564439815980646, + "tokens_seen": 743638016 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003912337011033099, + "loss": 3.0607, + "theoretical_loss": 3.756410731315252, + "tokens_seen": 743703552 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1784391, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.126253366470337, + "objective/train/theoretical_loss": 3.7563774847826954, + "objective/train/tokens_used": 764229088, + "theoretical_loss": 3.7563774847826954, + "tokens_seen": 743769088 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039122367101303914, + "loss": 3.0216, + "theoretical_loss": 3.7563774847826954, + "tokens_seen": 743769088 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039121364092276826, + "loss": 2.9771, + "theoretical_loss": 3.756344241999641, + "tokens_seen": 743834624 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003912036108324975, + "loss": 2.9837, + "theoretical_loss": 3.7563110029653366, + "tokens_seen": 743900160 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003911935807422267, + "loss": 2.944, + "theoretical_loss": 3.7562777676790278, + "tokens_seen": 743965696 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039118355065195586, + "loss": 2.9103, + "theoretical_loss": 3.756244536139963, + "tokens_seen": 744031232 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039117352056168504, + "loss": 3.0089, + "theoretical_loss": 3.756211308347389, + "tokens_seen": 744096768 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003911634904714143, + "loss": 2.9125, + "theoretical_loss": 3.7561780843005543, + "tokens_seen": 744162304 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003911534603811434, + "loss": 2.87, + "theoretical_loss": 3.7561448639987063, + "tokens_seen": 744227840 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039114343029087264, + "loss": 2.9349, + "theoretical_loss": 3.7561116474410925, + "tokens_seen": 744293376 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003911334002006018, + "loss": 2.9267, + "theoretical_loss": 3.7560784346269624, + "tokens_seen": 744358912 + }, + { + "epoch": 9.0, + "learning_rate": 0.000391123370110331, + "loss": 2.9787, + "theoretical_loss": 3.7560452255555647, + "tokens_seen": 744424448 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039111334002006024, + "loss": 2.9234, + "theoretical_loss": 3.756012020226147, + "tokens_seen": 744489984 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039110330992978936, + "loss": 2.9597, + "theoretical_loss": 3.755978818637959, + "tokens_seen": 744555520 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003910932798395186, + "loss": 2.9745, + "theoretical_loss": 3.75594562079025, + "tokens_seen": 744621056 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039108324974924773, + "loss": 2.8868, + "theoretical_loss": 3.7559124266822694, + "tokens_seen": 744686592 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039107321965897696, + "loss": 3.0083, + "theoretical_loss": 3.755879236313267, + "tokens_seen": 744752128 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039106318956870614, + "loss": 2.9679, + "theoretical_loss": 3.755846049682492, + "tokens_seen": 744817664 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003910531594784353, + "loss": 2.9556, + "theoretical_loss": 3.7558128667891952, + "tokens_seen": 744883200 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003910431293881645, + "loss": 2.9854, + "theoretical_loss": 3.755779687632627, + "tokens_seen": 744948736 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003910330992978937, + "loss": 2.9652, + "theoretical_loss": 3.7557465122120366, + "tokens_seen": 745014272 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039102306920762287, + "loss": 2.9643, + "theoretical_loss": 3.7557133405266767, + "tokens_seen": 745079808 + }, + { + "epoch": 9.0, + "learning_rate": 0.0003910130391173521, + "loss": 2.9053, + "theoretical_loss": 3.755680172575797, + "tokens_seen": 745145344 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039100300902708123, + "loss": 2.9479, + "theoretical_loss": 3.7556470083586486, + "tokens_seen": 745210880 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039099297893681047, + "loss": 2.9545, + "theoretical_loss": 3.755613847874484, + "tokens_seen": 745276416 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039098294884653965, + "loss": 2.94, + "theoretical_loss": 3.755580691122554, + "tokens_seen": 745341952 + }, + { + "epoch": 9.0, + "objective/train/docs_used": 1788170, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9751358032226562, + "objective/train/theoretical_loss": 3.75554753810211, + "objective/train/tokens_used": 765867488, + "theoretical_loss": 3.75554753810211, + "tokens_seen": 745407488 + }, + { + "epoch": 9.0, + "learning_rate": 0.00039097291875626883, + "loss": 2.8883, + "theoretical_loss": 3.75554753810211, + "tokens_seen": 745407488 + }, + { + "epoch": 9.01, + "learning_rate": 0.000390962888665998, + "loss": 3.0829, + "theoretical_loss": 3.7555143888124047, + "tokens_seen": 745473024 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003909528585757272, + "loss": 3.0805, + "theoretical_loss": 3.7554812432526905, + "tokens_seen": 745538560 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039094282848545637, + "loss": 3.0221, + "theoretical_loss": 3.7554481014222194, + "tokens_seen": 745604096 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003909327983951856, + "loss": 3.0206, + "theoretical_loss": 3.755414963320244, + "tokens_seen": 745669632 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039092276830491473, + "loss": 2.9015, + "theoretical_loss": 3.7553818289460175, + "tokens_seen": 745735168 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039091273821464397, + "loss": 3.0623, + "theoretical_loss": 3.7553486982987936, + "tokens_seen": 745800704 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003909027081243731, + "loss": 2.7934, + "theoretical_loss": 3.7553155713778246, + "tokens_seen": 745866240 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039089267803410233, + "loss": 3.0201, + "theoretical_loss": 3.7552824481823643, + "tokens_seen": 745931776 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003908826479438315, + "loss": 2.966, + "theoretical_loss": 3.755249328711667, + "tokens_seen": 745997312 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003908726178535607, + "loss": 3.0314, + "theoretical_loss": 3.755216212964986, + "tokens_seen": 746062848 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003908625877632899, + "loss": 2.9576, + "theoretical_loss": 3.7551831009415757, + "tokens_seen": 746128384 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039085255767301906, + "loss": 2.9862, + "theoretical_loss": 3.755149992640691, + "tokens_seen": 746193920 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039084252758274824, + "loss": 2.9643, + "theoretical_loss": 3.7551168880615857, + "tokens_seen": 746259456 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039083249749247747, + "loss": 3.0869, + "theoretical_loss": 3.755083787203515, + "tokens_seen": 746324992 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003908224674022066, + "loss": 3.0393, + "theoretical_loss": 3.755050690065734, + "tokens_seen": 746390528 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039081243731193583, + "loss": 2.9945, + "theoretical_loss": 3.755017596647499, + "tokens_seen": 746456064 + }, + { + "epoch": 9.01, + "learning_rate": 0.000390802407221665, + "loss": 2.8655, + "theoretical_loss": 3.7549845069480634, + "tokens_seen": 746521600 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003907923771313942, + "loss": 2.9111, + "theoretical_loss": 3.7549514209666843, + "tokens_seen": 746587136 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003907823470411234, + "loss": 2.8761, + "theoretical_loss": 3.754918338702617, + "tokens_seen": 746652672 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039077231695085256, + "loss": 3.1001, + "theoretical_loss": 3.7548852601551186, + "tokens_seen": 746718208 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039076228686058174, + "loss": 2.9421, + "theoretical_loss": 3.7548521853234442, + "tokens_seen": 746783744 + }, + { + "epoch": 9.01, + "learning_rate": 0.000390752256770311, + "loss": 2.9352, + "theoretical_loss": 3.754819114206851, + "tokens_seen": 746849280 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003907422266800401, + "loss": 2.9328, + "theoretical_loss": 3.7547860468045955, + "tokens_seen": 746914816 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039073219658976934, + "loss": 3.0489, + "theoretical_loss": 3.7547529831159348, + "tokens_seen": 746980352 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1792930, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.825350046157837, + "objective/train/theoretical_loss": 3.754719923140126, + "objective/train/tokens_used": 767505888, + "theoretical_loss": 3.754719923140126, + "tokens_seen": 747045888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039072216649949846, + "loss": 2.9826, + "theoretical_loss": 3.754719923140126, + "tokens_seen": 747045888 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003907121364092277, + "loss": 2.8827, + "theoretical_loss": 3.754686866876427, + "tokens_seen": 747111424 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003907021063189569, + "loss": 2.9697, + "theoretical_loss": 3.7546538143240946, + "tokens_seen": 747176960 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039069207622868606, + "loss": 3.041, + "theoretical_loss": 3.7546207654823878, + "tokens_seen": 747242496 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039068204613841524, + "loss": 2.7803, + "theoretical_loss": 3.754587720350563, + "tokens_seen": 747308032 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003906720160481445, + "loss": 2.9936, + "theoretical_loss": 3.75455467892788, + "tokens_seen": 747373568 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003906619859578736, + "loss": 3.0277, + "theoretical_loss": 3.754521641213597, + "tokens_seen": 747439104 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039065195586760284, + "loss": 2.9266, + "theoretical_loss": 3.754488607206972, + "tokens_seen": 747504640 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039064192577733197, + "loss": 2.9601, + "theoretical_loss": 3.754455576907264, + "tokens_seen": 747570176 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003906318956870612, + "loss": 2.8738, + "theoretical_loss": 3.7544225503137323, + "tokens_seen": 747635712 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003906218655967904, + "loss": 2.955, + "theoretical_loss": 3.7543895274256363, + "tokens_seen": 747701248 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039061183550651957, + "loss": 3.0214, + "theoretical_loss": 3.7543565082422363, + "tokens_seen": 747766784 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039060180541624875, + "loss": 2.8788, + "theoretical_loss": 3.7543234927627904, + "tokens_seen": 747832320 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039059177532597793, + "loss": 2.9173, + "theoretical_loss": 3.75429048098656, + "tokens_seen": 747897856 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003905817452357071, + "loss": 3.0397, + "theoretical_loss": 3.7542574729128044, + "tokens_seen": 747963392 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039057171514543634, + "loss": 2.9399, + "theoretical_loss": 3.7542244685407846, + "tokens_seen": 748028928 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039056168505516547, + "loss": 3.0072, + "theoretical_loss": 3.7541914678697603, + "tokens_seen": 748094464 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003905516549648947, + "loss": 2.8826, + "theoretical_loss": 3.7541584708989935, + "tokens_seen": 748160000 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039054162487462383, + "loss": 3.0876, + "theoretical_loss": 3.7541254776277446, + "tokens_seen": 748225536 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039053159478435307, + "loss": 2.9765, + "theoretical_loss": 3.754092488055275, + "tokens_seen": 748291072 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039052156469408225, + "loss": 2.9039, + "theoretical_loss": 3.754059502180846, + "tokens_seen": 748356608 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039051153460381143, + "loss": 2.8101, + "theoretical_loss": 3.7540265200037193, + "tokens_seen": 748422144 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003905015045135406, + "loss": 2.9762, + "theoretical_loss": 3.7539935415231565, + "tokens_seen": 748487680 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039049147442326985, + "loss": 2.9966, + "theoretical_loss": 3.7539605667384204, + "tokens_seen": 748553216 + }, + { + "epoch": 9.01, + "learning_rate": 0.000390481444332999, + "loss": 3.0344, + "theoretical_loss": 3.753927595648773, + "tokens_seen": 748618752 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1795920, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9627983570098877, + "objective/train/theoretical_loss": 3.753894628253476, + "objective/train/tokens_used": 769144288, + "theoretical_loss": 3.753894628253476, + "tokens_seen": 748684288 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003904714142427282, + "loss": 2.9251, + "theoretical_loss": 3.753894628253476, + "tokens_seen": 748684288 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039046138415245734, + "loss": 2.9082, + "theoretical_loss": 3.7538616645517933, + "tokens_seen": 748749824 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039045135406218657, + "loss": 2.9505, + "theoretical_loss": 3.7538287045429874, + "tokens_seen": 748815360 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039044132397191575, + "loss": 3.0754, + "theoretical_loss": 3.753795748226321, + "tokens_seen": 748880896 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039043129388164493, + "loss": 2.9619, + "theoretical_loss": 3.753762795601059, + "tokens_seen": 748946432 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003904212637913741, + "loss": 2.9469, + "theoretical_loss": 3.753729846666463, + "tokens_seen": 749011968 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003904112337011033, + "loss": 2.9996, + "theoretical_loss": 3.7536969014217973, + "tokens_seen": 749077504 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003904012036108325, + "loss": 3.0957, + "theoretical_loss": 3.7536639598663264, + "tokens_seen": 749143040 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903911735205617, + "loss": 3.0746, + "theoretical_loss": 3.7536310219993148, + "tokens_seen": 749208576 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903811434302909, + "loss": 3.1147, + "theoretical_loss": 3.753598087820026, + "tokens_seen": 749274112 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903711133400201, + "loss": 2.9659, + "theoretical_loss": 3.7535651573277256, + "tokens_seen": 749339648 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039036108324974926, + "loss": 3.0136, + "theoretical_loss": 3.753532230521677, + "tokens_seen": 749405184 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039035105315947844, + "loss": 2.9829, + "theoretical_loss": 3.753499307401147, + "tokens_seen": 749470720 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039034102306920767, + "loss": 3.025, + "theoretical_loss": 3.7534663879653998, + "tokens_seen": 749536256 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903309929789368, + "loss": 2.94, + "theoretical_loss": 3.7534334722137013, + "tokens_seen": 749601792 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039032096288866603, + "loss": 3.045, + "theoretical_loss": 3.7534005601453164, + "tokens_seen": 749667328 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903109327983952, + "loss": 2.9114, + "theoretical_loss": 3.753367651759512, + "tokens_seen": 749732864 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003903009027081244, + "loss": 2.9295, + "theoretical_loss": 3.753334747055553, + "tokens_seen": 749798400 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003902908726178536, + "loss": 2.9582, + "theoretical_loss": 3.7533018460327074, + "tokens_seen": 749863936 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039028084252758276, + "loss": 2.8631, + "theoretical_loss": 3.7532689486902404, + "tokens_seen": 749929472 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039027081243731194, + "loss": 2.937, + "theoretical_loss": 3.7532360550274193, + "tokens_seen": 749995008 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003902607823470412, + "loss": 3.0703, + "theoretical_loss": 3.7532031650435105, + "tokens_seen": 750060544 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003902507522567703, + "loss": 2.9681, + "theoretical_loss": 3.7531702787377816, + "tokens_seen": 750126080 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039024072216649954, + "loss": 2.9133, + "theoretical_loss": 3.7531373961095, + "tokens_seen": 750191616 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039023069207622866, + "loss": 3.028, + "theoretical_loss": 3.753104517157933, + "tokens_seen": 750257152 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1800822, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.080545425415039, + "objective/train/theoretical_loss": 3.753071641882349, + "objective/train/tokens_used": 770782688, + "theoretical_loss": 3.753071641882349, + "tokens_seen": 750322688 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003902206619859579, + "loss": 2.9642, + "theoretical_loss": 3.753071641882349, + "tokens_seen": 750322688 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003902106318956871, + "loss": 3.024, + "theoretical_loss": 3.753038770282015, + "tokens_seen": 750388224 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039020060180541626, + "loss": 2.9382, + "theoretical_loss": 3.7530059023562, + "tokens_seen": 750453760 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039019057171514544, + "loss": 3.0132, + "theoretical_loss": 3.7529730381041717, + "tokens_seen": 750519296 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003901805416248747, + "loss": 2.9909, + "theoretical_loss": 3.7529401775251996, + "tokens_seen": 750584832 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003901705115346038, + "loss": 3.0013, + "theoretical_loss": 3.7529073206185517, + "tokens_seen": 750650368 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039016048144433304, + "loss": 2.9731, + "theoretical_loss": 3.752874467383498, + "tokens_seen": 750715904 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039015045135406217, + "loss": 3.0497, + "theoretical_loss": 3.752841617819307, + "tokens_seen": 750781440 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003901404212637914, + "loss": 2.9081, + "theoretical_loss": 3.752808771925248, + "tokens_seen": 750846976 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003901303911735206, + "loss": 2.8392, + "theoretical_loss": 3.752775929700591, + "tokens_seen": 750912512 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039012036108324977, + "loss": 3.0258, + "theoretical_loss": 3.7527430911446062, + "tokens_seen": 750978048 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039011033099297895, + "loss": 2.9058, + "theoretical_loss": 3.752710256256563, + "tokens_seen": 751043584 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039010030090270813, + "loss": 3.0264, + "theoretical_loss": 3.752677425035732, + "tokens_seen": 751109120 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003900902708124373, + "loss": 2.96, + "theoretical_loss": 3.7526445974813845, + "tokens_seen": 751174656 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039008024072216654, + "loss": 2.9608, + "theoretical_loss": 3.7526117735927897, + "tokens_seen": 751240192 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039007021063189567, + "loss": 2.9682, + "theoretical_loss": 3.7525789533692198, + "tokens_seen": 751305728 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003900601805416249, + "loss": 3.0344, + "theoretical_loss": 3.7525461368099453, + "tokens_seen": 751371264 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039005015045135403, + "loss": 3.0125, + "theoretical_loss": 3.752513323914237, + "tokens_seen": 751436800 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039004012036108327, + "loss": 2.9894, + "theoretical_loss": 3.752480514681368, + "tokens_seen": 751502336 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039003009027081245, + "loss": 2.881, + "theoretical_loss": 3.7524477091106085, + "tokens_seen": 751567872 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039002006018054163, + "loss": 2.868, + "theoretical_loss": 3.752414907201232, + "tokens_seen": 751633408 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003900100300902708, + "loss": 3.0164, + "theoretical_loss": 3.7523821089525087, + "tokens_seen": 751698944 + }, + { + "epoch": 9.01, + "learning_rate": 0.00039000000000000005, + "loss": 2.779, + "theoretical_loss": 3.7523493143637126, + "tokens_seen": 751764480 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003899899699097292, + "loss": 2.956, + "theoretical_loss": 3.752316523434116, + "tokens_seen": 751830016 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003899799398194584, + "loss": 2.8779, + "theoretical_loss": 3.752283736162991, + "tokens_seen": 751895552 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1803717, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.166200876235962, + "objective/train/theoretical_loss": 3.7522509525496113, + "objective/train/tokens_used": 772421088, + "theoretical_loss": 3.7522509525496113, + "tokens_seen": 751961088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038996990972918754, + "loss": 2.9979, + "theoretical_loss": 3.7522509525496113, + "tokens_seen": 751961088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038995987963891677, + "loss": 2.9747, + "theoretical_loss": 3.75221817259325, + "tokens_seen": 752026624 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038994984954864595, + "loss": 2.9722, + "theoretical_loss": 3.75218539629318, + "tokens_seen": 752092160 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038993981945837513, + "loss": 2.9627, + "theoretical_loss": 3.7521526236486755, + "tokens_seen": 752157696 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003899297893681043, + "loss": 3.0334, + "theoretical_loss": 3.7521198546590098, + "tokens_seen": 752223232 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003899197592778335, + "loss": 2.9058, + "theoretical_loss": 3.7520870893234575, + "tokens_seen": 752288768 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003899097291875627, + "loss": 2.9612, + "theoretical_loss": 3.752054327641293, + "tokens_seen": 752354304 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898996990972919, + "loss": 3.0061, + "theoretical_loss": 3.75202156961179, + "tokens_seen": 752419840 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038988966900702104, + "loss": 3.0225, + "theoretical_loss": 3.751988815234224, + "tokens_seen": 752485376 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898796389167503, + "loss": 2.9556, + "theoretical_loss": 3.751956064507869, + "tokens_seen": 752550912 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898696088264794, + "loss": 3.0158, + "theoretical_loss": 3.7519233174320004, + "tokens_seen": 752616448 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038985957873620864, + "loss": 2.9971, + "theoretical_loss": 3.751890574005894, + "tokens_seen": 752681984 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898495486459378, + "loss": 2.9871, + "theoretical_loss": 3.751857834228824, + "tokens_seen": 752747520 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389839518555667, + "loss": 3.0008, + "theoretical_loss": 3.751825098100068, + "tokens_seen": 752813056 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898294884653962, + "loss": 2.8059, + "theoretical_loss": 3.7517923656189005, + "tokens_seen": 752878592 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003898194583751254, + "loss": 2.9713, + "theoretical_loss": 3.751759636784598, + "tokens_seen": 752944128 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038980942828485454, + "loss": 2.9091, + "theoretical_loss": 3.7517269115964362, + "tokens_seen": 753009664 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897993981945838, + "loss": 2.7537, + "theoretical_loss": 3.7516941900536933, + "tokens_seen": 753075200 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897893681043129, + "loss": 3.0246, + "theoretical_loss": 3.7516614721556443, + "tokens_seen": 753140736 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038977933801404214, + "loss": 3.021, + "theoretical_loss": 3.7516287579015666, + "tokens_seen": 753206272 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897693079237713, + "loss": 2.9628, + "theoretical_loss": 3.751596047290738, + "tokens_seen": 753271808 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897592778335005, + "loss": 3.0304, + "theoretical_loss": 3.751563340322435, + "tokens_seen": 753337344 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897492477432297, + "loss": 2.9474, + "theoretical_loss": 3.7515306369959363, + "tokens_seen": 753402880 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038973921765295886, + "loss": 2.9062, + "theoretical_loss": 3.7514979373105186, + "tokens_seen": 753468416 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038972918756268805, + "loss": 3.0625, + "theoretical_loss": 3.7514652412654597, + "tokens_seen": 753533952 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1807508, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9669129848480225, + "objective/train/theoretical_loss": 3.7514325488600386, + "objective/train/tokens_used": 774059488, + "theoretical_loss": 3.7514325488600386, + "tokens_seen": 753599488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897191574724173, + "loss": 2.9545, + "theoretical_loss": 3.7514325488600386, + "tokens_seen": 753599488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003897091273821464, + "loss": 2.9298, + "theoretical_loss": 3.7513998600935334, + "tokens_seen": 753665024 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038969909729187564, + "loss": 2.9524, + "theoretical_loss": 3.7513671749652224, + "tokens_seen": 753730560 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038968906720160477, + "loss": 2.8779, + "theoretical_loss": 3.751334493474385, + "tokens_seen": 753796096 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389679037111334, + "loss": 3.0066, + "theoretical_loss": 3.7513018156202995, + "tokens_seen": 753861632 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003896690070210632, + "loss": 3.0092, + "theoretical_loss": 3.751269141402245, + "tokens_seen": 753927168 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038965897693079237, + "loss": 2.9108, + "theoretical_loss": 3.751236470819502, + "tokens_seen": 753992704 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038964894684052155, + "loss": 3.0354, + "theoretical_loss": 3.7512038038713493, + "tokens_seen": 754058240 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003896389167502508, + "loss": 2.9906, + "theoretical_loss": 3.751171140557067, + "tokens_seen": 754123776 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038962888665997997, + "loss": 2.9459, + "theoretical_loss": 3.7511384808759347, + "tokens_seen": 754189312 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038961885656970915, + "loss": 2.8596, + "theoretical_loss": 3.751105824827233, + "tokens_seen": 754254848 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038960882647943833, + "loss": 2.9557, + "theoretical_loss": 3.7510731724102424, + "tokens_seen": 754320384 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003895987963891675, + "loss": 2.8882, + "theoretical_loss": 3.7510405236242432, + "tokens_seen": 754385920 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038958876629889674, + "loss": 3.0407, + "theoretical_loss": 3.7510078784685166, + "tokens_seen": 754451456 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038957873620862587, + "loss": 3.0212, + "theoretical_loss": 3.7509752369423435, + "tokens_seen": 754516992 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003895687061183551, + "loss": 2.9768, + "theoretical_loss": 3.750942599045005, + "tokens_seen": 754582528 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038955867602808423, + "loss": 3.0084, + "theoretical_loss": 3.7509099647757824, + "tokens_seen": 754648064 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038954864593781347, + "loss": 2.898, + "theoretical_loss": 3.750877334133958, + "tokens_seen": 754713600 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038953861584754265, + "loss": 3.0083, + "theoretical_loss": 3.7508447071188136, + "tokens_seen": 754779136 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038952858575727183, + "loss": 2.9985, + "theoretical_loss": 3.7508120837296306, + "tokens_seen": 754844672 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389518555667001, + "loss": 3.0173, + "theoretical_loss": 3.750779463965692, + "tokens_seen": 754910208 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038950852557673025, + "loss": 3.0231, + "theoretical_loss": 3.7507468478262798, + "tokens_seen": 754975744 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894984954864594, + "loss": 2.8989, + "theoretical_loss": 3.7507142353106766, + "tokens_seen": 755041280 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894884653961886, + "loss": 2.9787, + "theoretical_loss": 3.750681626418166, + "tokens_seen": 755106816 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038947843530591774, + "loss": 2.9609, + "theoretical_loss": 3.7506490211480306, + "tokens_seen": 755172352 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1812427, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8931655883789062, + "objective/train/theoretical_loss": 3.7506164194995533, + "objective/train/tokens_used": 775697888, + "theoretical_loss": 3.7506164194995533, + "tokens_seen": 755237888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038946840521564697, + "loss": 3.024, + "theoretical_loss": 3.7506164194995533, + "tokens_seen": 755237888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038945837512537615, + "loss": 2.8896, + "theoretical_loss": 3.750583821472018, + "tokens_seen": 755303424 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038944834503510533, + "loss": 3.0182, + "theoretical_loss": 3.750551227064709, + "tokens_seen": 755368960 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894383149448345, + "loss": 3.0459, + "theoretical_loss": 3.75051863627691, + "tokens_seen": 755434496 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894282848545637, + "loss": 3.0763, + "theoretical_loss": 3.750486049107904, + "tokens_seen": 755500032 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894182547642929, + "loss": 3.0415, + "theoretical_loss": 3.7504534655569763, + "tokens_seen": 755565568 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003894082246740221, + "loss": 2.8773, + "theoretical_loss": 3.750420885623411, + "tokens_seen": 755631104 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038939819458375124, + "loss": 3.0132, + "theoretical_loss": 3.7503883093064934, + "tokens_seen": 755696640 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003893881644934805, + "loss": 3.002, + "theoretical_loss": 3.750355736605508, + "tokens_seen": 755762176 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003893781344032096, + "loss": 2.9659, + "theoretical_loss": 3.75032316751974, + "tokens_seen": 755827712 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038936810431293884, + "loss": 2.9938, + "theoretical_loss": 3.7502906020484748, + "tokens_seen": 755893248 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389358074222668, + "loss": 2.9941, + "theoretical_loss": 3.7502580401909977, + "tokens_seen": 755958784 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003893480441323972, + "loss": 3.0517, + "theoretical_loss": 3.7502254819465946, + "tokens_seen": 756024320 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003893380140421264, + "loss": 2.9907, + "theoretical_loss": 3.7501929273145516, + "tokens_seen": 756089856 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003893279839518556, + "loss": 3.0444, + "theoretical_loss": 3.750160376294154, + "tokens_seen": 756155392 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038931795386158474, + "loss": 2.903, + "theoretical_loss": 3.7501278288846898, + "tokens_seen": 756220928 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389307923771314, + "loss": 2.9159, + "theoretical_loss": 3.7500952850854445, + "tokens_seen": 756286464 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892978936810431, + "loss": 2.9588, + "theoretical_loss": 3.750062744895705, + "tokens_seen": 756352000 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038928786359077234, + "loss": 2.987, + "theoretical_loss": 3.750030208314758, + "tokens_seen": 756417536 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892778335005015, + "loss": 2.9156, + "theoretical_loss": 3.7499976753418904, + "tokens_seen": 756483072 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892678034102307, + "loss": 2.9766, + "theoretical_loss": 3.749965145976391, + "tokens_seen": 756548608 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892577733199599, + "loss": 3.0643, + "theoretical_loss": 3.749932620217546, + "tokens_seen": 756614144 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038924774322968906, + "loss": 3.0185, + "theoretical_loss": 3.7499000980646437, + "tokens_seen": 756679680 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038923771313941825, + "loss": 3.0096, + "theoretical_loss": 3.749867579516972, + "tokens_seen": 756745216 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892276830491475, + "loss": 2.9931, + "theoretical_loss": 3.7498350645738183, + "tokens_seen": 756810752 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1815390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1163766384124756, + "objective/train/theoretical_loss": 3.7498025532344723, + "objective/train/tokens_used": 777336288, + "theoretical_loss": 3.7498025532344723, + "tokens_seen": 756876288 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003892176529588766, + "loss": 3.0522, + "theoretical_loss": 3.7498025532344723, + "tokens_seen": 756876288 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038920762286860584, + "loss": 3.0443, + "theoretical_loss": 3.749770045498222, + "tokens_seen": 756941824 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038919759277833497, + "loss": 3.0077, + "theoretical_loss": 3.7497375413643566, + "tokens_seen": 757007360 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003891875626880642, + "loss": 2.9182, + "theoretical_loss": 3.749705040832164, + "tokens_seen": 757072896 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003891775325977934, + "loss": 2.9541, + "theoretical_loss": 3.7496725439009344, + "tokens_seen": 757138432 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038916750250752257, + "loss": 2.9728, + "theoretical_loss": 3.7496400505699565, + "tokens_seen": 757203968 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038915747241725175, + "loss": 3.0194, + "theoretical_loss": 3.74960756083852, + "tokens_seen": 757269504 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389147442326981, + "loss": 2.9472, + "theoretical_loss": 3.7495750747059153, + "tokens_seen": 757335040 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003891374122367101, + "loss": 3.0528, + "theoretical_loss": 3.7495425921714323, + "tokens_seen": 757400576 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038912738214643935, + "loss": 2.9682, + "theoretical_loss": 3.7495101132343605, + "tokens_seen": 757466112 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003891173520561685, + "loss": 2.8813, + "theoretical_loss": 3.7494776378939907, + "tokens_seen": 757531648 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003891073219658977, + "loss": 2.9668, + "theoretical_loss": 3.7494451661496138, + "tokens_seen": 757597184 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003890972918756269, + "loss": 2.9091, + "theoretical_loss": 3.7494126980005196, + "tokens_seen": 757662720 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038908726178535607, + "loss": 3.0508, + "theoretical_loss": 3.749380233446, + "tokens_seen": 757728256 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038907723169508525, + "loss": 2.9655, + "theoretical_loss": 3.749347772485346, + "tokens_seen": 757793792 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038906720160481443, + "loss": 3.0035, + "theoretical_loss": 3.7493153151178493, + "tokens_seen": 757859328 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003890571715145436, + "loss": 3.0403, + "theoretical_loss": 3.7492828613428006, + "tokens_seen": 757924864 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038904714142427285, + "loss": 2.9928, + "theoretical_loss": 3.7492504111594926, + "tokens_seen": 757990400 + }, + { + "epoch": 9.01, + "learning_rate": 0.000389037111334002, + "loss": 3.1312, + "theoretical_loss": 3.749217964567217, + "tokens_seen": 758055936 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003890270812437312, + "loss": 2.9707, + "theoretical_loss": 3.749185521565266, + "tokens_seen": 758121472 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003890170511534604, + "loss": 2.9855, + "theoretical_loss": 3.7491530821529317, + "tokens_seen": 758187008 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003890070210631896, + "loss": 3.0472, + "theoretical_loss": 3.7491206463295073, + "tokens_seen": 758252544 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038899699097291876, + "loss": 3.0077, + "theoretical_loss": 3.749088214094285, + "tokens_seen": 758318080 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038898696088264794, + "loss": 2.9471, + "theoretical_loss": 3.749055785446558, + "tokens_seen": 758383616 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003889769307923771, + "loss": 3.0547, + "theoretical_loss": 3.7490233603856202, + "tokens_seen": 758449152 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1820190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0227887630462646, + "objective/train/theoretical_loss": 3.7489909389107634, + "objective/train/tokens_used": 778974688, + "theoretical_loss": 3.7489909389107634, + "tokens_seen": 758514688 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038896690070210635, + "loss": 2.8398, + "theoretical_loss": 3.7489909389107634, + "tokens_seen": 758514688 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003889568706118355, + "loss": 3.0367, + "theoretical_loss": 3.748958521021283, + "tokens_seen": 758580224 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003889468405215647, + "loss": 2.913, + "theoretical_loss": 3.748926106716472, + "tokens_seen": 758645760 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038893681043129384, + "loss": 3.0489, + "theoretical_loss": 3.7488936959956236, + "tokens_seen": 758711296 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003889267803410231, + "loss": 3.0486, + "theoretical_loss": 3.7488612888580333, + "tokens_seen": 758776832 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038891675025075226, + "loss": 2.8655, + "theoretical_loss": 3.7488288853029954, + "tokens_seen": 758842368 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038890672016048144, + "loss": 3.0861, + "theoretical_loss": 3.748796485329804, + "tokens_seen": 758907904 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888966900702106, + "loss": 3.0319, + "theoretical_loss": 3.7487640889377536, + "tokens_seen": 758973440 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888866599799398, + "loss": 2.9951, + "theoretical_loss": 3.74873169612614, + "tokens_seen": 759038976 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038887662988966904, + "loss": 2.9344, + "theoretical_loss": 3.7486993068942582, + "tokens_seen": 759104512 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888665997993982, + "loss": 2.957, + "theoretical_loss": 3.748666921241403, + "tokens_seen": 759170048 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888565697091274, + "loss": 2.8983, + "theoretical_loss": 3.748634539166871, + "tokens_seen": 759235584 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888465396188566, + "loss": 2.9459, + "theoretical_loss": 3.7486021606699573, + "tokens_seen": 759301120 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888365095285858, + "loss": 3.0184, + "theoretical_loss": 3.7485697857499574, + "tokens_seen": 759366656 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038882647943831494, + "loss": 2.8992, + "theoretical_loss": 3.7485374144061687, + "tokens_seen": 759432192 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888164493480442, + "loss": 3.0417, + "theoretical_loss": 3.748505046637887, + "tokens_seen": 759497728 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003888064192577733, + "loss": 2.9944, + "theoretical_loss": 3.7484726824444095, + "tokens_seen": 759563264 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038879638916750254, + "loss": 2.9593, + "theoretical_loss": 3.748440321825032, + "tokens_seen": 759628800 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003887863590772317, + "loss": 2.9791, + "theoretical_loss": 3.7484079647790516, + "tokens_seen": 759694336 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003887763289869609, + "loss": 2.9409, + "theoretical_loss": 3.7483756113057662, + "tokens_seen": 759759872 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003887662988966901, + "loss": 3.0018, + "theoretical_loss": 3.748343261404473, + "tokens_seen": 759825408 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038875626880641927, + "loss": 3.0365, + "theoretical_loss": 3.748310915074469, + "tokens_seen": 759890944 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038874623871614845, + "loss": 3.0277, + "theoretical_loss": 3.748278572315053, + "tokens_seen": 759956480 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003887362086258777, + "loss": 3.0058, + "theoretical_loss": 3.7482462331255224, + "tokens_seen": 760022016 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003887261785356068, + "loss": 3.0575, + "theoretical_loss": 3.7482138975051753, + "tokens_seen": 760087552 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1823260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.080761194229126, + "objective/train/theoretical_loss": 3.74818156545331, + "objective/train/tokens_used": 780613088, + "theoretical_loss": 3.74818156545331, + "tokens_seen": 760153088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038871614844533604, + "loss": 3.0899, + "theoretical_loss": 3.74818156545331, + "tokens_seen": 760153088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038870611835506517, + "loss": 2.8949, + "theoretical_loss": 3.7481492369692253, + "tokens_seen": 760218624 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886960882647944, + "loss": 2.9447, + "theoretical_loss": 3.7481169120522204, + "tokens_seen": 760284160 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886860581745236, + "loss": 2.9634, + "theoretical_loss": 3.748084590701594, + "tokens_seen": 760349696 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038867602808425277, + "loss": 3.0254, + "theoretical_loss": 3.7480522729166443, + "tokens_seen": 760415232 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038866599799398195, + "loss": 2.9729, + "theoretical_loss": 3.748019958696672, + "tokens_seen": 760480768 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886559679037112, + "loss": 2.8541, + "theoretical_loss": 3.7479876480409766, + "tokens_seen": 760546304 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886459378134403, + "loss": 3.0084, + "theoretical_loss": 3.747955340948857, + "tokens_seen": 760611840 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038863590772316955, + "loss": 3.0129, + "theoretical_loss": 3.747923037419614, + "tokens_seen": 760677376 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886258776328987, + "loss": 2.9278, + "theoretical_loss": 3.7478907374525474, + "tokens_seen": 760742912 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886158475426279, + "loss": 2.9533, + "theoretical_loss": 3.747858441046957, + "tokens_seen": 760808448 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003886058174523571, + "loss": 2.951, + "theoretical_loss": 3.7478261482021447, + "tokens_seen": 760873984 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038859578736208627, + "loss": 3.0356, + "theoretical_loss": 3.7477938589174107, + "tokens_seen": 760939520 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038858575727181545, + "loss": 3.0796, + "theoretical_loss": 3.7477615731920553, + "tokens_seen": 761005056 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038857572718154463, + "loss": 3.061, + "theoretical_loss": 3.7477292910253803, + "tokens_seen": 761070592 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003885656970912738, + "loss": 3.0106, + "theoretical_loss": 3.747697012416687, + "tokens_seen": 761136128 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038855566700100305, + "loss": 2.8942, + "theoretical_loss": 3.747664737365277, + "tokens_seen": 761201664 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003885456369107322, + "loss": 2.9613, + "theoretical_loss": 3.747632465870452, + "tokens_seen": 761267200 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003885356068204614, + "loss": 2.8877, + "theoretical_loss": 3.7476001979315137, + "tokens_seen": 761332736 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003885255767301906, + "loss": 3.0403, + "theoretical_loss": 3.7475679335477645, + "tokens_seen": 761398272 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003885155466399198, + "loss": 3.1311, + "theoretical_loss": 3.7475356727185067, + "tokens_seen": 761463808 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038850551654964896, + "loss": 3.0188, + "theoretical_loss": 3.7475034154430427, + "tokens_seen": 761529344 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038849548645937814, + "loss": 3.0342, + "theoretical_loss": 3.747471161720675, + "tokens_seen": 761594880 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003884854563691073, + "loss": 2.9424, + "theoretical_loss": 3.7474389115507076, + "tokens_seen": 761660416 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038847542627883655, + "loss": 2.9341, + "theoretical_loss": 3.7474066649324422, + "tokens_seen": 761725952 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1827180, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.102508068084717, + "objective/train/theoretical_loss": 3.7473744218651834, + "objective/train/tokens_used": 782251488, + "theoretical_loss": 3.7473744218651834, + "tokens_seen": 761791488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003884653961885657, + "loss": 3.0578, + "theoretical_loss": 3.7473744218651834, + "tokens_seen": 761791488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003884553660982949, + "loss": 2.9268, + "theoretical_loss": 3.747342182348234, + "tokens_seen": 761857024 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038844533600802404, + "loss": 2.9444, + "theoretical_loss": 3.747309946380898, + "tokens_seen": 761922560 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003884353059177533, + "loss": 2.9513, + "theoretical_loss": 3.7472777139624793, + "tokens_seen": 761988096 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038842527582748246, + "loss": 2.998, + "theoretical_loss": 3.7472454850922814, + "tokens_seen": 762053632 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038841524573721164, + "loss": 3.1333, + "theoretical_loss": 3.7472132597696097, + "tokens_seen": 762119168 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003884052156469408, + "loss": 2.958, + "theoretical_loss": 3.7471810379937676, + "tokens_seen": 762184704 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038839518555667, + "loss": 3.005, + "theoretical_loss": 3.7471488197640612, + "tokens_seen": 762250240 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883851554663992, + "loss": 3.0844, + "theoretical_loss": 3.7471166050797935, + "tokens_seen": 762315776 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883751253761284, + "loss": 3.0185, + "theoretical_loss": 3.7470843939402716, + "tokens_seen": 762381312 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038836509528585755, + "loss": 2.9414, + "theoretical_loss": 3.747052186344799, + "tokens_seen": 762446848 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883550651955868, + "loss": 2.9208, + "theoretical_loss": 3.7470199822926826, + "tokens_seen": 762512384 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038834503510531596, + "loss": 3.0169, + "theoretical_loss": 3.746987781783227, + "tokens_seen": 762577920 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038833500501504514, + "loss": 2.9999, + "theoretical_loss": 3.7469555848157388, + "tokens_seen": 762643456 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883249749247743, + "loss": 3.04, + "theoretical_loss": 3.746923391389524, + "tokens_seen": 762708992 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883149448345035, + "loss": 2.9748, + "theoretical_loss": 3.7468912015038884, + "tokens_seen": 762774528 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003883049147442327, + "loss": 2.8756, + "theoretical_loss": 3.746859015158139, + "tokens_seen": 762840064 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003882948846539619, + "loss": 2.9902, + "theoretical_loss": 3.746826832351582, + "tokens_seen": 762905600 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038828485456369105, + "loss": 3.0148, + "theoretical_loss": 3.7467946530835246, + "tokens_seen": 762971136 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003882748244734203, + "loss": 2.8972, + "theoretical_loss": 3.7467624773532737, + "tokens_seen": 763036672 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003882647943831494, + "loss": 2.9471, + "theoretical_loss": 3.746730305160137, + "tokens_seen": 763102208 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038825476429287865, + "loss": 2.9923, + "theoretical_loss": 3.746698136503421, + "tokens_seen": 763167744 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038824473420260783, + "loss": 3.074, + "theoretical_loss": 3.746665971382434, + "tokens_seen": 763233280 + }, + { + "epoch": 9.01, + "learning_rate": 0.000388234704112337, + "loss": 2.9498, + "theoretical_loss": 3.7466338097964837, + "tokens_seen": 763298816 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003882246740220662, + "loss": 2.8434, + "theoretical_loss": 3.746601651744878, + "tokens_seen": 763364352 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1831663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.152209758758545, + "objective/train/theoretical_loss": 3.7465694972269254, + "objective/train/tokens_used": 783889888, + "theoretical_loss": 3.7465694972269254, + "tokens_seen": 763429888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038821464393179537, + "loss": 3.0181, + "theoretical_loss": 3.7465694972269254, + "tokens_seen": 763429888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038820461384152455, + "loss": 2.9738, + "theoretical_loss": 3.7465373462419342, + "tokens_seen": 763495424 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881945837512538, + "loss": 3.0079, + "theoretical_loss": 3.7465051987892126, + "tokens_seen": 763560960 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881845536609829, + "loss": 2.9381, + "theoretical_loss": 3.74647305486807, + "tokens_seen": 763626496 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038817452357071215, + "loss": 3.0244, + "theoretical_loss": 3.7464409144778155, + "tokens_seen": 763692032 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038816449348044133, + "loss": 3.0016, + "theoretical_loss": 3.7464087776177575, + "tokens_seen": 763757568 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881544633901705, + "loss": 2.9647, + "theoretical_loss": 3.7463766442872064, + "tokens_seen": 763823104 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881444332998997, + "loss": 2.9711, + "theoretical_loss": 3.7463445144854703, + "tokens_seen": 763888640 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881344032096289, + "loss": 2.9088, + "theoretical_loss": 3.7463123882118605, + "tokens_seen": 763954176 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881243731193581, + "loss": 3.0005, + "theoretical_loss": 3.7462802654656864, + "tokens_seen": 764019712 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003881143430290873, + "loss": 2.9962, + "theoretical_loss": 3.7462481462462582, + "tokens_seen": 764085248 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038810431293881647, + "loss": 3.0128, + "theoretical_loss": 3.746216030552886, + "tokens_seen": 764150784 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038809428284854565, + "loss": 2.9817, + "theoretical_loss": 3.7461839183848804, + "tokens_seen": 764216320 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038808425275827483, + "loss": 3.0395, + "theoretical_loss": 3.7461518097415523, + "tokens_seen": 764281856 + }, + { + "epoch": 9.01, + "learning_rate": 0.000388074222668004, + "loss": 2.9363, + "theoretical_loss": 3.7461197046222128, + "tokens_seen": 764347392 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038806419257773325, + "loss": 3.012, + "theoretical_loss": 3.7460876030261723, + "tokens_seen": 764412928 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003880541624874624, + "loss": 3.036, + "theoretical_loss": 3.7460555049527438, + "tokens_seen": 764478464 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003880441323971916, + "loss": 2.896, + "theoretical_loss": 3.7460234104012367, + "tokens_seen": 764544000 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003880341023069208, + "loss": 2.9665, + "theoretical_loss": 3.7459913193709644, + "tokens_seen": 764609536 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038802407221665, + "loss": 3.0524, + "theoretical_loss": 3.745959231861238, + "tokens_seen": 764675072 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038801404212637916, + "loss": 3.0212, + "theoretical_loss": 3.7459271478713694, + "tokens_seen": 764740608 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038800401203610834, + "loss": 3.0306, + "theoretical_loss": 3.7458950674006717, + "tokens_seen": 764806144 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003879939819458375, + "loss": 2.916, + "theoretical_loss": 3.7458629904484564, + "tokens_seen": 764871680 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038798395185556675, + "loss": 3.0619, + "theoretical_loss": 3.745830917014037, + "tokens_seen": 764937216 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003879739217652959, + "loss": 3.038, + "theoretical_loss": 3.7457988470967263, + "tokens_seen": 765002752 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1835071, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9196348190307617, + "objective/train/theoretical_loss": 3.7457667806958366, + "objective/train/tokens_used": 785528288, + "theoretical_loss": 3.7457667806958366, + "tokens_seen": 765068288 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003879638916750251, + "loss": 3.052, + "theoretical_loss": 3.7457667806958366, + "tokens_seen": 765068288 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038795386158475424, + "loss": 2.9419, + "theoretical_loss": 3.7457347178106826, + "tokens_seen": 765133824 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003879438314944835, + "loss": 2.9954, + "theoretical_loss": 3.745702658440576, + "tokens_seen": 765199360 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038793380140421266, + "loss": 3.0267, + "theoretical_loss": 3.7456706025848314, + "tokens_seen": 765264896 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038792377131394184, + "loss": 2.8677, + "theoretical_loss": 3.745638550242763, + "tokens_seen": 765330432 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387913741223671, + "loss": 2.9657, + "theoretical_loss": 3.7456065014136843, + "tokens_seen": 765395968 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003879037111334002, + "loss": 3.0074, + "theoretical_loss": 3.745574456096909, + "tokens_seen": 765461504 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878936810431294, + "loss": 2.8997, + "theoretical_loss": 3.745542414291753, + "tokens_seen": 765527040 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878836509528586, + "loss": 3.0328, + "theoretical_loss": 3.7455103759975295, + "tokens_seen": 765592576 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038787362086258775, + "loss": 3.0471, + "theoretical_loss": 3.7454783412135537, + "tokens_seen": 765658112 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387863590772317, + "loss": 2.883, + "theoretical_loss": 3.7454463099391413, + "tokens_seen": 765723648 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038785356068204616, + "loss": 2.8909, + "theoretical_loss": 3.7454142821736065, + "tokens_seen": 765789184 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038784353059177534, + "loss": 3.0555, + "theoretical_loss": 3.7453822579162654, + "tokens_seen": 765854720 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878335005015045, + "loss": 2.9862, + "theoretical_loss": 3.745350237166433, + "tokens_seen": 765920256 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878234704112337, + "loss": 2.9388, + "theoretical_loss": 3.7453182199234254, + "tokens_seen": 765985792 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878134403209629, + "loss": 2.9769, + "theoretical_loss": 3.7452862061865586, + "tokens_seen": 766051328 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003878034102306921, + "loss": 2.9256, + "theoretical_loss": 3.745254195955148, + "tokens_seen": 766116864 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038779338014042125, + "loss": 3.0897, + "theoretical_loss": 3.7452221892285116, + "tokens_seen": 766182400 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003877833500501505, + "loss": 3.0586, + "theoretical_loss": 3.7451901860059644, + "tokens_seen": 766247936 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003877733199598796, + "loss": 2.8859, + "theoretical_loss": 3.7451581862868233, + "tokens_seen": 766313472 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038776328986960885, + "loss": 2.954, + "theoretical_loss": 3.745126190070406, + "tokens_seen": 766379008 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038775325977933803, + "loss": 2.917, + "theoretical_loss": 3.745094197356029, + "tokens_seen": 766444544 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003877432296890672, + "loss": 2.9771, + "theoretical_loss": 3.7450622081430103, + "tokens_seen": 766510080 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003877331995987964, + "loss": 3.0389, + "theoretical_loss": 3.7450302224306657, + "tokens_seen": 766575616 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038772316950852557, + "loss": 2.9653, + "theoretical_loss": 3.744998240218315, + "tokens_seen": 766641152 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1839781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0761451721191406, + "objective/train/theoretical_loss": 3.744966261505275, + "objective/train/tokens_used": 787166688, + "theoretical_loss": 3.744966261505275, + "tokens_seen": 766706688 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038771313941825475, + "loss": 3.0535, + "theoretical_loss": 3.744966261505275, + "tokens_seen": 766706688 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387703109327984, + "loss": 2.8291, + "theoretical_loss": 3.7449342862908632, + "tokens_seen": 766772224 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876930792377131, + "loss": 3.0839, + "theoretical_loss": 3.744902314574399, + "tokens_seen": 766837760 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038768304914744235, + "loss": 2.8346, + "theoretical_loss": 3.7448703463552, + "tokens_seen": 766903296 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038767301905717153, + "loss": 2.9842, + "theoretical_loss": 3.7448383816325856, + "tokens_seen": 766968832 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876629889669007, + "loss": 3.02, + "theoretical_loss": 3.744806420405874, + "tokens_seen": 767034368 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876529588766299, + "loss": 3.0079, + "theoretical_loss": 3.7447744626743846, + "tokens_seen": 767099904 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876429287863591, + "loss": 2.8929, + "theoretical_loss": 3.7447425084374366, + "tokens_seen": 767165440 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038763289869608826, + "loss": 2.9905, + "theoretical_loss": 3.744710557694349, + "tokens_seen": 767230976 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876228686058175, + "loss": 3.0043, + "theoretical_loss": 3.744678610444441, + "tokens_seen": 767296512 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003876128385155466, + "loss": 2.9889, + "theoretical_loss": 3.744646666687034, + "tokens_seen": 767362048 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038760280842527585, + "loss": 3.0454, + "theoretical_loss": 3.7446147264214464, + "tokens_seen": 767427584 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387592778335005, + "loss": 3.0541, + "theoretical_loss": 3.744582789646999, + "tokens_seen": 767493120 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875827482447342, + "loss": 3.0331, + "theoretical_loss": 3.7445508563630123, + "tokens_seen": 767558656 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875727181544634, + "loss": 3.0124, + "theoretical_loss": 3.744518926568807, + "tokens_seen": 767624192 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875626880641926, + "loss": 3.0425, + "theoretical_loss": 3.7444870002637027, + "tokens_seen": 767689728 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038755265797392176, + "loss": 2.9709, + "theoretical_loss": 3.7444550774470216, + "tokens_seen": 767755264 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387542627883651, + "loss": 3.0352, + "theoretical_loss": 3.7444231581180842, + "tokens_seen": 767820800 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875325977933801, + "loss": 2.9622, + "theoretical_loss": 3.744391242276212, + "tokens_seen": 767886336 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038752256770310936, + "loss": 2.9768, + "theoretical_loss": 3.744359329920726, + "tokens_seen": 767951872 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875125376128385, + "loss": 2.9043, + "theoretical_loss": 3.7443274210509485, + "tokens_seen": 768017408 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003875025075225677, + "loss": 3.0213, + "theoretical_loss": 3.7442955156662014, + "tokens_seen": 768082944 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003874924774322969, + "loss": 2.955, + "theoretical_loss": 3.7442636137658063, + "tokens_seen": 768148480 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003874824473420261, + "loss": 3.0405, + "theoretical_loss": 3.744231715349086, + "tokens_seen": 768214016 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038747241725175526, + "loss": 3.0371, + "theoretical_loss": 3.7441998204153624, + "tokens_seen": 768279552 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1842721, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.951047658920288, + "objective/train/theoretical_loss": 3.7441679289639582, + "objective/train/tokens_used": 788805088, + "theoretical_loss": 3.7441679289639582, + "tokens_seen": 768345088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038746238716148444, + "loss": 3.036, + "theoretical_loss": 3.7441679289639582, + "tokens_seen": 768345088 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003874523570712136, + "loss": 3.0228, + "theoretical_loss": 3.7441360409941966, + "tokens_seen": 768410624 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038744232698094286, + "loss": 2.9712, + "theoretical_loss": 3.7441041565054, + "tokens_seen": 768476160 + }, + { + "epoch": 9.01, + "learning_rate": 0.000387432296890672, + "loss": 3.0585, + "theoretical_loss": 3.7440722754968925, + "tokens_seen": 768541696 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003874222668004012, + "loss": 3.0741, + "theoretical_loss": 3.7440403979679964, + "tokens_seen": 768607232 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038741223671013035, + "loss": 2.9493, + "theoretical_loss": 3.744008523918036, + "tokens_seen": 768672768 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003874022066198596, + "loss": 3.0203, + "theoretical_loss": 3.7439766533463352, + "tokens_seen": 768738304 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038739217652958876, + "loss": 3.0, + "theoretical_loss": 3.743944786252218, + "tokens_seen": 768803840 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038738214643931795, + "loss": 3.0091, + "theoretical_loss": 3.7439129226350074, + "tokens_seen": 768869376 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003873721163490472, + "loss": 2.9739, + "theoretical_loss": 3.743881062494029, + "tokens_seen": 768934912 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038736208625877636, + "loss": 2.885, + "theoretical_loss": 3.7438492058286066, + "tokens_seen": 769000448 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038735205616850554, + "loss": 2.9554, + "theoretical_loss": 3.743817352638066, + "tokens_seen": 769065984 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003873420260782347, + "loss": 3.0497, + "theoretical_loss": 3.7437855029217304, + "tokens_seen": 769131520 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003873319959879639, + "loss": 2.965, + "theoretical_loss": 3.7437536566789262, + "tokens_seen": 769197056 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003873219658976931, + "loss": 3.0564, + "theoretical_loss": 3.7437218139089783, + "tokens_seen": 769262592 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003873119358074223, + "loss": 2.9783, + "theoretical_loss": 3.743689974611212, + "tokens_seen": 769328128 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038730190571715145, + "loss": 3.02, + "theoretical_loss": 3.7436581387849532, + "tokens_seen": 769393664 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872918756268807, + "loss": 3.0725, + "theoretical_loss": 3.7436263064295274, + "tokens_seen": 769459200 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872818455366098, + "loss": 3.0702, + "theoretical_loss": 3.743594477544261, + "tokens_seen": 769524736 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038727181544633905, + "loss": 2.9681, + "theoretical_loss": 3.74356265212848, + "tokens_seen": 769590272 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038726178535606823, + "loss": 2.9472, + "theoretical_loss": 3.743530830181511, + "tokens_seen": 769655808 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872517552657974, + "loss": 3.0266, + "theoretical_loss": 3.7434990117026805, + "tokens_seen": 769721344 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872417251755266, + "loss": 3.0555, + "theoretical_loss": 3.7434671966913147, + "tokens_seen": 769786880 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038723169508525577, + "loss": 2.8259, + "theoretical_loss": 3.743435385146742, + "tokens_seen": 769852416 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038722166499498495, + "loss": 3.0387, + "theoretical_loss": 3.743403577068288, + "tokens_seen": 769917952 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1846354, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.918738842010498, + "objective/train/theoretical_loss": 3.7433717724552813, + "objective/train/tokens_used": 790443488, + "theoretical_loss": 3.7433717724552813, + "tokens_seen": 769983488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872116349047142, + "loss": 3.0115, + "theoretical_loss": 3.7433717724552813, + "tokens_seen": 769983488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003872016048144433, + "loss": 2.9337, + "theoretical_loss": 3.7433399713070488, + "tokens_seen": 770049024 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038719157472417255, + "loss": 3.0513, + "theoretical_loss": 3.7433081736229177, + "tokens_seen": 770114560 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038718154463390173, + "loss": 2.9301, + "theoretical_loss": 3.7432763794022175, + "tokens_seen": 770180096 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871715145436309, + "loss": 3.0901, + "theoretical_loss": 3.7432445886442745, + "tokens_seen": 770245632 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871614844533601, + "loss": 3.0372, + "theoretical_loss": 3.7432128013484185, + "tokens_seen": 770311168 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871514543630893, + "loss": 3.0135, + "theoretical_loss": 3.743181017513977, + "tokens_seen": 770376704 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038714142427281846, + "loss": 3.0327, + "theoretical_loss": 3.7431492371402784, + "tokens_seen": 770442240 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871313941825477, + "loss": 2.9841, + "theoretical_loss": 3.7431174602266526, + "tokens_seen": 770507776 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871213640922768, + "loss": 3.0376, + "theoretical_loss": 3.743085686772428, + "tokens_seen": 770573312 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038711133400200605, + "loss": 2.9691, + "theoretical_loss": 3.743053916776934, + "tokens_seen": 770638848 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003871013039117352, + "loss": 2.9887, + "theoretical_loss": 3.7430221502395, + "tokens_seen": 770704384 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870912738214644, + "loss": 2.9725, + "theoretical_loss": 3.7429903871594554, + "tokens_seen": 770769920 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870812437311936, + "loss": 2.9724, + "theoretical_loss": 3.7429586275361304, + "tokens_seen": 770835456 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870712136409228, + "loss": 3.0257, + "theoretical_loss": 3.7429268713688546, + "tokens_seen": 770900992 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038706118355065196, + "loss": 2.9301, + "theoretical_loss": 3.7428951186569583, + "tokens_seen": 770966528 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870511534603812, + "loss": 3.0294, + "theoretical_loss": 3.742863369399772, + "tokens_seen": 771032064 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870411233701103, + "loss": 3.0281, + "theoretical_loss": 3.7428316235966257, + "tokens_seen": 771097600 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038703109327983956, + "loss": 3.0374, + "theoretical_loss": 3.7427998812468504, + "tokens_seen": 771163136 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870210631895687, + "loss": 2.9314, + "theoretical_loss": 3.742768142349777, + "tokens_seen": 771228672 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870110330992979, + "loss": 2.9512, + "theoretical_loss": 3.7427364069047373, + "tokens_seen": 771294208 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003870010030090271, + "loss": 2.9905, + "theoretical_loss": 3.742704674911062, + "tokens_seen": 771359744 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003869909729187563, + "loss": 2.9095, + "theoretical_loss": 3.742672946368082, + "tokens_seen": 771425280 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038698094282848546, + "loss": 3.0031, + "theoretical_loss": 3.742641221275129, + "tokens_seen": 771490816 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038697091273821464, + "loss": 3.052, + "theoretical_loss": 3.742609499631536, + "tokens_seen": 771556352 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1849397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9848885536193848, + "objective/train/theoretical_loss": 3.7425777814366343, + "objective/train/tokens_used": 792081888, + "theoretical_loss": 3.7425777814366343, + "tokens_seen": 771621888 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003869608826479438, + "loss": 2.9013, + "theoretical_loss": 3.7425777814366343, + "tokens_seen": 771621888 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038695085255767306, + "loss": 2.9378, + "theoretical_loss": 3.742546066689756, + "tokens_seen": 771687424 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003869408224674022, + "loss": 3.0145, + "theoretical_loss": 3.7425143553902336, + "tokens_seen": 771752960 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003869307923771314, + "loss": 3.0384, + "theoretical_loss": 3.7424826475373996, + "tokens_seen": 771818496 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038692076228686055, + "loss": 2.8998, + "theoretical_loss": 3.742450943130587, + "tokens_seen": 771884032 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003869107321965898, + "loss": 2.9986, + "theoretical_loss": 3.7424192421691282, + "tokens_seen": 771949568 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038690070210631896, + "loss": 2.8513, + "theoretical_loss": 3.742387544652357, + "tokens_seen": 772015104 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038689067201604815, + "loss": 3.0121, + "theoretical_loss": 3.7423558505796066, + "tokens_seen": 772080640 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038688064192577733, + "loss": 2.978, + "theoretical_loss": 3.7423241599502104, + "tokens_seen": 772146176 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038687061183550656, + "loss": 3.0095, + "theoretical_loss": 3.742292472763502, + "tokens_seen": 772211712 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003868605817452357, + "loss": 2.9678, + "theoretical_loss": 3.7422607890188155, + "tokens_seen": 772277248 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003868505516549649, + "loss": 2.9931, + "theoretical_loss": 3.7422291087154846, + "tokens_seen": 772342784 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038684052156469405, + "loss": 2.9841, + "theoretical_loss": 3.7421974318528437, + "tokens_seen": 772408320 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003868304914744233, + "loss": 3.0192, + "theoretical_loss": 3.7421657584302275, + "tokens_seen": 772473856 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038682046138415247, + "loss": 3.0719, + "theoretical_loss": 3.7421340884469703, + "tokens_seen": 772539392 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038681043129388165, + "loss": 3.0633, + "theoretical_loss": 3.7421024219024073, + "tokens_seen": 772604928 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038680040120361083, + "loss": 3.0349, + "theoretical_loss": 3.7420707587958733, + "tokens_seen": 772670464 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038679037111334, + "loss": 3.018, + "theoretical_loss": 3.7420390991267025, + "tokens_seen": 772736000 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003867803410230692, + "loss": 3.0037, + "theoretical_loss": 3.742007442894232, + "tokens_seen": 772801536 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038677031093279843, + "loss": 3.0266, + "theoretical_loss": 3.741975790097796, + "tokens_seen": 772867072 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038676028084252755, + "loss": 2.9903, + "theoretical_loss": 3.7419441407367318, + "tokens_seen": 772932608 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003867502507522568, + "loss": 3.0518, + "theoretical_loss": 3.7419124948103732, + "tokens_seen": 772998144 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003867402206619859, + "loss": 3.0327, + "theoretical_loss": 3.7418808523180576, + "tokens_seen": 773063680 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038673019057171515, + "loss": 2.9635, + "theoretical_loss": 3.741849213259121, + "tokens_seen": 773129216 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038672016048144433, + "loss": 2.9867, + "theoretical_loss": 3.7418175776329, + "tokens_seen": 773194752 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1854348, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.864654779434204, + "objective/train/theoretical_loss": 3.741785945438731, + "objective/train/tokens_used": 793720288, + "theoretical_loss": 3.741785945438731, + "tokens_seen": 773260288 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003867101303911735, + "loss": 3.0138, + "theoretical_loss": 3.741785945438731, + "tokens_seen": 773260288 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003867001003009027, + "loss": 2.928, + "theoretical_loss": 3.7417543166759515, + "tokens_seen": 773325824 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038669007021063193, + "loss": 2.9055, + "theoretical_loss": 3.741722691343898, + "tokens_seen": 773391360 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038668004012036106, + "loss": 3.0223, + "theoretical_loss": 3.741691069441907, + "tokens_seen": 773456896 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003866700100300903, + "loss": 2.9613, + "theoretical_loss": 3.7416594509693173, + "tokens_seen": 773522432 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003866599799398194, + "loss": 3.0026, + "theoretical_loss": 3.7416278359254656, + "tokens_seen": 773587968 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038664994984954866, + "loss": 2.9514, + "theoretical_loss": 3.74159622430969, + "tokens_seen": 773653504 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038663991975927784, + "loss": 3.0005, + "theoretical_loss": 3.741564616121328, + "tokens_seen": 773719040 + }, + { + "epoch": 9.01, + "learning_rate": 0.000386629889669007, + "loss": 2.8527, + "theoretical_loss": 3.741533011359718, + "tokens_seen": 773784576 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038661985957873625, + "loss": 2.9626, + "theoretical_loss": 3.741501410024199, + "tokens_seen": 773850112 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003866098294884654, + "loss": 3.0697, + "theoretical_loss": 3.741469812114108, + "tokens_seen": 773915648 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865997993981946, + "loss": 2.9938, + "theoretical_loss": 3.741438217628785, + "tokens_seen": 773981184 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865897693079238, + "loss": 3.0242, + "theoretical_loss": 3.741406626567568, + "tokens_seen": 774046720 + }, + { + "epoch": 9.01, + "learning_rate": 0.000386579739217653, + "loss": 3.0364, + "theoretical_loss": 3.741375038929797, + "tokens_seen": 774112256 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038656970912738216, + "loss": 3.0004, + "theoretical_loss": 3.7413434547148103, + "tokens_seen": 774177792 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865596790371114, + "loss": 3.0615, + "theoretical_loss": 3.7413118739219478, + "tokens_seen": 774243328 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865496489468405, + "loss": 2.8713, + "theoretical_loss": 3.741280296550549, + "tokens_seen": 774308864 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038653961885656976, + "loss": 3.0325, + "theoretical_loss": 3.7412487225999533, + "tokens_seen": 774374400 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865295887662989, + "loss": 2.9344, + "theoretical_loss": 3.7412171520695012, + "tokens_seen": 774439936 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865195586760281, + "loss": 2.9957, + "theoretical_loss": 3.741185584958533, + "tokens_seen": 774505472 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003865095285857573, + "loss": 2.9479, + "theoretical_loss": 3.741154021266388, + "tokens_seen": 774571008 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003864994984954865, + "loss": 2.9189, + "theoretical_loss": 3.7411224609924076, + "tokens_seen": 774636544 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038648946840521566, + "loss": 3.0073, + "theoretical_loss": 3.7410909041359326, + "tokens_seen": 774702080 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038647943831494484, + "loss": 2.922, + "theoretical_loss": 3.7410593506963035, + "tokens_seen": 774767616 + }, + { + "epoch": 9.01, + "learning_rate": 0.000386469408224674, + "loss": 3.0059, + "theoretical_loss": 3.7410278006728612, + "tokens_seen": 774833152 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1857292, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0774757862091064, + "objective/train/theoretical_loss": 3.7409962540649473, + "objective/train/tokens_used": 795358688, + "theoretical_loss": 3.7409962540649473, + "tokens_seen": 774898688 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038645937813440326, + "loss": 3.0073, + "theoretical_loss": 3.7409962540649473, + "tokens_seen": 774898688 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003864493480441324, + "loss": 3.0147, + "theoretical_loss": 3.740964710871903, + "tokens_seen": 774964224 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003864393179538616, + "loss": 2.9089, + "theoretical_loss": 3.7409331710930704, + "tokens_seen": 775029760 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038642928786359075, + "loss": 2.946, + "theoretical_loss": 3.7409016347277904, + "tokens_seen": 775095296 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038641925777332, + "loss": 3.0132, + "theoretical_loss": 3.7408701017754056, + "tokens_seen": 775160832 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038640922768304917, + "loss": 3.0059, + "theoretical_loss": 3.7408385722352584, + "tokens_seen": 775226368 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038639919759277835, + "loss": 2.9131, + "theoretical_loss": 3.7408070461066902, + "tokens_seen": 775291904 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038638916750250753, + "loss": 3.0148, + "theoretical_loss": 3.7407755233890443, + "tokens_seen": 775357440 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038637913741223676, + "loss": 2.8796, + "theoretical_loss": 3.7407440040816633, + "tokens_seen": 775422976 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003863691073219659, + "loss": 2.9548, + "theoretical_loss": 3.74071248818389, + "tokens_seen": 775488512 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003863590772316951, + "loss": 3.0169, + "theoretical_loss": 3.740680975695067, + "tokens_seen": 775554048 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038634904714142425, + "loss": 2.9907, + "theoretical_loss": 3.7406494666145385, + "tokens_seen": 775619584 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003863390170511535, + "loss": 3.072, + "theoretical_loss": 3.7406179609416474, + "tokens_seen": 775685120 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038632898696088267, + "loss": 2.9787, + "theoretical_loss": 3.7405864586757374, + "tokens_seen": 775750656 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038631895687061185, + "loss": 2.9943, + "theoretical_loss": 3.7405549598161523, + "tokens_seen": 775816192 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038630892678034103, + "loss": 2.9929, + "theoretical_loss": 3.7405234643622354, + "tokens_seen": 775881728 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003862988966900702, + "loss": 3.0627, + "theoretical_loss": 3.740491972313332, + "tokens_seen": 775947264 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003862888665997994, + "loss": 2.9415, + "theoretical_loss": 3.7404604836687856, + "tokens_seen": 776012800 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038627883650952863, + "loss": 2.916, + "theoretical_loss": 3.7404289984279417, + "tokens_seen": 776078336 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038626880641925775, + "loss": 2.9829, + "theoretical_loss": 3.7403975165901437, + "tokens_seen": 776143872 + }, + { + "epoch": 9.01, + "learning_rate": 0.000386258776328987, + "loss": 3.0392, + "theoretical_loss": 3.740366038154737, + "tokens_seen": 776209408 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003862487462387161, + "loss": 3.026, + "theoretical_loss": 3.7403345631210674, + "tokens_seen": 776274944 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038623871614844535, + "loss": 2.975, + "theoretical_loss": 3.740303091488479, + "tokens_seen": 776340480 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038622868605817453, + "loss": 2.9988, + "theoretical_loss": 3.7402716232563185, + "tokens_seen": 776406016 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003862186559679037, + "loss": 2.9105, + "theoretical_loss": 3.74024015842393, + "tokens_seen": 776471552 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1860986, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0180916786193848, + "objective/train/theoretical_loss": 3.7402086969906607, + "objective/train/tokens_used": 796997088, + "theoretical_loss": 3.7402086969906607, + "tokens_seen": 776537088 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003862086258776329, + "loss": 2.9674, + "theoretical_loss": 3.7402086969906607, + "tokens_seen": 776537088 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038619859578736213, + "loss": 3.0119, + "theoretical_loss": 3.740177238955855, + "tokens_seen": 776602624 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038618856569709126, + "loss": 2.9986, + "theoretical_loss": 3.740145784318861, + "tokens_seen": 776668160 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003861785356068205, + "loss": 2.9723, + "theoretical_loss": 3.7401143330790236, + "tokens_seen": 776733696 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003861685055165496, + "loss": 2.9914, + "theoretical_loss": 3.74008288523569, + "tokens_seen": 776799232 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038615847542627886, + "loss": 2.9298, + "theoretical_loss": 3.7400514407882066, + "tokens_seen": 776864768 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038614844533600804, + "loss": 3.0189, + "theoretical_loss": 3.7400199997359205, + "tokens_seen": 776930304 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003861384152457372, + "loss": 3.0588, + "theoretical_loss": 3.739988562078178, + "tokens_seen": 776995840 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003861283851554664, + "loss": 2.9091, + "theoretical_loss": 3.7399571278143275, + "tokens_seen": 777061376 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003861183550651956, + "loss": 2.9463, + "theoretical_loss": 3.7399256969437156, + "tokens_seen": 777126912 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038610832497492476, + "loss": 2.987, + "theoretical_loss": 3.7398942694656903, + "tokens_seen": 777192448 + }, + { + "epoch": 9.01, + "learning_rate": 0.000386098294884654, + "loss": 2.8874, + "theoretical_loss": 3.739862845379599, + "tokens_seen": 777257984 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860882647943831, + "loss": 2.9441, + "theoretical_loss": 3.73983142468479, + "tokens_seen": 777323520 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038607823470411236, + "loss": 2.9661, + "theoretical_loss": 3.7398000073806115, + "tokens_seen": 777389056 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860682046138415, + "loss": 3.083, + "theoretical_loss": 3.7397685934664118, + "tokens_seen": 777454592 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860581745235707, + "loss": 2.9547, + "theoretical_loss": 3.7397371829415387, + "tokens_seen": 777520128 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860481444332999, + "loss": 2.9638, + "theoretical_loss": 3.7397057758053416, + "tokens_seen": 777585664 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860381143430291, + "loss": 2.9823, + "theoretical_loss": 3.7396743720571695, + "tokens_seen": 777651200 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038602808425275826, + "loss": 3.0468, + "theoretical_loss": 3.739642971696371, + "tokens_seen": 777716736 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860180541624875, + "loss": 3.0195, + "theoretical_loss": 3.7396115747222956, + "tokens_seen": 777782272 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003860080240722166, + "loss": 3.0009, + "theoretical_loss": 3.739580181134292, + "tokens_seen": 777847808 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038599799398194586, + "loss": 3.0786, + "theoretical_loss": 3.739548790931711, + "tokens_seen": 777913344 + }, + { + "epoch": 9.01, + "learning_rate": 0.000385987963891675, + "loss": 3.02, + "theoretical_loss": 3.739517404113901, + "tokens_seen": 777978880 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003859779338014042, + "loss": 3.0324, + "theoretical_loss": 3.7394860206802134, + "tokens_seen": 778044416 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003859679037111334, + "loss": 3.0677, + "theoretical_loss": 3.7394546406299973, + "tokens_seen": 778109952 + }, + { + "epoch": 9.01, + "objective/train/docs_used": 1865974, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.745924711227417, + "objective/train/theoretical_loss": 3.7394232639626033, + "objective/train/tokens_used": 798635488, + "theoretical_loss": 3.7394232639626033, + "tokens_seen": 778175488 + }, + { + "epoch": 9.01, + "learning_rate": 0.0003859578736208626, + "loss": 2.9607, + "theoretical_loss": 3.7394232639626033, + "tokens_seen": 778175488 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038594784353059177, + "loss": 2.9969, + "theoretical_loss": 3.739391890677382, + "tokens_seen": 778241024 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038593781344032095, + "loss": 2.9203, + "theoretical_loss": 3.7393605207736833, + "tokens_seen": 778306560 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038592778335005013, + "loss": 3.0129, + "theoretical_loss": 3.7393291542508593, + "tokens_seen": 778372096 + }, + { + "epoch": 9.01, + "learning_rate": 0.00038591775325977937, + "loss": 2.9936, + "theoretical_loss": 3.7392977911082603, + "tokens_seen": 778437632 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003859077231695085, + "loss": 3.0576, + "theoretical_loss": 3.739266431345237, + "tokens_seen": 778503168 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038589769307923773, + "loss": 2.8634, + "theoretical_loss": 3.739235074961142, + "tokens_seen": 778568704 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038588766298896685, + "loss": 3.0768, + "theoretical_loss": 3.7392037219553256, + "tokens_seen": 778634240 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003858776328986961, + "loss": 2.9171, + "theoretical_loss": 3.7391723723271406, + "tokens_seen": 778699776 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003858676028084253, + "loss": 2.9597, + "theoretical_loss": 3.7391410260759383, + "tokens_seen": 778765312 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038585757271815445, + "loss": 3.0272, + "theoretical_loss": 3.739109683201071, + "tokens_seen": 778830848 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003858475426278837, + "loss": 2.9632, + "theoretical_loss": 3.7390783437018906, + "tokens_seen": 778896384 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038583751253761287, + "loss": 2.9386, + "theoretical_loss": 3.73904700757775, + "tokens_seen": 778961920 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038582748244734205, + "loss": 3.0235, + "theoretical_loss": 3.7390156748280017, + "tokens_seen": 779027456 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038581745235707123, + "loss": 3.0535, + "theoretical_loss": 3.738984345451999, + "tokens_seen": 779092992 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003858074222668004, + "loss": 3.0664, + "theoretical_loss": 3.7389530194490934, + "tokens_seen": 779158528 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003857973921765296, + "loss": 3.1024, + "theoretical_loss": 3.7389216968186396, + "tokens_seen": 779224064 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038578736208625883, + "loss": 2.999, + "theoretical_loss": 3.738890377559991, + "tokens_seen": 779289600 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038577733199598796, + "loss": 3.0912, + "theoretical_loss": 3.7388590616724997, + "tokens_seen": 779355136 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003857673019057172, + "loss": 2.9852, + "theoretical_loss": 3.73882774915552, + "tokens_seen": 779420672 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003857572718154463, + "loss": 2.9964, + "theoretical_loss": 3.7387964400084064, + "tokens_seen": 779486208 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038574724172517555, + "loss": 3.0065, + "theoretical_loss": 3.738765134230512, + "tokens_seen": 779551744 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038573721163490473, + "loss": 2.9949, + "theoretical_loss": 3.7387338318211922, + "tokens_seen": 779617280 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003857271815446339, + "loss": 2.9449, + "theoretical_loss": 3.7387025327798007, + "tokens_seen": 779682816 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003857171514543631, + "loss": 2.8778, + "theoretical_loss": 3.7386712371056916, + "tokens_seen": 779748352 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1868730, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9139211177825928, + "objective/train/theoretical_loss": 3.7386399447982206, + "objective/train/tokens_used": 800273888, + "theoretical_loss": 3.7386399447982206, + "tokens_seen": 779813888 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038570712136409233, + "loss": 2.8248, + "theoretical_loss": 3.7386399447982206, + "tokens_seen": 779813888 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038569709127382146, + "loss": 3.0, + "theoretical_loss": 3.7386086558567424, + "tokens_seen": 779879424 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856870611835507, + "loss": 2.9497, + "theoretical_loss": 3.7385773702806118, + "tokens_seen": 779944960 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856770310932798, + "loss": 3.0164, + "theoretical_loss": 3.7385460880691843, + "tokens_seen": 780010496 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038566700100300906, + "loss": 2.9515, + "theoretical_loss": 3.7385148092218152, + "tokens_seen": 780076032 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038565697091273824, + "loss": 3.0093, + "theoretical_loss": 3.7384835337378606, + "tokens_seen": 780141568 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856469408224674, + "loss": 2.9619, + "theoretical_loss": 3.738452261616676, + "tokens_seen": 780207104 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856369107321966, + "loss": 2.9768, + "theoretical_loss": 3.7384209928576175, + "tokens_seen": 780272640 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856268806419258, + "loss": 2.9781, + "theoretical_loss": 3.7383897274600413, + "tokens_seen": 780338176 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038561685055165496, + "loss": 2.9376, + "theoretical_loss": 3.7383584654233033, + "tokens_seen": 780403712 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003856068204613842, + "loss": 3.0112, + "theoretical_loss": 3.738327206746761, + "tokens_seen": 780469248 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855967903711133, + "loss": 3.0082, + "theoretical_loss": 3.7382959514297704, + "tokens_seen": 780534784 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038558676028084256, + "loss": 3.0549, + "theoretical_loss": 3.7382646994716886, + "tokens_seen": 780600320 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855767301905717, + "loss": 3.036, + "theoretical_loss": 3.7382334508718724, + "tokens_seen": 780665856 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855667001003009, + "loss": 2.9219, + "theoretical_loss": 3.7382022056296798, + "tokens_seen": 780731392 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855566700100301, + "loss": 3.0022, + "theoretical_loss": 3.7381709637444676, + "tokens_seen": 780796928 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855466399197593, + "loss": 2.9884, + "theoretical_loss": 3.7381397252155932, + "tokens_seen": 780862464 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038553660982948846, + "loss": 2.9618, + "theoretical_loss": 3.738108490042415, + "tokens_seen": 780928000 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855265797392177, + "loss": 3.0481, + "theoretical_loss": 3.7380772582242905, + "tokens_seen": 780993536 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003855165496489468, + "loss": 2.9406, + "theoretical_loss": 3.7380460297605786, + "tokens_seen": 781059072 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038550651955867606, + "loss": 2.9712, + "theoretical_loss": 3.7380148046506365, + "tokens_seen": 781124608 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003854964894684052, + "loss": 2.8934, + "theoretical_loss": 3.737983582893823, + "tokens_seen": 781190144 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003854864593781344, + "loss": 2.9578, + "theoretical_loss": 3.737952364489497, + "tokens_seen": 781255680 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003854764292878636, + "loss": 3.05, + "theoretical_loss": 3.737921149437018, + "tokens_seen": 781321216 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003854663991975928, + "loss": 2.9855, + "theoretical_loss": 3.737889937735744, + "tokens_seen": 781386752 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1873751, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.873283863067627, + "objective/train/theoretical_loss": 3.7378587293850343, + "objective/train/tokens_used": 801912288, + "theoretical_loss": 3.7378587293850343, + "tokens_seen": 781452288 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038545636910732197, + "loss": 2.9703, + "theoretical_loss": 3.7378587293850343, + "tokens_seen": 781452288 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038544633901705115, + "loss": 3.0221, + "theoretical_loss": 3.7378275243842487, + "tokens_seen": 781517824 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038543630892678033, + "loss": 2.985, + "theoretical_loss": 3.7377963227327466, + "tokens_seen": 781583360 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038542627883650957, + "loss": 3.0741, + "theoretical_loss": 3.7377651244298873, + "tokens_seen": 781648896 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003854162487462387, + "loss": 2.9646, + "theoretical_loss": 3.7377339294750316, + "tokens_seen": 781714432 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038540621865596793, + "loss": 3.0365, + "theoretical_loss": 3.7377027378675387, + "tokens_seen": 781779968 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038539618856569705, + "loss": 2.9959, + "theoretical_loss": 3.7376715496067696, + "tokens_seen": 781845504 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003853861584754263, + "loss": 2.9739, + "theoretical_loss": 3.7376403646920835, + "tokens_seen": 781911040 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038537612838515547, + "loss": 2.9895, + "theoretical_loss": 3.7376091831228426, + "tokens_seen": 781976576 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038536609829488465, + "loss": 3.0377, + "theoretical_loss": 3.737578004898407, + "tokens_seen": 782042112 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038535606820461383, + "loss": 2.9485, + "theoretical_loss": 3.737546830018137, + "tokens_seen": 782107648 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038534603811434307, + "loss": 2.7354, + "theoretical_loss": 3.737515658481395, + "tokens_seen": 782173184 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003853360080240722, + "loss": 2.8816, + "theoretical_loss": 3.7374844902875406, + "tokens_seen": 782238720 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038532597793380143, + "loss": 2.9987, + "theoretical_loss": 3.7374533254359372, + "tokens_seen": 782304256 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038531594784353056, + "loss": 2.9744, + "theoretical_loss": 3.737422163925945, + "tokens_seen": 782369792 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003853059177532598, + "loss": 3.0554, + "theoretical_loss": 3.737391005756926, + "tokens_seen": 782435328 + }, + { + "epoch": 9.02, + "learning_rate": 0.000385295887662989, + "loss": 2.8893, + "theoretical_loss": 3.7373598509282435, + "tokens_seen": 782500864 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038528585757271816, + "loss": 2.967, + "theoretical_loss": 3.7373286994392583, + "tokens_seen": 782566400 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038527582748244734, + "loss": 3.0253, + "theoretical_loss": 3.737297551289333, + "tokens_seen": 782631936 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003852657973921765, + "loss": 3.0551, + "theoretical_loss": 3.737266406477831, + "tokens_seen": 782697472 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003852557673019057, + "loss": 2.9717, + "theoretical_loss": 3.737235265004114, + "tokens_seen": 782763008 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038524573721163493, + "loss": 3.0098, + "theoretical_loss": 3.737204126867545, + "tokens_seen": 782828544 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038523570712136406, + "loss": 2.9982, + "theoretical_loss": 3.7371729920674874, + "tokens_seen": 782894080 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003852256770310933, + "loss": 2.9359, + "theoretical_loss": 3.7371418606033044, + "tokens_seen": 782959616 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003852156469408225, + "loss": 3.0479, + "theoretical_loss": 3.7371107324743593, + "tokens_seen": 783025152 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1876674, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0206878185272217, + "objective/train/theoretical_loss": 3.7370796076800152, + "objective/train/tokens_used": 803550688, + "theoretical_loss": 3.7370796076800152, + "tokens_seen": 783090688 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038520561685055166, + "loss": 2.9348, + "theoretical_loss": 3.7370796076800152, + "tokens_seen": 783090688 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038519558676028084, + "loss": 2.9805, + "theoretical_loss": 3.7370484862196376, + "tokens_seen": 783156224 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038518555667001, + "loss": 2.9692, + "theoretical_loss": 3.737017368092588, + "tokens_seen": 783221760 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003851755265797392, + "loss": 2.9361, + "theoretical_loss": 3.7369862532982325, + "tokens_seen": 783287296 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038516549648946844, + "loss": 2.9885, + "theoretical_loss": 3.736955141835934, + "tokens_seen": 783352832 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038515546639919756, + "loss": 2.9964, + "theoretical_loss": 3.736924033705058, + "tokens_seen": 783418368 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003851454363089268, + "loss": 2.9608, + "theoretical_loss": 3.736892928904968, + "tokens_seen": 783483904 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003851354062186559, + "loss": 2.8442, + "theoretical_loss": 3.73686182743503, + "tokens_seen": 783549440 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038512537612838516, + "loss": 2.8974, + "theoretical_loss": 3.7368307292946086, + "tokens_seen": 783614976 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003851153460381144, + "loss": 2.8091, + "theoretical_loss": 3.7367996344830687, + "tokens_seen": 783680512 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003851053159478435, + "loss": 2.9295, + "theoretical_loss": 3.7367685429997755, + "tokens_seen": 783746048 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038509528585757276, + "loss": 3.001, + "theoretical_loss": 3.7367374548440946, + "tokens_seen": 783811584 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850852557673019, + "loss": 3.0158, + "theoretical_loss": 3.7367063700153924, + "tokens_seen": 783877120 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850752256770311, + "loss": 3.0025, + "theoretical_loss": 3.736675288513034, + "tokens_seen": 783942656 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850651955867603, + "loss": 2.9554, + "theoretical_loss": 3.736644210336385, + "tokens_seen": 784008192 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850551654964895, + "loss": 2.9991, + "theoretical_loss": 3.7366131354848124, + "tokens_seen": 784073728 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038504513540621866, + "loss": 3.0025, + "theoretical_loss": 3.7365820639576826, + "tokens_seen": 784139264 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850351053159479, + "loss": 3.0371, + "theoretical_loss": 3.736550995754362, + "tokens_seen": 784204800 + }, + { + "epoch": 9.02, + "learning_rate": 0.000385025075225677, + "loss": 3.1073, + "theoretical_loss": 3.7365199308742167, + "tokens_seen": 784270336 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038501504513540626, + "loss": 2.9857, + "theoretical_loss": 3.7364888693166143, + "tokens_seen": 784335872 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003850050150451354, + "loss": 2.9662, + "theoretical_loss": 3.7364578110809212, + "tokens_seen": 784401408 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003849949849548646, + "loss": 3.0404, + "theoretical_loss": 3.736426756166506, + "tokens_seen": 784466944 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003849849548645938, + "loss": 2.9405, + "theoretical_loss": 3.7363957045727343, + "tokens_seen": 784532480 + }, + { + "epoch": 9.02, + "learning_rate": 0.000384974924774323, + "loss": 2.9943, + "theoretical_loss": 3.736364656298975, + "tokens_seen": 784598016 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038496489468405217, + "loss": 2.958, + "theoretical_loss": 3.7363336113445955, + "tokens_seen": 784663552 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1880534, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.77963924407959, + "objective/train/theoretical_loss": 3.736302569708964, + "objective/train/tokens_used": 805189088, + "theoretical_loss": 3.736302569708964, + "tokens_seen": 784729088 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038495486459378135, + "loss": 2.9268, + "theoretical_loss": 3.736302569708964, + "tokens_seen": 784729088 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038494483450351053, + "loss": 2.9844, + "theoretical_loss": 3.7362715313914476, + "tokens_seen": 784794624 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038493480441323977, + "loss": 2.964, + "theoretical_loss": 3.7362404963914155, + "tokens_seen": 784860160 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003849247743229689, + "loss": 2.8855, + "theoretical_loss": 3.736209464708236, + "tokens_seen": 784925696 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038491474423269813, + "loss": 2.9017, + "theoretical_loss": 3.7361784363412776, + "tokens_seen": 784991232 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038490471414242725, + "loss": 3.0441, + "theoretical_loss": 3.7361474112899087, + "tokens_seen": 785056768 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003848946840521565, + "loss": 2.9499, + "theoretical_loss": 3.736116389553499, + "tokens_seen": 785122304 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038488465396188567, + "loss": 2.9969, + "theoretical_loss": 3.7360853711314173, + "tokens_seen": 785187840 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038487462387161485, + "loss": 2.9558, + "theoretical_loss": 3.736054356023033, + "tokens_seen": 785253376 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038486459378134403, + "loss": 2.9064, + "theoretical_loss": 3.7360233442277155, + "tokens_seen": 785318912 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038485456369107327, + "loss": 2.9979, + "theoretical_loss": 3.7359923357448346, + "tokens_seen": 785384448 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003848445336008024, + "loss": 3.0188, + "theoretical_loss": 3.73596133057376, + "tokens_seen": 785449984 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038483450351053163, + "loss": 2.9597, + "theoretical_loss": 3.735930328713861, + "tokens_seen": 785515520 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038482447342026076, + "loss": 2.8763, + "theoretical_loss": 3.73589933016451, + "tokens_seen": 785581056 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038481444332999, + "loss": 2.9445, + "theoretical_loss": 3.7358683349250743, + "tokens_seen": 785646592 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003848044132397192, + "loss": 2.9753, + "theoretical_loss": 3.7358373429949268, + "tokens_seen": 785712128 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038479438314944836, + "loss": 2.9569, + "theoretical_loss": 3.735806354373437, + "tokens_seen": 785777664 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038478435305917754, + "loss": 2.9778, + "theoretical_loss": 3.735775369059976, + "tokens_seen": 785843200 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003847743229689067, + "loss": 3.023, + "theoretical_loss": 3.7357443870539155, + "tokens_seen": 785908736 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003847642928786359, + "loss": 3.0271, + "theoretical_loss": 3.7357134083546257, + "tokens_seen": 785974272 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038475426278836513, + "loss": 2.7941, + "theoretical_loss": 3.735682432961479, + "tokens_seen": 786039808 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038474423269809426, + "loss": 2.9026, + "theoretical_loss": 3.7356514608738456, + "tokens_seen": 786105344 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003847342026078235, + "loss": 2.8681, + "theoretical_loss": 3.7356204920910985, + "tokens_seen": 786170880 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003847241725175527, + "loss": 2.923, + "theoretical_loss": 3.7355895266126087, + "tokens_seen": 786236416 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038471414242728186, + "loss": 2.988, + "theoretical_loss": 3.735558564437749, + "tokens_seen": 786301952 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1885189, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.056809663772583, + "objective/train/theoretical_loss": 3.735527605565892, + "objective/train/tokens_used": 806827488, + "theoretical_loss": 3.735527605565892, + "tokens_seen": 786367488 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038470411233701104, + "loss": 2.973, + "theoretical_loss": 3.735527605565892, + "tokens_seen": 786367488 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003846940822467402, + "loss": 3.0214, + "theoretical_loss": 3.735496649996408, + "tokens_seen": 786433024 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003846840521564694, + "loss": 2.9763, + "theoretical_loss": 3.735465697728672, + "tokens_seen": 786498560 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038467402206619864, + "loss": 3.001, + "theoretical_loss": 3.7354347487620556, + "tokens_seen": 786564096 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038466399197592776, + "loss": 3.1185, + "theoretical_loss": 3.7354038030959322, + "tokens_seen": 786629632 + }, + { + "epoch": 9.02, + "learning_rate": 0.000384653961885657, + "loss": 3.0545, + "theoretical_loss": 3.735372860729674, + "tokens_seen": 786695168 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003846439317953861, + "loss": 3.0503, + "theoretical_loss": 3.7353419216626556, + "tokens_seen": 786760704 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038463390170511536, + "loss": 2.9479, + "theoretical_loss": 3.735310985894249, + "tokens_seen": 786826240 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038462387161484454, + "loss": 2.9968, + "theoretical_loss": 3.735280053423829, + "tokens_seen": 786891776 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003846138415245737, + "loss": 2.9531, + "theoretical_loss": 3.7352491242507684, + "tokens_seen": 786957312 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003846038114343029, + "loss": 3.0184, + "theoretical_loss": 3.735218198374442, + "tokens_seen": 787022848 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003845937813440321, + "loss": 3.0917, + "theoretical_loss": 3.7351872757942237, + "tokens_seen": 787088384 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038458375125376127, + "loss": 2.842, + "theoretical_loss": 3.7351563565094876, + "tokens_seen": 787153920 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003845737211634905, + "loss": 3.0213, + "theoretical_loss": 3.735125440519608, + "tokens_seen": 787219456 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038456369107321963, + "loss": 2.9816, + "theoretical_loss": 3.73509452782396, + "tokens_seen": 787284992 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038455366098294886, + "loss": 2.9514, + "theoretical_loss": 3.735063618421918, + "tokens_seen": 787350528 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038454363089267805, + "loss": 3.0643, + "theoretical_loss": 3.735032712312857, + "tokens_seen": 787416064 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038453360080240723, + "loss": 3.0224, + "theoretical_loss": 3.735001809496153, + "tokens_seen": 787481600 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003845235707121364, + "loss": 2.855, + "theoretical_loss": 3.73497090997118, + "tokens_seen": 787547136 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003845135406218656, + "loss": 2.9884, + "theoretical_loss": 3.7349400137373143, + "tokens_seen": 787612672 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038450351053159477, + "loss": 2.9611, + "theoretical_loss": 3.7349091207939313, + "tokens_seen": 787678208 + }, + { + "epoch": 9.02, + "learning_rate": 0.000384493480441324, + "loss": 2.9761, + "theoretical_loss": 3.7348782311404074, + "tokens_seen": 787743744 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038448345035105313, + "loss": 2.9614, + "theoretical_loss": 3.7348473447761172, + "tokens_seen": 787809280 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038447342026078237, + "loss": 3.0272, + "theoretical_loss": 3.7348164617004387, + "tokens_seen": 787874816 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003844633901705115, + "loss": 2.9187, + "theoretical_loss": 3.734785581912747, + "tokens_seen": 787940352 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1887955, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9412989616394043, + "objective/train/theoretical_loss": 3.734754705412419, + "objective/train/tokens_used": 808465888, + "theoretical_loss": 3.734754705412419, + "tokens_seen": 788005888 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038445336008024073, + "loss": 2.9338, + "theoretical_loss": 3.734754705412419, + "tokens_seen": 788005888 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003844433299899699, + "loss": 2.9364, + "theoretical_loss": 3.734723832198831, + "tokens_seen": 788071424 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003844332998996991, + "loss": 2.9998, + "theoretical_loss": 3.7346929622713603, + "tokens_seen": 788136960 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003844232698094283, + "loss": 3.0301, + "theoretical_loss": 3.7346620956293837, + "tokens_seen": 788202496 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038441323971915745, + "loss": 3.0872, + "theoretical_loss": 3.734631232272279, + "tokens_seen": 788268032 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038440320962888664, + "loss": 2.9947, + "theoretical_loss": 3.7346003721994228, + "tokens_seen": 788333568 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038439317953861587, + "loss": 2.9939, + "theoretical_loss": 3.7345695154101923, + "tokens_seen": 788399104 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038438314944834505, + "loss": 2.9429, + "theoretical_loss": 3.7345386619039664, + "tokens_seen": 788464640 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038437311935807423, + "loss": 2.9932, + "theoretical_loss": 3.7345078116801225, + "tokens_seen": 788530176 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038436308926780347, + "loss": 2.9373, + "theoretical_loss": 3.734476964738038, + "tokens_seen": 788595712 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003843530591775326, + "loss": 2.9963, + "theoretical_loss": 3.7344461210770916, + "tokens_seen": 788661248 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038434302908726183, + "loss": 2.9596, + "theoretical_loss": 3.7344152806966617, + "tokens_seen": 788726784 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038433299899699096, + "loss": 3.0454, + "theoretical_loss": 3.7343844435961273, + "tokens_seen": 788792320 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003843229689067202, + "loss": 2.9582, + "theoretical_loss": 3.734353609774866, + "tokens_seen": 788857856 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003843129388164494, + "loss": 3.0314, + "theoretical_loss": 3.734322779232258, + "tokens_seen": 788923392 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038430290872617856, + "loss": 2.9775, + "theoretical_loss": 3.7342919519676814, + "tokens_seen": 788988928 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038429287863590774, + "loss": 2.9225, + "theoretical_loss": 3.734261127980515, + "tokens_seen": 789054464 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003842828485456369, + "loss": 3.0295, + "theoretical_loss": 3.73423030727014, + "tokens_seen": 789120000 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003842728184553661, + "loss": 3.0244, + "theoretical_loss": 3.7341994898359343, + "tokens_seen": 789185536 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038426278836509533, + "loss": 2.9922, + "theoretical_loss": 3.734168675677278, + "tokens_seen": 789251072 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038425275827482446, + "loss": 2.9131, + "theoretical_loss": 3.734137864793552, + "tokens_seen": 789316608 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003842427281845537, + "loss": 3.0041, + "theoretical_loss": 3.734107057184135, + "tokens_seen": 789382144 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003842326980942829, + "loss": 2.9791, + "theoretical_loss": 3.734076252848408, + "tokens_seen": 789447680 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038422266800401206, + "loss": 2.9249, + "theoretical_loss": 3.734045451785751, + "tokens_seen": 789513216 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038421263791374124, + "loss": 3.0369, + "theoretical_loss": 3.734014653995545, + "tokens_seen": 789578752 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0576727390289307, + "objective/train/theoretical_loss": 3.7339838594771706, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7339838594771706, + "tokens_seen": 789644288 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003842026078234704, + "loss": 3.0132, + "theoretical_loss": 3.7339838594771706, + "tokens_seen": 789644288 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841925777331996, + "loss": 3.0309, + "theoretical_loss": 3.733953068230008, + "tokens_seen": 789709824 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038418254764292884, + "loss": 3.046, + "theoretical_loss": 3.7339222802534398, + "tokens_seen": 789775360 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038417251755265796, + "loss": 2.9128, + "theoretical_loss": 3.733891495546846, + "tokens_seen": 789840896 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841624874623872, + "loss": 3.048, + "theoretical_loss": 3.7338607141096083, + "tokens_seen": 789906432 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841524573721163, + "loss": 2.9285, + "theoretical_loss": 3.7338299359411087, + "tokens_seen": 789971968 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038414242728184556, + "loss": 3.0041, + "theoretical_loss": 3.7337991610407286, + "tokens_seen": 790037504 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038413239719157474, + "loss": 2.9521, + "theoretical_loss": 3.7337683894078495, + "tokens_seen": 790103040 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841223671013039, + "loss": 3.0415, + "theoretical_loss": 3.733737621041854, + "tokens_seen": 790168576 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841123370110331, + "loss": 3.011, + "theoretical_loss": 3.733706855942125, + "tokens_seen": 790234112 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003841023069207623, + "loss": 2.8192, + "theoretical_loss": 3.733676094108044, + "tokens_seen": 790299648 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038409227683049147, + "loss": 3.0344, + "theoretical_loss": 3.7336453355389936, + "tokens_seen": 790365184 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003840822467402207, + "loss": 2.9773, + "theoretical_loss": 3.7336145802343568, + "tokens_seen": 790430720 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038407221664994983, + "loss": 2.9841, + "theoretical_loss": 3.7335838281935168, + "tokens_seen": 790496256 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038406218655967907, + "loss": 3.0843, + "theoretical_loss": 3.7335530794158567, + "tokens_seen": 790561792 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038405215646940825, + "loss": 3.0519, + "theoretical_loss": 3.733522333900759, + "tokens_seen": 790627328 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038404212637913743, + "loss": 3.0126, + "theoretical_loss": 3.733491591647608, + "tokens_seen": 790692864 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003840320962888666, + "loss": 3.0357, + "theoretical_loss": 3.733460852655787, + "tokens_seen": 790758400 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003840220661985958, + "loss": 2.961, + "theoretical_loss": 3.733430116924679, + "tokens_seen": 790823936 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038401203610832497, + "loss": 3.1031, + "theoretical_loss": 3.7333993844536693, + "tokens_seen": 790889472 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003840020060180542, + "loss": 2.9435, + "theoretical_loss": 3.733368655242141, + "tokens_seen": 790955008 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038399197592778333, + "loss": 3.0006, + "theoretical_loss": 3.7333379292894793, + "tokens_seen": 791020544 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038398194583751257, + "loss": 2.9654, + "theoretical_loss": 3.7333072065950677, + "tokens_seen": 791086080 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003839719157472417, + "loss": 2.9914, + "theoretical_loss": 3.7332764871582906, + "tokens_seen": 791151616 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038396188565697093, + "loss": 3.0228, + "theoretical_loss": 3.7332457709785345, + "tokens_seen": 791217152 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.988478660583496, + "objective/train/theoretical_loss": 3.7332150580551824, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7332150580551824, + "tokens_seen": 791282688 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003839518555667001, + "loss": 3.0166, + "theoretical_loss": 3.7332150580551824, + "tokens_seen": 791282688 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003839418254764293, + "loss": 3.008, + "theoretical_loss": 3.7331843483876206, + "tokens_seen": 791348224 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003839317953861585, + "loss": 2.9542, + "theoretical_loss": 3.7331536419752336, + "tokens_seen": 791413760 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038392176529588765, + "loss": 2.969, + "theoretical_loss": 3.7331229388174076, + "tokens_seen": 791479296 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038391173520561684, + "loss": 3.0131, + "theoretical_loss": 3.733092238913528, + "tokens_seen": 791544832 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038390170511534607, + "loss": 3.0748, + "theoretical_loss": 3.73306154226298, + "tokens_seen": 791610368 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003838916750250752, + "loss": 3.0496, + "theoretical_loss": 3.73303084886515, + "tokens_seen": 791675904 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038388164493480443, + "loss": 3.0307, + "theoretical_loss": 3.7330001587194244, + "tokens_seen": 791741440 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003838716148445336, + "loss": 2.8484, + "theoretical_loss": 3.732969471825189, + "tokens_seen": 791806976 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003838615847542628, + "loss": 3.0296, + "theoretical_loss": 3.73293878818183, + "tokens_seen": 791872512 + }, + { + "epoch": 9.02, + "learning_rate": 0.000383851554663992, + "loss": 2.9665, + "theoretical_loss": 3.7329081077887345, + "tokens_seen": 791938048 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038384152457372116, + "loss": 3.0157, + "theoretical_loss": 3.7328774306452894, + "tokens_seen": 792003584 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038383149448345034, + "loss": 2.9835, + "theoretical_loss": 3.732846756750882, + "tokens_seen": 792069120 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003838214643931796, + "loss": 2.9687, + "theoretical_loss": 3.7328160861048976, + "tokens_seen": 792134656 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003838114343029087, + "loss": 3.0298, + "theoretical_loss": 3.732785418706725, + "tokens_seen": 792200192 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038380140421263794, + "loss": 2.9458, + "theoretical_loss": 3.732754754555752, + "tokens_seen": 792265728 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038379137412236706, + "loss": 3.0124, + "theoretical_loss": 3.732724093651365, + "tokens_seen": 792331264 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003837813440320963, + "loss": 2.9351, + "theoretical_loss": 3.7326934359929527, + "tokens_seen": 792396800 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003837713139418255, + "loss": 2.9198, + "theoretical_loss": 3.7326627815799025, + "tokens_seen": 792462336 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038376128385155466, + "loss": 2.9231, + "theoretical_loss": 3.732632130411602, + "tokens_seen": 792527872 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038375125376128384, + "loss": 2.8346, + "theoretical_loss": 3.732601482487441, + "tokens_seen": 792593408 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003837412236710131, + "loss": 3.0067, + "theoretical_loss": 3.732570837806807, + "tokens_seen": 792658944 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003837311935807422, + "loss": 2.9477, + "theoretical_loss": 3.7325401963690883, + "tokens_seen": 792724480 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038372116349047144, + "loss": 3.063, + "theoretical_loss": 3.7325095581736742, + "tokens_seen": 792790016 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038371113340020057, + "loss": 2.9893, + "theoretical_loss": 3.7324789232199533, + "tokens_seen": 792855552 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.793391227722168, + "objective/train/theoretical_loss": 3.7324482915073154, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7324482915073154, + "tokens_seen": 792921088 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003837011033099298, + "loss": 2.981, + "theoretical_loss": 3.7324482915073154, + "tokens_seen": 792921088 + }, + { + "epoch": 9.02, + "learning_rate": 0.000383691073219659, + "loss": 3.0861, + "theoretical_loss": 3.7324176630351493, + "tokens_seen": 792986624 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038368104312938816, + "loss": 2.9411, + "theoretical_loss": 3.7323870378028436, + "tokens_seen": 793052160 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038367101303911735, + "loss": 3.0245, + "theoretical_loss": 3.7323564158097895, + "tokens_seen": 793117696 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003836609829488465, + "loss": 2.9175, + "theoretical_loss": 3.732325797055375, + "tokens_seen": 793183232 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003836509528585757, + "loss": 3.0353, + "theoretical_loss": 3.7322951815389915, + "tokens_seen": 793248768 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038364092276830494, + "loss": 2.9975, + "theoretical_loss": 3.7322645692600287, + "tokens_seen": 793314304 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003836308926780341, + "loss": 3.0403, + "theoretical_loss": 3.7322339602178767, + "tokens_seen": 793379840 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003836208625877633, + "loss": 2.9304, + "theoretical_loss": 3.7322033544119257, + "tokens_seen": 793445376 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003836108324974925, + "loss": 3.0639, + "theoretical_loss": 3.7321727518415666, + "tokens_seen": 793510912 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038360080240722167, + "loss": 2.9736, + "theoretical_loss": 3.7321421525061904, + "tokens_seen": 793576448 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003835907723169509, + "loss": 2.9217, + "theoretical_loss": 3.7321115564051874, + "tokens_seen": 793641984 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038358074222668003, + "loss": 2.9547, + "theoretical_loss": 3.732080963537949, + "tokens_seen": 793707520 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038357071213640927, + "loss": 2.9801, + "theoretical_loss": 3.7320503739038666, + "tokens_seen": 793773056 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038356068204613845, + "loss": 3.0481, + "theoretical_loss": 3.732019787502331, + "tokens_seen": 793838592 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038355065195586763, + "loss": 3.0085, + "theoretical_loss": 3.7319892043327347, + "tokens_seen": 793904128 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003835406218655968, + "loss": 2.9306, + "theoretical_loss": 3.731958624394469, + "tokens_seen": 793969664 + }, + { + "epoch": 9.02, + "learning_rate": 0.000383530591775326, + "loss": 2.9756, + "theoretical_loss": 3.7319280476869254, + "tokens_seen": 794035200 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038352056168505517, + "loss": 2.9995, + "theoretical_loss": 3.7318974742094966, + "tokens_seen": 794100736 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003835105315947844, + "loss": 3.0837, + "theoretical_loss": 3.7318669039615746, + "tokens_seen": 794166272 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038350050150451353, + "loss": 3.0727, + "theoretical_loss": 3.7318363369425516, + "tokens_seen": 794231808 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038349047141424277, + "loss": 2.9521, + "theoretical_loss": 3.731805773151821, + "tokens_seen": 794297344 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003834804413239719, + "loss": 3.0401, + "theoretical_loss": 3.731775212588774, + "tokens_seen": 794362880 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038347041123370113, + "loss": 2.9684, + "theoretical_loss": 3.731744655252805, + "tokens_seen": 794428416 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003834603811434303, + "loss": 2.9977, + "theoretical_loss": 3.7317141011433064, + "tokens_seen": 794493952 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8127682209014893, + "objective/train/theoretical_loss": 3.731683550259671, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.731683550259671, + "tokens_seen": 794559488 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003834503510531595, + "loss": 2.9525, + "theoretical_loss": 3.731683550259671, + "tokens_seen": 794559488 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003834403209628887, + "loss": 3.026, + "theoretical_loss": 3.731653002601293, + "tokens_seen": 794625024 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038343029087261786, + "loss": 2.9021, + "theoretical_loss": 3.731622458167566, + "tokens_seen": 794690560 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038342026078234704, + "loss": 3.0285, + "theoretical_loss": 3.731591916957883, + "tokens_seen": 794756096 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038341023069207627, + "loss": 3.034, + "theoretical_loss": 3.7315613789716373, + "tokens_seen": 794821632 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003834002006018054, + "loss": 3.0239, + "theoretical_loss": 3.731530844208225, + "tokens_seen": 794887168 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038339017051153463, + "loss": 3.044, + "theoretical_loss": 3.7315003126670385, + "tokens_seen": 794952704 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003833801404212638, + "loss": 2.9991, + "theoretical_loss": 3.731469784347473, + "tokens_seen": 795018240 + }, + { + "epoch": 9.02, + "learning_rate": 0.000383370110330993, + "loss": 2.9188, + "theoretical_loss": 3.731439259248923, + "tokens_seen": 795083776 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003833600802407222, + "loss": 2.955, + "theoretical_loss": 3.731408737370783, + "tokens_seen": 795149312 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038335005015045136, + "loss": 2.9735, + "theoretical_loss": 3.7313782187124485, + "tokens_seen": 795214848 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038334002006018054, + "loss": 2.8808, + "theoretical_loss": 3.731347703273313, + "tokens_seen": 795280384 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003833299899699098, + "loss": 3.0358, + "theoretical_loss": 3.731317191052773, + "tokens_seen": 795345920 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003833199598796389, + "loss": 3.0118, + "theoretical_loss": 3.7312866820502233, + "tokens_seen": 795411456 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038330992978936814, + "loss": 3.0093, + "theoretical_loss": 3.73125617626506, + "tokens_seen": 795476992 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038329989969909726, + "loss": 3.0302, + "theoretical_loss": 3.7312256736966782, + "tokens_seen": 795542528 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003832898696088265, + "loss": 2.8186, + "theoretical_loss": 3.731195174344474, + "tokens_seen": 795608064 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003832798395185557, + "loss": 2.9207, + "theoretical_loss": 3.7311646782078434, + "tokens_seen": 795673600 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038326980942828486, + "loss": 3.0145, + "theoretical_loss": 3.731134185286182, + "tokens_seen": 795739136 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038325977933801404, + "loss": 2.9139, + "theoretical_loss": 3.731103695578887, + "tokens_seen": 795804672 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003832497492477433, + "loss": 3.0653, + "theoretical_loss": 3.7310732090853547, + "tokens_seen": 795870208 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003832397191574724, + "loss": 2.9341, + "theoretical_loss": 3.731042725804981, + "tokens_seen": 795935744 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038322968906720164, + "loss": 2.9885, + "theoretical_loss": 3.7310122457371637, + "tokens_seen": 796001280 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038321965897693077, + "loss": 3.0651, + "theoretical_loss": 3.7309817688813, + "tokens_seen": 796066816 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038320962888666, + "loss": 3.0922, + "theoretical_loss": 3.7309512952367854, + "tokens_seen": 796132352 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.72015118598938, + "objective/train/theoretical_loss": 3.7309208248030186, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7309208248030186, + "tokens_seen": 796197888 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003831995987963892, + "loss": 2.8475, + "theoretical_loss": 3.7309208248030186, + "tokens_seen": 796197888 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038318956870611836, + "loss": 2.9936, + "theoretical_loss": 3.730890357579397, + "tokens_seen": 796263424 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038317953861584755, + "loss": 3.0054, + "theoretical_loss": 3.7308598935653174, + "tokens_seen": 796328960 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003831695085255767, + "loss": 3.0661, + "theoretical_loss": 3.7308294327601788, + "tokens_seen": 796394496 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003831594784353059, + "loss": 2.957, + "theoretical_loss": 3.730798975163378, + "tokens_seen": 796460032 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038314944834503514, + "loss": 2.9535, + "theoretical_loss": 3.7307685207743138, + "tokens_seen": 796525568 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038313941825476427, + "loss": 2.9538, + "theoretical_loss": 3.7307380695923844, + "tokens_seen": 796591104 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003831293881644935, + "loss": 3.02, + "theoretical_loss": 3.730707621616988, + "tokens_seen": 796656640 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038311935807422263, + "loss": 2.9911, + "theoretical_loss": 3.7306771768475233, + "tokens_seen": 796722176 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038310932798395187, + "loss": 2.959, + "theoretical_loss": 3.73064673528339, + "tokens_seen": 796787712 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038309929789368105, + "loss": 2.9169, + "theoretical_loss": 3.7306162969239853, + "tokens_seen": 796853248 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038308926780341023, + "loss": 2.9908, + "theoretical_loss": 3.7305858617687093, + "tokens_seen": 796918784 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003830792377131394, + "loss": 2.9944, + "theoretical_loss": 3.730555429816961, + "tokens_seen": 796984320 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038306920762286865, + "loss": 3.0943, + "theoretical_loss": 3.7305250010681403, + "tokens_seen": 797049856 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003830591775325978, + "loss": 2.8916, + "theoretical_loss": 3.7304945755216465, + "tokens_seen": 797115392 + }, + { + "epoch": 9.02, + "learning_rate": 0.000383049147442327, + "loss": 2.9363, + "theoretical_loss": 3.7304641531768787, + "tokens_seen": 797180928 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038303911735205614, + "loss": 2.9396, + "theoretical_loss": 3.730433734033238, + "tokens_seen": 797246464 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038302908726178537, + "loss": 3.0638, + "theoretical_loss": 3.730403318090124, + "tokens_seen": 797312000 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038301905717151455, + "loss": 3.0307, + "theoretical_loss": 3.730372905346936, + "tokens_seen": 797377536 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038300902708124373, + "loss": 2.969, + "theoretical_loss": 3.7303424958030753, + "tokens_seen": 797443072 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829989969909729, + "loss": 2.9648, + "theoretical_loss": 3.730312089457943, + "tokens_seen": 797508608 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829889669007021, + "loss": 2.989, + "theoretical_loss": 3.7302816863109385, + "tokens_seen": 797574144 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829789368104313, + "loss": 2.9425, + "theoretical_loss": 3.7302512863614634, + "tokens_seen": 797639680 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829689067201605, + "loss": 3.0896, + "theoretical_loss": 3.730220889608919, + "tokens_seen": 797705216 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038295887662988964, + "loss": 3.0522, + "theoretical_loss": 3.7301904960527064, + "tokens_seen": 797770752 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.05021071434021, + "objective/train/theoretical_loss": 3.7301601056922262, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7301601056922262, + "tokens_seen": 797836288 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829488465396189, + "loss": 3.0304, + "theoretical_loss": 3.7301601056922262, + "tokens_seen": 797836288 + }, + { + "epoch": 9.02, + "learning_rate": 0.000382938816449348, + "loss": 3.0362, + "theoretical_loss": 3.7301297185268805, + "tokens_seen": 797901824 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038292878635907724, + "loss": 3.0433, + "theoretical_loss": 3.730099334556071, + "tokens_seen": 797967360 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829187562688064, + "loss": 3.0182, + "theoretical_loss": 3.7300689537792, + "tokens_seen": 798032896 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003829087261785356, + "loss": 3.0536, + "theoretical_loss": 3.7300385761956676, + "tokens_seen": 798098432 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003828986960882648, + "loss": 2.9868, + "theoretical_loss": 3.7300082018048784, + "tokens_seen": 798163968 + }, + { + "epoch": 9.02, + "learning_rate": 0.000382888665997994, + "loss": 2.8842, + "theoretical_loss": 3.7299778306062334, + "tokens_seen": 798229504 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003828786359077232, + "loss": 3.0316, + "theoretical_loss": 3.729947462599135, + "tokens_seen": 798295040 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003828686058174524, + "loss": 3.0983, + "theoretical_loss": 3.7299170977829865, + "tokens_seen": 798360576 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038285857572718156, + "loss": 3.0775, + "theoretical_loss": 3.729886736157191, + "tokens_seen": 798426112 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038284854563691074, + "loss": 2.9222, + "theoretical_loss": 3.72985637772115, + "tokens_seen": 798491648 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038283851554664, + "loss": 3.0037, + "theoretical_loss": 3.729826022474268, + "tokens_seen": 798557184 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003828284854563691, + "loss": 3.0265, + "theoretical_loss": 3.729795670415948, + "tokens_seen": 798622720 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038281845536609834, + "loss": 3.0466, + "theoretical_loss": 3.7297653215455924, + "tokens_seen": 798688256 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038280842527582746, + "loss": 3.0021, + "theoretical_loss": 3.7297349758626064, + "tokens_seen": 798753792 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827983951855567, + "loss": 3.0493, + "theoretical_loss": 3.7297046333663926, + "tokens_seen": 798819328 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827883650952859, + "loss": 2.9641, + "theoretical_loss": 3.7296742940563554, + "tokens_seen": 798884864 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038277833500501506, + "loss": 3.041, + "theoretical_loss": 3.7296439579318994, + "tokens_seen": 798950400 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038276830491474424, + "loss": 3.064, + "theoretical_loss": 3.729613624992428, + "tokens_seen": 799015936 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827582748244735, + "loss": 2.9812, + "theoretical_loss": 3.7295832952373456, + "tokens_seen": 799081472 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827482447342026, + "loss": 3.0268, + "theoretical_loss": 3.7295529686660576, + "tokens_seen": 799147008 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038273821464393184, + "loss": 2.9944, + "theoretical_loss": 3.729522645277968, + "tokens_seen": 799212544 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038272818455366097, + "loss": 2.9508, + "theoretical_loss": 3.7294923250724823, + "tokens_seen": 799278080 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827181544633902, + "loss": 3.0041, + "theoretical_loss": 3.7294620080490044, + "tokens_seen": 799343616 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003827081243731194, + "loss": 2.989, + "theoretical_loss": 3.729431694206941, + "tokens_seen": 799409152 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8454091548919678, + "objective/train/theoretical_loss": 3.7294013835456967, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7294013835456967, + "tokens_seen": 799474688 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038269809428284856, + "loss": 2.9844, + "theoretical_loss": 3.7294013835456967, + "tokens_seen": 799474688 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038268806419257775, + "loss": 2.9613, + "theoretical_loss": 3.7293710760646768, + "tokens_seen": 799540224 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003826780341023069, + "loss": 3.0056, + "theoretical_loss": 3.7293407717632876, + "tokens_seen": 799605760 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003826680040120361, + "loss": 2.9801, + "theoretical_loss": 3.729310470640935, + "tokens_seen": 799671296 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038265797392176534, + "loss": 3.0924, + "theoretical_loss": 3.729280172697024, + "tokens_seen": 799736832 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038264794383149447, + "loss": 3.0354, + "theoretical_loss": 3.729249877930962, + "tokens_seen": 799802368 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003826379137412237, + "loss": 2.9701, + "theoretical_loss": 3.729219586342154, + "tokens_seen": 799867904 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038262788365095283, + "loss": 3.0524, + "theoretical_loss": 3.7291892979300085, + "tokens_seen": 799933440 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038261785356068207, + "loss": 2.942, + "theoretical_loss": 3.72915901269393, + "tokens_seen": 799998976 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038260782347041125, + "loss": 2.9817, + "theoretical_loss": 3.7291287306333265, + "tokens_seen": 800064512 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038259779338014043, + "loss": 3.0839, + "theoretical_loss": 3.7290984517476047, + "tokens_seen": 800130048 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003825877632898696, + "loss": 3.0487, + "theoretical_loss": 3.729068176036172, + "tokens_seen": 800195584 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038257773319959885, + "loss": 2.9373, + "theoretical_loss": 3.729037903498435, + "tokens_seen": 800261120 + }, + { + "epoch": 9.02, + "learning_rate": 0.000382567703109328, + "loss": 2.9955, + "theoretical_loss": 3.729007634133802, + "tokens_seen": 800326656 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003825576730190572, + "loss": 2.971, + "theoretical_loss": 3.72897736794168, + "tokens_seen": 800392192 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038254764292878634, + "loss": 2.9619, + "theoretical_loss": 3.728947104921477, + "tokens_seen": 800457728 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038253761283851557, + "loss": 3.0363, + "theoretical_loss": 3.728916845072601, + "tokens_seen": 800523264 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038252758274824475, + "loss": 3.0381, + "theoretical_loss": 3.72888658839446, + "tokens_seen": 800588800 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038251755265797393, + "loss": 3.0404, + "theoretical_loss": 3.728856334886462, + "tokens_seen": 800654336 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003825075225677031, + "loss": 2.9301, + "theoretical_loss": 3.728826084548016, + "tokens_seen": 800719872 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824974924774323, + "loss": 3.0817, + "theoretical_loss": 3.7287958373785295, + "tokens_seen": 800785408 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824874623871615, + "loss": 2.9595, + "theoretical_loss": 3.728765593377412, + "tokens_seen": 800850944 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824774322968907, + "loss": 3.0164, + "theoretical_loss": 3.7287353525440734, + "tokens_seen": 800916480 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038246740220661984, + "loss": 2.9674, + "theoretical_loss": 3.7287051148779207, + "tokens_seen": 800982016 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824573721163491, + "loss": 2.9382, + "theoretical_loss": 3.728674880378364, + "tokens_seen": 801047552 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.827241897583008, + "objective/train/theoretical_loss": 3.7286446490448126, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7286446490448126, + "tokens_seen": 801113088 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824473420260782, + "loss": 2.9293, + "theoretical_loss": 3.7286446490448126, + "tokens_seen": 801113088 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038243731193580744, + "loss": 3.0108, + "theoretical_loss": 3.728614420876677, + "tokens_seen": 801178624 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824272818455366, + "loss": 2.9606, + "theoretical_loss": 3.728584195873365, + "tokens_seen": 801244160 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003824172517552658, + "loss": 3.1105, + "theoretical_loss": 3.7285539740342877, + "tokens_seen": 801309696 + }, + { + "epoch": 9.02, + "learning_rate": 0.000382407221664995, + "loss": 3.056, + "theoretical_loss": 3.7285237553588546, + "tokens_seen": 801375232 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823971915747242, + "loss": 2.8428, + "theoretical_loss": 3.7284935398464762, + "tokens_seen": 801440768 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038238716148445334, + "loss": 3.0616, + "theoretical_loss": 3.728463327496563, + "tokens_seen": 801506304 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823771313941826, + "loss": 2.9909, + "theoretical_loss": 3.7284331183085246, + "tokens_seen": 801571840 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823671013039117, + "loss": 2.8635, + "theoretical_loss": 3.728402912281773, + "tokens_seen": 801637376 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038235707121364094, + "loss": 2.8642, + "theoretical_loss": 3.7283727094157166, + "tokens_seen": 801702912 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823470411233701, + "loss": 2.9917, + "theoretical_loss": 3.728342509709769, + "tokens_seen": 801768448 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823370110330993, + "loss": 2.921, + "theoretical_loss": 3.7283123131633396, + "tokens_seen": 801833984 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003823269809428285, + "loss": 3.0654, + "theoretical_loss": 3.7282821197758405, + "tokens_seen": 801899520 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038231695085255766, + "loss": 2.9673, + "theoretical_loss": 3.7282519295466825, + "tokens_seen": 801965056 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038230692076228685, + "loss": 2.9852, + "theoretical_loss": 3.7282217424752777, + "tokens_seen": 802030592 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003822968906720161, + "loss": 3.0183, + "theoretical_loss": 3.7281915585610372, + "tokens_seen": 802096128 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003822868605817452, + "loss": 2.8243, + "theoretical_loss": 3.728161377803374, + "tokens_seen": 802161664 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038227683049147444, + "loss": 2.9392, + "theoretical_loss": 3.728131200201699, + "tokens_seen": 802227200 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038226680040120357, + "loss": 3.0277, + "theoretical_loss": 3.728101025755425, + "tokens_seen": 802292736 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003822567703109328, + "loss": 2.914, + "theoretical_loss": 3.728070854463964, + "tokens_seen": 802358272 + }, + { + "epoch": 9.02, + "learning_rate": 0.000382246740220662, + "loss": 3.0573, + "theoretical_loss": 3.7280406863267284, + "tokens_seen": 802423808 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038223671013039117, + "loss": 2.9201, + "theoretical_loss": 3.7280105213431316, + "tokens_seen": 802489344 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038222668004012035, + "loss": 2.9312, + "theoretical_loss": 3.7279803595125856, + "tokens_seen": 802554880 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003822166499498496, + "loss": 3.1375, + "theoretical_loss": 3.7279502008345045, + "tokens_seen": 802620416 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003822066198595787, + "loss": 3.0446, + "theoretical_loss": 3.7279200453083, + "tokens_seen": 802685952 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0889148712158203, + "objective/train/theoretical_loss": 3.727889892933386, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.727889892933386, + "tokens_seen": 802751488 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038219658976930795, + "loss": 2.9979, + "theoretical_loss": 3.727889892933386, + "tokens_seen": 802751488 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003821865596790371, + "loss": 2.9547, + "theoretical_loss": 3.7278597437091765, + "tokens_seen": 802817024 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003821765295887663, + "loss": 3.0697, + "theoretical_loss": 3.727829597635085, + "tokens_seen": 802882560 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003821664994984955, + "loss": 3.1241, + "theoretical_loss": 3.7277994547105244, + "tokens_seen": 802948096 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038215646940822467, + "loss": 3.0571, + "theoretical_loss": 3.7277693149349096, + "tokens_seen": 803013632 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038214643931795385, + "loss": 2.9157, + "theoretical_loss": 3.7277391783076537, + "tokens_seen": 803079168 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038213640922768303, + "loss": 2.9966, + "theoretical_loss": 3.7277090448281722, + "tokens_seen": 803144704 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038212637913741227, + "loss": 2.9052, + "theoretical_loss": 3.727678914495878, + "tokens_seen": 803210240 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038211634904714145, + "loss": 2.9864, + "theoretical_loss": 3.727648787310187, + "tokens_seen": 803275776 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038210631895687063, + "loss": 3.0633, + "theoretical_loss": 3.7276186632705137, + "tokens_seen": 803341312 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003820962888665998, + "loss": 3.1055, + "theoretical_loss": 3.7275885423762727, + "tokens_seen": 803406848 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038208625877632905, + "loss": 3.0712, + "theoretical_loss": 3.7275584246268783, + "tokens_seen": 803472384 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003820762286860582, + "loss": 3.0089, + "theoretical_loss": 3.7275283100217464, + "tokens_seen": 803537920 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003820661985957874, + "loss": 3.0125, + "theoretical_loss": 3.727498198560293, + "tokens_seen": 803603456 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038205616850551654, + "loss": 2.9618, + "theoretical_loss": 3.727468090241932, + "tokens_seen": 803668992 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038204613841524577, + "loss": 2.956, + "theoretical_loss": 3.7274379850660804, + "tokens_seen": 803734528 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038203610832497495, + "loss": 3.0737, + "theoretical_loss": 3.7274078830321535, + "tokens_seen": 803800064 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038202607823470413, + "loss": 3.0991, + "theoretical_loss": 3.7273777841395677, + "tokens_seen": 803865600 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003820160481444333, + "loss": 2.9861, + "theoretical_loss": 3.7273476883877383, + "tokens_seen": 803931136 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003820060180541625, + "loss": 3.0088, + "theoretical_loss": 3.727317595776082, + "tokens_seen": 803996672 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819959879638917, + "loss": 2.9496, + "theoretical_loss": 3.7272875063040156, + "tokens_seen": 804062208 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819859578736209, + "loss": 3.029, + "theoretical_loss": 3.727257419970955, + "tokens_seen": 804127744 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038197592778335004, + "loss": 2.9476, + "theoretical_loss": 3.7272273367763176, + "tokens_seen": 804193280 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819658976930793, + "loss": 3.0863, + "theoretical_loss": 3.7271972567195197, + "tokens_seen": 804258816 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819558676028084, + "loss": 3.029, + "theoretical_loss": 3.7271671797999786, + "tokens_seen": 804324352 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.968827247619629, + "objective/train/theoretical_loss": 3.727137106017112, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.727137106017112, + "tokens_seen": 804389888 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038194583751253764, + "loss": 3.0005, + "theoretical_loss": 3.727137106017112, + "tokens_seen": 804389888 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819358074222668, + "loss": 2.8265, + "theoretical_loss": 3.727107035370336, + "tokens_seen": 804455424 + }, + { + "epoch": 9.02, + "learning_rate": 0.000381925777331996, + "loss": 3.1011, + "theoretical_loss": 3.7270769678590696, + "tokens_seen": 804520960 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819157472417252, + "loss": 2.9807, + "theoretical_loss": 3.7270469034827296, + "tokens_seen": 804586496 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003819057171514544, + "loss": 2.939, + "theoretical_loss": 3.727016842240734, + "tokens_seen": 804652032 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038189568706118354, + "loss": 2.909, + "theoretical_loss": 3.7269867841325013, + "tokens_seen": 804717568 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818856569709128, + "loss": 2.9494, + "theoretical_loss": 3.7269567291574486, + "tokens_seen": 804783104 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818756268806419, + "loss": 2.9757, + "theoretical_loss": 3.726926677314995, + "tokens_seen": 804848640 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038186559679037114, + "loss": 2.9661, + "theoretical_loss": 3.7268966286045586, + "tokens_seen": 804914176 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818555667001003, + "loss": 3.0875, + "theoretical_loss": 3.7268665830255583, + "tokens_seen": 804979712 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818455366098295, + "loss": 2.8479, + "theoretical_loss": 3.7268365405774126, + "tokens_seen": 805045248 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818355065195587, + "loss": 3.0492, + "theoretical_loss": 3.7268065012595404, + "tokens_seen": 805110784 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038182547642928786, + "loss": 2.8896, + "theoretical_loss": 3.7267764650713615, + "tokens_seen": 805176320 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038181544633901705, + "loss": 2.9786, + "theoretical_loss": 3.7267464320122943, + "tokens_seen": 805241856 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003818054162487463, + "loss": 3.0585, + "theoretical_loss": 3.726716402081758, + "tokens_seen": 805307392 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003817953861584754, + "loss": 2.9779, + "theoretical_loss": 3.726686375279173, + "tokens_seen": 805372928 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038178535606820464, + "loss": 3.0431, + "theoretical_loss": 3.7266563516039586, + "tokens_seen": 805438464 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038177532597793377, + "loss": 2.859, + "theoretical_loss": 3.7266263310555345, + "tokens_seen": 805504000 + }, + { + "epoch": 9.02, + "learning_rate": 0.000381765295887663, + "loss": 2.992, + "theoretical_loss": 3.726596313633321, + "tokens_seen": 805569536 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003817552657973922, + "loss": 3.0343, + "theoretical_loss": 3.7265662993367377, + "tokens_seen": 805635072 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038174523570712137, + "loss": 2.9509, + "theoretical_loss": 3.7265362881652058, + "tokens_seen": 805700608 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038173520561685055, + "loss": 2.8991, + "theoretical_loss": 3.7265062801181448, + "tokens_seen": 805766144 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003817251755265798, + "loss": 3.0428, + "theoretical_loss": 3.726476275194976, + "tokens_seen": 805831680 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003817151454363089, + "loss": 3.0676, + "theoretical_loss": 3.72644627339512, + "tokens_seen": 805897216 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038170511534603815, + "loss": 3.0285, + "theoretical_loss": 3.726416274717997, + "tokens_seen": 805962752 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7334651947021484, + "objective/train/theoretical_loss": 3.7263862791630302, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7263862791630302, + "tokens_seen": 806028288 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003816950852557673, + "loss": 2.939, + "theoretical_loss": 3.7263862791630302, + "tokens_seen": 806028288 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003816850551654965, + "loss": 2.9018, + "theoretical_loss": 3.7263562867296387, + "tokens_seen": 806093824 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003816750250752257, + "loss": 3.0028, + "theoretical_loss": 3.726326297417245, + "tokens_seen": 806159360 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038166499498495487, + "loss": 3.005, + "theoretical_loss": 3.7262963112252696, + "tokens_seen": 806224896 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038165496489468405, + "loss": 3.0508, + "theoretical_loss": 3.726266328153135, + "tokens_seen": 806290432 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038164493480441323, + "loss": 2.8531, + "theoretical_loss": 3.7262363482002634, + "tokens_seen": 806355968 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003816349047141424, + "loss": 2.9798, + "theoretical_loss": 3.7262063713660765, + "tokens_seen": 806421504 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038162487462387165, + "loss": 2.9694, + "theoretical_loss": 3.7261763976499958, + "tokens_seen": 806487040 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003816148445336008, + "loss": 2.967, + "theoretical_loss": 3.726146427051445, + "tokens_seen": 806552576 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038160481444333, + "loss": 3.0359, + "theoretical_loss": 3.7261164595698446, + "tokens_seen": 806618112 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038159478435305914, + "loss": 3.0474, + "theoretical_loss": 3.726086495204619, + "tokens_seen": 806683648 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003815847542627884, + "loss": 3.0282, + "theoretical_loss": 3.7260565339551905, + "tokens_seen": 806749184 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038157472417251756, + "loss": 2.9546, + "theoretical_loss": 3.7260265758209816, + "tokens_seen": 806814720 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038156469408224674, + "loss": 3.0505, + "theoretical_loss": 3.725996620801416, + "tokens_seen": 806880256 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003815546639919759, + "loss": 2.9627, + "theoretical_loss": 3.725966668895916, + "tokens_seen": 806945792 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038154463390170515, + "loss": 2.9863, + "theoretical_loss": 3.725936720103906, + "tokens_seen": 807011328 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003815346038114343, + "loss": 3.0382, + "theoretical_loss": 3.7259067744248098, + "tokens_seen": 807076864 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003815245737211635, + "loss": 2.9905, + "theoretical_loss": 3.7258768318580495, + "tokens_seen": 807142400 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038151454363089264, + "loss": 2.9448, + "theoretical_loss": 3.725846892403051, + "tokens_seen": 807207936 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003815045135406219, + "loss": 3.0579, + "theoretical_loss": 3.7258169560592362, + "tokens_seen": 807273472 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038149448345035106, + "loss": 2.9861, + "theoretical_loss": 3.7257870228260304, + "tokens_seen": 807339008 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038148445336008024, + "loss": 2.9147, + "theoretical_loss": 3.7257570927028585, + "tokens_seen": 807404544 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003814744232698094, + "loss": 2.894, + "theoretical_loss": 3.725727165689144, + "tokens_seen": 807470080 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003814643931795386, + "loss": 3.0522, + "theoretical_loss": 3.725697241784312, + "tokens_seen": 807535616 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003814543630892678, + "loss": 2.9171, + "theoretical_loss": 3.7256673209877866, + "tokens_seen": 807601152 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.930077075958252, + "objective/train/theoretical_loss": 3.725637403298994, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.725637403298994, + "tokens_seen": 807666688 + }, + { + "epoch": 9.02, + "learning_rate": 0.000381444332998997, + "loss": 2.969, + "theoretical_loss": 3.725637403298994, + "tokens_seen": 807666688 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038143430290872614, + "loss": 3.0228, + "theoretical_loss": 3.725607488717358, + "tokens_seen": 807732224 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003814242728184554, + "loss": 2.9633, + "theoretical_loss": 3.725577577242304, + "tokens_seen": 807797760 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038141424272818456, + "loss": 2.9861, + "theoretical_loss": 3.7255476688732587, + "tokens_seen": 807863296 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038140421263791374, + "loss": 2.9914, + "theoretical_loss": 3.7255177636096466, + "tokens_seen": 807928832 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813941825476429, + "loss": 2.9294, + "theoretical_loss": 3.725487861450893, + "tokens_seen": 807994368 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813841524573721, + "loss": 3.0266, + "theoretical_loss": 3.7254579623964243, + "tokens_seen": 808059904 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038137412236710134, + "loss": 3.0324, + "theoretical_loss": 3.7254280664456667, + "tokens_seen": 808125440 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813640922768305, + "loss": 3.0376, + "theoretical_loss": 3.725398173598046, + "tokens_seen": 808190976 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813540621865597, + "loss": 3.0233, + "theoretical_loss": 3.7253682838529887, + "tokens_seen": 808256512 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813440320962889, + "loss": 3.0339, + "theoretical_loss": 3.7253383972099208, + "tokens_seen": 808322048 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038133400200601806, + "loss": 2.9563, + "theoretical_loss": 3.7253085136682698, + "tokens_seen": 808387584 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038132397191574725, + "loss": 3.0509, + "theoretical_loss": 3.725278633227462, + "tokens_seen": 808453120 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813139418254765, + "loss": 3.0001, + "theoretical_loss": 3.725248755886924, + "tokens_seen": 808518656 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003813039117352056, + "loss": 3.0863, + "theoretical_loss": 3.725218881646083, + "tokens_seen": 808584192 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038129388164493484, + "loss": 3.0474, + "theoretical_loss": 3.7251890105043666, + "tokens_seen": 808649728 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038128385155466397, + "loss": 3.0381, + "theoretical_loss": 3.725159142461202, + "tokens_seen": 808715264 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003812738214643932, + "loss": 2.9555, + "theoretical_loss": 3.7251292775160167, + "tokens_seen": 808780800 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003812637913741224, + "loss": 2.919, + "theoretical_loss": 3.725099415668238, + "tokens_seen": 808846336 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038125376128385157, + "loss": 3.0668, + "theoretical_loss": 3.725069556917295, + "tokens_seen": 808911872 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038124373119358075, + "loss": 2.9651, + "theoretical_loss": 3.7250397012626135, + "tokens_seen": 808977408 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038123370110331, + "loss": 2.9691, + "theoretical_loss": 3.725009848703624, + "tokens_seen": 809042944 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003812236710130391, + "loss": 2.9876, + "theoretical_loss": 3.7249799992397525, + "tokens_seen": 809108480 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038121364092276835, + "loss": 3.0012, + "theoretical_loss": 3.7249501528704294, + "tokens_seen": 809174016 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003812036108324975, + "loss": 2.9571, + "theoretical_loss": 3.7249203095950825, + "tokens_seen": 809239552 + }, + { + "epoch": 9.02, + "objective/train/docs_used": 1890880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2441139221191406, + "objective/train/theoretical_loss": 3.7248904694131406, + "objective/train/tokens_used": 809933280, + "theoretical_loss": 3.7248904694131406, + "tokens_seen": 809305088 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003811935807422267, + "loss": 3.108, + "theoretical_loss": 3.7248904694131406, + "tokens_seen": 809305088 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003811835506519559, + "loss": 3.0993, + "theoretical_loss": 3.724860632324032, + "tokens_seen": 809370624 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038117352056168507, + "loss": 2.9347, + "theoretical_loss": 3.724830798327187, + "tokens_seen": 809436160 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038116349047141425, + "loss": 2.9901, + "theoretical_loss": 3.7248009674220337, + "tokens_seen": 809501696 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038115346038114343, + "loss": 3.0258, + "theoretical_loss": 3.724771139608002, + "tokens_seen": 809567232 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003811434302908726, + "loss": 3.0347, + "theoretical_loss": 3.724741314884521, + "tokens_seen": 809632768 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038113340020060185, + "loss": 3.0879, + "theoretical_loss": 3.7247114932510206, + "tokens_seen": 809698304 + }, + { + "epoch": 9.02, + "learning_rate": 0.000381123370110331, + "loss": 3.0652, + "theoretical_loss": 3.7246816747069307, + "tokens_seen": 809763840 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003811133400200602, + "loss": 2.9909, + "theoretical_loss": 3.724651859251681, + "tokens_seen": 809829376 + }, + { + "epoch": 9.02, + "learning_rate": 0.00038110330992978934, + "loss": 3.0586, + "theoretical_loss": 3.7246220468847024, + "tokens_seen": 809894912 + }, + { + "epoch": 9.02, + "learning_rate": 0.0003810932798395186, + "loss": 2.9325, + "theoretical_loss": 3.724595497844973, + "tokens_seen": 809953280 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038108324974924776, + "loss": 2.9146, + "theoretical_loss": 3.724565691315198, + "tokens_seen": 810018816 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038107321965897694, + "loss": 2.8685, + "theoretical_loss": 3.724535887872048, + "tokens_seen": 810084352 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003810631895687061, + "loss": 2.8517, + "theoretical_loss": 3.724506087514952, + "tokens_seen": 810149888 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038105315947843535, + "loss": 2.9024, + "theoretical_loss": 3.7244762902433415, + "tokens_seen": 810215424 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003810431293881645, + "loss": 2.9164, + "theoretical_loss": 3.7244464960566477, + "tokens_seen": 810280960 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003810330992978937, + "loss": 2.8937, + "theoretical_loss": 3.7244167049543013, + "tokens_seen": 810346496 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038102306920762284, + "loss": 2.8814, + "theoretical_loss": 3.7243869169357344, + "tokens_seen": 810412032 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003810130391173521, + "loss": 2.8817, + "theoretical_loss": 3.724357132000377, + "tokens_seen": 810477568 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038100300902708126, + "loss": 2.9188, + "theoretical_loss": 3.724327350147662, + "tokens_seen": 810543104 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038099297893681044, + "loss": 2.8469, + "theoretical_loss": 3.724297571377021, + "tokens_seen": 810608640 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003809829488465396, + "loss": 2.7633, + "theoretical_loss": 3.7242677956878856, + "tokens_seen": 810674176 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003809729187562688, + "loss": 2.8657, + "theoretical_loss": 3.7242380230796877, + "tokens_seen": 810739712 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380962888665998, + "loss": 2.9164, + "theoretical_loss": 3.7242082535518595, + "tokens_seen": 810805248 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003809528585757272, + "loss": 2.8697, + "theoretical_loss": 3.724178487103834, + "tokens_seen": 810870784 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1941324, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8185172080993652, + "objective/train/theoretical_loss": 3.724148723735043, + "objective/train/tokens_used": 831396320, + "theoretical_loss": 3.724148723735043, + "tokens_seen": 810936320 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038094282848545635, + "loss": 2.8982, + "theoretical_loss": 3.724148723735043, + "tokens_seen": 810936320 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003809327983951856, + "loss": 2.8525, + "theoretical_loss": 3.724118963444919, + "tokens_seen": 811001856 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038092276830491476, + "loss": 2.9378, + "theoretical_loss": 3.724089206232896, + "tokens_seen": 811067392 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038091273821464394, + "loss": 2.9549, + "theoretical_loss": 3.7240594520984054, + "tokens_seen": 811132928 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003809027081243731, + "loss": 2.9578, + "theoretical_loss": 3.724029701040881, + "tokens_seen": 811198464 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808926780341023, + "loss": 3.0358, + "theoretical_loss": 3.7239999530597574, + "tokens_seen": 811264000 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808826479438315, + "loss": 2.9395, + "theoretical_loss": 3.7239702081544657, + "tokens_seen": 811329536 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808726178535607, + "loss": 3.0134, + "theoretical_loss": 3.72394046632444, + "tokens_seen": 811395072 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038086258776328985, + "loss": 2.79, + "theoretical_loss": 3.7239107275691152, + "tokens_seen": 811460608 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808525576730191, + "loss": 2.9165, + "theoretical_loss": 3.7238809918879245, + "tokens_seen": 811526144 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808425275827482, + "loss": 2.867, + "theoretical_loss": 3.723851259280302, + "tokens_seen": 811591680 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038083249749247745, + "loss": 3.015, + "theoretical_loss": 3.723821529745681, + "tokens_seen": 811657216 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808224674022066, + "loss": 2.8917, + "theoretical_loss": 3.723791803283497, + "tokens_seen": 811722752 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003808124373119358, + "loss": 2.8397, + "theoretical_loss": 3.7237620798931834, + "tokens_seen": 811788288 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380802407221665, + "loss": 2.9552, + "theoretical_loss": 3.723732359574176, + "tokens_seen": 811853824 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038079237713139417, + "loss": 3.0342, + "theoretical_loss": 3.7237026423259083, + "tokens_seen": 811919360 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038078234704112335, + "loss": 2.717, + "theoretical_loss": 3.7236729281478165, + "tokens_seen": 811984896 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003807723169508526, + "loss": 2.8858, + "theoretical_loss": 3.723643217039335, + "tokens_seen": 812050432 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003807622868605817, + "loss": 3.0298, + "theoretical_loss": 3.7236135089998985, + "tokens_seen": 812115968 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038075225677031095, + "loss": 2.9802, + "theoretical_loss": 3.7235838040289426, + "tokens_seen": 812181504 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038074222668004013, + "loss": 2.8056, + "theoretical_loss": 3.7235541021259033, + "tokens_seen": 812247040 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003807321965897693, + "loss": 2.9528, + "theoretical_loss": 3.723524403290216, + "tokens_seen": 812312576 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003807221664994985, + "loss": 2.8559, + "theoretical_loss": 3.7234947075213167, + "tokens_seen": 812378112 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003807121364092277, + "loss": 2.768, + "theoretical_loss": 3.723465014818641, + "tokens_seen": 812443648 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038070210631895685, + "loss": 3.0062, + "theoretical_loss": 3.723435325181625, + "tokens_seen": 812509184 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1946289, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.95773983001709, + "objective/train/theoretical_loss": 3.723405638609705, + "objective/train/tokens_used": 833034720, + "theoretical_loss": 3.723405638609705, + "tokens_seen": 812574720 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806920762286861, + "loss": 2.9874, + "theoretical_loss": 3.723405638609705, + "tokens_seen": 812574720 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806820461384152, + "loss": 2.7675, + "theoretical_loss": 3.7233759551023176, + "tokens_seen": 812640256 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038067201604814445, + "loss": 2.9225, + "theoretical_loss": 3.7233462746588994, + "tokens_seen": 812705792 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806619859578736, + "loss": 2.9601, + "theoretical_loss": 3.7233165972788864, + "tokens_seen": 812771328 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806519558676028, + "loss": 2.8883, + "theoretical_loss": 3.7232869229617167, + "tokens_seen": 812836864 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380641925777332, + "loss": 2.9107, + "theoretical_loss": 3.723257251706826, + "tokens_seen": 812902400 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806318956870612, + "loss": 2.8092, + "theoretical_loss": 3.723227583513652, + "tokens_seen": 812967936 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806218655967904, + "loss": 2.8383, + "theoretical_loss": 3.7231979183816324, + "tokens_seen": 813033472 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038061183550651954, + "loss": 2.9964, + "theoretical_loss": 3.7231682563102035, + "tokens_seen": 813099008 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003806018054162488, + "loss": 2.9682, + "theoretical_loss": 3.7231385972988047, + "tokens_seen": 813164544 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038059177532597796, + "loss": 2.9024, + "theoretical_loss": 3.7231089413468714, + "tokens_seen": 813230080 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038058174523570714, + "loss": 2.8532, + "theoretical_loss": 3.7230792884538433, + "tokens_seen": 813295616 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003805717151454363, + "loss": 2.9293, + "theoretical_loss": 3.723049638619158, + "tokens_seen": 813361152 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038056168505516555, + "loss": 2.9046, + "theoretical_loss": 3.7230199918422535, + "tokens_seen": 813426688 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003805516549648947, + "loss": 2.7905, + "theoretical_loss": 3.7229903481225683, + "tokens_seen": 813492224 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003805416248746239, + "loss": 2.9046, + "theoretical_loss": 3.7229607074595403, + "tokens_seen": 813557760 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038053159478435304, + "loss": 2.9023, + "theoretical_loss": 3.722931069852609, + "tokens_seen": 813623296 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003805215646940823, + "loss": 2.8797, + "theoretical_loss": 3.7229014353012126, + "tokens_seen": 813688832 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038051153460381146, + "loss": 2.9892, + "theoretical_loss": 3.7228718038047903, + "tokens_seen": 813754368 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038050150451354064, + "loss": 2.9816, + "theoretical_loss": 3.7228421753627807, + "tokens_seen": 813819904 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804914744232698, + "loss": 2.955, + "theoretical_loss": 3.722812549974624, + "tokens_seen": 813885440 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380481444332999, + "loss": 2.9223, + "theoretical_loss": 3.7227829276397584, + "tokens_seen": 813950976 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804714142427282, + "loss": 3.0507, + "theoretical_loss": 3.7227533083576243, + "tokens_seen": 814016512 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804613841524574, + "loss": 2.923, + "theoretical_loss": 3.7227236921276607, + "tokens_seen": 814082048 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038045135406218655, + "loss": 2.9874, + "theoretical_loss": 3.722694078949308, + "tokens_seen": 814147584 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1949407, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9553043842315674, + "objective/train/theoretical_loss": 3.722664468822006, + "objective/train/tokens_used": 834673120, + "theoretical_loss": 3.722664468822006, + "tokens_seen": 814213120 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804413239719158, + "loss": 2.8593, + "theoretical_loss": 3.722664468822006, + "tokens_seen": 814213120 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038043129388164496, + "loss": 2.8498, + "theoretical_loss": 3.7226348617451945, + "tokens_seen": 814278656 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038042126379137414, + "loss": 2.9527, + "theoretical_loss": 3.722605257718314, + "tokens_seen": 814344192 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804112337011033, + "loss": 2.8688, + "theoretical_loss": 3.722575656740805, + "tokens_seen": 814409728 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003804012036108325, + "loss": 2.9324, + "theoretical_loss": 3.7225460588121075, + "tokens_seen": 814475264 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003803911735205617, + "loss": 2.9617, + "theoretical_loss": 3.7225164639316635, + "tokens_seen": 814540800 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003803811434302909, + "loss": 2.8587, + "theoretical_loss": 3.722486872098912, + "tokens_seen": 814606336 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038037111334002005, + "loss": 2.8039, + "theoretical_loss": 3.722457283313296, + "tokens_seen": 814671872 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003803610832497493, + "loss": 2.9205, + "theoretical_loss": 3.722427697574255, + "tokens_seen": 814737408 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003803510531594784, + "loss": 2.9376, + "theoretical_loss": 3.722398114881231, + "tokens_seen": 814802944 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038034102306920765, + "loss": 2.9692, + "theoretical_loss": 3.722368535233665, + "tokens_seen": 814868480 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038033099297893683, + "loss": 2.8707, + "theoretical_loss": 3.722338958630999, + "tokens_seen": 814934016 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380320962888666, + "loss": 3.0107, + "theoretical_loss": 3.722309385072675, + "tokens_seen": 814999552 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003803109327983952, + "loss": 2.7753, + "theoretical_loss": 3.722279814558134, + "tokens_seen": 815065088 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038030090270812437, + "loss": 2.8488, + "theoretical_loss": 3.7222502470868193, + "tokens_seen": 815130624 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038029087261785355, + "loss": 2.916, + "theoretical_loss": 3.722220682658172, + "tokens_seen": 815196160 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802808425275828, + "loss": 2.8435, + "theoretical_loss": 3.722191121271635, + "tokens_seen": 815261696 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802708124373119, + "loss": 2.911, + "theoretical_loss": 3.72216156292665, + "tokens_seen": 815327232 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038026078234704115, + "loss": 2.855, + "theoretical_loss": 3.72213200762266, + "tokens_seen": 815392768 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038025075225677033, + "loss": 2.8763, + "theoretical_loss": 3.7221024553591087, + "tokens_seen": 815458304 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802407221664995, + "loss": 2.9117, + "theoretical_loss": 3.7220729061354376, + "tokens_seen": 815523840 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802306920762287, + "loss": 2.9248, + "theoretical_loss": 3.7220433599510905, + "tokens_seen": 815589376 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802206619859579, + "loss": 3.0251, + "theoretical_loss": 3.722013816805511, + "tokens_seen": 815654912 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038021063189568705, + "loss": 2.8624, + "theoretical_loss": 3.721984276698141, + "tokens_seen": 815720448 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003802006018054163, + "loss": 2.9379, + "theoretical_loss": 3.7219547396284254, + "tokens_seen": 815785984 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1953290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.942821502685547, + "objective/train/theoretical_loss": 3.721925205595807, + "objective/train/tokens_used": 836311520, + "theoretical_loss": 3.721925205595807, + "tokens_seen": 815851520 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003801905717151454, + "loss": 2.9458, + "theoretical_loss": 3.721925205595807, + "tokens_seen": 815851520 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038018054162487465, + "loss": 2.8726, + "theoretical_loss": 3.7218956745997307, + "tokens_seen": 815917056 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003801705115346038, + "loss": 3.0397, + "theoretical_loss": 3.721866146639639, + "tokens_seen": 815982592 + }, + { + "epoch": 10.0, + "learning_rate": 0.000380160481444333, + "loss": 2.9867, + "theoretical_loss": 3.721836621714977, + "tokens_seen": 816048128 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003801504513540622, + "loss": 2.8494, + "theoretical_loss": 3.721807099825188, + "tokens_seen": 816113664 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003801404212637914, + "loss": 2.9858, + "theoretical_loss": 3.7217775809697176, + "tokens_seen": 816179200 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038013039117352056, + "loss": 2.8869, + "theoretical_loss": 3.7217480651480095, + "tokens_seen": 816244736 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038012036108324974, + "loss": 2.8371, + "theoretical_loss": 3.7217185523595084, + "tokens_seen": 816310272 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003801103309929789, + "loss": 2.9782, + "theoretical_loss": 3.7216890426036593, + "tokens_seen": 816375808 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038010030090270816, + "loss": 2.9644, + "theoretical_loss": 3.7216595358799074, + "tokens_seen": 816441344 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800902708124373, + "loss": 2.8373, + "theoretical_loss": 3.721630032187697, + "tokens_seen": 816506880 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800802407221665, + "loss": 2.7971, + "theoretical_loss": 3.721600531526474, + "tokens_seen": 816572416 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800702106318957, + "loss": 2.8851, + "theoretical_loss": 3.7215710338956836, + "tokens_seen": 816637952 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800601805416249, + "loss": 2.9862, + "theoretical_loss": 3.7215415392947717, + "tokens_seen": 816703488 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038005015045135406, + "loss": 2.9855, + "theoretical_loss": 3.721512047723184, + "tokens_seen": 816769024 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038004012036108324, + "loss": 2.8875, + "theoretical_loss": 3.721482559180365, + "tokens_seen": 816834560 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800300902708124, + "loss": 2.9514, + "theoretical_loss": 3.7214530736657627, + "tokens_seen": 816900096 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038002006018054166, + "loss": 2.7315, + "theoretical_loss": 3.721423591178822, + "tokens_seen": 816965632 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003800100300902708, + "loss": 2.936, + "theoretical_loss": 3.7213941117189893, + "tokens_seen": 817031168 + }, + { + "epoch": 10.0, + "learning_rate": 0.00038, + "loss": 2.9737, + "theoretical_loss": 3.721364635285711, + "tokens_seen": 817096704 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037998996990972915, + "loss": 2.9436, + "theoretical_loss": 3.721335161878434, + "tokens_seen": 817162240 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003799799398194584, + "loss": 2.8987, + "theoretical_loss": 3.7213056914966045, + "tokens_seen": 817227776 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037996990972918756, + "loss": 2.893, + "theoretical_loss": 3.72127622413967, + "tokens_seen": 817293312 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037995987963891675, + "loss": 2.7805, + "theoretical_loss": 3.7212467598070775, + "tokens_seen": 817358848 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003799498495486459, + "loss": 2.8871, + "theoretical_loss": 3.721217298498273, + "tokens_seen": 817424384 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1957863, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8557963371276855, + "objective/train/theoretical_loss": 3.721187840212705, + "objective/train/tokens_used": 837949920, + "theoretical_loss": 3.721187840212705, + "tokens_seen": 817489920 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037993981945837516, + "loss": 2.9338, + "theoretical_loss": 3.721187840212705, + "tokens_seen": 817489920 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003799297893681043, + "loss": 2.8148, + "theoretical_loss": 3.7211583849498204, + "tokens_seen": 817555456 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003799197592778335, + "loss": 3.0023, + "theoretical_loss": 3.721128932709067, + "tokens_seen": 817620992 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037990972918756265, + "loss": 2.7707, + "theoretical_loss": 3.7210994834898923, + "tokens_seen": 817686528 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003798996990972919, + "loss": 2.9511, + "theoretical_loss": 3.721070037291745, + "tokens_seen": 817752064 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037988966900702107, + "loss": 2.9202, + "theoretical_loss": 3.721040594114072, + "tokens_seen": 817817600 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037987963891675025, + "loss": 2.9429, + "theoretical_loss": 3.7210111539563213, + "tokens_seen": 817883136 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003798696088264795, + "loss": 2.9479, + "theoretical_loss": 3.7209817168179424, + "tokens_seen": 817948672 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003798595787362086, + "loss": 2.8843, + "theoretical_loss": 3.7209522826983834, + "tokens_seen": 818014208 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037984954864593785, + "loss": 2.9466, + "theoretical_loss": 3.7209228515970922, + "tokens_seen": 818079744 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037983951855566703, + "loss": 2.934, + "theoretical_loss": 3.720893423513518, + "tokens_seen": 818145280 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003798294884653962, + "loss": 2.9787, + "theoretical_loss": 3.7208639984471095, + "tokens_seen": 818210816 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003798194583751254, + "loss": 2.9337, + "theoretical_loss": 3.7208345763973165, + "tokens_seen": 818276352 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037980942828485457, + "loss": 2.9225, + "theoretical_loss": 3.7208051573635874, + "tokens_seen": 818341888 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037979939819458375, + "loss": 2.9188, + "theoretical_loss": 3.7207757413453715, + "tokens_seen": 818407424 + }, + { + "epoch": 10.0, + "learning_rate": 0.000379789368104313, + "loss": 2.8346, + "theoretical_loss": 3.720746328342119, + "tokens_seen": 818472960 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003797793380140421, + "loss": 2.8595, + "theoretical_loss": 3.7207169183532782, + "tokens_seen": 818538496 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037976930792377135, + "loss": 3.0474, + "theoretical_loss": 3.7206875113783, + "tokens_seen": 818604032 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037975927783350053, + "loss": 2.849, + "theoretical_loss": 3.720658107416634, + "tokens_seen": 818669568 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003797492477432297, + "loss": 3.0349, + "theoretical_loss": 3.72062870646773, + "tokens_seen": 818735104 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003797392176529589, + "loss": 2.8932, + "theoretical_loss": 3.720599308531039, + "tokens_seen": 818800640 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003797291875626881, + "loss": 2.8265, + "theoretical_loss": 3.7205699136060097, + "tokens_seen": 818866176 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037971915747241725, + "loss": 3.0049, + "theoretical_loss": 3.7205405216920946, + "tokens_seen": 818931712 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003797091273821465, + "loss": 2.849, + "theoretical_loss": 3.720511132788743, + "tokens_seen": 818997248 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003796990972918756, + "loss": 3.0231, + "theoretical_loss": 3.7204817468954055, + "tokens_seen": 819062784 + }, + { + "debugging/Self-BLEU-5": 0.6333650938744053, + "debugging/distinct-1-grams": 0.7377814713078502, + "debugging/distinct-2-grams": 0.9292810328182818, + "debugging/entropy-1-grams": 6.450187162549074, + "debugging/entropy-2-grams": 7.643537431547519, + "debugging/length": 524.53125, + "debugging/num_segments": 32, + "epoch": 10.0, + "objective/train/docs_used": 1961048, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8675432205200195, + "objective/train/theoretical_loss": 3.7204523640115337, + "objective/train/tokens_used": 839588320, + "theoretical_loss": 3.7204523640115337, + "tokens_seen": 819128320 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037968906720160485, + "loss": 2.8082, + "theoretical_loss": 3.7204523640115337, + "tokens_seen": 819128320 + }, + { + "epoch": 10.0, + "learning_rate": 0.000379679037111334, + "loss": 2.8967, + "theoretical_loss": 3.7204229841365786, + "tokens_seen": 819193856 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003796690070210632, + "loss": 2.9176, + "theoretical_loss": 3.7203936072699912, + "tokens_seen": 819259392 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003796589769307924, + "loss": 2.9992, + "theoretical_loss": 3.720364233411223, + "tokens_seen": 819324928 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003796489468405216, + "loss": 2.9852, + "theoretical_loss": 3.7203348625597252, + "tokens_seen": 819390464 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037963891675025076, + "loss": 2.8953, + "theoretical_loss": 3.7203054947149496, + "tokens_seen": 819456000 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037962888665997994, + "loss": 2.9868, + "theoretical_loss": 3.7202761298763476, + "tokens_seen": 819521536 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003796188565697091, + "loss": 2.9724, + "theoretical_loss": 3.720246768043372, + "tokens_seen": 819587072 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037960882647943836, + "loss": 2.9862, + "theoretical_loss": 3.720217409215474, + "tokens_seen": 819652608 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795987963891675, + "loss": 2.9264, + "theoretical_loss": 3.720188053392106, + "tokens_seen": 819718144 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795887662988967, + "loss": 2.9563, + "theoretical_loss": 3.7201587005727212, + "tokens_seen": 819783680 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795787362086259, + "loss": 2.9759, + "theoretical_loss": 3.720129350756771, + "tokens_seen": 819849216 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795687061183551, + "loss": 2.9085, + "theoretical_loss": 3.720100003943708, + "tokens_seen": 819914752 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037955867602808426, + "loss": 2.9808, + "theoretical_loss": 3.7200706601329863, + "tokens_seen": 819980288 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037954864593781344, + "loss": 2.9426, + "theoretical_loss": 3.720041319324057, + "tokens_seen": 820045824 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795386158475426, + "loss": 2.9228, + "theoretical_loss": 3.7200119815163744, + "tokens_seen": 820111360 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037952858575727186, + "loss": 2.8785, + "theoretical_loss": 3.7199826467093917, + "tokens_seen": 820176896 + }, + { + "epoch": 10.0, + "learning_rate": 0.000379518555667001, + "loss": 2.8701, + "theoretical_loss": 3.719953314902562, + "tokens_seen": 820242432 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003795085255767302, + "loss": 2.8456, + "theoretical_loss": 3.7199239860953384, + "tokens_seen": 820307968 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037949849548645935, + "loss": 2.9623, + "theoretical_loss": 3.719894660287175, + "tokens_seen": 820373504 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003794884653961886, + "loss": 2.8899, + "theoretical_loss": 3.7198653374775255, + "tokens_seen": 820439040 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037947843530591776, + "loss": 2.991, + "theoretical_loss": 3.719836017665844, + "tokens_seen": 820504576 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037946840521564695, + "loss": 3.0386, + "theoretical_loss": 3.719806700851584, + "tokens_seen": 820570112 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003794583751253761, + "loss": 2.9025, + "theoretical_loss": 3.7197773870342, + "tokens_seen": 820635648 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037944834503510536, + "loss": 2.8247, + "theoretical_loss": 3.7197480762131465, + "tokens_seen": 820701184 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1965739, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.804866075515747, + "objective/train/theoretical_loss": 3.719718768387878, + "objective/train/tokens_used": 841226720, + "theoretical_loss": 3.719718768387878, + "tokens_seen": 820766720 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003794383149448345, + "loss": 2.8588, + "theoretical_loss": 3.719718768387878, + "tokens_seen": 820766720 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003794282848545637, + "loss": 2.8861, + "theoretical_loss": 3.7196894635578492, + "tokens_seen": 820832256 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037941825476429285, + "loss": 2.919, + "theoretical_loss": 3.7196601617225147, + "tokens_seen": 820897792 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003794082246740221, + "loss": 2.9664, + "theoretical_loss": 3.719630862881329, + "tokens_seen": 820963328 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037939819458375127, + "loss": 2.9591, + "theoretical_loss": 3.719601567033748, + "tokens_seen": 821028864 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037938816449348045, + "loss": 2.8944, + "theoretical_loss": 3.7195722741792263, + "tokens_seen": 821094400 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037937813440320963, + "loss": 2.9245, + "theoretical_loss": 3.7195429843172203, + "tokens_seen": 821159936 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003793681043129388, + "loss": 2.904, + "theoretical_loss": 3.7195136974471836, + "tokens_seen": 821225472 + }, + { + "epoch": 10.0, + "learning_rate": 0.000379358074222668, + "loss": 2.9131, + "theoretical_loss": 3.7194844135685736, + "tokens_seen": 821291008 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037934804413239723, + "loss": 2.9079, + "theoretical_loss": 3.7194551326808454, + "tokens_seen": 821356544 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037933801404212635, + "loss": 2.9548, + "theoretical_loss": 3.719425854783455, + "tokens_seen": 821422080 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003793279839518556, + "loss": 2.8786, + "theoretical_loss": 3.719396579875858, + "tokens_seen": 821487616 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003793179538615847, + "loss": 2.9272, + "theoretical_loss": 3.719367307957511, + "tokens_seen": 821553152 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037930792377131395, + "loss": 2.9498, + "theoretical_loss": 3.719338039027871, + "tokens_seen": 821618688 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037929789368104313, + "loss": 2.9592, + "theoretical_loss": 3.719308773086393, + "tokens_seen": 821684224 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003792878635907723, + "loss": 3.0151, + "theoretical_loss": 3.719279510132535, + "tokens_seen": 821749760 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003792778335005015, + "loss": 2.9885, + "theoretical_loss": 3.7192502501657536, + "tokens_seen": 821815296 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037926780341023073, + "loss": 2.9893, + "theoretical_loss": 3.7192209931855045, + "tokens_seen": 821880832 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037925777331995986, + "loss": 2.9826, + "theoretical_loss": 3.7191917391912463, + "tokens_seen": 821946368 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003792477432296891, + "loss": 2.8321, + "theoretical_loss": 3.7191624881824357, + "tokens_seen": 822011904 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003792377131394182, + "loss": 2.9092, + "theoretical_loss": 3.7191332401585293, + "tokens_seen": 822077440 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037922768304914746, + "loss": 2.9339, + "theoretical_loss": 3.7191039951189855, + "tokens_seen": 822142976 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037921765295887664, + "loss": 3.0253, + "theoretical_loss": 3.7190747530632615, + "tokens_seen": 822208512 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003792076228686058, + "loss": 2.9528, + "theoretical_loss": 3.719045513990815, + "tokens_seen": 822274048 + }, + { + "epoch": 10.0, + "learning_rate": 0.000379197592778335, + "loss": 2.9254, + "theoretical_loss": 3.7190162779011047, + "tokens_seen": 822339584 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1968695, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9721128940582275, + "objective/train/theoretical_loss": 3.7189870447935873, + "objective/train/tokens_used": 842865120, + "theoretical_loss": 3.7189870447935873, + "tokens_seen": 822405120 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791875626880642, + "loss": 2.9901, + "theoretical_loss": 3.7189870447935873, + "tokens_seen": 822405120 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037917753259779336, + "loss": 2.9685, + "theoretical_loss": 3.7189578146677222, + "tokens_seen": 822470656 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791675025075226, + "loss": 2.8924, + "theoretical_loss": 3.718928587522967, + "tokens_seen": 822536192 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791574724172517, + "loss": 2.9515, + "theoretical_loss": 3.718899363358781, + "tokens_seen": 822601728 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037914744232698096, + "loss": 2.832, + "theoretical_loss": 3.7188701421746213, + "tokens_seen": 822667264 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791374122367101, + "loss": 2.9969, + "theoretical_loss": 3.7188409239699487, + "tokens_seen": 822732800 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791273821464393, + "loss": 2.9688, + "theoretical_loss": 3.7188117087442203, + "tokens_seen": 822798336 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037911735205616856, + "loss": 2.9791, + "theoretical_loss": 3.718782496496896, + "tokens_seen": 822863872 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003791073219658977, + "loss": 2.9627, + "theoretical_loss": 3.718753287227435, + "tokens_seen": 822929408 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790972918756269, + "loss": 2.8469, + "theoretical_loss": 3.7187240809352966, + "tokens_seen": 822994944 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790872617853561, + "loss": 2.9157, + "theoretical_loss": 3.7186948776199404, + "tokens_seen": 823060480 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790772316950853, + "loss": 2.945, + "theoretical_loss": 3.7186656772808258, + "tokens_seen": 823126016 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037906720160481446, + "loss": 2.9791, + "theoretical_loss": 3.7186364799174116, + "tokens_seen": 823191552 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037905717151454364, + "loss": 2.8641, + "theoretical_loss": 3.7186072855291594, + "tokens_seen": 823257088 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790471414242728, + "loss": 2.9948, + "theoretical_loss": 3.7185780941155286, + "tokens_seen": 823322624 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037903711133400206, + "loss": 3.04, + "theoretical_loss": 3.718548905675979, + "tokens_seen": 823388160 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790270812437312, + "loss": 2.9498, + "theoretical_loss": 3.7185197202099705, + "tokens_seen": 823453696 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003790170511534604, + "loss": 2.9814, + "theoretical_loss": 3.718490537716965, + "tokens_seen": 823519232 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037900702106318955, + "loss": 2.9381, + "theoretical_loss": 3.7184613581964214, + "tokens_seen": 823584768 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003789969909729188, + "loss": 2.9569, + "theoretical_loss": 3.718432181647802, + "tokens_seen": 823650304 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037898696088264796, + "loss": 2.9597, + "theoretical_loss": 3.718403008070567, + "tokens_seen": 823715840 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037897693079237715, + "loss": 3.0373, + "theoretical_loss": 3.7183738374641777, + "tokens_seen": 823781376 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003789669007021063, + "loss": 2.9705, + "theoretical_loss": 3.7183446698280944, + "tokens_seen": 823846912 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037895687061183556, + "loss": 2.9627, + "theoretical_loss": 3.718315505161779, + "tokens_seen": 823912448 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003789468405215647, + "loss": 2.9073, + "theoretical_loss": 3.718286343464693, + "tokens_seen": 823977984 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1972398, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.069864273071289, + "objective/train/theoretical_loss": 3.718257184736298, + "objective/train/tokens_used": 844503520, + "theoretical_loss": 3.718257184736298, + "tokens_seen": 824043520 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003789368104312939, + "loss": 2.938, + "theoretical_loss": 3.718257184736298, + "tokens_seen": 824043520 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037892678034102305, + "loss": 2.9886, + "theoretical_loss": 3.7182280289760556, + "tokens_seen": 824109056 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003789167502507523, + "loss": 2.9129, + "theoretical_loss": 3.7181988761834277, + "tokens_seen": 824174592 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037890672016048147, + "loss": 2.8301, + "theoretical_loss": 3.7181697263578766, + "tokens_seen": 824240128 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037889669007021065, + "loss": 2.9549, + "theoretical_loss": 3.7181405794988636, + "tokens_seen": 824305664 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037888665997993983, + "loss": 2.9942, + "theoretical_loss": 3.7181114356058518, + "tokens_seen": 824371200 + }, + { + "epoch": 10.0, + "learning_rate": 0.000378876629889669, + "loss": 2.8782, + "theoretical_loss": 3.718082294678303, + "tokens_seen": 824436736 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003788665997993982, + "loss": 2.9339, + "theoretical_loss": 3.7180531567156807, + "tokens_seen": 824502272 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037885656970912743, + "loss": 2.8915, + "theoretical_loss": 3.7180240217174463, + "tokens_seen": 824567808 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037884653961885655, + "loss": 2.914, + "theoretical_loss": 3.7179948896830637, + "tokens_seen": 824633344 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003788365095285858, + "loss": 2.8798, + "theoretical_loss": 3.7179657606119956, + "tokens_seen": 824698880 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003788264794383149, + "loss": 2.9901, + "theoretical_loss": 3.7179366345037046, + "tokens_seen": 824764416 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037881644934804415, + "loss": 2.8267, + "theoretical_loss": 3.7179075113576547, + "tokens_seen": 824829952 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037880641925777333, + "loss": 2.9059, + "theoretical_loss": 3.717878391173309, + "tokens_seen": 824895488 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003787963891675025, + "loss": 2.9932, + "theoretical_loss": 3.717849273950131, + "tokens_seen": 824961024 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003787863590772317, + "loss": 2.9535, + "theoretical_loss": 3.7178201596875846, + "tokens_seen": 825026560 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037877632898696093, + "loss": 3.094, + "theoretical_loss": 3.7177910483851333, + "tokens_seen": 825092096 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037876629889669006, + "loss": 2.8476, + "theoretical_loss": 3.7177619400422417, + "tokens_seen": 825157632 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003787562688064193, + "loss": 2.8958, + "theoretical_loss": 3.7177328346583725, + "tokens_seen": 825223168 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003787462387161484, + "loss": 3.022, + "theoretical_loss": 3.7177037322329918, + "tokens_seen": 825288704 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037873620862587766, + "loss": 2.939, + "theoretical_loss": 3.7176746327655623, + "tokens_seen": 825354240 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037872617853560684, + "loss": 2.8888, + "theoretical_loss": 3.7176455362555494, + "tokens_seen": 825419776 + }, + { + "epoch": 10.0, + "learning_rate": 0.000378716148445336, + "loss": 2.9531, + "theoretical_loss": 3.717616442702418, + "tokens_seen": 825485312 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003787061183550652, + "loss": 2.893, + "theoretical_loss": 3.717587352105632, + "tokens_seen": 825550848 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786960882647944, + "loss": 3.0368, + "theoretical_loss": 3.717558264464657, + "tokens_seen": 825616384 + }, + { + "epoch": 10.0, + "objective/train/docs_used": 1975302, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8653371334075928, + "objective/train/theoretical_loss": 3.7175291797789582, + "objective/train/tokens_used": 846141920, + "theoretical_loss": 3.7175291797789582, + "tokens_seen": 825681920 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037868605817452356, + "loss": 2.8911, + "theoretical_loss": 3.7175291797789582, + "tokens_seen": 825681920 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786760280842528, + "loss": 3.0318, + "theoretical_loss": 3.717500098048, + "tokens_seen": 825747456 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786659979939819, + "loss": 2.9032, + "theoretical_loss": 3.7174710192712492, + "tokens_seen": 825812992 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037865596790371116, + "loss": 2.8375, + "theoretical_loss": 3.71744194344817, + "tokens_seen": 825878528 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786459378134403, + "loss": 2.8382, + "theoretical_loss": 3.717412870578228, + "tokens_seen": 825944064 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786359077231695, + "loss": 2.8101, + "theoretical_loss": 3.7173838006608895, + "tokens_seen": 826009600 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786258776328987, + "loss": 2.9326, + "theoretical_loss": 3.717354733695621, + "tokens_seen": 826075136 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003786158475426279, + "loss": 2.9158, + "theoretical_loss": 3.7173256696818875, + "tokens_seen": 826140672 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037860581745235706, + "loss": 2.8256, + "theoretical_loss": 3.7172966086191552, + "tokens_seen": 826206208 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003785957873620863, + "loss": 2.9353, + "theoretical_loss": 3.7172675505068913, + "tokens_seen": 826271744 + }, + { + "epoch": 10.0, + "learning_rate": 0.0003785857572718154, + "loss": 2.9763, + "theoretical_loss": 3.717238495344561, + "tokens_seen": 826337280 + }, + { + "epoch": 10.0, + "learning_rate": 0.00037857572718154466, + "loss": 2.9205, + "theoretical_loss": 3.7172094431316323, + "tokens_seen": 826402816 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003785656970912738, + "loss": 2.9731, + "theoretical_loss": 3.7171803938675714, + "tokens_seen": 826468352 + }, + { + "epoch": 10.01, + "learning_rate": 0.000378555667001003, + "loss": 3.0111, + "theoretical_loss": 3.7171513475518445, + "tokens_seen": 826533888 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003785456369107322, + "loss": 3.0024, + "theoretical_loss": 3.7171223041839196, + "tokens_seen": 826599424 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003785356068204614, + "loss": 2.9834, + "theoretical_loss": 3.717093263763264, + "tokens_seen": 826664960 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037852557673019057, + "loss": 2.8756, + "theoretical_loss": 3.7170642262893434, + "tokens_seen": 826730496 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037851554663991975, + "loss": 2.9981, + "theoretical_loss": 3.717035191761627, + "tokens_seen": 826796032 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037850551654964893, + "loss": 2.9272, + "theoretical_loss": 3.7170061601795816, + "tokens_seen": 826861568 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037849548645937816, + "loss": 2.9568, + "theoretical_loss": 3.7169771315426745, + "tokens_seen": 826927104 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003784854563691073, + "loss": 2.8886, + "theoretical_loss": 3.7169481058503746, + "tokens_seen": 826992640 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003784754262788365, + "loss": 2.8248, + "theoretical_loss": 3.716919083102149, + "tokens_seen": 827058176 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037846539618856565, + "loss": 2.85, + "theoretical_loss": 3.7168900632974666, + "tokens_seen": 827123712 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003784553660982949, + "loss": 2.9418, + "theoretical_loss": 3.7168610464357954, + "tokens_seen": 827189248 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037844533600802407, + "loss": 2.8914, + "theoretical_loss": 3.716832032516603, + "tokens_seen": 827254784 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1980177, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.640199661254883, + "objective/train/theoretical_loss": 3.7168030215393593, + "objective/train/tokens_used": 847780320, + "theoretical_loss": 3.7168030215393593, + "tokens_seen": 827320320 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037843530591775325, + "loss": 2.831, + "theoretical_loss": 3.7168030215393593, + "tokens_seen": 827320320 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037842527582748243, + "loss": 2.9034, + "theoretical_loss": 3.7167740135035316, + "tokens_seen": 827385856 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037841524573721167, + "loss": 2.9452, + "theoretical_loss": 3.71674500840859, + "tokens_seen": 827451392 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003784052156469408, + "loss": 2.9881, + "theoretical_loss": 3.716716006254002, + "tokens_seen": 827516928 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037839518555667003, + "loss": 2.7197, + "theoretical_loss": 3.716687007039238, + "tokens_seen": 827582464 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037838515546639916, + "loss": 2.8503, + "theoretical_loss": 3.716658010763767, + "tokens_seen": 827648000 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003783751253761284, + "loss": 2.8545, + "theoretical_loss": 3.716629017427058, + "tokens_seen": 827713536 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037836509528585763, + "loss": 2.9451, + "theoretical_loss": 3.716600027028581, + "tokens_seen": 827779072 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037835506519558675, + "loss": 2.9073, + "theoretical_loss": 3.7165710395678047, + "tokens_seen": 827844608 + }, + { + "epoch": 10.01, + "learning_rate": 0.000378345035105316, + "loss": 2.868, + "theoretical_loss": 3.7165420550442, + "tokens_seen": 827910144 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003783350050150451, + "loss": 2.8891, + "theoretical_loss": 3.716513073457236, + "tokens_seen": 827975680 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037832497492477435, + "loss": 2.9675, + "theoretical_loss": 3.7164840948063835, + "tokens_seen": 828041216 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037831494483450353, + "loss": 2.9515, + "theoretical_loss": 3.716455119091112, + "tokens_seen": 828106752 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003783049147442327, + "loss": 2.9378, + "theoretical_loss": 3.7164261463108916, + "tokens_seen": 828172288 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782948846539619, + "loss": 2.7694, + "theoretical_loss": 3.716397176465194, + "tokens_seen": 828237824 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037828485456369113, + "loss": 2.9526, + "theoretical_loss": 3.7163682095534893, + "tokens_seen": 828303360 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037827482447342026, + "loss": 2.9831, + "theoretical_loss": 3.7163392455752478, + "tokens_seen": 828368896 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782647943831495, + "loss": 3.0281, + "theoretical_loss": 3.71631028452994, + "tokens_seen": 828434432 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782547642928786, + "loss": 2.938, + "theoretical_loss": 3.7162813264170382, + "tokens_seen": 828499968 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037824473420260786, + "loss": 3.007, + "theoretical_loss": 3.7162523712360134, + "tokens_seen": 828565504 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037823470411233704, + "loss": 2.7916, + "theoretical_loss": 3.7162234189863357, + "tokens_seen": 828631040 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782246740220662, + "loss": 2.9097, + "theoretical_loss": 3.716194469667477, + "tokens_seen": 828696576 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782146439317954, + "loss": 2.9192, + "theoretical_loss": 3.7161655232789097, + "tokens_seen": 828762112 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003782046138415246, + "loss": 2.7993, + "theoretical_loss": 3.716136579820105, + "tokens_seen": 828827648 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037819458375125376, + "loss": 2.859, + "theoretical_loss": 3.7161076392905343, + "tokens_seen": 828893184 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1983117, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.842670202255249, + "objective/train/theoretical_loss": 3.7160787016896704, + "objective/train/tokens_used": 849418720, + "theoretical_loss": 3.7160787016896704, + "tokens_seen": 828958720 + }, + { + "epoch": 10.01, + "learning_rate": 0.000378184553660983, + "loss": 2.8895, + "theoretical_loss": 3.7160787016896704, + "tokens_seen": 828958720 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781745235707121, + "loss": 2.8951, + "theoretical_loss": 3.716049767016984, + "tokens_seen": 829024256 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037816449348044136, + "loss": 2.9436, + "theoretical_loss": 3.7160208352719497, + "tokens_seen": 829089792 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781544633901705, + "loss": 2.9415, + "theoretical_loss": 3.7159919064540374, + "tokens_seen": 829155328 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781444332998997, + "loss": 2.9158, + "theoretical_loss": 3.715962980562721, + "tokens_seen": 829220864 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781344032096289, + "loss": 2.9215, + "theoretical_loss": 3.7159340575974733, + "tokens_seen": 829286400 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781243731193581, + "loss": 3.0254, + "theoretical_loss": 3.715905137557767, + "tokens_seen": 829351936 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037811434302908726, + "loss": 2.9498, + "theoretical_loss": 3.715876220443074, + "tokens_seen": 829417472 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003781043129388165, + "loss": 2.9099, + "theoretical_loss": 3.7158473062528685, + "tokens_seen": 829483008 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003780942828485456, + "loss": 3.0028, + "theoretical_loss": 3.7158183949866235, + "tokens_seen": 829548544 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037808425275827486, + "loss": 2.8584, + "theoretical_loss": 3.715789486643812, + "tokens_seen": 829614080 + }, + { + "epoch": 10.01, + "learning_rate": 0.000378074222668004, + "loss": 2.9706, + "theoretical_loss": 3.7157605812239076, + "tokens_seen": 829679616 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003780641925777332, + "loss": 2.9984, + "theoretical_loss": 3.7157316787263843, + "tokens_seen": 829745152 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003780541624874624, + "loss": 2.9201, + "theoretical_loss": 3.715702779150716, + "tokens_seen": 829810688 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003780441323971916, + "loss": 2.9577, + "theoretical_loss": 3.7156738824963753, + "tokens_seen": 829876224 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037803410230692077, + "loss": 3.0202, + "theoretical_loss": 3.715644988762837, + "tokens_seen": 829941760 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037802407221664995, + "loss": 2.9625, + "theoretical_loss": 3.7156160979495763, + "tokens_seen": 830007296 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037801404212637913, + "loss": 2.9674, + "theoretical_loss": 3.715587210056066, + "tokens_seen": 830072832 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037800401203610836, + "loss": 2.9298, + "theoretical_loss": 3.715558325081781, + "tokens_seen": 830138368 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003779939819458375, + "loss": 2.8798, + "theoretical_loss": 3.7155294430261963, + "tokens_seen": 830203904 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037798395185556673, + "loss": 2.8864, + "theoretical_loss": 3.715500563888786, + "tokens_seen": 830269440 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037798395185556673, + "loss": 2.8333, + "theoretical_loss": 3.7154716876690257, + "tokens_seen": 830334976 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037797392176529585, + "loss": 2.9773, + "theoretical_loss": 3.71544281436639, + "tokens_seen": 830400512 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003779638916750251, + "loss": 2.8392, + "theoretical_loss": 3.7154139439803533, + "tokens_seen": 830466048 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037795386158475427, + "loss": 2.9393, + "theoretical_loss": 3.7153850765103917, + "tokens_seen": 830531584 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1987997, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1138222217559814, + "objective/train/theoretical_loss": 3.7153562119559806, + "objective/train/tokens_used": 851057120, + "theoretical_loss": 3.7153562119559806, + "tokens_seen": 830597120 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037794383149448345, + "loss": 3.0622, + "theoretical_loss": 3.7153562119559806, + "tokens_seen": 830597120 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037793380140421263, + "loss": 2.9696, + "theoretical_loss": 3.7153273503165956, + "tokens_seen": 830662656 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037792377131394187, + "loss": 2.9149, + "theoretical_loss": 3.7152984915917115, + "tokens_seen": 830728192 + }, + { + "epoch": 10.01, + "learning_rate": 0.000377913741223671, + "loss": 2.9363, + "theoretical_loss": 3.715269635780805, + "tokens_seen": 830793728 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037790371113340023, + "loss": 2.9478, + "theoretical_loss": 3.7152407828833516, + "tokens_seen": 830859264 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037789368104312936, + "loss": 2.9596, + "theoretical_loss": 3.7152119328988276, + "tokens_seen": 830924800 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003778836509528586, + "loss": 2.9223, + "theoretical_loss": 3.715183085826709, + "tokens_seen": 830990336 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003778736208625878, + "loss": 3.0061, + "theoretical_loss": 3.7151542416664722, + "tokens_seen": 831055872 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037786359077231695, + "loss": 2.9872, + "theoretical_loss": 3.7151254004175938, + "tokens_seen": 831121408 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037785356068204614, + "loss": 2.9254, + "theoretical_loss": 3.7150965620795504, + "tokens_seen": 831186944 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003778435305917753, + "loss": 2.9842, + "theoretical_loss": 3.7150677266518186, + "tokens_seen": 831252480 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003778335005015045, + "loss": 2.8936, + "theoretical_loss": 3.7150388941338752, + "tokens_seen": 831318016 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037782347041123373, + "loss": 2.97, + "theoretical_loss": 3.715010064525198, + "tokens_seen": 831383552 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037781344032096286, + "loss": 2.8546, + "theoretical_loss": 3.714981237825263, + "tokens_seen": 831449088 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003778034102306921, + "loss": 3.0234, + "theoretical_loss": 3.714952414033548, + "tokens_seen": 831514624 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003777933801404213, + "loss": 2.9654, + "theoretical_loss": 3.7149235931495306, + "tokens_seen": 831580160 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037778335005015046, + "loss": 2.7486, + "theoretical_loss": 3.714894775172688, + "tokens_seen": 831645696 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037777331995987964, + "loss": 3.0078, + "theoretical_loss": 3.7148659601024985, + "tokens_seen": 831711232 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003777632898696088, + "loss": 2.9772, + "theoretical_loss": 3.714837147938439, + "tokens_seen": 831776768 + }, + { + "epoch": 10.01, + "learning_rate": 0.000377753259779338, + "loss": 2.9382, + "theoretical_loss": 3.714808338679988, + "tokens_seen": 831842304 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037774322968906724, + "loss": 2.9576, + "theoretical_loss": 3.714779532326624, + "tokens_seen": 831907840 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037773319959879636, + "loss": 2.9001, + "theoretical_loss": 3.7147507288778243, + "tokens_seen": 831973376 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003777231695085256, + "loss": 2.9336, + "theoretical_loss": 3.714721928333068, + "tokens_seen": 832038912 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003777131394182547, + "loss": 2.8393, + "theoretical_loss": 3.714693130691834, + "tokens_seen": 832104448 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037770310932798396, + "loss": 2.9987, + "theoretical_loss": 3.7146643359536, + "tokens_seen": 832169984 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1991843, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.031172513961792, + "objective/train/theoretical_loss": 3.7146355441178445, + "objective/train/tokens_used": 852695520, + "theoretical_loss": 3.7146355441178445, + "tokens_seen": 832235520 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037769307923771314, + "loss": 2.9333, + "theoretical_loss": 3.7146355441178445, + "tokens_seen": 832235520 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776830491474423, + "loss": 2.9543, + "theoretical_loss": 3.7146067551840476, + "tokens_seen": 832301056 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776730190571715, + "loss": 2.9674, + "theoretical_loss": 3.714577969151687, + "tokens_seen": 832366592 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776629889669007, + "loss": 2.9484, + "theoretical_loss": 3.7145491860202435, + "tokens_seen": 832432128 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037765295887662987, + "loss": 2.9925, + "theoretical_loss": 3.7145204057891954, + "tokens_seen": 832497664 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776429287863591, + "loss": 2.9804, + "theoretical_loss": 3.7144916284580214, + "tokens_seen": 832563200 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037763289869608823, + "loss": 2.9119, + "theoretical_loss": 3.714462854026203, + "tokens_seen": 832628736 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037762286860581746, + "loss": 2.8909, + "theoretical_loss": 3.7144340824932183, + "tokens_seen": 832694272 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776128385155467, + "loss": 2.9574, + "theoretical_loss": 3.714405313858548, + "tokens_seen": 832759808 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003776028084252758, + "loss": 3.0046, + "theoretical_loss": 3.7143765481216713, + "tokens_seen": 832825344 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037759277833500506, + "loss": 2.9728, + "theoretical_loss": 3.7143477852820697, + "tokens_seen": 832890880 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003775827482447342, + "loss": 2.973, + "theoretical_loss": 3.7143190253392215, + "tokens_seen": 832956416 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003775727181544634, + "loss": 2.9425, + "theoretical_loss": 3.7142902682926087, + "tokens_seen": 833021952 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003775626880641926, + "loss": 2.8855, + "theoretical_loss": 3.714261514141711, + "tokens_seen": 833087488 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003775526579739218, + "loss": 3.0186, + "theoretical_loss": 3.71423276288601, + "tokens_seen": 833153024 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037754262788365097, + "loss": 2.8839, + "theoretical_loss": 3.714204014524985, + "tokens_seen": 833218560 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037753259779338015, + "loss": 3.0461, + "theoretical_loss": 3.714175269058118, + "tokens_seen": 833284096 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037752256770310933, + "loss": 3.0273, + "theoretical_loss": 3.71414652648489, + "tokens_seen": 833349632 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037751253761283857, + "loss": 2.9931, + "theoretical_loss": 3.7141177868047817, + "tokens_seen": 833415168 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003775025075225677, + "loss": 2.8505, + "theoretical_loss": 3.7140890500172743, + "tokens_seen": 833480704 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037749247743229693, + "loss": 2.9457, + "theoretical_loss": 3.7140603161218504, + "tokens_seen": 833546240 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037748244734202605, + "loss": 2.8545, + "theoretical_loss": 3.71403158511799, + "tokens_seen": 833611776 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003774724172517553, + "loss": 2.8158, + "theoretical_loss": 3.7140028570051764, + "tokens_seen": 833677312 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037746238716148447, + "loss": 2.809, + "theoretical_loss": 3.7139741317828907, + "tokens_seen": 833742848 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037745235707121365, + "loss": 3.0005, + "theoretical_loss": 3.7139454094506146, + "tokens_seen": 833808384 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1996596, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963710308074951, + "objective/train/theoretical_loss": 3.71391669000783, + "objective/train/tokens_used": 854333920, + "theoretical_loss": 3.71391669000783, + "tokens_seen": 833873920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037744232698094283, + "loss": 2.9245, + "theoretical_loss": 3.71391669000783, + "tokens_seen": 833873920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037743229689067207, + "loss": 2.9219, + "theoretical_loss": 3.7138879734540207, + "tokens_seen": 833939456 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003774222668004012, + "loss": 2.9821, + "theoretical_loss": 3.7138592597886673, + "tokens_seen": 834004992 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037741223671013043, + "loss": 2.931, + "theoretical_loss": 3.7138305490112535, + "tokens_seen": 834070528 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037740220661985956, + "loss": 3.0226, + "theoretical_loss": 3.7138018411212617, + "tokens_seen": 834136064 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003773921765295888, + "loss": 3.0087, + "theoretical_loss": 3.7137731361181743, + "tokens_seen": 834201600 + }, + { + "epoch": 10.01, + "learning_rate": 0.000377382146439318, + "loss": 2.9546, + "theoretical_loss": 3.713744434001474, + "tokens_seen": 834267136 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037737211634904715, + "loss": 3.0155, + "theoretical_loss": 3.713715734770645, + "tokens_seen": 834332672 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037736208625877634, + "loss": 2.999, + "theoretical_loss": 3.7136870384251695, + "tokens_seen": 834398208 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003773520561685055, + "loss": 2.9002, + "theoretical_loss": 3.7136583449645313, + "tokens_seen": 834463744 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003773420260782347, + "loss": 2.9174, + "theoretical_loss": 3.7136296543882135, + "tokens_seen": 834529280 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037733199598796393, + "loss": 3.0008, + "theoretical_loss": 3.7136009666957, + "tokens_seen": 834594816 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037732196589769306, + "loss": 2.9741, + "theoretical_loss": 3.7135722818864743, + "tokens_seen": 834660352 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003773119358074223, + "loss": 3.0114, + "theoretical_loss": 3.7135435999600204, + "tokens_seen": 834725888 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003773019057171515, + "loss": 2.9034, + "theoretical_loss": 3.7135149209158222, + "tokens_seen": 834791424 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037729187562688066, + "loss": 2.9573, + "theoretical_loss": 3.713486244753364, + "tokens_seen": 834856960 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037728184553660984, + "loss": 2.8732, + "theoretical_loss": 3.71345757147213, + "tokens_seen": 834922496 + }, + { + "epoch": 10.01, + "learning_rate": 0.000377271815446339, + "loss": 2.8861, + "theoretical_loss": 3.7134289010716044, + "tokens_seen": 834988032 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003772617853560682, + "loss": 2.9144, + "theoretical_loss": 3.7134002335512717, + "tokens_seen": 835053568 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037725175526579744, + "loss": 2.9762, + "theoretical_loss": 3.713371568910617, + "tokens_seen": 835119104 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037724172517552656, + "loss": 2.8789, + "theoretical_loss": 3.713342907149124, + "tokens_seen": 835184640 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003772316950852558, + "loss": 3.0015, + "theoretical_loss": 3.713314248266279, + "tokens_seen": 835250176 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003772216649949849, + "loss": 2.9069, + "theoretical_loss": 3.713285592261567, + "tokens_seen": 835315712 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037721163490471416, + "loss": 2.9162, + "theoretical_loss": 3.713256939134472, + "tokens_seen": 835381248 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037720160481444334, + "loss": 2.8955, + "theoretical_loss": 3.7132282888844794, + "tokens_seen": 835446784 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 1999676, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9058220386505127, + "objective/train/theoretical_loss": 3.7131996415110757, + "objective/train/tokens_used": 855972320, + "theoretical_loss": 3.7131996415110757, + "tokens_seen": 835512320 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003771915747241725, + "loss": 2.9223, + "theoretical_loss": 3.7131996415110757, + "tokens_seen": 835512320 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003771815446339017, + "loss": 2.8843, + "theoretical_loss": 3.713170997013746, + "tokens_seen": 835577856 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003771715145436309, + "loss": 2.9346, + "theoretical_loss": 3.713142355391976, + "tokens_seen": 835643392 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037716148445336007, + "loss": 2.8981, + "theoretical_loss": 3.7131137166452515, + "tokens_seen": 835708928 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003771514543630893, + "loss": 2.9923, + "theoretical_loss": 3.713085080773059, + "tokens_seen": 835774464 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037714142427281843, + "loss": 2.9105, + "theoretical_loss": 3.713056447774883, + "tokens_seen": 835840000 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037713139418254766, + "loss": 2.8979, + "theoretical_loss": 3.7130278176502114, + "tokens_seen": 835905536 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037712136409227685, + "loss": 3.0656, + "theoretical_loss": 3.71299919039853, + "tokens_seen": 835971072 + }, + { + "epoch": 10.01, + "learning_rate": 0.000377111334002006, + "loss": 2.9582, + "theoretical_loss": 3.7129705660193255, + "tokens_seen": 836036608 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003771013039117352, + "loss": 2.9851, + "theoretical_loss": 3.7129419445120844, + "tokens_seen": 836102144 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770912738214644, + "loss": 2.9655, + "theoretical_loss": 3.7129133258762934, + "tokens_seen": 836167680 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037708124373119357, + "loss": 2.9774, + "theoretical_loss": 3.712884710111439, + "tokens_seen": 836233216 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770712136409228, + "loss": 2.884, + "theoretical_loss": 3.712856097217009, + "tokens_seen": 836298752 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037706118355065193, + "loss": 3.0418, + "theoretical_loss": 3.71282748719249, + "tokens_seen": 836364288 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037705115346038117, + "loss": 2.9346, + "theoretical_loss": 3.71279888003737, + "tokens_seen": 836429824 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770411233701103, + "loss": 2.9543, + "theoretical_loss": 3.7127702757511356, + "tokens_seen": 836495360 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037703109327983953, + "loss": 2.9056, + "theoretical_loss": 3.712741674333275, + "tokens_seen": 836560896 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770210631895687, + "loss": 2.8875, + "theoretical_loss": 3.7127130757832747, + "tokens_seen": 836626432 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770110330992979, + "loss": 2.8656, + "theoretical_loss": 3.7126844801006245, + "tokens_seen": 836691968 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003770010030090271, + "loss": 3.0058, + "theoretical_loss": 3.7126558872848108, + "tokens_seen": 836757504 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037699097291875625, + "loss": 2.9512, + "theoretical_loss": 3.712627297335322, + "tokens_seen": 836823040 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037698094282848544, + "loss": 3.0212, + "theoretical_loss": 3.712598710251647, + "tokens_seen": 836888576 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037697091273821467, + "loss": 2.957, + "theoretical_loss": 3.7125701260332735, + "tokens_seen": 836954112 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003769608826479438, + "loss": 2.9608, + "theoretical_loss": 3.71254154467969, + "tokens_seen": 837019648 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037695085255767303, + "loss": 2.9129, + "theoretical_loss": 3.7125129661903853, + "tokens_seen": 837085184 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2002562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8506319522857666, + "objective/train/theoretical_loss": 3.712484390564848, + "objective/train/tokens_used": 857610720, + "theoretical_loss": 3.712484390564848, + "tokens_seen": 837150720 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003769408224674022, + "loss": 2.9962, + "theoretical_loss": 3.712484390564848, + "tokens_seen": 837150720 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003769307923771314, + "loss": 3.0438, + "theoretical_loss": 3.7124558178025673, + "tokens_seen": 837216256 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003769207622868606, + "loss": 3.0124, + "theoretical_loss": 3.7124272479030314, + "tokens_seen": 837281792 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037691073219658976, + "loss": 2.9184, + "theoretical_loss": 3.7123986808657303, + "tokens_seen": 837347328 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037690070210631894, + "loss": 3.037, + "theoretical_loss": 3.712370116690153, + "tokens_seen": 837412864 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768906720160482, + "loss": 2.9429, + "theoretical_loss": 3.7123415553757892, + "tokens_seen": 837478400 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768806419257773, + "loss": 2.925, + "theoretical_loss": 3.712312996922128, + "tokens_seen": 837543936 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037687061183550654, + "loss": 2.9072, + "theoretical_loss": 3.7122844413286584, + "tokens_seen": 837609472 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768605817452357, + "loss": 2.9144, + "theoretical_loss": 3.7122558885948718, + "tokens_seen": 837675008 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768505516549649, + "loss": 2.8512, + "theoretical_loss": 3.7122273387202567, + "tokens_seen": 837740544 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037684052156469413, + "loss": 2.9696, + "theoretical_loss": 3.7121987917043042, + "tokens_seen": 837806080 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037683049147442326, + "loss": 2.9592, + "theoretical_loss": 3.7121702475465037, + "tokens_seen": 837871616 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768204613841525, + "loss": 3.0547, + "theoretical_loss": 3.7121417062463458, + "tokens_seen": 837937152 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003768104312938817, + "loss": 2.9062, + "theoretical_loss": 3.7121131678033215, + "tokens_seen": 838002688 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037680040120361086, + "loss": 2.9267, + "theoretical_loss": 3.7120846322169205, + "tokens_seen": 838068224 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037679037111334004, + "loss": 2.8634, + "theoretical_loss": 3.7120560994866336, + "tokens_seen": 838133760 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003767803410230692, + "loss": 2.8848, + "theoretical_loss": 3.7120275696119522, + "tokens_seen": 838199296 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003767703109327984, + "loss": 2.9114, + "theoretical_loss": 3.711999042592367, + "tokens_seen": 838264832 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037676028084252764, + "loss": 2.9582, + "theoretical_loss": 3.7119705184273686, + "tokens_seen": 838330368 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037675025075225676, + "loss": 2.9212, + "theoretical_loss": 3.7119419971164493, + "tokens_seen": 838395904 + }, + { + "epoch": 10.01, + "learning_rate": 0.000376740220661986, + "loss": 2.9257, + "theoretical_loss": 3.7119134786591, + "tokens_seen": 838461440 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003767301905717151, + "loss": 2.9829, + "theoretical_loss": 3.7118849630548114, + "tokens_seen": 838526976 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037672016048144436, + "loss": 2.9494, + "theoretical_loss": 3.711856450303076, + "tokens_seen": 838592512 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037671013039117354, + "loss": 2.8135, + "theoretical_loss": 3.711827940403386, + "tokens_seen": 838658048 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003767001003009027, + "loss": 2.9829, + "theoretical_loss": 3.7117994333552318, + "tokens_seen": 838723584 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2006310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1135005950927734, + "objective/train/theoretical_loss": 3.7117709291581065, + "objective/train/tokens_used": 859249120, + "theoretical_loss": 3.7117709291581065, + "tokens_seen": 838789120 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003766900702106319, + "loss": 3.006, + "theoretical_loss": 3.7117709291581065, + "tokens_seen": 838789120 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003766800401203611, + "loss": 3.031, + "theoretical_loss": 3.7117424278115014, + "tokens_seen": 838854656 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037667001003009027, + "loss": 2.9147, + "theoretical_loss": 3.71171392931491, + "tokens_seen": 838920192 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003766599799398195, + "loss": 2.9866, + "theoretical_loss": 3.7116854336678236, + "tokens_seen": 838985728 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037664994984954863, + "loss": 2.9564, + "theoretical_loss": 3.7116569408697355, + "tokens_seen": 839051264 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037663991975927786, + "loss": 3.0547, + "theoretical_loss": 3.711628450920138, + "tokens_seen": 839116800 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037662988966900705, + "loss": 2.9546, + "theoretical_loss": 3.711599963818524, + "tokens_seen": 839182336 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003766198595787362, + "loss": 2.9011, + "theoretical_loss": 3.7115714795643866, + "tokens_seen": 839247872 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003766098294884654, + "loss": 2.8034, + "theoretical_loss": 3.711542998157218, + "tokens_seen": 839313408 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003765997993981946, + "loss": 2.9823, + "theoretical_loss": 3.7115145195965122, + "tokens_seen": 839378944 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037658976930792377, + "loss": 3.0312, + "theoretical_loss": 3.711486043881762, + "tokens_seen": 839444480 + }, + { + "epoch": 10.01, + "learning_rate": 0.000376579739217653, + "loss": 3.0329, + "theoretical_loss": 3.7114575710124615, + "tokens_seen": 839510016 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037656970912738213, + "loss": 2.8856, + "theoretical_loss": 3.711429100988104, + "tokens_seen": 839575552 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037655967903711137, + "loss": 3.0577, + "theoretical_loss": 3.711400633808183, + "tokens_seen": 839641088 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003765496489468405, + "loss": 2.935, + "theoretical_loss": 3.7113721694721926, + "tokens_seen": 839706624 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037653961885656973, + "loss": 2.9016, + "theoretical_loss": 3.7113437079796263, + "tokens_seen": 839772160 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003765295887662989, + "loss": 3.1074, + "theoretical_loss": 3.711315249329979, + "tokens_seen": 839837696 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003765195586760281, + "loss": 2.9748, + "theoretical_loss": 3.711286793522744, + "tokens_seen": 839903232 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003765095285857573, + "loss": 2.9765, + "theoretical_loss": 3.711258340557416, + "tokens_seen": 839968768 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037649949849548645, + "loss": 2.9959, + "theoretical_loss": 3.71122989043349, + "tokens_seen": 840034304 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037648946840521564, + "loss": 2.9041, + "theoretical_loss": 3.7112014431504594, + "tokens_seen": 840099840 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037647943831494487, + "loss": 2.9828, + "theoretical_loss": 3.71117299870782, + "tokens_seen": 840165376 + }, + { + "epoch": 10.01, + "learning_rate": 0.000376469408224674, + "loss": 2.9282, + "theoretical_loss": 3.7111445571050665, + "tokens_seen": 840230912 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037645937813440323, + "loss": 2.9056, + "theoretical_loss": 3.7111161183416934, + "tokens_seen": 840296448 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003764493480441324, + "loss": 3.0576, + "theoretical_loss": 3.7110876824171966, + "tokens_seen": 840361984 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2011120, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9873898029327393, + "objective/train/theoretical_loss": 3.711059249331071, + "objective/train/tokens_used": 860887520, + "theoretical_loss": 3.711059249331071, + "tokens_seen": 840427520 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003764393179538616, + "loss": 2.957, + "theoretical_loss": 3.711059249331071, + "tokens_seen": 840427520 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003764292878635908, + "loss": 2.9473, + "theoretical_loss": 3.7110308190828114, + "tokens_seen": 840493056 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037641925777331996, + "loss": 2.9759, + "theoretical_loss": 3.711002391671914, + "tokens_seen": 840558592 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037640922768304914, + "loss": 2.9139, + "theoretical_loss": 3.7109739670978747, + "tokens_seen": 840624128 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003763991975927784, + "loss": 2.9159, + "theoretical_loss": 3.7109455453601887, + "tokens_seen": 840689664 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003763891675025075, + "loss": 3.0273, + "theoretical_loss": 3.7109171264583516, + "tokens_seen": 840755200 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037637913741223674, + "loss": 2.9697, + "theoretical_loss": 3.7108887103918597, + "tokens_seen": 840820736 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037636910732196586, + "loss": 2.9727, + "theoretical_loss": 3.71086029716021, + "tokens_seen": 840886272 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003763590772316951, + "loss": 2.9397, + "theoretical_loss": 3.7108318867628975, + "tokens_seen": 840951808 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003763490471414243, + "loss": 2.86, + "theoretical_loss": 3.71080347919942, + "tokens_seen": 841017344 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037633901705115346, + "loss": 2.9744, + "theoretical_loss": 3.7107750744692725, + "tokens_seen": 841082880 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037632898696088264, + "loss": 3.0089, + "theoretical_loss": 3.7107466725719527, + "tokens_seen": 841148416 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003763189568706119, + "loss": 3.0116, + "theoretical_loss": 3.710718273506957, + "tokens_seen": 841213952 + }, + { + "epoch": 10.01, + "learning_rate": 0.000376308926780341, + "loss": 2.9518, + "theoretical_loss": 3.710689877273783, + "tokens_seen": 841279488 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037629889669007024, + "loss": 2.9529, + "theoretical_loss": 3.7106614838719265, + "tokens_seen": 841345024 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037628886659979937, + "loss": 2.9976, + "theoretical_loss": 3.7106330933008858, + "tokens_seen": 841410560 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003762788365095286, + "loss": 2.8948, + "theoretical_loss": 3.7106047055601574, + "tokens_seen": 841476096 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003762688064192578, + "loss": 2.9401, + "theoretical_loss": 3.7105763206492397, + "tokens_seen": 841541632 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037625877632898696, + "loss": 2.9146, + "theoretical_loss": 3.7105479385676294, + "tokens_seen": 841607168 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037624874623871615, + "loss": 2.9613, + "theoretical_loss": 3.7105195593148244, + "tokens_seen": 841672704 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003762387161484453, + "loss": 2.8868, + "theoretical_loss": 3.7104911828903226, + "tokens_seen": 841738240 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003762286860581745, + "loss": 2.9814, + "theoretical_loss": 3.7104628092936225, + "tokens_seen": 841803776 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037621865596790374, + "loss": 2.9443, + "theoretical_loss": 3.7104344385242216, + "tokens_seen": 841869312 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037620862587763287, + "loss": 2.9778, + "theoretical_loss": 3.7104060705816178, + "tokens_seen": 841934848 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003761985957873621, + "loss": 2.9906, + "theoretical_loss": 3.7103777054653104, + "tokens_seen": 842000384 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2014324, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.929621934890747, + "objective/train/theoretical_loss": 3.710349343174797, + "objective/train/tokens_used": 862525920, + "theoretical_loss": 3.710349343174797, + "tokens_seen": 842065920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037618856569709123, + "loss": 2.8882, + "theoretical_loss": 3.710349343174797, + "tokens_seen": 842065920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037617853560682047, + "loss": 3.0099, + "theoretical_loss": 3.7103209837095763, + "tokens_seen": 842131456 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037616850551654965, + "loss": 2.9755, + "theoretical_loss": 3.7102926270691476, + "tokens_seen": 842196992 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037615847542627883, + "loss": 2.8378, + "theoretical_loss": 3.710264273253009, + "tokens_seen": 842262528 + }, + { + "epoch": 10.01, + "learning_rate": 0.000376148445336008, + "loss": 2.9667, + "theoretical_loss": 3.7102359222606602, + "tokens_seen": 842328064 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037613841524573725, + "loss": 2.9316, + "theoretical_loss": 3.7102075740915996, + "tokens_seen": 842393600 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037612838515546637, + "loss": 2.9704, + "theoretical_loss": 3.7101792287453272, + "tokens_seen": 842459136 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003761183550651956, + "loss": 2.9972, + "theoretical_loss": 3.7101508862213417, + "tokens_seen": 842524672 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003761083249749248, + "loss": 2.9005, + "theoretical_loss": 3.710122546519143, + "tokens_seen": 842590208 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037609829488465397, + "loss": 2.9941, + "theoretical_loss": 3.710094209638231, + "tokens_seen": 842655744 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003760882647943832, + "loss": 3.0373, + "theoretical_loss": 3.7100658755781044, + "tokens_seen": 842721280 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037607823470411233, + "loss": 3.0442, + "theoretical_loss": 3.710037544338264, + "tokens_seen": 842786816 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037606820461384157, + "loss": 2.9214, + "theoretical_loss": 3.710009215918209, + "tokens_seen": 842852352 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003760581745235707, + "loss": 2.9063, + "theoretical_loss": 3.7099808903174405, + "tokens_seen": 842917888 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037604814443329993, + "loss": 2.922, + "theoretical_loss": 3.709952567535458, + "tokens_seen": 842983424 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003760381143430291, + "loss": 2.907, + "theoretical_loss": 3.709924247571763, + "tokens_seen": 843048960 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003760280842527583, + "loss": 2.8455, + "theoretical_loss": 3.709895930425854, + "tokens_seen": 843114496 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003760180541624875, + "loss": 2.9139, + "theoretical_loss": 3.7098676160972337, + "tokens_seen": 843180032 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037600802407221665, + "loss": 2.9396, + "theoretical_loss": 3.709839304585402, + "tokens_seen": 843245568 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037599799398194584, + "loss": 2.8891, + "theoretical_loss": 3.709810995889859, + "tokens_seen": 843311104 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037598796389167507, + "loss": 2.8977, + "theoretical_loss": 3.7097826900101074, + "tokens_seen": 843376640 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003759779338014042, + "loss": 2.8727, + "theoretical_loss": 3.709754386945647, + "tokens_seen": 843442176 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037596790371113343, + "loss": 2.9287, + "theoretical_loss": 3.709726086695979, + "tokens_seen": 843507712 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003759578736208626, + "loss": 2.8255, + "theoretical_loss": 3.7096977892606064, + "tokens_seen": 843573248 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003759478435305918, + "loss": 2.9631, + "theoretical_loss": 3.709669494639029, + "tokens_seen": 843638784 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2018953, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7592225074768066, + "objective/train/theoretical_loss": 3.709641202830749, + "objective/train/tokens_used": 864164320, + "theoretical_loss": 3.709641202830749, + "tokens_seen": 843704320 + }, + { + "epoch": 10.01, + "learning_rate": 0.000375937813440321, + "loss": 2.7992, + "theoretical_loss": 3.709641202830749, + "tokens_seen": 843704320 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037592778335005016, + "loss": 2.8914, + "theoretical_loss": 3.7096129138352687, + "tokens_seen": 843769856 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037591775325977934, + "loss": 2.8599, + "theoretical_loss": 3.70958462765209, + "tokens_seen": 843835392 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003759077231695086, + "loss": 2.8739, + "theoretical_loss": 3.7095563442807133, + "tokens_seen": 843900928 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003758976930792377, + "loss": 2.8847, + "theoretical_loss": 3.7095280637206427, + "tokens_seen": 843966464 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037588766298896694, + "loss": 2.9881, + "theoretical_loss": 3.70949978597138, + "tokens_seen": 844032000 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037587763289869606, + "loss": 2.9533, + "theoretical_loss": 3.7094715110324272, + "tokens_seen": 844097536 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003758676028084253, + "loss": 3.0105, + "theoretical_loss": 3.709443238903287, + "tokens_seen": 844163072 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003758575727181545, + "loss": 2.9511, + "theoretical_loss": 3.709414969583462, + "tokens_seen": 844228608 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037584754262788366, + "loss": 3.0205, + "theoretical_loss": 3.7093867030724557, + "tokens_seen": 844294144 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037583751253761284, + "loss": 3.0327, + "theoretical_loss": 3.70935843936977, + "tokens_seen": 844359680 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003758274824473421, + "loss": 2.9329, + "theoretical_loss": 3.7093301784749086, + "tokens_seen": 844425216 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003758174523570712, + "loss": 3.0396, + "theoretical_loss": 3.7093019203873743, + "tokens_seen": 844490752 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037580742226680044, + "loss": 2.8999, + "theoretical_loss": 3.7092736651066707, + "tokens_seen": 844556288 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037579739217652957, + "loss": 2.941, + "theoretical_loss": 3.709245412632301, + "tokens_seen": 844621824 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003757873620862588, + "loss": 2.9431, + "theoretical_loss": 3.709217162963769, + "tokens_seen": 844687360 + }, + { + "epoch": 10.01, + "learning_rate": 0.000375777331995988, + "loss": 2.9223, + "theoretical_loss": 3.709188916100578, + "tokens_seen": 844752896 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037576730190571716, + "loss": 2.9158, + "theoretical_loss": 3.7091606720422323, + "tokens_seen": 844818432 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037575727181544635, + "loss": 2.8646, + "theoretical_loss": 3.7091324307882356, + "tokens_seen": 844883968 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003757472417251755, + "loss": 2.9882, + "theoretical_loss": 3.709104192338092, + "tokens_seen": 844949504 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003757372116349047, + "loss": 2.9806, + "theoretical_loss": 3.709075956691305, + "tokens_seen": 845015040 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037572718154463394, + "loss": 3.0185, + "theoretical_loss": 3.7090477238473802, + "tokens_seen": 845080576 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037571715145436307, + "loss": 2.7961, + "theoretical_loss": 3.709019493805821, + "tokens_seen": 845146112 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003757071213640923, + "loss": 2.9858, + "theoretical_loss": 3.708991266566133, + "tokens_seen": 845211648 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037569709127382143, + "loss": 3.0275, + "theoretical_loss": 3.70896304212782, + "tokens_seen": 845277184 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2022219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.76697039604187, + "objective/train/theoretical_loss": 3.708934820490387, + "objective/train/tokens_used": 865802720, + "theoretical_loss": 3.708934820490387, + "tokens_seen": 845342720 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037568706118355067, + "loss": 2.7722, + "theoretical_loss": 3.708934820490387, + "tokens_seen": 845342720 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037567703109327985, + "loss": 2.9027, + "theoretical_loss": 3.7089066016533385, + "tokens_seen": 845408256 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037566700100300903, + "loss": 2.9187, + "theoretical_loss": 3.70887838561618, + "tokens_seen": 845473792 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003756569709127382, + "loss": 2.9096, + "theoretical_loss": 3.7088501723784177, + "tokens_seen": 845539328 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037564694082246745, + "loss": 2.916, + "theoretical_loss": 3.708821961939556, + "tokens_seen": 845604864 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003756369107321966, + "loss": 2.9604, + "theoretical_loss": 3.7087937542990996, + "tokens_seen": 845670400 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003756268806419258, + "loss": 2.9366, + "theoretical_loss": 3.708765549456555, + "tokens_seen": 845735936 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037561685055165494, + "loss": 2.9282, + "theoretical_loss": 3.708737347411428, + "tokens_seen": 845801472 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037560682046138417, + "loss": 2.9752, + "theoretical_loss": 3.7087091481632237, + "tokens_seen": 845867008 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037559679037111335, + "loss": 2.9433, + "theoretical_loss": 3.708680951711449, + "tokens_seen": 845932544 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037558676028084253, + "loss": 2.9429, + "theoretical_loss": 3.7086527580556092, + "tokens_seen": 845998080 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755767301905717, + "loss": 2.8931, + "theoretical_loss": 3.7086245671952107, + "tokens_seen": 846063616 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755667001003009, + "loss": 2.9687, + "theoretical_loss": 3.70859637912976, + "tokens_seen": 846129152 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755566700100301, + "loss": 2.8801, + "theoretical_loss": 3.7085681938587634, + "tokens_seen": 846194688 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755466399197593, + "loss": 2.9738, + "theoretical_loss": 3.7085400113817277, + "tokens_seen": 846260224 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037553660982948844, + "loss": 2.9373, + "theoretical_loss": 3.708511831698159, + "tokens_seen": 846325760 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755265797392177, + "loss": 2.9342, + "theoretical_loss": 3.708483654807565, + "tokens_seen": 846391296 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003755165496489468, + "loss": 2.8738, + "theoretical_loss": 3.7084554807094516, + "tokens_seen": 846456832 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037550651955867604, + "loss": 2.9955, + "theoretical_loss": 3.708427309403327, + "tokens_seen": 846522368 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754964894684052, + "loss": 3.0104, + "theoretical_loss": 3.708399140888697, + "tokens_seen": 846587904 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754864593781344, + "loss": 2.9517, + "theoretical_loss": 3.708370975165071, + "tokens_seen": 846653440 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754764292878636, + "loss": 2.9822, + "theoretical_loss": 3.7083428122319546, + "tokens_seen": 846718976 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754663991975928, + "loss": 3.0286, + "theoretical_loss": 3.7083146520888564, + "tokens_seen": 846784512 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037545636910732194, + "loss": 2.9468, + "theoretical_loss": 3.708286494735283, + "tokens_seen": 846850048 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754463390170512, + "loss": 2.908, + "theoretical_loss": 3.708258340170744, + "tokens_seen": 846915584 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2026006, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.064587354660034, + "objective/train/theoretical_loss": 3.7082301883947455, + "objective/train/tokens_used": 867441120, + "theoretical_loss": 3.7082301883947455, + "tokens_seen": 846981120 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754363089267803, + "loss": 3.0122, + "theoretical_loss": 3.7082301883947455, + "tokens_seen": 846981120 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037542627883650954, + "loss": 3.0479, + "theoretical_loss": 3.708202039406797, + "tokens_seen": 847046656 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754162487462387, + "loss": 2.9218, + "theoretical_loss": 3.7081738932064052, + "tokens_seen": 847112192 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003754062186559679, + "loss": 3.0279, + "theoretical_loss": 3.70814574979308, + "tokens_seen": 847177728 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003753961885656971, + "loss": 3.0796, + "theoretical_loss": 3.7081176091663295, + "tokens_seen": 847243264 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037538615847542626, + "loss": 2.8046, + "theoretical_loss": 3.7080894713256614, + "tokens_seen": 847308800 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037537612838515544, + "loss": 2.9192, + "theoretical_loss": 3.708061336270585, + "tokens_seen": 847374336 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003753660982948847, + "loss": 2.8685, + "theoretical_loss": 3.7080332040006088, + "tokens_seen": 847439872 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037535606820461386, + "loss": 2.9923, + "theoretical_loss": 3.708005074515243, + "tokens_seen": 847505408 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037534603811434304, + "loss": 2.9104, + "theoretical_loss": 3.707976947813995, + "tokens_seen": 847570944 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003753360080240723, + "loss": 2.9858, + "theoretical_loss": 3.7079488238963743, + "tokens_seen": 847636480 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003753259779338014, + "loss": 2.9808, + "theoretical_loss": 3.707920702761891, + "tokens_seen": 847702016 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037531594784353064, + "loss": 3.0093, + "theoretical_loss": 3.7078925844100548, + "tokens_seen": 847767552 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037530591775325977, + "loss": 2.8794, + "theoretical_loss": 3.7078644688403735, + "tokens_seen": 847833088 + }, + { + "epoch": 10.01, + "learning_rate": 0.000375295887662989, + "loss": 3.0307, + "theoretical_loss": 3.7078363560523586, + "tokens_seen": 847898624 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003752858575727182, + "loss": 2.9555, + "theoretical_loss": 3.707808246045519, + "tokens_seen": 847964160 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037527582748244736, + "loss": 2.9517, + "theoretical_loss": 3.7077801388193654, + "tokens_seen": 848029696 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037526579739217655, + "loss": 2.9952, + "theoretical_loss": 3.7077520343734074, + "tokens_seen": 848095232 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003752557673019057, + "loss": 2.9111, + "theoretical_loss": 3.7077239327071547, + "tokens_seen": 848160768 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003752457372116349, + "loss": 3.0083, + "theoretical_loss": 3.707695833820118, + "tokens_seen": 848226304 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037523570712136414, + "loss": 2.8858, + "theoretical_loss": 3.707667737711809, + "tokens_seen": 848291840 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037522567703109327, + "loss": 2.987, + "theoretical_loss": 3.7076396443817354, + "tokens_seen": 848357376 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003752156469408225, + "loss": 2.9573, + "theoretical_loss": 3.7076115538294108, + "tokens_seen": 848422912 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037520561685055163, + "loss": 3.0364, + "theoretical_loss": 3.707583466054344, + "tokens_seen": 848488448 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037519558676028087, + "loss": 2.9735, + "theoretical_loss": 3.7075553810560473, + "tokens_seen": 848553984 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2030509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.012279987335205, + "objective/train/theoretical_loss": 3.707527298834031, + "objective/train/tokens_used": 869079520, + "theoretical_loss": 3.707527298834031, + "tokens_seen": 848619520 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037518555667001005, + "loss": 3.0274, + "theoretical_loss": 3.707527298834031, + "tokens_seen": 848619520 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037517552657973923, + "loss": 2.8778, + "theoretical_loss": 3.707499219387807, + "tokens_seen": 848685056 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003751654964894684, + "loss": 2.9576, + "theoretical_loss": 3.7074711427168854, + "tokens_seen": 848750592 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037515546639919765, + "loss": 2.9075, + "theoretical_loss": 3.7074430688207785, + "tokens_seen": 848816128 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003751454363089268, + "loss": 2.9458, + "theoretical_loss": 3.707414997698998, + "tokens_seen": 848881664 + }, + { + "epoch": 10.01, + "learning_rate": 0.000375135406218656, + "loss": 2.976, + "theoretical_loss": 3.7073869293510553, + "tokens_seen": 848947200 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037512537612838514, + "loss": 2.9821, + "theoretical_loss": 3.7073588637764616, + "tokens_seen": 849012736 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037511534603811437, + "loss": 2.9082, + "theoretical_loss": 3.70733080097473, + "tokens_seen": 849078272 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037510531594784355, + "loss": 2.9871, + "theoretical_loss": 3.707302740945371, + "tokens_seen": 849143808 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037509528585757273, + "loss": 2.9466, + "theoretical_loss": 3.707274683687899, + "tokens_seen": 849209344 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750852557673019, + "loss": 3.0097, + "theoretical_loss": 3.7072466292018245, + "tokens_seen": 849274880 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750752256770311, + "loss": 2.9551, + "theoretical_loss": 3.7072185774866604, + "tokens_seen": 849340416 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750651955867603, + "loss": 2.9968, + "theoretical_loss": 3.7071905285419193, + "tokens_seen": 849405952 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750551654964895, + "loss": 2.9302, + "theoretical_loss": 3.707162482367114, + "tokens_seen": 849471488 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037504513540621864, + "loss": 2.8763, + "theoretical_loss": 3.707134438961757, + "tokens_seen": 849537024 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750351053159479, + "loss": 2.9278, + "theoretical_loss": 3.7071063983253616, + "tokens_seen": 849602560 + }, + { + "epoch": 10.01, + "learning_rate": 0.000375025075225677, + "loss": 2.986, + "theoretical_loss": 3.70707836045744, + "tokens_seen": 849668096 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037501504513540624, + "loss": 2.9134, + "theoretical_loss": 3.707050325357507, + "tokens_seen": 849733632 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003750050150451354, + "loss": 2.8506, + "theoretical_loss": 3.7070222930250742, + "tokens_seen": 849799168 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749949849548646, + "loss": 2.9756, + "theoretical_loss": 3.7069942634596558, + "tokens_seen": 849864704 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749849548645938, + "loss": 2.9454, + "theoretical_loss": 3.706966236660765, + "tokens_seen": 849930240 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374974924774323, + "loss": 2.9898, + "theoretical_loss": 3.7069382126279162, + "tokens_seen": 849995776 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037496489468405214, + "loss": 2.9564, + "theoretical_loss": 3.706910191360622, + "tokens_seen": 850061312 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749548645937814, + "loss": 2.86, + "theoretical_loss": 3.706882172858397, + "tokens_seen": 850126848 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749448345035105, + "loss": 2.9481, + "theoretical_loss": 3.7068541571207554, + "tokens_seen": 850192384 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2033644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0224108695983887, + "objective/train/theoretical_loss": 3.706826144147211, + "objective/train/tokens_used": 870717920, + "theoretical_loss": 3.706826144147211, + "tokens_seen": 850257920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037493480441323974, + "loss": 2.907, + "theoretical_loss": 3.706826144147211, + "tokens_seen": 850257920 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749247743229689, + "loss": 3.0052, + "theoretical_loss": 3.7067981339372786, + "tokens_seen": 850323456 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749147442326981, + "loss": 3.0194, + "theoretical_loss": 3.706770126490472, + "tokens_seen": 850388992 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003749047141424273, + "loss": 2.9134, + "theoretical_loss": 3.7067421218063057, + "tokens_seen": 850454528 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037489468405215646, + "loss": 2.9574, + "theoretical_loss": 3.7067141198842943, + "tokens_seen": 850520064 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037488465396188564, + "loss": 2.7906, + "theoretical_loss": 3.706686120723953, + "tokens_seen": 850585600 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003748746238716149, + "loss": 2.9339, + "theoretical_loss": 3.7066581243247962, + "tokens_seen": 850651136 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374864593781344, + "loss": 2.8741, + "theoretical_loss": 3.7066301306863396, + "tokens_seen": 850716672 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037485456369107324, + "loss": 2.9682, + "theoretical_loss": 3.7066021398080977, + "tokens_seen": 850782208 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037484453360080237, + "loss": 2.9964, + "theoretical_loss": 3.706574151689586, + "tokens_seen": 850847744 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003748345035105316, + "loss": 2.9968, + "theoretical_loss": 3.7065461663303196, + "tokens_seen": 850913280 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003748244734202608, + "loss": 3.0097, + "theoretical_loss": 3.7065181837298145, + "tokens_seen": 850978816 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037481444332998997, + "loss": 2.9075, + "theoretical_loss": 3.7064902038875855, + "tokens_seen": 851044352 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037480441323971915, + "loss": 3.0648, + "theoretical_loss": 3.7064622268031497, + "tokens_seen": 851109888 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003747943831494484, + "loss": 3.0466, + "theoretical_loss": 3.7064342524760217, + "tokens_seen": 851175424 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003747843530591775, + "loss": 2.9873, + "theoretical_loss": 3.7064062809057177, + "tokens_seen": 851240960 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037477432296890675, + "loss": 2.9681, + "theoretical_loss": 3.706378312091754, + "tokens_seen": 851306496 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037476429287863587, + "loss": 2.9152, + "theoretical_loss": 3.706350346033647, + "tokens_seen": 851372032 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003747542627883651, + "loss": 2.9722, + "theoretical_loss": 3.706322382730913, + "tokens_seen": 851437568 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003747442326980943, + "loss": 2.8689, + "theoretical_loss": 3.7062944221830683, + "tokens_seen": 851503104 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037473420260782347, + "loss": 3.0496, + "theoretical_loss": 3.7062664643896297, + "tokens_seen": 851568640 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037472417251755265, + "loss": 2.9238, + "theoretical_loss": 3.7062385093501136, + "tokens_seen": 851634176 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037471414242728183, + "loss": 2.9453, + "theoretical_loss": 3.706210557064037, + "tokens_seen": 851699712 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374704112337011, + "loss": 2.9322, + "theoretical_loss": 3.706182607530917, + "tokens_seen": 851765248 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037469408224674025, + "loss": 2.9705, + "theoretical_loss": 3.7061546607502702, + "tokens_seen": 851830784 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2036651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8395614624023438, + "objective/train/theoretical_loss": 3.7061267167216148, + "objective/train/tokens_used": 872356320, + "theoretical_loss": 3.7061267167216148, + "tokens_seen": 851896320 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003746840521564694, + "loss": 2.9652, + "theoretical_loss": 3.7061267167216148, + "tokens_seen": 851896320 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003746740220661986, + "loss": 3.0383, + "theoretical_loss": 3.706098775444467, + "tokens_seen": 851961856 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037466399197592774, + "loss": 2.8918, + "theoretical_loss": 3.706070836918345, + "tokens_seen": 852027392 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374653961885657, + "loss": 2.9296, + "theoretical_loss": 3.706042901142766, + "tokens_seen": 852092928 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037464393179538615, + "loss": 2.9091, + "theoretical_loss": 3.706014968117248, + "tokens_seen": 852158464 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037463390170511534, + "loss": 2.9693, + "theoretical_loss": 3.7059870378413082, + "tokens_seen": 852224000 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037462387161484457, + "loss": 2.917, + "theoretical_loss": 3.705959110314465, + "tokens_seen": 852289536 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037461384152457375, + "loss": 2.9013, + "theoretical_loss": 3.7059311855362367, + "tokens_seen": 852355072 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037460381143430293, + "loss": 2.9547, + "theoretical_loss": 3.705903263506141, + "tokens_seen": 852420608 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745937813440321, + "loss": 2.9534, + "theoretical_loss": 3.7058753442236965, + "tokens_seen": 852486144 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745837512537613, + "loss": 2.9343, + "theoretical_loss": 3.705847427688422, + "tokens_seen": 852551680 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745737211634905, + "loss": 2.9364, + "theoretical_loss": 3.7058195138998347, + "tokens_seen": 852617216 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745636910732197, + "loss": 3.0251, + "theoretical_loss": 3.7057916028574542, + "tokens_seen": 852682752 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037455366098294884, + "loss": 2.8299, + "theoretical_loss": 3.7057636945607997, + "tokens_seen": 852748288 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745436308926781, + "loss": 2.9112, + "theoretical_loss": 3.705735789009389, + "tokens_seen": 852813824 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745336008024072, + "loss": 2.8657, + "theoretical_loss": 3.7057078862027426, + "tokens_seen": 852879360 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037452357071213644, + "loss": 2.9909, + "theoretical_loss": 3.7056799861403777, + "tokens_seen": 852944896 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745135406218656, + "loss": 2.9898, + "theoretical_loss": 3.7056520888218154, + "tokens_seen": 853010432 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003745035105315948, + "loss": 2.8375, + "theoretical_loss": 3.7056241942465737, + "tokens_seen": 853075968 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374493480441324, + "loss": 2.8918, + "theoretical_loss": 3.7055963024141736, + "tokens_seen": 853141504 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744834503510532, + "loss": 2.9636, + "theoretical_loss": 3.7055684133241336, + "tokens_seen": 853207040 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037447342026078234, + "loss": 2.8894, + "theoretical_loss": 3.705540526975973, + "tokens_seen": 853272576 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744633901705116, + "loss": 2.988, + "theoretical_loss": 3.705512643369213, + "tokens_seen": 853338112 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744533600802407, + "loss": 2.8647, + "theoretical_loss": 3.7054847625033736, + "tokens_seen": 853403648 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037444332998996994, + "loss": 2.9473, + "theoretical_loss": 3.7054568843779734, + "tokens_seen": 853469184 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2040382, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0007150173187256, + "objective/train/theoretical_loss": 3.705429008992534, + "objective/train/tokens_used": 873994720, + "theoretical_loss": 3.705429008992534, + "tokens_seen": 853534720 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744332998996991, + "loss": 2.8994, + "theoretical_loss": 3.705429008992534, + "tokens_seen": 853534720 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744232698094283, + "loss": 3.035, + "theoretical_loss": 3.705401136346575, + "tokens_seen": 853600256 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003744132397191575, + "loss": 2.911, + "theoretical_loss": 3.7053732664396177, + "tokens_seen": 853665792 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037440320962888666, + "loss": 2.8931, + "theoretical_loss": 3.705345399271182, + "tokens_seen": 853731328 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037439317953861585, + "loss": 2.9868, + "theoretical_loss": 3.7053175348407885, + "tokens_seen": 853796864 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003743831494483451, + "loss": 3.0088, + "theoretical_loss": 3.7052896731479588, + "tokens_seen": 853862400 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003743731193580742, + "loss": 2.9852, + "theoretical_loss": 3.7052618141922133, + "tokens_seen": 853927936 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037436308926780344, + "loss": 2.9972, + "theoretical_loss": 3.705233957973073, + "tokens_seen": 853993472 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037435305917753257, + "loss": 2.9614, + "theoretical_loss": 3.7052061044900597, + "tokens_seen": 854059008 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003743430290872618, + "loss": 2.8654, + "theoretical_loss": 3.705178253742694, + "tokens_seen": 854124544 + }, + { + "epoch": 10.01, + "learning_rate": 0.000374332998996991, + "loss": 2.9517, + "theoretical_loss": 3.705150405730498, + "tokens_seen": 854190080 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037432296890672017, + "loss": 2.9395, + "theoretical_loss": 3.7051225604529927, + "tokens_seen": 854255616 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037431293881644935, + "loss": 2.8131, + "theoretical_loss": 3.7050947179096996, + "tokens_seen": 854321152 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003743029087261786, + "loss": 2.9571, + "theoretical_loss": 3.7050668781001415, + "tokens_seen": 854386688 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003742928786359077, + "loss": 2.9983, + "theoretical_loss": 3.7050390410238396, + "tokens_seen": 854452224 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037428284854563695, + "loss": 3.0276, + "theoretical_loss": 3.7050112066803162, + "tokens_seen": 854517760 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037427281845536607, + "loss": 2.8835, + "theoretical_loss": 3.7049833750690935, + "tokens_seen": 854583296 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003742627883650953, + "loss": 2.9625, + "theoretical_loss": 3.7049555461896935, + "tokens_seen": 854648832 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003742527582748245, + "loss": 2.8801, + "theoretical_loss": 3.7049277200416393, + "tokens_seen": 854714368 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037424272818455367, + "loss": 2.9281, + "theoretical_loss": 3.704899896624452, + "tokens_seen": 854779904 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037423269809428285, + "loss": 3.0185, + "theoretical_loss": 3.7048720759376557, + "tokens_seen": 854845440 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037422266800401203, + "loss": 3.0099, + "theoretical_loss": 3.7048442579807723, + "tokens_seen": 854910976 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003742126379137412, + "loss": 2.9982, + "theoretical_loss": 3.7048164427533252, + "tokens_seen": 854976512 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037420260782347045, + "loss": 3.0273, + "theoretical_loss": 3.7047886302548374, + "tokens_seen": 855042048 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741925777331996, + "loss": 2.9293, + "theoretical_loss": 3.7047608204848315, + "tokens_seen": 855107584 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2045380, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9381117820739746, + "objective/train/theoretical_loss": 3.704733013442831, + "objective/train/tokens_used": 875633120, + "theoretical_loss": 3.704733013442831, + "tokens_seen": 855173120 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741825476429288, + "loss": 2.9711, + "theoretical_loss": 3.704733013442831, + "tokens_seen": 855173120 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037417251755265794, + "loss": 2.9064, + "theoretical_loss": 3.7047052091283597, + "tokens_seen": 855238656 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741624874623872, + "loss": 3.0263, + "theoretical_loss": 3.7046774075409403, + "tokens_seen": 855304192 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037415245737211635, + "loss": 3.0513, + "theoretical_loss": 3.7046496086800973, + "tokens_seen": 855369728 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037414242728184554, + "loss": 2.9423, + "theoretical_loss": 3.7046218125453536, + "tokens_seen": 855435264 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741323971915747, + "loss": 2.8864, + "theoretical_loss": 3.704594019136233, + "tokens_seen": 855500800 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037412236710130395, + "loss": 2.926, + "theoretical_loss": 3.704566228452261, + "tokens_seen": 855566336 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741123370110331, + "loss": 2.9647, + "theoretical_loss": 3.7045384404929598, + "tokens_seen": 855631872 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003741023069207623, + "loss": 2.9111, + "theoretical_loss": 3.7045106552578546, + "tokens_seen": 855697408 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037409227683049144, + "loss": 2.9355, + "theoretical_loss": 3.7044828727464694, + "tokens_seen": 855762944 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740822467402207, + "loss": 2.8782, + "theoretical_loss": 3.704455092958329, + "tokens_seen": 855828480 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037407221664994986, + "loss": 2.9205, + "theoretical_loss": 3.704427315892957, + "tokens_seen": 855894016 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037406218655967904, + "loss": 2.9001, + "theoretical_loss": 3.704399541549879, + "tokens_seen": 855959552 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740521564694082, + "loss": 2.9582, + "theoretical_loss": 3.7043717699286205, + "tokens_seen": 856025088 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740421263791374, + "loss": 2.9235, + "theoretical_loss": 3.704344001028704, + "tokens_seen": 856090624 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740320962888666, + "loss": 3.0032, + "theoretical_loss": 3.7043162348496574, + "tokens_seen": 856156160 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740220661985958, + "loss": 2.9808, + "theoretical_loss": 3.7042884713910036, + "tokens_seen": 856221696 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037401203610832494, + "loss": 2.9794, + "theoretical_loss": 3.704260710652269, + "tokens_seen": 856287232 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003740020060180542, + "loss": 2.9484, + "theoretical_loss": 3.704232952632979, + "tokens_seen": 856352768 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037399197592778336, + "loss": 2.9659, + "theoretical_loss": 3.7042051973326586, + "tokens_seen": 856418304 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037398194583751254, + "loss": 2.958, + "theoretical_loss": 3.704177444750834, + "tokens_seen": 856483840 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739719157472417, + "loss": 3.0386, + "theoretical_loss": 3.7041496948870303, + "tokens_seen": 856549376 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739618856569709, + "loss": 2.9717, + "theoretical_loss": 3.704121947740774, + "tokens_seen": 856614912 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739518555667001, + "loss": 2.9086, + "theoretical_loss": 3.7040942033115907, + "tokens_seen": 856680448 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739418254764293, + "loss": 2.9237, + "theoretical_loss": 3.704066461599007, + "tokens_seen": 856745984 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2048204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9901444911956787, + "objective/train/theoretical_loss": 3.704038722602548, + "objective/train/tokens_used": 877271520, + "theoretical_loss": 3.704038722602548, + "tokens_seen": 856811520 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037393179538615845, + "loss": 2.9767, + "theoretical_loss": 3.704038722602548, + "tokens_seen": 856811520 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739217652958877, + "loss": 2.9684, + "theoretical_loss": 3.704010986321742, + "tokens_seen": 856877056 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003739117352056168, + "loss": 3.0409, + "theoretical_loss": 3.7039832527561134, + "tokens_seen": 856942592 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037390170511534605, + "loss": 2.9778, + "theoretical_loss": 3.70395552190519, + "tokens_seen": 857008128 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003738916750250752, + "loss": 2.8656, + "theoretical_loss": 3.7039277937684982, + "tokens_seen": 857073664 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003738816449348044, + "loss": 2.8834, + "theoretical_loss": 3.7039000683455647, + "tokens_seen": 857139200 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037387161484453364, + "loss": 2.8998, + "theoretical_loss": 3.703872345635917, + "tokens_seen": 857204736 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037386158475426277, + "loss": 2.9037, + "theoretical_loss": 3.703844625639081, + "tokens_seen": 857270272 + }, + { + "epoch": 10.01, + "learning_rate": 0.000373851554663992, + "loss": 2.8957, + "theoretical_loss": 3.703816908354585, + "tokens_seen": 857335808 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003738415245737212, + "loss": 2.928, + "theoretical_loss": 3.703789193781956, + "tokens_seen": 857401344 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037383149448345037, + "loss": 3.0074, + "theoretical_loss": 3.703761481920721, + "tokens_seen": 857466880 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037382146439317955, + "loss": 2.927, + "theoretical_loss": 3.7037337727704083, + "tokens_seen": 857532416 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003738114343029088, + "loss": 2.9494, + "theoretical_loss": 3.7037060663305446, + "tokens_seen": 857597952 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003738014042126379, + "loss": 2.9787, + "theoretical_loss": 3.703678362600659, + "tokens_seen": 857663488 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037379137412236715, + "loss": 2.9757, + "theoretical_loss": 3.7036506615802773, + "tokens_seen": 857729024 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037378134403209627, + "loss": 2.9332, + "theoretical_loss": 3.7036229632689297, + "tokens_seen": 857794560 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003737713139418255, + "loss": 2.9331, + "theoretical_loss": 3.7035952676661434, + "tokens_seen": 857860096 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003737612838515547, + "loss": 2.9024, + "theoretical_loss": 3.703567574771446, + "tokens_seen": 857925632 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037375125376128387, + "loss": 2.8537, + "theoretical_loss": 3.7035398845843672, + "tokens_seen": 857991168 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037374122367101305, + "loss": 2.9535, + "theoretical_loss": 3.703512197104435, + "tokens_seen": 858056704 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037373119358074223, + "loss": 2.8753, + "theoretical_loss": 3.703484512331177, + "tokens_seen": 858122240 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003737211634904714, + "loss": 2.8975, + "theoretical_loss": 3.7034568302641233, + "tokens_seen": 858187776 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037371113340020065, + "loss": 2.8985, + "theoretical_loss": 3.7034291509028017, + "tokens_seen": 858253312 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003737011033099298, + "loss": 3.0629, + "theoretical_loss": 3.703401474246742, + "tokens_seen": 858318848 + }, + { + "epoch": 10.01, + "learning_rate": 0.000373691073219659, + "loss": 2.9698, + "theoretical_loss": 3.7033738002954726, + "tokens_seen": 858384384 + }, + { + "epoch": 10.01, + "objective/train/docs_used": 2053110, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.994478702545166, + "objective/train/theoretical_loss": 3.7033461290485232, + "objective/train/tokens_used": 878909920, + "theoretical_loss": 3.7033461290485232, + "tokens_seen": 858449920 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037368104312938814, + "loss": 2.9846, + "theoretical_loss": 3.7033461290485232, + "tokens_seen": 858449920 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003736710130391174, + "loss": 2.9621, + "theoretical_loss": 3.7033184605054226, + "tokens_seen": 858515456 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037366098294884655, + "loss": 3.0198, + "theoretical_loss": 3.703290794665701, + "tokens_seen": 858580992 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037365095285857574, + "loss": 2.8666, + "theoretical_loss": 3.703263131528887, + "tokens_seen": 858646528 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003736409227683049, + "loss": 3.0482, + "theoretical_loss": 3.7032354710945112, + "tokens_seen": 858712064 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037363089267803415, + "loss": 2.9311, + "theoretical_loss": 3.7032078133621025, + "tokens_seen": 858777600 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003736208625877633, + "loss": 2.904, + "theoretical_loss": 3.7031801583311914, + "tokens_seen": 858843136 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003736108324974925, + "loss": 3.0355, + "theoretical_loss": 3.7031525060013077, + "tokens_seen": 858908672 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037360080240722164, + "loss": 2.9179, + "theoretical_loss": 3.703124856371981, + "tokens_seen": 858974208 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003735907723169509, + "loss": 2.8666, + "theoretical_loss": 3.703097209442743, + "tokens_seen": 859039744 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037358074222668006, + "loss": 2.9319, + "theoretical_loss": 3.7030695652131222, + "tokens_seen": 859105280 + }, + { + "epoch": 10.01, + "learning_rate": 0.00037357071213640924, + "loss": 3.0692, + "theoretical_loss": 3.7030419236826506, + "tokens_seen": 859170816 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003735606820461384, + "loss": 2.9514, + "theoretical_loss": 3.703014284850858, + "tokens_seen": 859236352 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003735506519558676, + "loss": 2.9165, + "theoretical_loss": 3.702986648717276, + "tokens_seen": 859301888 + }, + { + "epoch": 10.01, + "learning_rate": 0.0003735406218655968, + "loss": 2.8282, + "theoretical_loss": 3.7029590152814342, + "tokens_seen": 859367424 + }, + { + "epoch": 10.01, + "learning_rate": 0.000373530591775326, + "loss": 2.9177, + "theoretical_loss": 3.702931384542864, + "tokens_seen": 859432960 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037352056168505514, + "loss": 2.939, + "theoretical_loss": 3.7029037565010974, + "tokens_seen": 859498496 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003735105315947844, + "loss": 2.939, + "theoretical_loss": 3.702876131155664, + "tokens_seen": 859564032 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037350050150451356, + "loss": 2.95, + "theoretical_loss": 3.7028485085060963, + "tokens_seen": 859629568 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037349047141424274, + "loss": 2.9724, + "theoretical_loss": 3.7028208885519254, + "tokens_seen": 859695104 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734804413239719, + "loss": 2.9587, + "theoretical_loss": 3.7027932712926828, + "tokens_seen": 859760640 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734704112337011, + "loss": 2.9346, + "theoretical_loss": 3.7027656567279, + "tokens_seen": 859826176 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734603811434303, + "loss": 2.7983, + "theoretical_loss": 3.702738044857109, + "tokens_seen": 859891712 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734503510531595, + "loss": 2.8924, + "theoretical_loss": 3.7027104356798413, + "tokens_seen": 859957248 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037344032096288865, + "loss": 2.9718, + "theoretical_loss": 3.7026828291956297, + "tokens_seen": 860022784 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2055936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9745545387268066, + "objective/train/theoretical_loss": 3.7026552254040053, + "objective/train/tokens_used": 880548320, + "theoretical_loss": 3.7026552254040053, + "tokens_seen": 860088320 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734302908726179, + "loss": 2.9827, + "theoretical_loss": 3.7026552254040053, + "tokens_seen": 860088320 + }, + { + "epoch": 10.02, + "learning_rate": 0.000373420260782347, + "loss": 2.9184, + "theoretical_loss": 3.702627624304501, + "tokens_seen": 860153856 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037341023069207625, + "loss": 2.9236, + "theoretical_loss": 3.7026000258966496, + "tokens_seen": 860219392 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003734002006018054, + "loss": 2.9832, + "theoretical_loss": 3.7025724301799827, + "tokens_seen": 860284928 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003733901705115346, + "loss": 2.893, + "theoretical_loss": 3.702544837154033, + "tokens_seen": 860350464 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003733801404212638, + "loss": 3.0479, + "theoretical_loss": 3.702517246818334, + "tokens_seen": 860416000 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037337011033099297, + "loss": 2.97, + "theoretical_loss": 3.7024896591724175, + "tokens_seen": 860481536 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037336008024072215, + "loss": 2.9335, + "theoretical_loss": 3.702462074215817, + "tokens_seen": 860547072 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003733500501504514, + "loss": 2.9937, + "theoretical_loss": 3.702434491948065, + "tokens_seen": 860612608 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003733400200601805, + "loss": 3.0287, + "theoretical_loss": 3.702406912368696, + "tokens_seen": 860678144 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037332998996990975, + "loss": 2.9279, + "theoretical_loss": 3.702379335477242, + "tokens_seen": 860743680 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037331995987963893, + "loss": 2.8686, + "theoretical_loss": 3.7023517612732366, + "tokens_seen": 860809216 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003733099297893681, + "loss": 2.915, + "theoretical_loss": 3.7023241897562142, + "tokens_seen": 860874752 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732998996990973, + "loss": 2.9069, + "theoretical_loss": 3.7022966209257073, + "tokens_seen": 860940288 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732898696088265, + "loss": 2.8943, + "theoretical_loss": 3.70226905478125, + "tokens_seen": 861005824 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037327983951855565, + "loss": 2.9047, + "theoretical_loss": 3.702241491322377, + "tokens_seen": 861071360 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732698094282849, + "loss": 3.0031, + "theoretical_loss": 3.7022139305486212, + "tokens_seen": 861136896 + }, + { + "epoch": 10.02, + "learning_rate": 0.000373259779338014, + "loss": 2.8483, + "theoretical_loss": 3.7021863724595168, + "tokens_seen": 861202432 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037324974924774325, + "loss": 3.05, + "theoretical_loss": 3.702158817054599, + "tokens_seen": 861267968 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732397191574724, + "loss": 2.9738, + "theoretical_loss": 3.7021312643334015, + "tokens_seen": 861333504 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732296890672016, + "loss": 2.9577, + "theoretical_loss": 3.7021037142954585, + "tokens_seen": 861399040 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003732196589769308, + "loss": 2.9569, + "theoretical_loss": 3.702076166940305, + "tokens_seen": 861464576 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037320962888666, + "loss": 3.0237, + "theoretical_loss": 3.7020486222674753, + "tokens_seen": 861530112 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037319959879638916, + "loss": 3.0011, + "theoretical_loss": 3.702021080276505, + "tokens_seen": 861595648 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037318956870611834, + "loss": 2.9517, + "theoretical_loss": 3.701993540966928, + "tokens_seen": 861661184 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2059571, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.951868772506714, + "objective/train/theoretical_loss": 3.7019660043382796, + "objective/train/tokens_used": 882186720, + "theoretical_loss": 3.7019660043382796, + "tokens_seen": 861726720 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731795386158475, + "loss": 2.9325, + "theoretical_loss": 3.7019660043382796, + "tokens_seen": 861726720 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037316950852557675, + "loss": 2.9507, + "theoretical_loss": 3.7019384703900955, + "tokens_seen": 861792256 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731594784353059, + "loss": 2.9159, + "theoretical_loss": 3.7019109391219107, + "tokens_seen": 861857792 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731494483450351, + "loss": 2.944, + "theoretical_loss": 3.7018834105332603, + "tokens_seen": 861923328 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731394182547643, + "loss": 2.9304, + "theoretical_loss": 3.7018558846236798, + "tokens_seen": 861988864 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731293881644935, + "loss": 2.9761, + "theoretical_loss": 3.701828361392705, + "tokens_seen": 862054400 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003731193580742227, + "loss": 2.9491, + "theoretical_loss": 3.701800840839872, + "tokens_seen": 862119936 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037310932798395184, + "loss": 2.9566, + "theoretical_loss": 3.7017733229647165, + "tokens_seen": 862185472 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003730992978936811, + "loss": 2.863, + "theoretical_loss": 3.701745807766774, + "tokens_seen": 862251008 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037308926780341026, + "loss": 3.0039, + "theoretical_loss": 3.7017182952455805, + "tokens_seen": 862316544 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037307923771313944, + "loss": 2.8469, + "theoretical_loss": 3.701690785400673, + "tokens_seen": 862382080 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003730692076228686, + "loss": 2.9131, + "theoretical_loss": 3.7016632782315875, + "tokens_seen": 862447616 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003730591775325978, + "loss": 2.8829, + "theoretical_loss": 3.70163577373786, + "tokens_seen": 862513152 + }, + { + "epoch": 10.02, + "learning_rate": 0.000373049147442327, + "loss": 2.8943, + "theoretical_loss": 3.7016082719190275, + "tokens_seen": 862578688 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003730391173520562, + "loss": 2.8281, + "theoretical_loss": 3.7015807727746264, + "tokens_seen": 862644224 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037302908726178534, + "loss": 2.8916, + "theoretical_loss": 3.7015532763041934, + "tokens_seen": 862709760 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003730190571715146, + "loss": 2.9403, + "theoretical_loss": 3.7015257825072654, + "tokens_seen": 862775296 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037300902708124376, + "loss": 3.0267, + "theoretical_loss": 3.70149829138338, + "tokens_seen": 862840832 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037299899699097294, + "loss": 2.9534, + "theoretical_loss": 3.7014708029320733, + "tokens_seen": 862906368 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729889669007021, + "loss": 2.9319, + "theoretical_loss": 3.7014433171528838, + "tokens_seen": 862971904 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729789368104313, + "loss": 2.9848, + "theoretical_loss": 3.701415834045348, + "tokens_seen": 863037440 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729689067201605, + "loss": 3.0089, + "theoretical_loss": 3.701388353609003, + "tokens_seen": 863102976 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729588766298897, + "loss": 2.8391, + "theoretical_loss": 3.701360875843388, + "tokens_seen": 863168512 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037294884653961885, + "loss": 3.0157, + "theoretical_loss": 3.7013334007480383, + "tokens_seen": 863234048 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729388164493481, + "loss": 2.9894, + "theoretical_loss": 3.701305928322494, + "tokens_seen": 863299584 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2064827, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1282567977905273, + "objective/train/theoretical_loss": 3.7012784585662915, + "objective/train/tokens_used": 883825120, + "theoretical_loss": 3.7012784585662915, + "tokens_seen": 863365120 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729287863590772, + "loss": 3.0145, + "theoretical_loss": 3.7012784585662915, + "tokens_seen": 863365120 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037291875626880645, + "loss": 2.8688, + "theoretical_loss": 3.701250991478969, + "tokens_seen": 863430656 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003729087261785356, + "loss": 2.8826, + "theoretical_loss": 3.701223527060066, + "tokens_seen": 863496192 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003728986960882648, + "loss": 2.7833, + "theoretical_loss": 3.7011960653091194, + "tokens_seen": 863561728 + }, + { + "epoch": 10.02, + "learning_rate": 0.000372888665997994, + "loss": 2.8705, + "theoretical_loss": 3.701168606225668, + "tokens_seen": 863627264 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037287863590772317, + "loss": 3.0349, + "theoretical_loss": 3.70114114980925, + "tokens_seen": 863692800 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037286860581745235, + "loss": 2.8483, + "theoretical_loss": 3.7011136960594047, + "tokens_seen": 863758336 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003728585757271816, + "loss": 2.94, + "theoretical_loss": 3.70108624497567, + "tokens_seen": 863823872 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003728485456369107, + "loss": 2.967, + "theoretical_loss": 3.701058796557586, + "tokens_seen": 863889408 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037283851554663995, + "loss": 2.9803, + "theoretical_loss": 3.7010313508046897, + "tokens_seen": 863954944 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037282848545636913, + "loss": 2.9806, + "theoretical_loss": 3.7010039077165224, + "tokens_seen": 864020480 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003728184553660983, + "loss": 2.9172, + "theoretical_loss": 3.7009764672926213, + "tokens_seen": 864086016 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003728084252758275, + "loss": 3.0163, + "theoretical_loss": 3.700949029532527, + "tokens_seen": 864151552 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727983951855567, + "loss": 2.9792, + "theoretical_loss": 3.700921594435779, + "tokens_seen": 864217088 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037278836509528585, + "loss": 2.9289, + "theoretical_loss": 3.700894162001916, + "tokens_seen": 864282624 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727783350050151, + "loss": 3.0239, + "theoretical_loss": 3.7008667322304776, + "tokens_seen": 864348160 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727683049147442, + "loss": 2.9906, + "theoretical_loss": 3.7008393051210042, + "tokens_seen": 864413696 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037275827482447345, + "loss": 2.9008, + "theoretical_loss": 3.7008118806730357, + "tokens_seen": 864479232 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727482447342026, + "loss": 2.8855, + "theoretical_loss": 3.700784458886112, + "tokens_seen": 864544768 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727382146439318, + "loss": 2.9864, + "theoretical_loss": 3.7007570397597727, + "tokens_seen": 864610304 + }, + { + "epoch": 10.02, + "learning_rate": 0.000372728184553661, + "loss": 2.9497, + "theoretical_loss": 3.700729623293558, + "tokens_seen": 864675840 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003727181544633902, + "loss": 2.8572, + "theoretical_loss": 3.700702209487009, + "tokens_seen": 864741376 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037270812437311936, + "loss": 3.0053, + "theoretical_loss": 3.7006747983396657, + "tokens_seen": 864806912 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037269809428284854, + "loss": 2.9006, + "theoretical_loss": 3.700647389851069, + "tokens_seen": 864872448 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726880641925777, + "loss": 2.9426, + "theoretical_loss": 3.7006199840207588, + "tokens_seen": 864937984 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2067625, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0255439281463623, + "objective/train/theoretical_loss": 3.7005925808482765, + "objective/train/tokens_used": 885463520, + "theoretical_loss": 3.7005925808482765, + "tokens_seen": 865003520 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037267803410230696, + "loss": 2.9215, + "theoretical_loss": 3.7005925808482765, + "tokens_seen": 865003520 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726680040120361, + "loss": 2.9471, + "theoretical_loss": 3.7005651803331636, + "tokens_seen": 865069056 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726579739217653, + "loss": 2.881, + "theoretical_loss": 3.7005377824749592, + "tokens_seen": 865134592 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726479438314945, + "loss": 3.0072, + "theoretical_loss": 3.7005103872732064, + "tokens_seen": 865200128 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726379137412237, + "loss": 2.9905, + "theoretical_loss": 3.7004829947274454, + "tokens_seen": 865265664 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037262788365095286, + "loss": 2.9903, + "theoretical_loss": 3.7004556048372184, + "tokens_seen": 865331200 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037261785356068204, + "loss": 3.0351, + "theoretical_loss": 3.7004282176020658, + "tokens_seen": 865396736 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003726078234704112, + "loss": 2.9778, + "theoretical_loss": 3.7004008330215292, + "tokens_seen": 865462272 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037259779338014046, + "loss": 2.9341, + "theoretical_loss": 3.7003734510951514, + "tokens_seen": 865527808 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003725877632898696, + "loss": 2.9317, + "theoretical_loss": 3.700346071822474, + "tokens_seen": 865593344 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003725777331995988, + "loss": 3.0338, + "theoretical_loss": 3.700318695203038, + "tokens_seen": 865658880 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037256770310932795, + "loss": 2.8608, + "theoretical_loss": 3.700291321236386, + "tokens_seen": 865724416 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003725576730190572, + "loss": 2.9755, + "theoretical_loss": 3.70026394992206, + "tokens_seen": 865789952 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037254764292878636, + "loss": 2.8636, + "theoretical_loss": 3.7002365812596034, + "tokens_seen": 865855488 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037253761283851554, + "loss": 2.9993, + "theoretical_loss": 3.700209215248557, + "tokens_seen": 865921024 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003725275827482447, + "loss": 2.9224, + "theoretical_loss": 3.7001818518884635, + "tokens_seen": 865986560 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037251755265797396, + "loss": 3.0155, + "theoretical_loss": 3.700154491178866, + "tokens_seen": 866052096 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003725075225677031, + "loss": 3.0355, + "theoretical_loss": 3.7001271331193077, + "tokens_seen": 866117632 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003724974924774323, + "loss": 2.9667, + "theoretical_loss": 3.7000997777093305, + "tokens_seen": 866183168 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037248746238716145, + "loss": 2.9867, + "theoretical_loss": 3.7000724249484778, + "tokens_seen": 866248704 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003724774322968907, + "loss": 2.937, + "theoretical_loss": 3.700045074836293, + "tokens_seen": 866314240 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037246740220661987, + "loss": 2.9106, + "theoretical_loss": 3.7000177273723183, + "tokens_seen": 866379776 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037245737211634905, + "loss": 2.9462, + "theoretical_loss": 3.6999903825560976, + "tokens_seen": 866445312 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037244734202607823, + "loss": 2.8937, + "theoretical_loss": 3.699963040387175, + "tokens_seen": 866510848 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003724373119358074, + "loss": 3.0104, + "theoretical_loss": 3.6999357008650926, + "tokens_seen": 866576384 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2072421, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8958187103271484, + "objective/train/theoretical_loss": 3.699908363989395, + "objective/train/tokens_used": 887101920, + "theoretical_loss": 3.699908363989395, + "tokens_seen": 866641920 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003724272818455366, + "loss": 2.9047, + "theoretical_loss": 3.699908363989395, + "tokens_seen": 866641920 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003724172517552658, + "loss": 2.9283, + "theoretical_loss": 3.6998810297596254, + "tokens_seen": 866707456 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037240722166499495, + "loss": 2.9429, + "theoretical_loss": 3.6998536981753283, + "tokens_seen": 866772992 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723971915747242, + "loss": 2.8791, + "theoretical_loss": 3.699826369236047, + "tokens_seen": 866838528 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723871614844533, + "loss": 2.9789, + "theoretical_loss": 3.699799042941326, + "tokens_seen": 866904064 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037237713139418255, + "loss": 3.002, + "theoretical_loss": 3.69977171929071, + "tokens_seen": 866969600 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723671013039118, + "loss": 3.0012, + "theoretical_loss": 3.699744398283742, + "tokens_seen": 867035136 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723570712136409, + "loss": 2.8777, + "theoretical_loss": 3.6997170799199672, + "tokens_seen": 867100672 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037234704112337015, + "loss": 2.9156, + "theoretical_loss": 3.69968976419893, + "tokens_seen": 867166208 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037233701103309933, + "loss": 2.9392, + "theoretical_loss": 3.699662451120175, + "tokens_seen": 867231744 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723269809428285, + "loss": 3.0717, + "theoretical_loss": 3.699635140683248, + "tokens_seen": 867297280 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723169508525577, + "loss": 3.0539, + "theoretical_loss": 3.699607832887692, + "tokens_seen": 867362816 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003723069207622869, + "loss": 2.857, + "theoretical_loss": 3.699580527733053, + "tokens_seen": 867428352 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037229689067201605, + "loss": 2.9619, + "theoretical_loss": 3.6995532252188763, + "tokens_seen": 867493888 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003722868605817453, + "loss": 2.9865, + "theoretical_loss": 3.699525925344707, + "tokens_seen": 867559424 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003722768304914744, + "loss": 2.9813, + "theoretical_loss": 3.69949862811009, + "tokens_seen": 867624960 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037226680040120365, + "loss": 3.039, + "theoretical_loss": 3.6994713335145715, + "tokens_seen": 867690496 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003722567703109328, + "loss": 2.9642, + "theoretical_loss": 3.6994440415576957, + "tokens_seen": 867756032 + }, + { + "epoch": 10.02, + "learning_rate": 0.000372246740220662, + "loss": 2.9862, + "theoretical_loss": 3.69941675223901, + "tokens_seen": 867821568 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003722367101303912, + "loss": 2.9665, + "theoretical_loss": 3.699389465558059, + "tokens_seen": 867887104 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003722266800401204, + "loss": 2.8735, + "theoretical_loss": 3.6993621815143887, + "tokens_seen": 867952640 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037221664994984956, + "loss": 2.9659, + "theoretical_loss": 3.6993349001075453, + "tokens_seen": 868018176 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037220661985957874, + "loss": 2.9634, + "theoretical_loss": 3.6993076213370752, + "tokens_seen": 868083712 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721965897693079, + "loss": 2.9381, + "theoretical_loss": 3.6992803452025242, + "tokens_seen": 868149248 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037218655967903716, + "loss": 2.9554, + "theoretical_loss": 3.6992530717034393, + "tokens_seen": 868214784 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2075237, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.99349045753479, + "objective/train/theoretical_loss": 3.6992258008393657, + "objective/train/tokens_used": 888740320, + "theoretical_loss": 3.6992258008393657, + "tokens_seen": 868280320 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721765295887663, + "loss": 2.9509, + "theoretical_loss": 3.6992258008393657, + "tokens_seen": 868280320 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721664994984955, + "loss": 2.8598, + "theoretical_loss": 3.6991985326098513, + "tokens_seen": 868345856 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721564694082247, + "loss": 2.8869, + "theoretical_loss": 3.6991712670144414, + "tokens_seen": 868411392 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721464393179539, + "loss": 2.8936, + "theoretical_loss": 3.699144004052684, + "tokens_seen": 868476928 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037213640922768306, + "loss": 2.9892, + "theoretical_loss": 3.699116743724126, + "tokens_seen": 868542464 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037212637913741224, + "loss": 2.9512, + "theoretical_loss": 3.6990894860283134, + "tokens_seen": 868608000 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003721163490471414, + "loss": 2.8452, + "theoretical_loss": 3.699062230964794, + "tokens_seen": 868673536 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037210631895687066, + "loss": 2.9417, + "theoretical_loss": 3.699034978533115, + "tokens_seen": 868739072 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003720962888665998, + "loss": 2.9291, + "theoretical_loss": 3.6990077287328242, + "tokens_seen": 868804608 + }, + { + "epoch": 10.02, + "learning_rate": 0.000372086258776329, + "loss": 2.9454, + "theoretical_loss": 3.698980481563468, + "tokens_seen": 868870144 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037207622868605815, + "loss": 3.0452, + "theoretical_loss": 3.698953237024594, + "tokens_seen": 868935680 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003720661985957874, + "loss": 2.8538, + "theoretical_loss": 3.698925995115751, + "tokens_seen": 869001216 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037205616850551656, + "loss": 2.9064, + "theoretical_loss": 3.6988987558364865, + "tokens_seen": 869066752 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037204613841524575, + "loss": 2.954, + "theoretical_loss": 3.6988715191863477, + "tokens_seen": 869132288 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003720361083249749, + "loss": 2.8965, + "theoretical_loss": 3.698844285164883, + "tokens_seen": 869197824 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037202607823470416, + "loss": 2.8885, + "theoretical_loss": 3.6988170537716405, + "tokens_seen": 869263360 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003720160481444333, + "loss": 2.9184, + "theoretical_loss": 3.698789825006169, + "tokens_seen": 869328896 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003720060180541625, + "loss": 2.9241, + "theoretical_loss": 3.698762598868016, + "tokens_seen": 869394432 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037199598796389165, + "loss": 2.8417, + "theoretical_loss": 3.6987353753567307, + "tokens_seen": 869459968 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003719859578736209, + "loss": 2.9654, + "theoretical_loss": 3.6987081544718605, + "tokens_seen": 869525504 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037197592778335007, + "loss": 2.973, + "theoretical_loss": 3.6986809362129556, + "tokens_seen": 869591040 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037196589769307925, + "loss": 2.9576, + "theoretical_loss": 3.6986537205795638, + "tokens_seen": 869656576 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037195586760280843, + "loss": 2.9474, + "theoretical_loss": 3.6986265075712343, + "tokens_seen": 869722112 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003719458375125376, + "loss": 3.0029, + "theoretical_loss": 3.698599297187516, + "tokens_seen": 869787648 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003719358074222668, + "loss": 2.9837, + "theoretical_loss": 3.698572089427959, + "tokens_seen": 869853184 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9944217205047607, + "objective/train/theoretical_loss": 3.6985448842921107, + "objective/train/tokens_used": 890378720, + "theoretical_loss": 3.6985448842921107, + "tokens_seen": 869918720 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371925777331996, + "loss": 2.9709, + "theoretical_loss": 3.6985448842921107, + "tokens_seen": 869918720 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037191574724172515, + "loss": 3.0511, + "theoretical_loss": 3.6985176817795216, + "tokens_seen": 869984256 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003719057171514544, + "loss": 2.898, + "theoretical_loss": 3.6984904818897415, + "tokens_seen": 870049792 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003718956870611835, + "loss": 2.8891, + "theoretical_loss": 3.6984632846223193, + "tokens_seen": 870115328 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037188565697091275, + "loss": 2.7878, + "theoretical_loss": 3.6984360899768047, + "tokens_seen": 870180864 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037187562688064193, + "loss": 3.0116, + "theoretical_loss": 3.6984088979527483, + "tokens_seen": 870246400 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003718655967903711, + "loss": 2.9954, + "theoretical_loss": 3.6983817085496993, + "tokens_seen": 870311936 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003718555667001003, + "loss": 2.8604, + "theoretical_loss": 3.6983545217672074, + "tokens_seen": 870377472 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037184553660982953, + "loss": 3.0181, + "theoretical_loss": 3.698327337604823, + "tokens_seen": 870443008 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037183550651955866, + "loss": 2.9228, + "theoretical_loss": 3.6983001560620976, + "tokens_seen": 870508544 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003718254764292879, + "loss": 3.0243, + "theoretical_loss": 3.69827297713858, + "tokens_seen": 870574080 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371815446339017, + "loss": 3.0077, + "theoretical_loss": 3.698245800833821, + "tokens_seen": 870639616 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037180541624874625, + "loss": 2.9667, + "theoretical_loss": 3.698218627147371, + "tokens_seen": 870705152 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037179538615847544, + "loss": 2.9941, + "theoretical_loss": 3.6981914560787823, + "tokens_seen": 870770688 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717853560682046, + "loss": 3.054, + "theoretical_loss": 3.6981642876276033, + "tokens_seen": 870836224 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717753259779338, + "loss": 2.983, + "theoretical_loss": 3.6981371217933865, + "tokens_seen": 870901760 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371765295887663, + "loss": 2.963, + "theoretical_loss": 3.6981099585756825, + "tokens_seen": 870967296 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037175526579739216, + "loss": 3.0042, + "theoretical_loss": 3.6980827979740423, + "tokens_seen": 871032832 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717452357071214, + "loss": 2.9751, + "theoretical_loss": 3.6980556399880173, + "tokens_seen": 871098368 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717352056168505, + "loss": 2.8888, + "theoretical_loss": 3.6980284846171587, + "tokens_seen": 871163904 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037172517552657976, + "loss": 2.9479, + "theoretical_loss": 3.6980013318610183, + "tokens_seen": 871229440 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717151454363089, + "loss": 2.947, + "theoretical_loss": 3.6979741817191476, + "tokens_seen": 871294976 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003717051153460381, + "loss": 2.9163, + "theoretical_loss": 3.697947034191098, + "tokens_seen": 871360512 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003716950852557673, + "loss": 2.9591, + "theoretical_loss": 3.697919889276422, + "tokens_seen": 871426048 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003716850551654965, + "loss": 2.9077, + "theoretical_loss": 3.69789274697467, + "tokens_seen": 871491584 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1423146724700928, + "objective/train/theoretical_loss": 3.6978656072853955, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6978656072853955, + "tokens_seen": 871557120 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037167502507522566, + "loss": 2.9435, + "theoretical_loss": 3.6978656072853955, + "tokens_seen": 871557120 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003716649949849549, + "loss": 2.9503, + "theoretical_loss": 3.6978384702081506, + "tokens_seen": 871622656 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371654964894684, + "loss": 2.9884, + "theoretical_loss": 3.6978113357424864, + "tokens_seen": 871688192 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037164493480441326, + "loss": 2.9358, + "theoretical_loss": 3.6977842038879563, + "tokens_seen": 871753728 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003716349047141424, + "loss": 2.9844, + "theoretical_loss": 3.6977570746441124, + "tokens_seen": 871819264 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003716248746238716, + "loss": 3.0051, + "theoretical_loss": 3.6977299480105073, + "tokens_seen": 871884800 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037161484453360086, + "loss": 3.0025, + "theoretical_loss": 3.697702823986693, + "tokens_seen": 871950336 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037160481444333, + "loss": 2.9175, + "theoretical_loss": 3.6976757025722238, + "tokens_seen": 872015872 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003715947843530592, + "loss": 2.8622, + "theoretical_loss": 3.697648583766651, + "tokens_seen": 872081408 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037158475426278835, + "loss": 3.0208, + "theoretical_loss": 3.6976214675695287, + "tokens_seen": 872146944 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003715747241725176, + "loss": 2.7458, + "theoretical_loss": 3.69759435398041, + "tokens_seen": 872212480 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037156469408224676, + "loss": 2.9999, + "theoretical_loss": 3.697567242998847, + "tokens_seen": 872278016 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037155466399197595, + "loss": 2.9465, + "theoretical_loss": 3.6975401346243943, + "tokens_seen": 872343552 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003715446339017051, + "loss": 2.9905, + "theoretical_loss": 3.6975130288566054, + "tokens_seen": 872409088 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037153460381143436, + "loss": 3.0234, + "theoretical_loss": 3.697485925695033, + "tokens_seen": 872474624 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003715245737211635, + "loss": 3.0097, + "theoretical_loss": 3.697458825139231, + "tokens_seen": 872540160 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003715145436308927, + "loss": 2.9866, + "theoretical_loss": 3.6974317271887536, + "tokens_seen": 872605696 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037150451354062185, + "loss": 2.8587, + "theoretical_loss": 3.6974046318431544, + "tokens_seen": 872671232 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003714944834503511, + "loss": 2.946, + "theoretical_loss": 3.6973775391019874, + "tokens_seen": 872736768 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037148445336008027, + "loss": 3.0408, + "theoretical_loss": 3.697350448964807, + "tokens_seen": 872802304 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037147442326980945, + "loss": 3.0303, + "theoretical_loss": 3.6973233614311667, + "tokens_seen": 872867840 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037146439317953863, + "loss": 2.9474, + "theoretical_loss": 3.697296276500621, + "tokens_seen": 872933376 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003714543630892678, + "loss": 2.9523, + "theoretical_loss": 3.6972691941727254, + "tokens_seen": 872998912 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371444332998997, + "loss": 2.9287, + "theoretical_loss": 3.6972421144470338, + "tokens_seen": 873064448 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037143430290872623, + "loss": 2.9033, + "theoretical_loss": 3.6972150373231, + "tokens_seen": 873129984 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9914374351501465, + "objective/train/theoretical_loss": 3.6971879628004802, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6971879628004802, + "tokens_seen": 873195520 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037142427281845535, + "loss": 2.9171, + "theoretical_loss": 3.6971879628004802, + "tokens_seen": 873195520 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003714142427281846, + "loss": 2.813, + "theoretical_loss": 3.6971608908787283, + "tokens_seen": 873261056 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003714042126379137, + "loss": 3.0563, + "theoretical_loss": 3.6971338215574, + "tokens_seen": 873326592 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037139418254764295, + "loss": 2.9762, + "theoretical_loss": 3.69710675483605, + "tokens_seen": 873392128 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037138415245737213, + "loss": 2.8978, + "theoretical_loss": 3.697079690714233, + "tokens_seen": 873457664 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003713741223671013, + "loss": 2.9521, + "theoretical_loss": 3.6970526291915053, + "tokens_seen": 873523200 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003713640922768305, + "loss": 3.0226, + "theoretical_loss": 3.6970255702674217, + "tokens_seen": 873588736 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037135406218655973, + "loss": 2.9639, + "theoretical_loss": 3.696998513941538, + "tokens_seen": 873654272 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037134403209628886, + "loss": 3.0317, + "theoretical_loss": 3.6969714602134096, + "tokens_seen": 873719808 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003713340020060181, + "loss": 2.975, + "theoretical_loss": 3.6969444090825925, + "tokens_seen": 873785344 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003713239719157472, + "loss": 3.0192, + "theoretical_loss": 3.6969173605486425, + "tokens_seen": 873850880 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037131394182547645, + "loss": 2.8769, + "theoretical_loss": 3.6968903146111156, + "tokens_seen": 873916416 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037130391173520564, + "loss": 2.9681, + "theoretical_loss": 3.696863271269568, + "tokens_seen": 873981952 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712938816449348, + "loss": 2.9115, + "theoretical_loss": 3.6968362305235556, + "tokens_seen": 874047488 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371283851554664, + "loss": 3.0624, + "theoretical_loss": 3.6968091923726347, + "tokens_seen": 874113024 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712738214643932, + "loss": 2.8881, + "theoretical_loss": 3.6967821568163624, + "tokens_seen": 874178560 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037126379137412236, + "loss": 2.9548, + "theoretical_loss": 3.696755123854294, + "tokens_seen": 874244096 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712537612838516, + "loss": 2.9103, + "theoretical_loss": 3.696728093485987, + "tokens_seen": 874309632 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712437311935807, + "loss": 3.0267, + "theoretical_loss": 3.6967010657109984, + "tokens_seen": 874375168 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037123370110330996, + "loss": 3.0355, + "theoretical_loss": 3.696674040528884, + "tokens_seen": 874440704 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712236710130391, + "loss": 2.9434, + "theoretical_loss": 3.6966470179392017, + "tokens_seen": 874506240 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712136409227683, + "loss": 2.9692, + "theoretical_loss": 3.696619997941508, + "tokens_seen": 874571776 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003712036108324975, + "loss": 3.0195, + "theoretical_loss": 3.696592980535361, + "tokens_seen": 874637312 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711935807422267, + "loss": 2.9231, + "theoretical_loss": 3.6965659657203163, + "tokens_seen": 874702848 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037118355065195586, + "loss": 2.9473, + "theoretical_loss": 3.6965389534959328, + "tokens_seen": 874768384 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7804970741271973, + "objective/train/theoretical_loss": 3.6965119438617675, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6965119438617675, + "tokens_seen": 874833920 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711735205616851, + "loss": 2.8745, + "theoretical_loss": 3.6965119438617675, + "tokens_seen": 874833920 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711634904714142, + "loss": 3.0762, + "theoretical_loss": 3.6964849368173778, + "tokens_seen": 874899456 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037115346038114346, + "loss": 2.9233, + "theoretical_loss": 3.696457932362322, + "tokens_seen": 874964992 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711434302908726, + "loss": 2.936, + "theoretical_loss": 3.696430930496157, + "tokens_seen": 875030528 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711334002006018, + "loss": 2.918, + "theoretical_loss": 3.696403931218442, + "tokens_seen": 875096064 + }, + { + "epoch": 10.02, + "learning_rate": 0.000371123370110331, + "loss": 2.8032, + "theoretical_loss": 3.6963769345287334, + "tokens_seen": 875161600 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003711133400200602, + "loss": 3.019, + "theoretical_loss": 3.6963499404265914, + "tokens_seen": 875227136 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037110330992978937, + "loss": 2.9126, + "theoretical_loss": 3.696322948911572, + "tokens_seen": 875292672 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037109327983951855, + "loss": 2.853, + "theoretical_loss": 3.6962959599832352, + "tokens_seen": 875358208 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037108324974924773, + "loss": 2.7871, + "theoretical_loss": 3.696268973641139, + "tokens_seen": 875423744 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037107321965897696, + "loss": 2.9887, + "theoretical_loss": 3.696241989884842, + "tokens_seen": 875489280 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003710631895687061, + "loss": 2.9651, + "theoretical_loss": 3.696215008713903, + "tokens_seen": 875554816 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003710531594784353, + "loss": 2.9784, + "theoretical_loss": 3.696188030127881, + "tokens_seen": 875620352 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037104312938816445, + "loss": 2.9541, + "theoretical_loss": 3.6961610541263337, + "tokens_seen": 875685888 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003710330992978937, + "loss": 2.9889, + "theoretical_loss": 3.6961340807088217, + "tokens_seen": 875751424 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037102306920762287, + "loss": 3.0271, + "theoretical_loss": 3.696107109874903, + "tokens_seen": 875816960 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037101303911735205, + "loss": 2.8803, + "theoretical_loss": 3.6960801416241376, + "tokens_seen": 875882496 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037100300902708123, + "loss": 3.062, + "theoretical_loss": 3.6960531759560844, + "tokens_seen": 875948032 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037099297893681047, + "loss": 2.9437, + "theoretical_loss": 3.6960262128703025, + "tokens_seen": 876013568 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003709829488465396, + "loss": 3.0354, + "theoretical_loss": 3.6959992523663527, + "tokens_seen": 876079104 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037097291875626883, + "loss": 2.9011, + "theoretical_loss": 3.695972294443793, + "tokens_seen": 876144640 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037096288866599796, + "loss": 2.976, + "theoretical_loss": 3.6959453391021855, + "tokens_seen": 876210176 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003709528585757272, + "loss": 2.9, + "theoretical_loss": 3.695918386341088, + "tokens_seen": 876275712 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003709428284854564, + "loss": 2.9293, + "theoretical_loss": 3.6958914361600606, + "tokens_seen": 876341248 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037093279839518555, + "loss": 2.9797, + "theoretical_loss": 3.6958644885586645, + "tokens_seen": 876406784 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.002401828765869, + "objective/train/theoretical_loss": 3.6958375435364594, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6958375435364594, + "tokens_seen": 876472320 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037092276830491474, + "loss": 2.9051, + "theoretical_loss": 3.6958375435364594, + "tokens_seen": 876472320 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003709127382146439, + "loss": 2.9264, + "theoretical_loss": 3.6958106010930054, + "tokens_seen": 876537856 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003709027081243731, + "loss": 2.977, + "theoretical_loss": 3.695783661227863, + "tokens_seen": 876603392 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037089267803410233, + "loss": 3.034, + "theoretical_loss": 3.695756723940593, + "tokens_seen": 876668928 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037088264794383146, + "loss": 2.9427, + "theoretical_loss": 3.6957297892307563, + "tokens_seen": 876734464 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003708726178535607, + "loss": 3.0293, + "theoretical_loss": 3.6957028570979125, + "tokens_seen": 876800000 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037086258776328993, + "loss": 2.8779, + "theoretical_loss": 3.6956759275416236, + "tokens_seen": 876865536 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037085255767301906, + "loss": 2.9437, + "theoretical_loss": 3.69564900056145, + "tokens_seen": 876931072 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003708425275827483, + "loss": 2.9395, + "theoretical_loss": 3.6956220761569534, + "tokens_seen": 876996608 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003708324974924774, + "loss": 2.9504, + "theoretical_loss": 3.695595154327694, + "tokens_seen": 877062144 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037082246740220665, + "loss": 3.0112, + "theoretical_loss": 3.695568235073234, + "tokens_seen": 877127680 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037081243731193584, + "loss": 2.9568, + "theoretical_loss": 3.695541318393134, + "tokens_seen": 877193216 + }, + { + "epoch": 10.02, + "learning_rate": 0.000370802407221665, + "loss": 2.8871, + "theoretical_loss": 3.695514404286956, + "tokens_seen": 877258752 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707923771313942, + "loss": 2.9116, + "theoretical_loss": 3.695487492754262, + "tokens_seen": 877324288 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707823470411234, + "loss": 2.9483, + "theoretical_loss": 3.6954605837946124, + "tokens_seen": 877389824 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037077231695085256, + "loss": 2.7992, + "theoretical_loss": 3.6954336774075704, + "tokens_seen": 877455360 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707622868605818, + "loss": 2.9468, + "theoretical_loss": 3.6954067735926968, + "tokens_seen": 877520896 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707522567703109, + "loss": 2.9951, + "theoretical_loss": 3.6953798723495552, + "tokens_seen": 877586432 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037074222668004016, + "loss": 2.9995, + "theoretical_loss": 3.6953529736777058, + "tokens_seen": 877651968 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707321965897693, + "loss": 2.8596, + "theoretical_loss": 3.6953260775767123, + "tokens_seen": 877717504 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707221664994985, + "loss": 2.9544, + "theoretical_loss": 3.6952991840461364, + "tokens_seen": 877783040 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707121364092277, + "loss": 2.9864, + "theoretical_loss": 3.695272293085541, + "tokens_seen": 877848576 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003707021063189569, + "loss": 2.9738, + "theoretical_loss": 3.695245404694488, + "tokens_seen": 877914112 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037069207622868606, + "loss": 2.9671, + "theoretical_loss": 3.695218518872541, + "tokens_seen": 877979648 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003706820461384153, + "loss": 2.9434, + "theoretical_loss": 3.6951916356192616, + "tokens_seen": 878045184 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.724334239959717, + "objective/train/theoretical_loss": 3.695164754934214, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.695164754934214, + "tokens_seen": 878110720 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003706720160481444, + "loss": 2.9286, + "theoretical_loss": 3.695164754934214, + "tokens_seen": 878110720 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037066198595787366, + "loss": 2.9041, + "theoretical_loss": 3.6951378768169603, + "tokens_seen": 878176256 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003706519558676028, + "loss": 2.956, + "theoretical_loss": 3.6951110012670636, + "tokens_seen": 878241792 + }, + { + "epoch": 10.02, + "learning_rate": 0.000370641925777332, + "loss": 2.9852, + "theoretical_loss": 3.695084128284088, + "tokens_seen": 878307328 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003706318956870612, + "loss": 2.9837, + "theoretical_loss": 3.695057257867596, + "tokens_seen": 878372864 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003706218655967904, + "loss": 3.0006, + "theoretical_loss": 3.695030390017151, + "tokens_seen": 878438400 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037061183550651957, + "loss": 2.9244, + "theoretical_loss": 3.695003524732317, + "tokens_seen": 878503936 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037060180541624875, + "loss": 2.8957, + "theoretical_loss": 3.6949766620126576, + "tokens_seen": 878569472 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037059177532597793, + "loss": 3.0803, + "theoretical_loss": 3.694949801857736, + "tokens_seen": 878635008 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037058174523570716, + "loss": 2.9991, + "theoretical_loss": 3.694922944267117, + "tokens_seen": 878700544 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003705717151454363, + "loss": 3.0072, + "theoretical_loss": 3.6948960892403635, + "tokens_seen": 878766080 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003705616850551655, + "loss": 2.9391, + "theoretical_loss": 3.6948692367770404, + "tokens_seen": 878831616 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037055165496489465, + "loss": 2.9803, + "theoretical_loss": 3.6948423868767115, + "tokens_seen": 878897152 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003705416248746239, + "loss": 2.9914, + "theoretical_loss": 3.6948155395389413, + "tokens_seen": 878962688 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037053159478435307, + "loss": 2.856, + "theoretical_loss": 3.694788694763294, + "tokens_seen": 879028224 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037052156469408225, + "loss": 2.8479, + "theoretical_loss": 3.694761852549334, + "tokens_seen": 879093760 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037051153460381143, + "loss": 2.9129, + "theoretical_loss": 3.6947350128966256, + "tokens_seen": 879159296 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037050150451354067, + "loss": 2.9844, + "theoretical_loss": 3.6947081758047347, + "tokens_seen": 879224832 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003704914744232698, + "loss": 3.0579, + "theoretical_loss": 3.694681341273225, + "tokens_seen": 879290368 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037048144433299903, + "loss": 2.9668, + "theoretical_loss": 3.694654509301662, + "tokens_seen": 879355904 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037047141424272816, + "loss": 3.0282, + "theoretical_loss": 3.69462767988961, + "tokens_seen": 879421440 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003704613841524574, + "loss": 2.9934, + "theoretical_loss": 3.694600853036635, + "tokens_seen": 879486976 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003704513540621866, + "loss": 2.9848, + "theoretical_loss": 3.694574028742302, + "tokens_seen": 879552512 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037044132397191575, + "loss": 2.8708, + "theoretical_loss": 3.694547207006176, + "tokens_seen": 879618048 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037043129388164494, + "loss": 2.9951, + "theoretical_loss": 3.694520387827823, + "tokens_seen": 879683584 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.861652374267578, + "objective/train/theoretical_loss": 3.6944935712068077, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6944935712068077, + "tokens_seen": 879749120 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003704212637913741, + "loss": 2.8067, + "theoretical_loss": 3.6944935712068077, + "tokens_seen": 879749120 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003704112337011033, + "loss": 2.9931, + "theoretical_loss": 3.694466757142697, + "tokens_seen": 879814656 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037040120361083253, + "loss": 2.9995, + "theoretical_loss": 3.6944399456350556, + "tokens_seen": 879880192 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037039117352056166, + "loss": 2.9561, + "theoretical_loss": 3.6944131366834494, + "tokens_seen": 879945728 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003703811434302909, + "loss": 3.0188, + "theoretical_loss": 3.6943863302874456, + "tokens_seen": 880011264 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003703711133400201, + "loss": 3.0521, + "theoretical_loss": 3.694359526446609, + "tokens_seen": 880076800 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037036108324974926, + "loss": 3.0205, + "theoretical_loss": 3.694332725160506, + "tokens_seen": 880142336 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037035105315947844, + "loss": 2.9181, + "theoretical_loss": 3.6943059264287035, + "tokens_seen": 880207872 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003703410230692076, + "loss": 2.9453, + "theoretical_loss": 3.6942791302507674, + "tokens_seen": 880273408 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003703309929789368, + "loss": 3.0005, + "theoretical_loss": 3.6942523366262647, + "tokens_seen": 880338944 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037032096288866604, + "loss": 2.9945, + "theoretical_loss": 3.6942255455547617, + "tokens_seen": 880404480 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037031093279839516, + "loss": 2.9264, + "theoretical_loss": 3.6941987570358243, + "tokens_seen": 880470016 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003703009027081244, + "loss": 2.9595, + "theoretical_loss": 3.694171971069021, + "tokens_seen": 880535552 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003702908726178535, + "loss": 3.0198, + "theoretical_loss": 3.6941451876539175, + "tokens_seen": 880601088 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037028084252758276, + "loss": 3.015, + "theoretical_loss": 3.694118406790081, + "tokens_seen": 880666624 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037027081243731194, + "loss": 2.9948, + "theoretical_loss": 3.6940916284770795, + "tokens_seen": 880732160 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003702607823470411, + "loss": 2.9831, + "theoretical_loss": 3.6940648527144795, + "tokens_seen": 880797696 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003702507522567703, + "loss": 2.8978, + "theoretical_loss": 3.694038079501848, + "tokens_seen": 880863232 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003702407221664995, + "loss": 3.0261, + "theoretical_loss": 3.6940113088387534, + "tokens_seen": 880928768 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037023069207622867, + "loss": 2.984, + "theoretical_loss": 3.6939845407247627, + "tokens_seen": 880994304 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003702206619859579, + "loss": 2.9029, + "theoretical_loss": 3.6939577751594435, + "tokens_seen": 881059840 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037021063189568703, + "loss": 2.9863, + "theoretical_loss": 3.693931012142364, + "tokens_seen": 881125376 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037020060180541626, + "loss": 2.9233, + "theoretical_loss": 3.693904251673092, + "tokens_seen": 881190912 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037019057171514544, + "loss": 2.906, + "theoretical_loss": 3.693877493751195, + "tokens_seen": 881256448 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003701805416248746, + "loss": 2.9639, + "theoretical_loss": 3.693850738376242, + "tokens_seen": 881321984 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.084153652191162, + "objective/train/theoretical_loss": 3.6938239855478, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6938239855478, + "tokens_seen": 881387520 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003701705115346038, + "loss": 2.9847, + "theoretical_loss": 3.6938239855478, + "tokens_seen": 881387520 + }, + { + "epoch": 10.02, + "learning_rate": 0.000370160481444333, + "loss": 3.0789, + "theoretical_loss": 3.6937972352654382, + "tokens_seen": 881453056 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037015045135406217, + "loss": 3.0415, + "theoretical_loss": 3.693770487528725, + "tokens_seen": 881518592 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003701404212637914, + "loss": 2.9678, + "theoretical_loss": 3.693743742337229, + "tokens_seen": 881584128 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037013039117352053, + "loss": 2.9483, + "theoretical_loss": 3.693716999690518, + "tokens_seen": 881649664 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037012036108324977, + "loss": 2.9888, + "theoretical_loss": 3.6936902595881618, + "tokens_seen": 881715200 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037011033099297895, + "loss": 3.0424, + "theoretical_loss": 3.693663522029728, + "tokens_seen": 881780736 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037010030090270813, + "loss": 3.0448, + "theoretical_loss": 3.6936367870147864, + "tokens_seen": 881846272 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037009027081243736, + "loss": 2.9802, + "theoretical_loss": 3.6936100545429067, + "tokens_seen": 881911808 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003700802407221665, + "loss": 2.9262, + "theoretical_loss": 3.6935833246136562, + "tokens_seen": 881977344 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003700702106318957, + "loss": 2.9909, + "theoretical_loss": 3.6935565972266056, + "tokens_seen": 882042880 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037006018054162485, + "loss": 2.9114, + "theoretical_loss": 3.693529872381324, + "tokens_seen": 882108416 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003700501504513541, + "loss": 2.9317, + "theoretical_loss": 3.6935031500773805, + "tokens_seen": 882173952 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037004012036108327, + "loss": 2.9361, + "theoretical_loss": 3.693476430314345, + "tokens_seen": 882239488 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037003009027081245, + "loss": 3.0167, + "theoretical_loss": 3.693449713091787, + "tokens_seen": 882305024 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037002006018054163, + "loss": 2.9549, + "theoretical_loss": 3.6934229984092766, + "tokens_seen": 882370560 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037001003009027087, + "loss": 2.9474, + "theoretical_loss": 3.693396286266383, + "tokens_seen": 882436096 + }, + { + "epoch": 10.02, + "learning_rate": 0.00037, + "loss": 2.9944, + "theoretical_loss": 3.693369576662677, + "tokens_seen": 882501632 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036998996990972923, + "loss": 2.9056, + "theoretical_loss": 3.693342869597728, + "tokens_seen": 882567168 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036997993981945836, + "loss": 2.8601, + "theoretical_loss": 3.6933161650711064, + "tokens_seen": 882632704 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003699699097291876, + "loss": 2.8797, + "theoretical_loss": 3.693289463082383, + "tokens_seen": 882698240 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003699598796389168, + "loss": 2.9499, + "theoretical_loss": 3.693262763631128, + "tokens_seen": 882763776 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036994984954864595, + "loss": 3.0004, + "theoretical_loss": 3.6932360667169113, + "tokens_seen": 882829312 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036993981945837514, + "loss": 3.0335, + "theoretical_loss": 3.693209372339304, + "tokens_seen": 882894848 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003699297893681043, + "loss": 2.9137, + "theoretical_loss": 3.693182680497877, + "tokens_seen": 882960384 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9991486072540283, + "objective/train/theoretical_loss": 3.6931559911922003, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6931559911922003, + "tokens_seen": 883025920 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003699197592778335, + "loss": 2.9182, + "theoretical_loss": 3.6931559911922003, + "tokens_seen": 883025920 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036990972918756273, + "loss": 2.884, + "theoretical_loss": 3.693129304421846, + "tokens_seen": 883091456 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036989969909729186, + "loss": 2.9168, + "theoretical_loss": 3.6931026201863846, + "tokens_seen": 883156992 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003698896690070211, + "loss": 2.961, + "theoretical_loss": 3.693075938485387, + "tokens_seen": 883222528 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003698796389167503, + "loss": 2.9524, + "theoretical_loss": 3.6930492593184248, + "tokens_seen": 883288064 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036986960882647946, + "loss": 2.9849, + "theoretical_loss": 3.693022582685069, + "tokens_seen": 883353600 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036985957873620864, + "loss": 2.9884, + "theoretical_loss": 3.6929959085848916, + "tokens_seen": 883419136 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003698495486459378, + "loss": 2.9342, + "theoretical_loss": 3.6929692370174636, + "tokens_seen": 883484672 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369839518555667, + "loss": 2.9285, + "theoretical_loss": 3.6929425679823566, + "tokens_seen": 883550208 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036982948846539624, + "loss": 2.978, + "theoretical_loss": 3.692915901479143, + "tokens_seen": 883615744 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036981945837512536, + "loss": 2.8754, + "theoretical_loss": 3.6928892375073943, + "tokens_seen": 883681280 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003698094282848546, + "loss": 2.9299, + "theoretical_loss": 3.6928625760666822, + "tokens_seen": 883746816 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003697993981945837, + "loss": 2.8598, + "theoretical_loss": 3.6928359171565797, + "tokens_seen": 883812352 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036978936810431296, + "loss": 2.9047, + "theoretical_loss": 3.6928092607766576, + "tokens_seen": 883877888 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036977933801404214, + "loss": 2.8881, + "theoretical_loss": 3.6927826069264893, + "tokens_seen": 883943424 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003697693079237713, + "loss": 2.888, + "theoretical_loss": 3.6927559556056466, + "tokens_seen": 884008960 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003697592778335005, + "loss": 2.9406, + "theoretical_loss": 3.6927293068137024, + "tokens_seen": 884074496 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003697492477432297, + "loss": 2.8621, + "theoretical_loss": 3.692702660550229, + "tokens_seen": 884140032 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036973921765295887, + "loss": 2.9481, + "theoretical_loss": 3.692676016814799, + "tokens_seen": 884205568 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003697291875626881, + "loss": 2.9249, + "theoretical_loss": 3.692649375606986, + "tokens_seen": 884271104 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036971915747241723, + "loss": 3.0461, + "theoretical_loss": 3.692622736926362, + "tokens_seen": 884336640 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036970912738214646, + "loss": 2.929, + "theoretical_loss": 3.6925961007725, + "tokens_seen": 884402176 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036969909729187565, + "loss": 2.9986, + "theoretical_loss": 3.692569467144974, + "tokens_seen": 884467712 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003696890672016048, + "loss": 2.9977, + "theoretical_loss": 3.6925428360433563, + "tokens_seen": 884533248 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369679037111334, + "loss": 3.0121, + "theoretical_loss": 3.6925162074672206, + "tokens_seen": 884598784 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.662740707397461, + "objective/train/theoretical_loss": 3.6924895814161407, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6924895814161407, + "tokens_seen": 884664320 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003696690070210632, + "loss": 2.8016, + "theoretical_loss": 3.6924895814161407, + "tokens_seen": 884664320 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036965897693079237, + "loss": 2.9118, + "theoretical_loss": 3.6924629578896893, + "tokens_seen": 884729856 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003696489468405216, + "loss": 3.0079, + "theoretical_loss": 3.6924363368874404, + "tokens_seen": 884795392 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036963891675025073, + "loss": 2.8877, + "theoretical_loss": 3.6924097184089684, + "tokens_seen": 884860928 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036962888665997997, + "loss": 2.968, + "theoretical_loss": 3.6923831024538467, + "tokens_seen": 884926464 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003696188565697091, + "loss": 3.0479, + "theoretical_loss": 3.6923564890216487, + "tokens_seen": 884992000 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036960882647943833, + "loss": 3.0304, + "theoretical_loss": 3.6923298781119485, + "tokens_seen": 885057536 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003695987963891675, + "loss": 2.8758, + "theoretical_loss": 3.6923032697243214, + "tokens_seen": 885123072 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003695887662988967, + "loss": 3.0149, + "theoretical_loss": 3.6922766638583404, + "tokens_seen": 885188608 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036957873620862587, + "loss": 2.9398, + "theoretical_loss": 3.6922500605135804, + "tokens_seen": 885254144 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036956870611835505, + "loss": 2.98, + "theoretical_loss": 3.692223459689616, + "tokens_seen": 885319680 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036955867602808424, + "loss": 2.9017, + "theoretical_loss": 3.6921968613860217, + "tokens_seen": 885385216 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036954864593781347, + "loss": 2.9632, + "theoretical_loss": 3.6921702656023716, + "tokens_seen": 885450752 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003695386158475426, + "loss": 2.9807, + "theoretical_loss": 3.6921436723382413, + "tokens_seen": 885516288 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036952858575727183, + "loss": 2.939, + "theoretical_loss": 3.692117081593205, + "tokens_seen": 885581824 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369518555667001, + "loss": 2.9971, + "theoretical_loss": 3.6920904933668384, + "tokens_seen": 885647360 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003695085255767302, + "loss": 3.0668, + "theoretical_loss": 3.6920639076587154, + "tokens_seen": 885712896 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003694984954864594, + "loss": 3.0136, + "theoretical_loss": 3.6920373244684126, + "tokens_seen": 885778432 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036948846539618856, + "loss": 2.972, + "theoretical_loss": 3.692010743795504, + "tokens_seen": 885843968 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036947843530591774, + "loss": 2.8538, + "theoretical_loss": 3.691984165639566, + "tokens_seen": 885909504 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369468405215647, + "loss": 3.0085, + "theoretical_loss": 3.691957590000174, + "tokens_seen": 885975040 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003694583751253761, + "loss": 2.9007, + "theoretical_loss": 3.6919310168769033, + "tokens_seen": 886040576 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036944834503510534, + "loss": 2.9635, + "theoretical_loss": 3.691904446269329, + "tokens_seen": 886106112 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036943831494483446, + "loss": 3.0053, + "theoretical_loss": 3.691877878177028, + "tokens_seen": 886171648 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003694282848545637, + "loss": 3.0415, + "theoretical_loss": 3.6918513125995753, + "tokens_seen": 886237184 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9452788829803467, + "objective/train/theoretical_loss": 3.691824749536548, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.691824749536548, + "tokens_seen": 886302720 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003694182547642929, + "loss": 2.9785, + "theoretical_loss": 3.691824749536548, + "tokens_seen": 886302720 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036940822467402206, + "loss": 3.1249, + "theoretical_loss": 3.691798188987521, + "tokens_seen": 886368256 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036939819458375124, + "loss": 3.0224, + "theoretical_loss": 3.691771630952071, + "tokens_seen": 886433792 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003693881644934805, + "loss": 2.9514, + "theoretical_loss": 3.6917450754297754, + "tokens_seen": 886499328 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003693781344032096, + "loss": 2.9831, + "theoretical_loss": 3.6917185224202083, + "tokens_seen": 886564864 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036936810431293884, + "loss": 2.8924, + "theoretical_loss": 3.691691971922948, + "tokens_seen": 886630400 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369358074222668, + "loss": 3.0332, + "theoretical_loss": 3.691665423937571, + "tokens_seen": 886695936 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003693480441323972, + "loss": 2.8429, + "theoretical_loss": 3.691638878463653, + "tokens_seen": 886761472 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036933801404212644, + "loss": 3.0055, + "theoretical_loss": 3.691612335500772, + "tokens_seen": 886827008 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036932798395185556, + "loss": 2.9961, + "theoretical_loss": 3.6915857950485043, + "tokens_seen": 886892544 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003693179538615848, + "loss": 3.0412, + "theoretical_loss": 3.691559257106427, + "tokens_seen": 886958080 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003693079237713139, + "loss": 3.0032, + "theoretical_loss": 3.6915327216741174, + "tokens_seen": 887023616 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036929789368104316, + "loss": 2.9859, + "theoretical_loss": 3.691506188751153, + "tokens_seen": 887089152 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036928786359077234, + "loss": 2.9817, + "theoretical_loss": 3.6914796583371103, + "tokens_seen": 887154688 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003692778335005015, + "loss": 2.8604, + "theoretical_loss": 3.691453130431567, + "tokens_seen": 887220224 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003692678034102307, + "loss": 2.9997, + "theoretical_loss": 3.6914266050341014, + "tokens_seen": 887285760 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003692577733199599, + "loss": 2.9321, + "theoretical_loss": 3.6914000821442907, + "tokens_seen": 887351296 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036924774322968907, + "loss": 2.97, + "theoretical_loss": 3.6913735617617123, + "tokens_seen": 887416832 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003692377131394183, + "loss": 2.9014, + "theoretical_loss": 3.6913470438859446, + "tokens_seen": 887482368 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036922768304914743, + "loss": 2.9821, + "theoretical_loss": 3.6913205285165653, + "tokens_seen": 887547904 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036921765295887666, + "loss": 2.9669, + "theoretical_loss": 3.6912940156531526, + "tokens_seen": 887613440 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036920762286860585, + "loss": 2.9731, + "theoretical_loss": 3.691267505295284, + "tokens_seen": 887678976 + }, + { + "epoch": 10.02, + "learning_rate": 0.000369197592778335, + "loss": 2.9218, + "theoretical_loss": 3.691240997442539, + "tokens_seen": 887744512 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003691875626880642, + "loss": 3.0029, + "theoretical_loss": 3.6912144920944945, + "tokens_seen": 887810048 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003691775325977934, + "loss": 2.969, + "theoretical_loss": 3.6911879892507304, + "tokens_seen": 887875584 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1112446784973145, + "objective/train/theoretical_loss": 3.6911614889108244, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6911614889108244, + "tokens_seen": 887941120 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036916750250752257, + "loss": 3.0407, + "theoretical_loss": 3.6911614889108244, + "tokens_seen": 887941120 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003691574724172518, + "loss": 2.9956, + "theoretical_loss": 3.691134991074355, + "tokens_seen": 888006656 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036914744232698093, + "loss": 2.873, + "theoretical_loss": 3.691108495740902, + "tokens_seen": 888072192 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036913741223671017, + "loss": 2.9218, + "theoretical_loss": 3.691082002910043, + "tokens_seen": 888137728 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003691273821464393, + "loss": 3.0157, + "theoretical_loss": 3.6910555125813582, + "tokens_seen": 888203264 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036911735205616853, + "loss": 2.9636, + "theoretical_loss": 3.6910290247544255, + "tokens_seen": 888268800 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003691073219658977, + "loss": 2.9495, + "theoretical_loss": 3.691002539428825, + "tokens_seen": 888334336 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690972918756269, + "loss": 2.9159, + "theoretical_loss": 3.6909760566041356, + "tokens_seen": 888399872 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690872617853561, + "loss": 2.9395, + "theoretical_loss": 3.6909495762799365, + "tokens_seen": 888465408 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036907723169508525, + "loss": 2.9439, + "theoretical_loss": 3.6909230984558077, + "tokens_seen": 888530944 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036906720160481444, + "loss": 2.9717, + "theoretical_loss": 3.6908966231313283, + "tokens_seen": 888596480 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036905717151454367, + "loss": 2.982, + "theoretical_loss": 3.690870150306079, + "tokens_seen": 888662016 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690471414242728, + "loss": 2.9075, + "theoretical_loss": 3.690843679979638, + "tokens_seen": 888727552 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036903711133400203, + "loss": 3.0183, + "theoretical_loss": 3.690817212151586, + "tokens_seen": 888793088 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690270812437312, + "loss": 3.0433, + "theoretical_loss": 3.690790746821503, + "tokens_seen": 888858624 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690170511534604, + "loss": 3.0307, + "theoretical_loss": 3.69076428398897, + "tokens_seen": 888924160 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003690070210631896, + "loss": 3.0032, + "theoretical_loss": 3.6907378236535653, + "tokens_seen": 888989696 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036899699097291876, + "loss": 2.8939, + "theoretical_loss": 3.6907113658148702, + "tokens_seen": 889055232 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036898696088264794, + "loss": 2.9293, + "theoretical_loss": 3.6906849104724655, + "tokens_seen": 889120768 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003689769307923772, + "loss": 2.826, + "theoretical_loss": 3.690658457625931, + "tokens_seen": 889186304 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003689669007021063, + "loss": 2.996, + "theoretical_loss": 3.6906320072748477, + "tokens_seen": 889251840 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036895687061183554, + "loss": 2.9324, + "theoretical_loss": 3.6906055594187963, + "tokens_seen": 889317376 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036894684052156466, + "loss": 3.0575, + "theoretical_loss": 3.690579114057357, + "tokens_seen": 889382912 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003689368104312939, + "loss": 2.9315, + "theoretical_loss": 3.6905526711901118, + "tokens_seen": 889448448 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003689267803410231, + "loss": 2.8162, + "theoretical_loss": 3.6905262308166407, + "tokens_seen": 889513984 + }, + { + "epoch": 10.02, + "objective/train/docs_used": 2079968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7734861373901367, + "objective/train/theoretical_loss": 3.6904997929365253, + "objective/train/tokens_used": 890928608, + "theoretical_loss": 3.6904997929365253, + "tokens_seen": 889579520 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036891675025075226, + "loss": 2.9668, + "theoretical_loss": 3.6904997929365253, + "tokens_seen": 889579520 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036890672016048144, + "loss": 3.0725, + "theoretical_loss": 3.6904733575493474, + "tokens_seen": 889645056 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688966900702107, + "loss": 2.9418, + "theoretical_loss": 3.690446924654687, + "tokens_seen": 889710592 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688866599799398, + "loss": 2.9253, + "theoretical_loss": 3.6904204942521264, + "tokens_seen": 889776128 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036887662988966904, + "loss": 2.9807, + "theoretical_loss": 3.6903940663412467, + "tokens_seen": 889841664 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036886659979939817, + "loss": 2.8783, + "theoretical_loss": 3.69036764092163, + "tokens_seen": 889907200 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688565697091274, + "loss": 2.9795, + "theoretical_loss": 3.6903412179928576, + "tokens_seen": 889972736 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688465396188566, + "loss": 2.9391, + "theoretical_loss": 3.690314797554511, + "tokens_seen": 890038272 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036883650952858576, + "loss": 2.8726, + "theoretical_loss": 3.6902883796061734, + "tokens_seen": 890103808 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036882647943831494, + "loss": 2.9405, + "theoretical_loss": 3.690261964147426, + "tokens_seen": 890169344 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688164493480441, + "loss": 2.948, + "theoretical_loss": 3.6902355511778504, + "tokens_seen": 890234880 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003688064192577733, + "loss": 2.9493, + "theoretical_loss": 3.6902091406970294, + "tokens_seen": 890300416 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036879638916750254, + "loss": 2.9255, + "theoretical_loss": 3.690182732704545, + "tokens_seen": 890365952 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036878635907723167, + "loss": 2.969, + "theoretical_loss": 3.690156327199981, + "tokens_seen": 890431488 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003687763289869609, + "loss": 3.004, + "theoretical_loss": 3.6901299241829175, + "tokens_seen": 890497024 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036876629889669003, + "loss": 2.949, + "theoretical_loss": 3.690103523652939, + "tokens_seen": 890562560 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036875626880641927, + "loss": 3.0586, + "theoretical_loss": 3.6900771256096276, + "tokens_seen": 890628096 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036874623871614845, + "loss": 2.9456, + "theoretical_loss": 3.6900507300525662, + "tokens_seen": 890693632 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036873620862587763, + "loss": 2.8928, + "theoretical_loss": 3.6900243369813377, + "tokens_seen": 890759168 + }, + { + "epoch": 10.02, + "learning_rate": 0.0003687261785356068, + "loss": 2.9583, + "theoretical_loss": 3.689997946395525, + "tokens_seen": 890824704 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036871614844533605, + "loss": 2.8602, + "theoretical_loss": 3.6899715582947117, + "tokens_seen": 890890240 + }, + { + "epoch": 10.02, + "learning_rate": 0.00036870611835506517, + "loss": 3.0031, + "theoretical_loss": 3.689948058484254, + "tokens_seen": 890948608 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686960882647944, + "loss": 2.8745, + "theoretical_loss": 3.6899216750805035, + "tokens_seen": 891014144 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036868605817452353, + "loss": 2.804, + "theoretical_loss": 3.6898952941605474, + "tokens_seen": 891079680 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036867602808425277, + "loss": 2.7633, + "theoretical_loss": 3.6898689157239692, + "tokens_seen": 891145216 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2128643, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8281288146972656, + "objective/train/theoretical_loss": 3.689842539770353, + "objective/train/tokens_used": 911670752, + "theoretical_loss": 3.689842539770353, + "tokens_seen": 891210752 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036866599799398195, + "loss": 2.8489, + "theoretical_loss": 3.689842539770353, + "tokens_seen": 891210752 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036865596790371113, + "loss": 2.8953, + "theoretical_loss": 3.6898161662992823, + "tokens_seen": 891276288 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686459378134403, + "loss": 2.7546, + "theoretical_loss": 3.6897897953103405, + "tokens_seen": 891341824 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686359077231695, + "loss": 2.8818, + "theoretical_loss": 3.689763426803112, + "tokens_seen": 891407360 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686258776328987, + "loss": 2.8204, + "theoretical_loss": 3.6897370607771816, + "tokens_seen": 891472896 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686158475426279, + "loss": 2.7197, + "theoretical_loss": 3.6897106972321314, + "tokens_seen": 891538432 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003686058174523571, + "loss": 2.8189, + "theoretical_loss": 3.6896843361675478, + "tokens_seen": 891603968 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003685957873620863, + "loss": 2.8388, + "theoretical_loss": 3.689657977583013, + "tokens_seen": 891669504 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036858575727181545, + "loss": 2.8643, + "theoretical_loss": 3.6896316214781137, + "tokens_seen": 891735040 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036857572718154464, + "loss": 2.9334, + "theoretical_loss": 3.689605267852433, + "tokens_seen": 891800576 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036856569709127387, + "loss": 2.6991, + "theoretical_loss": 3.689578916705555, + "tokens_seen": 891866112 + }, + { + "epoch": 11.0, + "learning_rate": 0.000368555667001003, + "loss": 2.9198, + "theoretical_loss": 3.689552568037066, + "tokens_seen": 891931648 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036854563691073223, + "loss": 2.9485, + "theoretical_loss": 3.68952622184655, + "tokens_seen": 891997184 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003685356068204614, + "loss": 2.8777, + "theoretical_loss": 3.6894998781335917, + "tokens_seen": 892062720 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003685255767301906, + "loss": 2.8363, + "theoretical_loss": 3.6894735368977765, + "tokens_seen": 892128256 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003685155466399198, + "loss": 2.8079, + "theoretical_loss": 3.68944719813869, + "tokens_seen": 892193792 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036850551654964896, + "loss": 2.7151, + "theoretical_loss": 3.6894208618559157, + "tokens_seen": 892259328 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036849548645937814, + "loss": 2.8722, + "theoretical_loss": 3.689394528049041, + "tokens_seen": 892324864 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003684854563691074, + "loss": 2.9009, + "theoretical_loss": 3.68936819671765, + "tokens_seen": 892390400 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003684754262788365, + "loss": 2.8291, + "theoretical_loss": 3.6893418678613283, + "tokens_seen": 892455936 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036846539618856574, + "loss": 2.8759, + "theoretical_loss": 3.6893155414796626, + "tokens_seen": 892521472 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036845536609829486, + "loss": 2.9518, + "theoretical_loss": 3.689289217572237, + "tokens_seen": 892587008 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003684453360080241, + "loss": 2.8736, + "theoretical_loss": 3.689262896138639, + "tokens_seen": 892652544 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003684353059177533, + "loss": 2.9258, + "theoretical_loss": 3.6892365771784528, + "tokens_seen": 892718080 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036842527582748246, + "loss": 2.873, + "theoretical_loss": 3.689210260691266, + "tokens_seen": 892783616 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2132298, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9010894298553467, + "objective/train/theoretical_loss": 3.689183946676664, + "objective/train/tokens_used": 913309152, + "theoretical_loss": 3.689183946676664, + "tokens_seen": 892849152 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036841524573721164, + "loss": 2.8456, + "theoretical_loss": 3.689183946676664, + "tokens_seen": 892849152 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003684052156469409, + "loss": 2.8278, + "theoretical_loss": 3.689157635134233, + "tokens_seen": 892914688 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036839518555667, + "loss": 2.7277, + "theoretical_loss": 3.689131326063559, + "tokens_seen": 892980224 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036838515546639924, + "loss": 2.8628, + "theoretical_loss": 3.68910501946423, + "tokens_seen": 893045760 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036837512537612837, + "loss": 2.8697, + "theoretical_loss": 3.68907871533583, + "tokens_seen": 893111296 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003683650952858576, + "loss": 2.8865, + "theoretical_loss": 3.6890524136779477, + "tokens_seen": 893176832 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003683550651955868, + "loss": 2.9081, + "theoretical_loss": 3.6890261144901686, + "tokens_seen": 893242368 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036834503510531596, + "loss": 2.8675, + "theoretical_loss": 3.68899981777208, + "tokens_seen": 893307904 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036833500501504514, + "loss": 2.916, + "theoretical_loss": 3.688973523523269, + "tokens_seen": 893373440 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003683249749247743, + "loss": 2.8676, + "theoretical_loss": 3.688947231743323, + "tokens_seen": 893438976 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003683149448345035, + "loss": 2.9134, + "theoretical_loss": 3.688920942431828, + "tokens_seen": 893504512 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036830491474423274, + "loss": 2.7271, + "theoretical_loss": 3.6888946555883715, + "tokens_seen": 893570048 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036829488465396187, + "loss": 2.8641, + "theoretical_loss": 3.688868371212542, + "tokens_seen": 893635584 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003682848545636911, + "loss": 2.8563, + "theoretical_loss": 3.688842089303925, + "tokens_seen": 893701120 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036827482447342023, + "loss": 2.8213, + "theoretical_loss": 3.6888158098621093, + "tokens_seen": 893766656 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036826479438314947, + "loss": 2.8069, + "theoretical_loss": 3.688789532886682, + "tokens_seen": 893832192 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036825476429287865, + "loss": 2.7729, + "theoretical_loss": 3.6887632583772314, + "tokens_seen": 893897728 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036824473420260783, + "loss": 2.8693, + "theoretical_loss": 3.6887369863333452, + "tokens_seen": 893963264 + }, + { + "epoch": 11.0, + "learning_rate": 0.000368234704112337, + "loss": 2.8718, + "theoretical_loss": 3.6887107167546107, + "tokens_seen": 894028800 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036822467402206625, + "loss": 2.9357, + "theoretical_loss": 3.6886844496406166, + "tokens_seen": 894094336 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036821464393179537, + "loss": 2.7001, + "theoretical_loss": 3.68865818499095, + "tokens_seen": 894159872 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003682046138415246, + "loss": 2.911, + "theoretical_loss": 3.6886319228052002, + "tokens_seen": 894225408 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036819458375125373, + "loss": 2.9417, + "theoretical_loss": 3.688605663082955, + "tokens_seen": 894290944 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036818455366098297, + "loss": 2.8982, + "theoretical_loss": 3.688579405823803, + "tokens_seen": 894356480 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036817452357071215, + "loss": 3.0045, + "theoretical_loss": 3.6885531510273326, + "tokens_seen": 894422016 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2137231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9741861820220947, + "objective/train/theoretical_loss": 3.6885268986931323, + "objective/train/tokens_used": 914947552, + "theoretical_loss": 3.6885268986931323, + "tokens_seen": 894487552 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036816449348044133, + "loss": 2.9228, + "theoretical_loss": 3.6885268986931323, + "tokens_seen": 894487552 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003681544633901705, + "loss": 2.7609, + "theoretical_loss": 3.688500648820791, + "tokens_seen": 894553088 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003681444332998997, + "loss": 2.8882, + "theoretical_loss": 3.6884744014098976, + "tokens_seen": 894618624 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003681344032096289, + "loss": 2.7981, + "theoretical_loss": 3.6884481564600407, + "tokens_seen": 894684160 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003681243731193581, + "loss": 3.0128, + "theoretical_loss": 3.688421913970809, + "tokens_seen": 894749696 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036811434302908724, + "loss": 2.8451, + "theoretical_loss": 3.6883956739417925, + "tokens_seen": 894815232 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003681043129388165, + "loss": 2.8506, + "theoretical_loss": 3.6883694363725796, + "tokens_seen": 894880768 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003680942828485456, + "loss": 2.8498, + "theoretical_loss": 3.6883432012627604, + "tokens_seen": 894946304 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036808425275827484, + "loss": 2.8455, + "theoretical_loss": 3.6883169686119235, + "tokens_seen": 895011840 + }, + { + "epoch": 11.0, + "learning_rate": 0.000368074222668004, + "loss": 2.8707, + "theoretical_loss": 3.688290738419658, + "tokens_seen": 895077376 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003680641925777332, + "loss": 2.8198, + "theoretical_loss": 3.6882645106855554, + "tokens_seen": 895142912 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003680541624874624, + "loss": 2.9075, + "theoretical_loss": 3.6882382854092035, + "tokens_seen": 895208448 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003680441323971916, + "loss": 2.8543, + "theoretical_loss": 3.6882120625901926, + "tokens_seen": 895273984 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036803410230692074, + "loss": 2.9797, + "theoretical_loss": 3.6881858422281133, + "tokens_seen": 895339520 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036802407221665, + "loss": 2.8736, + "theoretical_loss": 3.6881596243225543, + "tokens_seen": 895405056 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003680140421263791, + "loss": 3.0017, + "theoretical_loss": 3.688133408873107, + "tokens_seen": 895470592 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036800401203610834, + "loss": 2.9928, + "theoretical_loss": 3.688107195879361, + "tokens_seen": 895536128 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679939819458375, + "loss": 2.8704, + "theoretical_loss": 3.688080985340906, + "tokens_seen": 895601664 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679839518555667, + "loss": 2.8487, + "theoretical_loss": 3.6880547772573338, + "tokens_seen": 895667200 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679739217652959, + "loss": 2.8627, + "theoretical_loss": 3.6880285716282337, + "tokens_seen": 895732736 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036796389167502506, + "loss": 2.8867, + "theoretical_loss": 3.6880023684531964, + "tokens_seen": 895798272 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036795386158475424, + "loss": 2.8753, + "theoretical_loss": 3.687976167731813, + "tokens_seen": 895863808 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679438314944835, + "loss": 2.9194, + "theoretical_loss": 3.6879499694636735, + "tokens_seen": 895929344 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679338014042126, + "loss": 2.8422, + "theoretical_loss": 3.68792377364837, + "tokens_seen": 895994880 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036792377131394184, + "loss": 2.7851, + "theoretical_loss": 3.687897580285492, + "tokens_seen": 896060416 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2140415, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8628313541412354, + "objective/train/theoretical_loss": 3.687871389374632, + "objective/train/tokens_used": 916585952, + "theoretical_loss": 3.687871389374632, + "tokens_seen": 896125952 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036791374122367097, + "loss": 2.9348, + "theoretical_loss": 3.687871389374632, + "tokens_seen": 896125952 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003679037111334002, + "loss": 2.7911, + "theoretical_loss": 3.6878452009153806, + "tokens_seen": 896191488 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003678936810431294, + "loss": 2.7885, + "theoretical_loss": 3.687819014907329, + "tokens_seen": 896257024 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036788365095285857, + "loss": 2.7148, + "theoretical_loss": 3.687792831350068, + "tokens_seen": 896322560 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036787362086258775, + "loss": 2.9449, + "theoretical_loss": 3.68776665024319, + "tokens_seen": 896388096 + }, + { + "epoch": 11.0, + "learning_rate": 0.000367863590772317, + "loss": 2.9009, + "theoretical_loss": 3.6877404715862863, + "tokens_seen": 896453632 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036785356068204616, + "loss": 2.7588, + "theoretical_loss": 3.6877142953789477, + "tokens_seen": 896519168 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036784353059177535, + "loss": 2.7413, + "theoretical_loss": 3.6876881216207673, + "tokens_seen": 896584704 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003678335005015045, + "loss": 2.7557, + "theoretical_loss": 3.687661950311336, + "tokens_seen": 896650240 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003678234704112337, + "loss": 2.9435, + "theoretical_loss": 3.6876357814502465, + "tokens_seen": 896715776 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036781344032096294, + "loss": 2.8355, + "theoretical_loss": 3.68760961503709, + "tokens_seen": 896781312 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036780341023069207, + "loss": 2.9283, + "theoretical_loss": 3.6875834510714594, + "tokens_seen": 896846848 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003677933801404213, + "loss": 2.8957, + "theoretical_loss": 3.687557289552946, + "tokens_seen": 896912384 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036778335005015043, + "loss": 2.7845, + "theoretical_loss": 3.687531130481143, + "tokens_seen": 896977920 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036777331995987967, + "loss": 2.8564, + "theoretical_loss": 3.687504973855643, + "tokens_seen": 897043456 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036776328986960885, + "loss": 2.8254, + "theoretical_loss": 3.6874788196760377, + "tokens_seen": 897108992 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036775325977933803, + "loss": 2.9158, + "theoretical_loss": 3.68745266794192, + "tokens_seen": 897174528 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003677432296890672, + "loss": 2.8931, + "theoretical_loss": 3.6874265186528827, + "tokens_seen": 897240064 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036773319959879645, + "loss": 2.841, + "theoretical_loss": 3.6874003718085193, + "tokens_seen": 897305600 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036772316950852557, + "loss": 2.794, + "theoretical_loss": 3.687374227408422, + "tokens_seen": 897371136 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003677131394182548, + "loss": 2.9108, + "theoretical_loss": 3.687348085452183, + "tokens_seen": 897436672 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036770310932798393, + "loss": 2.893, + "theoretical_loss": 3.6873219459393973, + "tokens_seen": 897502208 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036769307923771317, + "loss": 2.9328, + "theoretical_loss": 3.6872958088696572, + "tokens_seen": 897567744 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036768304914744235, + "loss": 2.8459, + "theoretical_loss": 3.6872696742425553, + "tokens_seen": 897633280 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036767301905717153, + "loss": 2.9162, + "theoretical_loss": 3.6872435420576863, + "tokens_seen": 897698816 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2145075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9295902252197266, + "objective/train/theoretical_loss": 3.687217412314643, + "objective/train/tokens_used": 918224352, + "theoretical_loss": 3.687217412314643, + "tokens_seen": 897764352 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676629889669007, + "loss": 3.0537, + "theoretical_loss": 3.687217412314643, + "tokens_seen": 897764352 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676529588766299, + "loss": 2.8277, + "theoretical_loss": 3.687191285013019, + "tokens_seen": 897829888 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676429287863591, + "loss": 2.8514, + "theoretical_loss": 3.687165160152408, + "tokens_seen": 897895424 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676328986960883, + "loss": 2.968, + "theoretical_loss": 3.687139037732404, + "tokens_seen": 897960960 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036762286860581744, + "loss": 2.9316, + "theoretical_loss": 3.6871129177526005, + "tokens_seen": 898026496 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676128385155467, + "loss": 2.8908, + "theoretical_loss": 3.687086800212592, + "tokens_seen": 898092032 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003676028084252758, + "loss": 2.8461, + "theoretical_loss": 3.687060685111973, + "tokens_seen": 898157568 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036759277833500504, + "loss": 2.8644, + "theoretical_loss": 3.6870345724503366, + "tokens_seen": 898223104 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675827482447342, + "loss": 2.8882, + "theoretical_loss": 3.6870084622272774, + "tokens_seen": 898288640 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675727181544634, + "loss": 2.8911, + "theoretical_loss": 3.6869823544423905, + "tokens_seen": 898354176 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675626880641926, + "loss": 2.9862, + "theoretical_loss": 3.686956249095269, + "tokens_seen": 898419712 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675526579739218, + "loss": 2.859, + "theoretical_loss": 3.6869301461855093, + "tokens_seen": 898485248 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036754262788365094, + "loss": 2.8401, + "theoretical_loss": 3.6869040457127045, + "tokens_seen": 898550784 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675325977933802, + "loss": 2.9726, + "theoretical_loss": 3.68687794767645, + "tokens_seen": 898616320 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675225677031093, + "loss": 2.8878, + "theoretical_loss": 3.686851852076341, + "tokens_seen": 898681856 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036751253761283854, + "loss": 2.8571, + "theoretical_loss": 3.6868257589119717, + "tokens_seen": 898747392 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003675025075225677, + "loss": 2.9039, + "theoretical_loss": 3.686799668182938, + "tokens_seen": 898812928 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674924774322969, + "loss": 2.7621, + "theoretical_loss": 3.6867735798888344, + "tokens_seen": 898878464 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674824473420261, + "loss": 2.8822, + "theoretical_loss": 3.686747494029256, + "tokens_seen": 898944000 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036747241725175526, + "loss": 2.9574, + "theoretical_loss": 3.6867214106037993, + "tokens_seen": 899009536 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036746238716148444, + "loss": 2.8877, + "theoretical_loss": 3.686695329612058, + "tokens_seen": 899075072 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674523570712137, + "loss": 2.819, + "theoretical_loss": 3.6866692510536296, + "tokens_seen": 899140608 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674423269809428, + "loss": 2.7979, + "theoretical_loss": 3.686643174928108, + "tokens_seen": 899206144 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036743229689067204, + "loss": 2.8687, + "theoretical_loss": 3.6866171012350906, + "tokens_seen": 899271680 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036742226680040117, + "loss": 2.8781, + "theoretical_loss": 3.6865910299741715, + "tokens_seen": 899337216 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2148176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0189764499664307, + "objective/train/theoretical_loss": 3.686564961144948, + "objective/train/tokens_used": 919862752, + "theoretical_loss": 3.686564961144948, + "tokens_seen": 899402752 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674122367101304, + "loss": 2.8658, + "theoretical_loss": 3.686564961144948, + "tokens_seen": 899402752 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003674022066198596, + "loss": 2.9111, + "theoretical_loss": 3.6865388947470152, + "tokens_seen": 899468288 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036739217652958877, + "loss": 2.7939, + "theoretical_loss": 3.6865128307799697, + "tokens_seen": 899533824 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036738214643931795, + "loss": 2.881, + "theoretical_loss": 3.686486769243408, + "tokens_seen": 899599360 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003673721163490472, + "loss": 2.8815, + "theoretical_loss": 3.686460710136926, + "tokens_seen": 899664896 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003673620862587763, + "loss": 2.8283, + "theoretical_loss": 3.68643465346012, + "tokens_seen": 899730432 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036735205616850555, + "loss": 2.8287, + "theoretical_loss": 3.686408599212587, + "tokens_seen": 899795968 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036734202607823467, + "loss": 2.8972, + "theoretical_loss": 3.686382547393923, + "tokens_seen": 899861504 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003673319959879639, + "loss": 2.8466, + "theoretical_loss": 3.6863564980037253, + "tokens_seen": 899927040 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003673219658976931, + "loss": 2.8457, + "theoretical_loss": 3.68633045104159, + "tokens_seen": 899992576 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036731193580742227, + "loss": 2.8723, + "theoretical_loss": 3.686304406507115, + "tokens_seen": 900058112 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036730190571715145, + "loss": 2.8616, + "theoretical_loss": 3.6862783643998966, + "tokens_seen": 900123648 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036729187562688063, + "loss": 2.9343, + "theoretical_loss": 3.686252324719532, + "tokens_seen": 900189184 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003672818455366098, + "loss": 2.9953, + "theoretical_loss": 3.686226287465618, + "tokens_seen": 900254720 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036727181544633905, + "loss": 2.9563, + "theoretical_loss": 3.6862002526377524, + "tokens_seen": 900320256 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003672617853560682, + "loss": 2.9492, + "theoretical_loss": 3.6861742202355328, + "tokens_seen": 900385792 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003672517552657974, + "loss": 2.8636, + "theoretical_loss": 3.686148190258556, + "tokens_seen": 900451328 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036724172517552654, + "loss": 2.8868, + "theoretical_loss": 3.68612216270642, + "tokens_seen": 900516864 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036723169508525577, + "loss": 2.876, + "theoretical_loss": 3.6860961375787222, + "tokens_seen": 900582400 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036722166499498495, + "loss": 2.9487, + "theoretical_loss": 3.6860701148750605, + "tokens_seen": 900647936 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036721163490471414, + "loss": 2.8679, + "theoretical_loss": 3.686044094595033, + "tokens_seen": 900713472 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003672016048144433, + "loss": 2.7794, + "theoretical_loss": 3.686018076738238, + "tokens_seen": 900779008 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036719157472417255, + "loss": 2.7586, + "theoretical_loss": 3.685992061304272, + "tokens_seen": 900844544 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003671815446339017, + "loss": 2.8196, + "theoretical_loss": 3.6859660482927348, + "tokens_seen": 900910080 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003671715145436309, + "loss": 2.8993, + "theoretical_loss": 3.685940037703223, + "tokens_seen": 900975616 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2152027, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8669023513793945, + "objective/train/theoretical_loss": 3.685914029535337, + "objective/train/tokens_used": 921501152, + "theoretical_loss": 3.685914029535337, + "tokens_seen": 901041152 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036716148445336004, + "loss": 2.8771, + "theoretical_loss": 3.685914029535337, + "tokens_seen": 901041152 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003671514543630893, + "loss": 2.9577, + "theoretical_loss": 3.685888023788674, + "tokens_seen": 901106688 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036714142427281846, + "loss": 2.8462, + "theoretical_loss": 3.6858620204628325, + "tokens_seen": 901172224 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036713139418254764, + "loss": 2.8013, + "theoretical_loss": 3.6858360195574114, + "tokens_seen": 901237760 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003671213640922768, + "loss": 2.8629, + "theoretical_loss": 3.685810021072009, + "tokens_seen": 901303296 + }, + { + "epoch": 11.0, + "learning_rate": 0.000367111334002006, + "loss": 2.8505, + "theoretical_loss": 3.6857840250062246, + "tokens_seen": 901368832 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036710130391173524, + "loss": 2.9186, + "theoretical_loss": 3.6857580313596574, + "tokens_seen": 901434368 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670912738214644, + "loss": 3.0107, + "theoretical_loss": 3.6857320401319056, + "tokens_seen": 901499904 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670812437311936, + "loss": 2.9683, + "theoretical_loss": 3.6857060513225686, + "tokens_seen": 901565440 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670712136409228, + "loss": 2.8928, + "theoretical_loss": 3.685680064931246, + "tokens_seen": 901630976 + }, + { + "epoch": 11.0, + "learning_rate": 0.000367061183550652, + "loss": 3.0226, + "theoretical_loss": 3.685654080957536, + "tokens_seen": 901696512 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036705115346038114, + "loss": 2.8874, + "theoretical_loss": 3.6856280994010397, + "tokens_seen": 901762048 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670411233701104, + "loss": 2.918, + "theoretical_loss": 3.685602120261355, + "tokens_seen": 901827584 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670310932798395, + "loss": 2.9412, + "theoretical_loss": 3.685576143538082, + "tokens_seen": 901893120 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036702106318956874, + "loss": 2.9155, + "theoretical_loss": 3.6855501692308215, + "tokens_seen": 901958656 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670110330992979, + "loss": 2.9911, + "theoretical_loss": 3.6855241973391712, + "tokens_seen": 902024192 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003670010030090271, + "loss": 2.7717, + "theoretical_loss": 3.685498227862733, + "tokens_seen": 902089728 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003669909729187563, + "loss": 3.0085, + "theoretical_loss": 3.685472260801105, + "tokens_seen": 902155264 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036698094282848546, + "loss": 2.8162, + "theoretical_loss": 3.6854462961538887, + "tokens_seen": 902220800 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036697091273821464, + "loss": 2.8464, + "theoretical_loss": 3.6854203339206837, + "tokens_seen": 902286336 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003669608826479439, + "loss": 2.9604, + "theoretical_loss": 3.68539437410109, + "tokens_seen": 902351872 + }, + { + "epoch": 11.0, + "learning_rate": 0.000366950852557673, + "loss": 2.869, + "theoretical_loss": 3.6853684166947085, + "tokens_seen": 902417408 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036694082246740224, + "loss": 2.7485, + "theoretical_loss": 3.685342461701139, + "tokens_seen": 902482944 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036693079237713137, + "loss": 2.8894, + "theoretical_loss": 3.6853165091199824, + "tokens_seen": 902548480 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003669207622868606, + "loss": 2.9099, + "theoretical_loss": 3.6852905589508396, + "tokens_seen": 902614016 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2156627, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7554752826690674, + "objective/train/theoretical_loss": 3.685264611193311, + "objective/train/tokens_used": 923139552, + "theoretical_loss": 3.685264611193311, + "tokens_seen": 902679552 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003669107321965898, + "loss": 2.822, + "theoretical_loss": 3.685264611193311, + "tokens_seen": 902679552 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036690070210631897, + "loss": 2.8704, + "theoretical_loss": 3.6852386658469976, + "tokens_seen": 902745088 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036689067201604815, + "loss": 2.8285, + "theoretical_loss": 3.6852127229115, + "tokens_seen": 902810624 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003668806419257774, + "loss": 2.7794, + "theoretical_loss": 3.6851867823864195, + "tokens_seen": 902876160 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003668706118355065, + "loss": 2.8469, + "theoretical_loss": 3.6851608442713566, + "tokens_seen": 902941696 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036686058174523575, + "loss": 2.8819, + "theoretical_loss": 3.6851349085659137, + "tokens_seen": 903007232 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036685055165496487, + "loss": 2.8647, + "theoretical_loss": 3.685108975269691, + "tokens_seen": 903072768 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003668405215646941, + "loss": 2.8735, + "theoretical_loss": 3.6850830443822904, + "tokens_seen": 903138304 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003668304914744233, + "loss": 3.0071, + "theoretical_loss": 3.685057115903313, + "tokens_seen": 903203840 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036682046138415247, + "loss": 2.7837, + "theoretical_loss": 3.685031189832361, + "tokens_seen": 903269376 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036681043129388165, + "loss": 2.9063, + "theoretical_loss": 3.6850052661690356, + "tokens_seen": 903334912 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036680040120361083, + "loss": 2.895, + "theoretical_loss": 3.6849793449129384, + "tokens_seen": 903400448 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036679037111334, + "loss": 2.8965, + "theoretical_loss": 3.6849534260636716, + "tokens_seen": 903465984 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036678034102306925, + "loss": 2.9949, + "theoretical_loss": 3.684927509620837, + "tokens_seen": 903531520 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003667703109327984, + "loss": 2.9569, + "theoretical_loss": 3.6849015955840376, + "tokens_seen": 903597056 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003667602808425276, + "loss": 2.9362, + "theoretical_loss": 3.684875683952874, + "tokens_seen": 903662592 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036675025075225674, + "loss": 2.8943, + "theoretical_loss": 3.684849774726949, + "tokens_seen": 903728128 + }, + { + "epoch": 11.0, + "learning_rate": 0.000366740220661986, + "loss": 2.9138, + "theoretical_loss": 3.6848238679058656, + "tokens_seen": 903793664 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036673019057171515, + "loss": 2.888, + "theoretical_loss": 3.684797963489226, + "tokens_seen": 903859200 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036672016048144434, + "loss": 2.9846, + "theoretical_loss": 3.6847720614766315, + "tokens_seen": 903924736 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003667101303911735, + "loss": 2.8632, + "theoretical_loss": 3.684746161867686, + "tokens_seen": 903990272 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036670010030090275, + "loss": 2.8693, + "theoretical_loss": 3.6847202646619923, + "tokens_seen": 904055808 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003666900702106319, + "loss": 2.8459, + "theoretical_loss": 3.6846943698591526, + "tokens_seen": 904121344 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003666800401203611, + "loss": 2.8885, + "theoretical_loss": 3.6846684774587697, + "tokens_seen": 904186880 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036667001003009024, + "loss": 2.84, + "theoretical_loss": 3.684642587460447, + "tokens_seen": 904252416 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2159749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8529579639434814, + "objective/train/theoretical_loss": 3.6846166998637875, + "objective/train/tokens_used": 924777952, + "theoretical_loss": 3.6846166998637875, + "tokens_seen": 904317952 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003666599799398195, + "loss": 2.8097, + "theoretical_loss": 3.6846166998637875, + "tokens_seen": 904317952 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036664994984954866, + "loss": 2.8785, + "theoretical_loss": 3.684590814668394, + "tokens_seen": 904383488 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036663991975927784, + "loss": 3.0092, + "theoretical_loss": 3.6845649318738714, + "tokens_seen": 904449024 + }, + { + "epoch": 11.0, + "learning_rate": 0.000366629889669007, + "loss": 2.9291, + "theoretical_loss": 3.6845390514798204, + "tokens_seen": 904514560 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003666198595787362, + "loss": 2.9797, + "theoretical_loss": 3.684513173485847, + "tokens_seen": 904580096 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003666098294884654, + "loss": 2.8823, + "theoretical_loss": 3.684487297891553, + "tokens_seen": 904645632 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003665997993981946, + "loss": 3.0224, + "theoretical_loss": 3.684461424696543, + "tokens_seen": 904711168 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036658976930792374, + "loss": 3.0074, + "theoretical_loss": 3.6844355539004203, + "tokens_seen": 904776704 + }, + { + "epoch": 11.0, + "learning_rate": 0.000366579739217653, + "loss": 2.8946, + "theoretical_loss": 3.6844096855027892, + "tokens_seen": 904842240 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036656970912738216, + "loss": 2.9065, + "theoretical_loss": 3.684383819503253, + "tokens_seen": 904907776 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036655967903711134, + "loss": 2.9363, + "theoretical_loss": 3.684357955901416, + "tokens_seen": 904973312 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003665496489468405, + "loss": 2.8764, + "theoretical_loss": 3.6843320946968827, + "tokens_seen": 905038848 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003665396188565697, + "loss": 2.8473, + "theoretical_loss": 3.6843062358892578, + "tokens_seen": 905104384 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003665295887662989, + "loss": 2.9312, + "theoretical_loss": 3.6842803794781434, + "tokens_seen": 905169920 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003665195586760281, + "loss": 2.7456, + "theoretical_loss": 3.6842545254631465, + "tokens_seen": 905235456 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036650952858575725, + "loss": 2.9008, + "theoretical_loss": 3.68422867384387, + "tokens_seen": 905300992 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003664994984954865, + "loss": 2.8107, + "theoretical_loss": 3.684202824619919, + "tokens_seen": 905366528 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003664894684052156, + "loss": 2.8806, + "theoretical_loss": 3.6841769777908984, + "tokens_seen": 905432064 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036647943831494484, + "loss": 2.8143, + "theoretical_loss": 3.684151133356413, + "tokens_seen": 905497600 + }, + { + "epoch": 11.0, + "learning_rate": 0.000366469408224674, + "loss": 2.9667, + "theoretical_loss": 3.6841252913160667, + "tokens_seen": 905563136 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003664593781344032, + "loss": 2.8183, + "theoretical_loss": 3.6840994516694656, + "tokens_seen": 905628672 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003664493480441324, + "loss": 2.8579, + "theoretical_loss": 3.6840736144162145, + "tokens_seen": 905694208 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036643931795386157, + "loss": 2.8754, + "theoretical_loss": 3.684047779555918, + "tokens_seen": 905759744 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036642928786359075, + "loss": 2.9078, + "theoretical_loss": 3.6840219470881825, + "tokens_seen": 905825280 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036641925777332, + "loss": 2.9592, + "theoretical_loss": 3.683996117012612, + "tokens_seen": 905890816 + }, + { + "epoch": 11.0, + "objective/train/docs_used": 2164390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9633588790893555, + "objective/train/theoretical_loss": 3.683970289328813, + "objective/train/tokens_used": 926416352, + "theoretical_loss": 3.683970289328813, + "tokens_seen": 905956352 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003664092276830491, + "loss": 2.9203, + "theoretical_loss": 3.683970289328813, + "tokens_seen": 905956352 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036639919759277835, + "loss": 2.8962, + "theoretical_loss": 3.6839444640363905, + "tokens_seen": 906021888 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036638916750250753, + "loss": 2.8354, + "theoretical_loss": 3.68391864113495, + "tokens_seen": 906087424 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003663791374122367, + "loss": 2.7702, + "theoretical_loss": 3.6838928206240977, + "tokens_seen": 906152960 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003663691073219659, + "loss": 2.9866, + "theoretical_loss": 3.6838670025034395, + "tokens_seen": 906218496 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036635907723169507, + "loss": 2.82, + "theoretical_loss": 3.6838411867725807, + "tokens_seen": 906284032 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003663490471414243, + "loss": 3.004, + "theoretical_loss": 3.683815373431128, + "tokens_seen": 906349568 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003663390170511535, + "loss": 2.8858, + "theoretical_loss": 3.6837895624786867, + "tokens_seen": 906415104 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036632898696088267, + "loss": 2.9804, + "theoretical_loss": 3.6837637539148633, + "tokens_seen": 906480640 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036631895687061185, + "loss": 2.7914, + "theoretical_loss": 3.6837379477392647, + "tokens_seen": 906546176 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036630892678034103, + "loss": 2.9081, + "theoretical_loss": 3.6837121439514964, + "tokens_seen": 906611712 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003662988966900702, + "loss": 2.8654, + "theoretical_loss": 3.683686342551165, + "tokens_seen": 906677248 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036628886659979945, + "loss": 2.8288, + "theoretical_loss": 3.6836605435378775, + "tokens_seen": 906742784 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003662788365095286, + "loss": 2.9297, + "theoretical_loss": 3.683634746911241, + "tokens_seen": 906808320 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003662688064192578, + "loss": 2.9156, + "theoretical_loss": 3.683608952670861, + "tokens_seen": 906873856 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036625877632898694, + "loss": 2.8838, + "theoretical_loss": 3.6835831608163447, + "tokens_seen": 906939392 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003662487462387162, + "loss": 2.9308, + "theoretical_loss": 3.6835573713473, + "tokens_seen": 907004928 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036623871614844535, + "loss": 2.8769, + "theoretical_loss": 3.6835315842633323, + "tokens_seen": 907070464 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036622868605817454, + "loss": 2.8989, + "theoretical_loss": 3.6835057995640503, + "tokens_seen": 907136000 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003662186559679037, + "loss": 2.7932, + "theoretical_loss": 3.6834800172490603, + "tokens_seen": 907201536 + }, + { + "epoch": 11.0, + "learning_rate": 0.00036620862587763295, + "loss": 2.9122, + "theoretical_loss": 3.68345423731797, + "tokens_seen": 907267072 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003661985957873621, + "loss": 2.8728, + "theoretical_loss": 3.683428459770387, + "tokens_seen": 907332608 + }, + { + "epoch": 11.0, + "learning_rate": 0.0003661885656970913, + "loss": 2.8644, + "theoretical_loss": 3.683402684605918, + "tokens_seen": 907398144 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036617853560682044, + "loss": 2.8251, + "theoretical_loss": 3.6833769118241713, + "tokens_seen": 907463680 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003661685055165497, + "loss": 2.9204, + "theoretical_loss": 3.683351141424754, + "tokens_seen": 907529216 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2167376, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9454479217529297, + "objective/train/theoretical_loss": 3.683325373407275, + "objective/train/tokens_used": 928054752, + "theoretical_loss": 3.683325373407275, + "tokens_seen": 907594752 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036615847542627886, + "loss": 2.9198, + "theoretical_loss": 3.683325373407275, + "tokens_seen": 907594752 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036614844533600804, + "loss": 2.8822, + "theoretical_loss": 3.6832996077713407, + "tokens_seen": 907660288 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003661384152457372, + "loss": 2.8482, + "theoretical_loss": 3.6832738445165596, + "tokens_seen": 907725824 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003661283851554664, + "loss": 2.9182, + "theoretical_loss": 3.68324808364254, + "tokens_seen": 907791360 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003661183550651956, + "loss": 2.9798, + "theoretical_loss": 3.6832223251488903, + "tokens_seen": 907856896 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003661083249749248, + "loss": 2.9831, + "theoretical_loss": 3.6831965690352177, + "tokens_seen": 907922432 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036609829488465394, + "loss": 2.9682, + "theoretical_loss": 3.683170815301132, + "tokens_seen": 907987968 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660882647943832, + "loss": 2.9321, + "theoretical_loss": 3.683145063946241, + "tokens_seen": 908053504 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036607823470411236, + "loss": 2.9081, + "theoretical_loss": 3.683119314970152, + "tokens_seen": 908119040 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036606820461384154, + "loss": 2.8812, + "theoretical_loss": 3.6830935683724753, + "tokens_seen": 908184576 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660581745235707, + "loss": 2.968, + "theoretical_loss": 3.683067824152819, + "tokens_seen": 908250112 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660481444332999, + "loss": 2.8847, + "theoretical_loss": 3.6830420823107914, + "tokens_seen": 908315648 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660381143430291, + "loss": 2.8584, + "theoretical_loss": 3.6830163428460025, + "tokens_seen": 908381184 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660280842527583, + "loss": 2.834, + "theoretical_loss": 3.6829906057580604, + "tokens_seen": 908446720 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036601805416248745, + "loss": 3.0175, + "theoretical_loss": 3.682964871046574, + "tokens_seen": 908512256 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003660080240722167, + "loss": 2.838, + "theoretical_loss": 3.6829391387111534, + "tokens_seen": 908577792 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659979939819458, + "loss": 2.9066, + "theoretical_loss": 3.682913408751407, + "tokens_seen": 908643328 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036598796389167504, + "loss": 2.8804, + "theoretical_loss": 3.682887681166944, + "tokens_seen": 908708864 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659779338014042, + "loss": 2.9555, + "theoretical_loss": 3.6828619559573754, + "tokens_seen": 908774400 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659679037111334, + "loss": 2.8565, + "theoretical_loss": 3.682836233122309, + "tokens_seen": 908839936 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659578736208626, + "loss": 2.9475, + "theoretical_loss": 3.682810512661355, + "tokens_seen": 908905472 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036594784353059177, + "loss": 2.8012, + "theoretical_loss": 3.6827847945741237, + "tokens_seen": 908971008 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036593781344032095, + "loss": 2.8556, + "theoretical_loss": 3.682759078860224, + "tokens_seen": 909036544 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659277833500502, + "loss": 2.9742, + "theoretical_loss": 3.6827333655192662, + "tokens_seen": 909102080 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003659177532597793, + "loss": 2.9698, + "theoretical_loss": 3.6827076545508604, + "tokens_seen": 909167616 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2171106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9373836517333984, + "objective/train/theoretical_loss": 3.6826819459546165, + "objective/train/tokens_used": 929693152, + "theoretical_loss": 3.6826819459546165, + "tokens_seen": 909233152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036590772316950855, + "loss": 2.9209, + "theoretical_loss": 3.6826819459546165, + "tokens_seen": 909233152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036589769307923773, + "loss": 2.8743, + "theoretical_loss": 3.6826562397301448, + "tokens_seen": 909298688 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658876629889669, + "loss": 2.9045, + "theoretical_loss": 3.6826305358770552, + "tokens_seen": 909364224 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658776328986961, + "loss": 2.8775, + "theoretical_loss": 3.682604834394959, + "tokens_seen": 909429760 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036586760280842527, + "loss": 2.8978, + "theoretical_loss": 3.682579135283466, + "tokens_seen": 909495296 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036585757271815445, + "loss": 2.8503, + "theoretical_loss": 3.682553438542186, + "tokens_seen": 909560832 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658475426278837, + "loss": 2.8149, + "theoretical_loss": 3.6825277441707316, + "tokens_seen": 909626368 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658375125376128, + "loss": 2.9099, + "theoretical_loss": 3.6825020521687115, + "tokens_seen": 909691904 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036582748244734205, + "loss": 2.943, + "theoretical_loss": 3.6824763625357377, + "tokens_seen": 909757440 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658174523570712, + "loss": 2.9025, + "theoretical_loss": 3.6824506752714212, + "tokens_seen": 909822976 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003658074222668004, + "loss": 2.8886, + "theoretical_loss": 3.682424990375372, + "tokens_seen": 909888512 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657973921765296, + "loss": 2.9189, + "theoretical_loss": 3.682399307847202, + "tokens_seen": 909954048 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657873620862588, + "loss": 2.9543, + "theoretical_loss": 3.6823736276865224, + "tokens_seen": 910019584 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036577733199598796, + "loss": 2.9441, + "theoretical_loss": 3.682347949892945, + "tokens_seen": 910085120 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036576730190571714, + "loss": 2.9044, + "theoretical_loss": 3.6823222744660793, + "tokens_seen": 910150656 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657572718154463, + "loss": 2.8649, + "theoretical_loss": 3.6822966014055387, + "tokens_seen": 910216192 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036574724172517555, + "loss": 2.8988, + "theoretical_loss": 3.682270930710933, + "tokens_seen": 910281728 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657372116349047, + "loss": 2.9759, + "theoretical_loss": 3.682245262381876, + "tokens_seen": 910347264 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657271815446339, + "loss": 2.8502, + "theoretical_loss": 3.682219596417978, + "tokens_seen": 910412800 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657171514543631, + "loss": 2.9663, + "theoretical_loss": 3.6821939328188513, + "tokens_seen": 910478336 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003657071213640923, + "loss": 2.8362, + "theoretical_loss": 3.682168271584107, + "tokens_seen": 910543872 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036569709127382146, + "loss": 2.9118, + "theoretical_loss": 3.682142612713358, + "tokens_seen": 910609408 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036568706118355064, + "loss": 2.8794, + "theoretical_loss": 3.6821169562062166, + "tokens_seen": 910674944 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003656770310932798, + "loss": 2.7627, + "theoretical_loss": 3.682091302062294, + "tokens_seen": 910740480 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036566700100300906, + "loss": 2.9654, + "theoretical_loss": 3.6820656502812037, + "tokens_seen": 910806016 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2176016, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.864053249359131, + "objective/train/theoretical_loss": 3.682040000862558, + "objective/train/tokens_used": 931331552, + "theoretical_loss": 3.682040000862558, + "tokens_seen": 910871552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003656569709127382, + "loss": 2.8236, + "theoretical_loss": 3.682040000862558, + "tokens_seen": 910871552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003656469408224674, + "loss": 2.9999, + "theoretical_loss": 3.6820143538059673, + "tokens_seen": 910937088 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036563691073219655, + "loss": 2.9052, + "theoretical_loss": 3.681988709111047, + "tokens_seen": 911002624 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003656268806419258, + "loss": 2.9512, + "theoretical_loss": 3.681963066777408, + "tokens_seen": 911068160 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036561685055165496, + "loss": 2.8307, + "theoretical_loss": 3.681937426804663, + "tokens_seen": 911133696 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036560682046138414, + "loss": 2.9038, + "theoretical_loss": 3.681911789192426, + "tokens_seen": 911199232 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655967903711134, + "loss": 2.8317, + "theoretical_loss": 3.681886153940309, + "tokens_seen": 911264768 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036558676028084256, + "loss": 2.9432, + "theoretical_loss": 3.681860521047926, + "tokens_seen": 911330304 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036557673019057174, + "loss": 2.9366, + "theoretical_loss": 3.6818348905148888, + "tokens_seen": 911395840 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655667001003009, + "loss": 2.8927, + "theoretical_loss": 3.6818092623408116, + "tokens_seen": 911461376 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655566700100301, + "loss": 2.8708, + "theoretical_loss": 3.6817836365253065, + "tokens_seen": 911526912 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655466399197593, + "loss": 3.0318, + "theoretical_loss": 3.6817580130679888, + "tokens_seen": 911592448 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655366098294885, + "loss": 2.9486, + "theoretical_loss": 3.6817323919684704, + "tokens_seen": 911657984 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036552657973921765, + "loss": 2.9628, + "theoretical_loss": 3.6817067732263653, + "tokens_seen": 911723520 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003655165496489469, + "loss": 2.7766, + "theoretical_loss": 3.6816811568412877, + "tokens_seen": 911789056 + }, + { + "epoch": 11.01, + "learning_rate": 0.000365506519558676, + "loss": 2.918, + "theoretical_loss": 3.68165554281285, + "tokens_seen": 911854592 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036549648946840525, + "loss": 2.9284, + "theoretical_loss": 3.6816299311406677, + "tokens_seen": 911920128 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003654864593781344, + "loss": 2.9589, + "theoretical_loss": 3.681604321824354, + "tokens_seen": 911985664 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003654764292878636, + "loss": 2.9211, + "theoretical_loss": 3.681578714863522, + "tokens_seen": 912051200 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003654663991975928, + "loss": 2.9617, + "theoretical_loss": 3.681553110257788, + "tokens_seen": 912116736 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036545636910732197, + "loss": 2.9648, + "theoretical_loss": 3.681527508006764, + "tokens_seen": 912182272 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036544633901705115, + "loss": 2.9146, + "theoretical_loss": 3.6815019081100657, + "tokens_seen": 912247808 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003654363089267804, + "loss": 2.8917, + "theoretical_loss": 3.681476310567307, + "tokens_seen": 912313344 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003654262788365095, + "loss": 2.8857, + "theoretical_loss": 3.6814507153781024, + "tokens_seen": 912378880 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036541624874623875, + "loss": 2.8513, + "theoretical_loss": 3.681425122542066, + "tokens_seen": 912444416 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2178998, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8429486751556396, + "objective/train/theoretical_loss": 3.6813995320588133, + "objective/train/tokens_used": 932969952, + "theoretical_loss": 3.6813995320588133, + "tokens_seen": 912509952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036540621865596793, + "loss": 2.8087, + "theoretical_loss": 3.6813995320588133, + "tokens_seen": 912509952 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653961885656971, + "loss": 2.8447, + "theoretical_loss": 3.6813739439279587, + "tokens_seen": 912575488 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653861584754263, + "loss": 2.9189, + "theoretical_loss": 3.681348358149117, + "tokens_seen": 912641024 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036537612838515547, + "loss": 2.9575, + "theoretical_loss": 3.6813227747219033, + "tokens_seen": 912706560 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036536609829488465, + "loss": 2.9157, + "theoretical_loss": 3.681297193645932, + "tokens_seen": 912772096 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653560682046139, + "loss": 2.9402, + "theoretical_loss": 3.681271614920819, + "tokens_seen": 912837632 + }, + { + "epoch": 11.01, + "learning_rate": 0.000365346038114343, + "loss": 2.8936, + "theoretical_loss": 3.6812460385461794, + "tokens_seen": 912903168 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036533600802407225, + "loss": 2.837, + "theoretical_loss": 3.681220464521628, + "tokens_seen": 912968704 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653259779338014, + "loss": 2.9127, + "theoretical_loss": 3.68119489284678, + "tokens_seen": 913034240 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653159478435306, + "loss": 2.8659, + "theoretical_loss": 3.681169323521252, + "tokens_seen": 913099776 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003653059177532598, + "loss": 2.9145, + "theoretical_loss": 3.6811437565446594, + "tokens_seen": 913165312 + }, + { + "epoch": 11.01, + "learning_rate": 0.000365295887662989, + "loss": 2.9169, + "theoretical_loss": 3.6811181919166165, + "tokens_seen": 913230848 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036528585757271816, + "loss": 2.919, + "theoretical_loss": 3.6810926296367406, + "tokens_seen": 913296384 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036527582748244734, + "loss": 2.8704, + "theoretical_loss": 3.6810670697046466, + "tokens_seen": 913361920 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003652657973921765, + "loss": 2.9369, + "theoretical_loss": 3.6810415121199505, + "tokens_seen": 913427456 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036525576730190575, + "loss": 2.9321, + "theoretical_loss": 3.681015956882269, + "tokens_seen": 913492992 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003652457372116349, + "loss": 2.9202, + "theoretical_loss": 3.680990403991218, + "tokens_seen": 913558528 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003652357071213641, + "loss": 2.8602, + "theoretical_loss": 3.6809648534464126, + "tokens_seen": 913624064 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003652256770310933, + "loss": 2.8375, + "theoretical_loss": 3.6809393052474704, + "tokens_seen": 913689600 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003652156469408225, + "loss": 2.82, + "theoretical_loss": 3.680913759394007, + "tokens_seen": 913755136 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036520561685055166, + "loss": 2.9005, + "theoretical_loss": 3.6808882158856395, + "tokens_seen": 913820672 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036519558676028084, + "loss": 2.855, + "theoretical_loss": 3.6808626747219844, + "tokens_seen": 913886208 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036518555667001, + "loss": 2.9595, + "theoretical_loss": 3.6808371359026575, + "tokens_seen": 913951744 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036517552657973926, + "loss": 2.9181, + "theoretical_loss": 3.680811599427277, + "tokens_seen": 914017280 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003651654964894684, + "loss": 2.7802, + "theoretical_loss": 3.6807860652954583, + "tokens_seen": 914082816 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2183908, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.956984043121338, + "objective/train/theoretical_loss": 3.680760533506819, + "objective/train/tokens_used": 934608352, + "theoretical_loss": 3.680760533506819, + "tokens_seen": 914148352 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003651554663991976, + "loss": 2.9483, + "theoretical_loss": 3.680760533506819, + "tokens_seen": 914148352 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036514543630892675, + "loss": 2.9032, + "theoretical_loss": 3.680735004060976, + "tokens_seen": 914213888 + }, + { + "epoch": 11.01, + "learning_rate": 0.000365135406218656, + "loss": 2.9605, + "theoretical_loss": 3.6807094769575466, + "tokens_seen": 914279424 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036512537612838516, + "loss": 2.908, + "theoretical_loss": 3.6806839521961474, + "tokens_seen": 914344960 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036511534603811434, + "loss": 2.9337, + "theoretical_loss": 3.6806584297763965, + "tokens_seen": 914410496 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003651053159478435, + "loss": 2.9784, + "theoretical_loss": 3.680632909697911, + "tokens_seen": 914476032 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036509528585757276, + "loss": 2.8103, + "theoretical_loss": 3.680607391960308, + "tokens_seen": 914541568 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003650852557673019, + "loss": 2.9281, + "theoretical_loss": 3.6805818765632052, + "tokens_seen": 914607104 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003650752256770311, + "loss": 2.9857, + "theoretical_loss": 3.6805563635062204, + "tokens_seen": 914672640 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036506519558676025, + "loss": 2.926, + "theoretical_loss": 3.680530852788971, + "tokens_seen": 914738176 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003650551654964895, + "loss": 2.926, + "theoretical_loss": 3.680505344411076, + "tokens_seen": 914803712 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036504513540621867, + "loss": 3.034, + "theoretical_loss": 3.6804798383721513, + "tokens_seen": 914869248 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036503510531594785, + "loss": 2.8874, + "theoretical_loss": 3.6804543346718166, + "tokens_seen": 914934784 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036502507522567703, + "loss": 3.0411, + "theoretical_loss": 3.6804288333096897, + "tokens_seen": 915000320 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003650150451354062, + "loss": 2.9997, + "theoretical_loss": 3.680403334285388, + "tokens_seen": 915065856 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003650050150451354, + "loss": 2.8177, + "theoretical_loss": 3.68037783759853, + "tokens_seen": 915131392 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003649949849548646, + "loss": 2.8351, + "theoretical_loss": 3.6803523432487353, + "tokens_seen": 915196928 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036498495486459375, + "loss": 2.9619, + "theoretical_loss": 3.6803268512356206, + "tokens_seen": 915262464 + }, + { + "epoch": 11.01, + "learning_rate": 0.000364974924774323, + "loss": 2.8958, + "theoretical_loss": 3.680301361558805, + "tokens_seen": 915328000 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003649648946840521, + "loss": 2.9349, + "theoretical_loss": 3.680275874217908, + "tokens_seen": 915393536 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036495486459378135, + "loss": 2.9925, + "theoretical_loss": 3.6802503892125475, + "tokens_seen": 915459072 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036494483450351053, + "loss": 2.9215, + "theoretical_loss": 3.6802249065423425, + "tokens_seen": 915524608 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003649348044132397, + "loss": 2.8617, + "theoretical_loss": 3.6801994262069115, + "tokens_seen": 915590144 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003649247743229689, + "loss": 2.8723, + "theoretical_loss": 3.680173948205874, + "tokens_seen": 915655680 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036491474423269813, + "loss": 2.7764, + "theoretical_loss": 3.680148472538849, + "tokens_seen": 915721216 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2186806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8205175399780273, + "objective/train/theoretical_loss": 3.6801229992054556, + "objective/train/tokens_used": 936246752, + "theoretical_loss": 3.6801229992054556, + "tokens_seen": 915786752 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036490471414242726, + "loss": 2.8683, + "theoretical_loss": 3.6801229992054556, + "tokens_seen": 915786752 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648946840521565, + "loss": 2.988, + "theoretical_loss": 3.680097528205313, + "tokens_seen": 915852288 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648846539618856, + "loss": 3.0117, + "theoretical_loss": 3.6800720595380407, + "tokens_seen": 915917824 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036487462387161485, + "loss": 2.9107, + "theoretical_loss": 3.680046593203258, + "tokens_seen": 915983360 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648645937813441, + "loss": 2.8594, + "theoretical_loss": 3.6800211292005844, + "tokens_seen": 916048896 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648545636910732, + "loss": 2.9939, + "theoretical_loss": 3.67999566752964, + "tokens_seen": 916114432 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036484453360080245, + "loss": 2.9155, + "theoretical_loss": 3.6799702081900434, + "tokens_seen": 916179968 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648345035105316, + "loss": 2.9179, + "theoretical_loss": 3.6799447511814156, + "tokens_seen": 916245504 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648244734202608, + "loss": 2.9639, + "theoretical_loss": 3.6799192965033756, + "tokens_seen": 916311040 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036481444332999, + "loss": 2.8605, + "theoretical_loss": 3.6798938441555435, + "tokens_seen": 916376576 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003648044132397192, + "loss": 2.8498, + "theoretical_loss": 3.6798683941375403, + "tokens_seen": 916442112 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036479438314944836, + "loss": 2.9192, + "theoretical_loss": 3.679842946448985, + "tokens_seen": 916507648 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036478435305917754, + "loss": 2.8597, + "theoretical_loss": 3.679817501089498, + "tokens_seen": 916573184 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003647743229689067, + "loss": 2.7883, + "theoretical_loss": 3.6797920580587, + "tokens_seen": 916638720 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036476429287863595, + "loss": 2.9571, + "theoretical_loss": 3.6797666173562114, + "tokens_seen": 916704256 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003647542627883651, + "loss": 2.8431, + "theoretical_loss": 3.679741178981653, + "tokens_seen": 916769792 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003647442326980943, + "loss": 2.9309, + "theoretical_loss": 3.6797157429346448, + "tokens_seen": 916835328 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003647342026078235, + "loss": 2.863, + "theoretical_loss": 3.6796903092148074, + "tokens_seen": 916900864 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003647241725175527, + "loss": 2.9059, + "theoretical_loss": 3.679664877821762, + "tokens_seen": 916966400 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036471414242728186, + "loss": 2.9573, + "theoretical_loss": 3.67963944875513, + "tokens_seen": 917031936 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036470411233701104, + "loss": 3.0503, + "theoretical_loss": 3.6796140220145306, + "tokens_seen": 917097472 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003646940822467402, + "loss": 2.9326, + "theoretical_loss": 3.6795885975995866, + "tokens_seen": 917163008 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036468405215646946, + "loss": 2.9198, + "theoretical_loss": 3.6795631755099176, + "tokens_seen": 917228544 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003646740220661986, + "loss": 2.9935, + "theoretical_loss": 3.6795377557451463, + "tokens_seen": 917294080 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003646639919759278, + "loss": 2.9294, + "theoretical_loss": 3.6795123383048933, + "tokens_seen": 917359616 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2190603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0161194801330566, + "objective/train/theoretical_loss": 3.6794869231887803, + "objective/train/tokens_used": 937885152, + "theoretical_loss": 3.6794869231887803, + "tokens_seen": 917425152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036465396188565695, + "loss": 2.9374, + "theoretical_loss": 3.6794869231887803, + "tokens_seen": 917425152 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003646439317953862, + "loss": 2.831, + "theoretical_loss": 3.679461510396428, + "tokens_seen": 917490688 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036463390170511536, + "loss": 2.9567, + "theoretical_loss": 3.679436099927459, + "tokens_seen": 917556224 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036462387161484454, + "loss": 2.884, + "theoretical_loss": 3.679410691781494, + "tokens_seen": 917621760 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003646138415245737, + "loss": 2.9805, + "theoretical_loss": 3.6793852859581557, + "tokens_seen": 917687296 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036460381143430296, + "loss": 2.967, + "theoretical_loss": 3.679359882457065, + "tokens_seen": 917752832 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645937813440321, + "loss": 2.8231, + "theoretical_loss": 3.6793344812778446, + "tokens_seen": 917818368 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645837512537613, + "loss": 2.9679, + "theoretical_loss": 3.679309082420116, + "tokens_seen": 917883904 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036457372116349045, + "loss": 2.8626, + "theoretical_loss": 3.6792836858835014, + "tokens_seen": 917949440 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645636910732197, + "loss": 2.8478, + "theoretical_loss": 3.6792582916676233, + "tokens_seen": 918014976 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036455366098294887, + "loss": 2.8166, + "theoretical_loss": 3.6792328997721038, + "tokens_seen": 918080512 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036454363089267805, + "loss": 2.8981, + "theoretical_loss": 3.679207510196565, + "tokens_seen": 918146048 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036453360080240723, + "loss": 2.9465, + "theoretical_loss": 3.679182122940629, + "tokens_seen": 918211584 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645235707121364, + "loss": 2.8956, + "theoretical_loss": 3.67915673800392, + "tokens_seen": 918277120 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645135406218656, + "loss": 2.9596, + "theoretical_loss": 3.679131355386059, + "tokens_seen": 918342656 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003645035105315948, + "loss": 2.9531, + "theoretical_loss": 3.679105975086669, + "tokens_seen": 918408192 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036449348044132395, + "loss": 2.9466, + "theoretical_loss": 3.679080597105374, + "tokens_seen": 918473728 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003644834503510532, + "loss": 2.9576, + "theoretical_loss": 3.6790552214417955, + "tokens_seen": 918539264 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003644734202607823, + "loss": 2.8874, + "theoretical_loss": 3.6790298480955568, + "tokens_seen": 918604800 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036446339017051155, + "loss": 2.8791, + "theoretical_loss": 3.6790044770662815, + "tokens_seen": 918670336 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036445336008024073, + "loss": 2.7991, + "theoretical_loss": 3.678979108353592, + "tokens_seen": 918735872 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003644433299899699, + "loss": 2.7633, + "theoretical_loss": 3.678953741957112, + "tokens_seen": 918801408 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003644332998996991, + "loss": 2.8476, + "theoretical_loss": 3.678928377876465, + "tokens_seen": 918866944 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036442326980942833, + "loss": 2.9581, + "theoretical_loss": 3.678903016111274, + "tokens_seen": 918932480 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036441323971915746, + "loss": 2.9273, + "theoretical_loss": 3.6788776566611627, + "tokens_seen": 918998016 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2195398, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.092406749725342, + "objective/train/theoretical_loss": 3.6788522995257553, + "objective/train/tokens_used": 939523552, + "theoretical_loss": 3.6788522995257553, + "tokens_seen": 919063552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003644032096288867, + "loss": 2.9824, + "theoretical_loss": 3.6788522995257553, + "tokens_seen": 919063552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643931795386158, + "loss": 2.9249, + "theoretical_loss": 3.6788269447046744, + "tokens_seen": 919129088 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036438314944834505, + "loss": 2.7577, + "theoretical_loss": 3.6788015921975448, + "tokens_seen": 919194624 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036437311935807424, + "loss": 2.8413, + "theoretical_loss": 3.6787762420039893, + "tokens_seen": 919260160 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643630892678034, + "loss": 2.9138, + "theoretical_loss": 3.678750894123633, + "tokens_seen": 919325696 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643530591775326, + "loss": 2.9431, + "theoretical_loss": 3.6787255485560992, + "tokens_seen": 919391232 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643430290872618, + "loss": 2.9302, + "theoretical_loss": 3.678700205301012, + "tokens_seen": 919456768 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036433299899699096, + "loss": 2.9198, + "theoretical_loss": 3.678674864357996, + "tokens_seen": 919522304 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643229689067202, + "loss": 2.7842, + "theoretical_loss": 3.6786495257266747, + "tokens_seen": 919587840 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003643129388164493, + "loss": 2.8769, + "theoretical_loss": 3.6786241894066736, + "tokens_seen": 919653376 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036430290872617856, + "loss": 2.9311, + "theoretical_loss": 3.6785988553976168, + "tokens_seen": 919718912 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642928786359077, + "loss": 2.9583, + "theoretical_loss": 3.6785735236991286, + "tokens_seen": 919784448 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642828485456369, + "loss": 2.9648, + "theoretical_loss": 3.6785481943108342, + "tokens_seen": 919849984 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642728184553661, + "loss": 2.9494, + "theoretical_loss": 3.6785228672323576, + "tokens_seen": 919915520 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642627883650953, + "loss": 2.9664, + "theoretical_loss": 3.678497542463324, + "tokens_seen": 919981056 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036425275827482446, + "loss": 2.8922, + "theoretical_loss": 3.678472220003359, + "tokens_seen": 920046592 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642427281845537, + "loss": 2.8967, + "theoretical_loss": 3.678446899852086, + "tokens_seen": 920112128 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642326980942828, + "loss": 2.9176, + "theoretical_loss": 3.6784215820091313, + "tokens_seen": 920177664 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036422266800401206, + "loss": 2.8264, + "theoretical_loss": 3.6783962664741203, + "tokens_seen": 920243200 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642126379137412, + "loss": 2.9109, + "theoretical_loss": 3.6783709532466773, + "tokens_seen": 920308736 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003642026078234704, + "loss": 2.9717, + "theoretical_loss": 3.6783456423264282, + "tokens_seen": 920374272 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003641925777331996, + "loss": 2.9062, + "theoretical_loss": 3.678320333712999, + "tokens_seen": 920439808 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003641825476429288, + "loss": 2.811, + "theoretical_loss": 3.6782950274060138, + "tokens_seen": 920505344 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036417251755265797, + "loss": 2.9611, + "theoretical_loss": 3.678269723405099, + "tokens_seen": 920570880 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036416248746238715, + "loss": 2.7438, + "theoretical_loss": 3.6782444217098806, + "tokens_seen": 920636416 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2198366, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8016464710235596, + "objective/train/theoretical_loss": 3.6782191223199847, + "objective/train/tokens_used": 941161952, + "theoretical_loss": 3.6782191223199847, + "tokens_seen": 920701952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036415245737211633, + "loss": 2.951, + "theoretical_loss": 3.6782191223199847, + "tokens_seen": 920701952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036414242728184556, + "loss": 3.0128, + "theoretical_loss": 3.678193825235036, + "tokens_seen": 920767488 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003641323971915747, + "loss": 2.9694, + "theoretical_loss": 3.6781685304546614, + "tokens_seen": 920833024 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003641223671013039, + "loss": 2.9471, + "theoretical_loss": 3.678143237978486, + "tokens_seen": 920898560 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036411233701103316, + "loss": 2.9318, + "theoretical_loss": 3.678117947806138, + "tokens_seen": 920964096 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003641023069207623, + "loss": 2.9465, + "theoretical_loss": 3.678092659937241, + "tokens_seen": 921029632 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003640922768304915, + "loss": 2.939, + "theoretical_loss": 3.678067374371423, + "tokens_seen": 921095168 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036408224674022065, + "loss": 2.8849, + "theoretical_loss": 3.67804209110831, + "tokens_seen": 921160704 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003640722166499499, + "loss": 2.9448, + "theoretical_loss": 3.678016810147528, + "tokens_seen": 921226240 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036406218655967907, + "loss": 2.8824, + "theoretical_loss": 3.677991531488705, + "tokens_seen": 921291776 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036405215646940825, + "loss": 2.969, + "theoretical_loss": 3.677966255131466, + "tokens_seen": 921357312 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036404212637913743, + "loss": 2.9563, + "theoretical_loss": 3.677940981075438, + "tokens_seen": 921422848 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003640320962888666, + "loss": 2.9023, + "theoretical_loss": 3.6779157093202492, + "tokens_seen": 921488384 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003640220661985958, + "loss": 2.957, + "theoretical_loss": 3.677890439865526, + "tokens_seen": 921553920 + }, + { + "epoch": 11.01, + "learning_rate": 0.000364012036108325, + "loss": 2.8995, + "theoretical_loss": 3.6778651727108937, + "tokens_seen": 921619456 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036400200601805415, + "loss": 2.9046, + "theoretical_loss": 3.6778399078559816, + "tokens_seen": 921684992 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003639919759277834, + "loss": 2.8952, + "theoretical_loss": 3.677814645300416, + "tokens_seen": 921750528 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003639819458375125, + "loss": 2.7809, + "theoretical_loss": 3.677789385043824, + "tokens_seen": 921816064 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036397191574724175, + "loss": 2.9392, + "theoretical_loss": 3.6777641270858332, + "tokens_seen": 921881600 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036396188565697093, + "loss": 3.0295, + "theoretical_loss": 3.6777388714260715, + "tokens_seen": 921947136 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003639518555667001, + "loss": 2.8506, + "theoretical_loss": 3.677713618064165, + "tokens_seen": 922012672 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003639418254764293, + "loss": 2.8132, + "theoretical_loss": 3.6776883669997433, + "tokens_seen": 922078208 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036393179538615853, + "loss": 2.9251, + "theoretical_loss": 3.6776631182324326, + "tokens_seen": 922143744 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036392176529588766, + "loss": 2.99, + "theoretical_loss": 3.6776378717618616, + "tokens_seen": 922209280 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003639117352056169, + "loss": 2.9202, + "theoretical_loss": 3.6776126275876573, + "tokens_seen": 922274816 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2201436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.884345293045044, + "objective/train/theoretical_loss": 3.6775873857094483, + "objective/train/tokens_used": 942800352, + "theoretical_loss": 3.6775873857094483, + "tokens_seen": 922340352 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363901705115346, + "loss": 2.8698, + "theoretical_loss": 3.6775873857094483, + "tokens_seen": 922340352 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036389167502507525, + "loss": 2.882, + "theoretical_loss": 3.6775621461268626, + "tokens_seen": 922405888 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036388164493480444, + "loss": 2.9306, + "theoretical_loss": 3.677536908839528, + "tokens_seen": 922471424 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003638716148445336, + "loss": 2.9491, + "theoretical_loss": 3.6775116738470732, + "tokens_seen": 922536960 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003638615847542628, + "loss": 3.0031, + "theoretical_loss": 3.6774864411491266, + "tokens_seen": 922602496 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363851554663992, + "loss": 2.9977, + "theoretical_loss": 3.6774612107453164, + "tokens_seen": 922668032 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036384152457372116, + "loss": 2.9311, + "theoretical_loss": 3.6774359826352705, + "tokens_seen": 922733568 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003638314944834504, + "loss": 2.8439, + "theoretical_loss": 3.677410756818618, + "tokens_seen": 922799104 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003638214643931795, + "loss": 2.9517, + "theoretical_loss": 3.677385533294988, + "tokens_seen": 922864640 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036381143430290876, + "loss": 2.9018, + "theoretical_loss": 3.6773603120640086, + "tokens_seen": 922930176 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003638014042126379, + "loss": 2.9353, + "theoretical_loss": 3.6773350931253086, + "tokens_seen": 922995712 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637913741223671, + "loss": 2.8842, + "theoretical_loss": 3.6773098764785175, + "tokens_seen": 923061248 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637813440320963, + "loss": 2.8729, + "theoretical_loss": 3.677284662123264, + "tokens_seen": 923126784 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637713139418255, + "loss": 3.0171, + "theoretical_loss": 3.677259450059177, + "tokens_seen": 923192320 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036376128385155466, + "loss": 2.9366, + "theoretical_loss": 3.6772342402858857, + "tokens_seen": 923257856 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637512537612839, + "loss": 2.8382, + "theoretical_loss": 3.6772090328030194, + "tokens_seen": 923323392 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363741223671013, + "loss": 2.8799, + "theoretical_loss": 3.6771838276102082, + "tokens_seen": 923388928 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036373119358074226, + "loss": 3.0235, + "theoretical_loss": 3.6771586247070807, + "tokens_seen": 923454464 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637211634904714, + "loss": 2.9476, + "theoretical_loss": 3.6771334240932667, + "tokens_seen": 923520000 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637111334002006, + "loss": 2.9551, + "theoretical_loss": 3.6771082257683956, + "tokens_seen": 923585536 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003637011033099298, + "loss": 2.9698, + "theoretical_loss": 3.6770830297320973, + "tokens_seen": 923651072 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363691073219659, + "loss": 2.9222, + "theoretical_loss": 3.677057835984001, + "tokens_seen": 923716608 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036368104312938817, + "loss": 2.8333, + "theoretical_loss": 3.677032644523738, + "tokens_seen": 923782144 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036367101303911735, + "loss": 2.9625, + "theoretical_loss": 3.677007455350937, + "tokens_seen": 923847680 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036366098294884653, + "loss": 2.9226, + "theoretical_loss": 3.6769822684652285, + "tokens_seen": 923913216 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2205356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.095860481262207, + "objective/train/theoretical_loss": 3.6769570838662426, + "objective/train/tokens_used": 944438752, + "theoretical_loss": 3.6769570838662426, + "tokens_seen": 923978752 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036365095285857576, + "loss": 2.8853, + "theoretical_loss": 3.6769570838662426, + "tokens_seen": 923978752 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003636409227683049, + "loss": 2.9439, + "theoretical_loss": 3.6769319015536093, + "tokens_seen": 924044288 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003636308926780341, + "loss": 2.9022, + "theoretical_loss": 3.6769067215269597, + "tokens_seen": 924109824 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036362086258776325, + "loss": 2.8385, + "theoretical_loss": 3.676881543785923, + "tokens_seen": 924175360 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003636108324974925, + "loss": 3.0471, + "theoretical_loss": 3.6768563683301303, + "tokens_seen": 924240896 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036360080240722167, + "loss": 2.7928, + "theoretical_loss": 3.6768311951592123, + "tokens_seen": 924306432 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036359077231695085, + "loss": 2.834, + "theoretical_loss": 3.676806024272799, + "tokens_seen": 924371968 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036358074222668003, + "loss": 3.017, + "theoretical_loss": 3.676780855670522, + "tokens_seen": 924437504 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036357071213640927, + "loss": 2.9654, + "theoretical_loss": 3.676755689352012, + "tokens_seen": 924503040 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003635606820461384, + "loss": 2.8807, + "theoretical_loss": 3.6767305253168994, + "tokens_seen": 924568576 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036355065195586763, + "loss": 2.9574, + "theoretical_loss": 3.676705363564815, + "tokens_seen": 924634112 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036354062186559676, + "loss": 2.8554, + "theoretical_loss": 3.6766802040953905, + "tokens_seen": 924699648 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363530591775326, + "loss": 2.7761, + "theoretical_loss": 3.676655046908257, + "tokens_seen": 924765184 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036352056168505517, + "loss": 3.0141, + "theoretical_loss": 3.6766298920030454, + "tokens_seen": 924830720 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036351053159478435, + "loss": 2.9714, + "theoretical_loss": 3.6766047393793873, + "tokens_seen": 924896256 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036350050150451353, + "loss": 2.9296, + "theoretical_loss": 3.676579589036914, + "tokens_seen": 924961792 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634904714142427, + "loss": 2.8629, + "theoretical_loss": 3.676554440975257, + "tokens_seen": 925027328 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634804413239719, + "loss": 2.9106, + "theoretical_loss": 3.6765292951940483, + "tokens_seen": 925092864 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036347041123370113, + "loss": 3.0189, + "theoretical_loss": 3.676504151692919, + "tokens_seen": 925158400 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036346038114343026, + "loss": 2.9084, + "theoretical_loss": 3.6764790104715006, + "tokens_seen": 925223936 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634503510531595, + "loss": 2.9641, + "theoretical_loss": 3.676453871529426, + "tokens_seen": 925289472 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634403209628886, + "loss": 2.963, + "theoretical_loss": 3.6764287348663265, + "tokens_seen": 925355008 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036343029087261786, + "loss": 2.8011, + "theoretical_loss": 3.676403600481834, + "tokens_seen": 925420544 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036342026078234704, + "loss": 2.911, + "theoretical_loss": 3.6763784683755807, + "tokens_seen": 925486080 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634102306920762, + "loss": 2.8215, + "theoretical_loss": 3.6763533385471994, + "tokens_seen": 925551616 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2209839, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9816226959228516, + "objective/train/theoretical_loss": 3.6763282109963216, + "objective/train/tokens_used": 946077152, + "theoretical_loss": 3.6763282109963216, + "tokens_seen": 925617152 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003634002006018054, + "loss": 2.9217, + "theoretical_loss": 3.6763282109963216, + "tokens_seen": 925617152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036339017051153464, + "loss": 2.794, + "theoretical_loss": 3.6763030857225796, + "tokens_seen": 925682688 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036338014042126376, + "loss": 2.8808, + "theoretical_loss": 3.6762779627256066, + "tokens_seen": 925748224 + }, + { + "epoch": 11.01, + "learning_rate": 0.000363370110330993, + "loss": 2.9682, + "theoretical_loss": 3.6762528420050344, + "tokens_seen": 925813760 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003633600802407222, + "loss": 2.9269, + "theoretical_loss": 3.676227723560496, + "tokens_seen": 925879296 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036335005015045136, + "loss": 2.8923, + "theoretical_loss": 3.6762026073916236, + "tokens_seen": 925944832 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003633400200601806, + "loss": 2.9758, + "theoretical_loss": 3.6761774934980505, + "tokens_seen": 926010368 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003633299899699097, + "loss": 2.9763, + "theoretical_loss": 3.67615238187941, + "tokens_seen": 926075904 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036331995987963896, + "loss": 2.893, + "theoretical_loss": 3.676127272535334, + "tokens_seen": 926141440 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003633099297893681, + "loss": 2.8321, + "theoretical_loss": 3.676102165465456, + "tokens_seen": 926206976 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632998996990973, + "loss": 2.8615, + "theoretical_loss": 3.6760770606694098, + "tokens_seen": 926272512 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632898696088265, + "loss": 2.8467, + "theoretical_loss": 3.6760519581468274, + "tokens_seen": 926338048 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632798395185557, + "loss": 2.7908, + "theoretical_loss": 3.6760268578973427, + "tokens_seen": 926403584 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036326980942828486, + "loss": 2.904, + "theoretical_loss": 3.6760017599205894, + "tokens_seen": 926469120 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632597793380141, + "loss": 2.9167, + "theoretical_loss": 3.6759766642162006, + "tokens_seen": 926534656 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632497492477432, + "loss": 3.0099, + "theoretical_loss": 3.67595157078381, + "tokens_seen": 926600192 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036323971915747246, + "loss": 2.9697, + "theoretical_loss": 3.6759264796230506, + "tokens_seen": 926665728 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632296890672016, + "loss": 2.8688, + "theoretical_loss": 3.675901390733557, + "tokens_seen": 926731264 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003632196589769308, + "loss": 2.8304, + "theoretical_loss": 3.6758763041149627, + "tokens_seen": 926796800 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036320962888666, + "loss": 2.9467, + "theoretical_loss": 3.6758512197669013, + "tokens_seen": 926862336 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003631995987963892, + "loss": 2.9651, + "theoretical_loss": 3.6758261376890076, + "tokens_seen": 926927872 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036318956870611837, + "loss": 2.9206, + "theoretical_loss": 3.6758010578809146, + "tokens_seen": 926993408 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036317953861584755, + "loss": 2.8364, + "theoretical_loss": 3.675775980342257, + "tokens_seen": 927058944 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036316950852557673, + "loss": 2.8938, + "theoretical_loss": 3.675750905072669, + "tokens_seen": 927124480 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036315947843530596, + "loss": 2.9564, + "theoretical_loss": 3.6757258320717847, + "tokens_seen": 927190016 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2213247, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.798551082611084, + "objective/train/theoretical_loss": 3.6757007613392396, + "objective/train/tokens_used": 947715552, + "theoretical_loss": 3.6757007613392396, + "tokens_seen": 927255552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003631494483450351, + "loss": 2.8752, + "theoretical_loss": 3.6757007613392396, + "tokens_seen": 927255552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003631394182547643, + "loss": 2.9627, + "theoretical_loss": 3.675675692874666, + "tokens_seen": 927321088 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036312938816449345, + "loss": 2.9317, + "theoretical_loss": 3.6756506266777005, + "tokens_seen": 927386624 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003631193580742227, + "loss": 2.9257, + "theoretical_loss": 3.6756255627479772, + "tokens_seen": 927452160 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036310932798395187, + "loss": 2.966, + "theoretical_loss": 3.67560050108513, + "tokens_seen": 927517696 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036309929789368105, + "loss": 2.9481, + "theoretical_loss": 3.6755754416887947, + "tokens_seen": 927583232 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036308926780341023, + "loss": 2.9847, + "theoretical_loss": 3.6755503845586057, + "tokens_seen": 927648768 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036307923771313947, + "loss": 2.8495, + "theoretical_loss": 3.6755253296941985, + "tokens_seen": 927714304 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003630692076228686, + "loss": 2.8902, + "theoretical_loss": 3.6755002770952077, + "tokens_seen": 927779840 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036305917753259783, + "loss": 2.9663, + "theoretical_loss": 3.675475226761269, + "tokens_seen": 927845376 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036304914744232696, + "loss": 2.94, + "theoretical_loss": 3.6754501786920164, + "tokens_seen": 927910912 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003630391173520562, + "loss": 2.8669, + "theoretical_loss": 3.675425132887087, + "tokens_seen": 927976448 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036302908726178537, + "loss": 2.9211, + "theoretical_loss": 3.6754000893461147, + "tokens_seen": 928041984 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036301905717151455, + "loss": 2.7874, + "theoretical_loss": 3.6753750480687355, + "tokens_seen": 928107520 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036300902708124374, + "loss": 2.9167, + "theoretical_loss": 3.6753500090545854, + "tokens_seen": 928173056 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629989969909729, + "loss": 2.863, + "theoretical_loss": 3.6753249723033, + "tokens_seen": 928238592 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629889669007021, + "loss": 2.8994, + "theoretical_loss": 3.675299937814515, + "tokens_seen": 928304128 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036297893681043133, + "loss": 2.9031, + "theoretical_loss": 3.6752749055878655, + "tokens_seen": 928369664 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036296890672016046, + "loss": 2.96, + "theoretical_loss": 3.6752498756229883, + "tokens_seen": 928435200 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629588766298897, + "loss": 2.9481, + "theoretical_loss": 3.6752248479195186, + "tokens_seen": 928500736 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629488465396188, + "loss": 2.7802, + "theoretical_loss": 3.6751998224770936, + "tokens_seen": 928566272 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036293881644934806, + "loss": 3.0113, + "theoretical_loss": 3.6751747992953483, + "tokens_seen": 928631808 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036292878635907724, + "loss": 2.9516, + "theoretical_loss": 3.67514977837392, + "tokens_seen": 928697344 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629187562688064, + "loss": 2.9703, + "theoretical_loss": 3.675124759712445, + "tokens_seen": 928762880 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003629087261785356, + "loss": 2.9868, + "theoretical_loss": 3.6750997433105583, + "tokens_seen": 928828416 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2217957, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.896436929702759, + "objective/train/theoretical_loss": 3.675074729167898, + "objective/train/tokens_used": 949353952, + "theoretical_loss": 3.675074729167898, + "tokens_seen": 928893952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036289869608826484, + "loss": 2.8422, + "theoretical_loss": 3.675074729167898, + "tokens_seen": 928893952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036288866599799396, + "loss": 2.9152, + "theoretical_loss": 3.6750497172841, + "tokens_seen": 928959488 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003628786359077232, + "loss": 2.9179, + "theoretical_loss": 3.6750247076588005, + "tokens_seen": 929025024 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003628686058174523, + "loss": 2.881, + "theoretical_loss": 3.6749997002916377, + "tokens_seen": 929090560 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036285857572718156, + "loss": 2.9204, + "theoretical_loss": 3.6749746951822475, + "tokens_seen": 929156096 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036284854563691074, + "loss": 2.8914, + "theoretical_loss": 3.6749496923302667, + "tokens_seen": 929221632 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003628385155466399, + "loss": 2.807, + "theoretical_loss": 3.674924691735333, + "tokens_seen": 929287168 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003628284854563691, + "loss": 2.9397, + "theoretical_loss": 3.674899693397082, + "tokens_seen": 929352704 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003628184553660983, + "loss": 2.8729, + "theoretical_loss": 3.674874697315153, + "tokens_seen": 929418240 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036280842527582747, + "loss": 2.9466, + "theoretical_loss": 3.6748497034891816, + "tokens_seen": 929483776 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003627983951855567, + "loss": 2.9564, + "theoretical_loss": 3.6748247119188058, + "tokens_seen": 929549312 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036278836509528583, + "loss": 2.9003, + "theoretical_loss": 3.6747997226036633, + "tokens_seen": 929614848 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036277833500501506, + "loss": 2.85, + "theoretical_loss": 3.6747747355433913, + "tokens_seen": 929680384 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036276830491474424, + "loss": 2.9216, + "theoretical_loss": 3.674749750737627, + "tokens_seen": 929745920 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003627582748244734, + "loss": 2.9211, + "theoretical_loss": 3.674724768186009, + "tokens_seen": 929811456 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003627482447342026, + "loss": 2.971, + "theoretical_loss": 3.6746997878881746, + "tokens_seen": 929876992 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003627382146439318, + "loss": 2.8442, + "theoretical_loss": 3.6746748098437614, + "tokens_seen": 929942528 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036272818455366097, + "loss": 2.9338, + "theoretical_loss": 3.674649834052407, + "tokens_seen": 930008064 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003627181544633902, + "loss": 2.8251, + "theoretical_loss": 3.674624860513751, + "tokens_seen": 930073600 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036270812437311933, + "loss": 2.9458, + "theoretical_loss": 3.67459988922743, + "tokens_seen": 930139136 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036269809428284857, + "loss": 2.934, + "theoretical_loss": 3.674574920193083, + "tokens_seen": 930204672 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003626880641925777, + "loss": 2.7967, + "theoretical_loss": 3.674549953410348, + "tokens_seen": 930270208 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036267803410230693, + "loss": 2.9584, + "theoretical_loss": 3.674524988878863, + "tokens_seen": 930335744 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003626680040120361, + "loss": 2.9871, + "theoretical_loss": 3.674500026598267, + "tokens_seen": 930401280 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003626579739217653, + "loss": 2.8878, + "theoretical_loss": 3.6744750665681982, + "tokens_seen": 930466816 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2220897, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9260380268096924, + "objective/train/theoretical_loss": 3.674450108788295, + "objective/train/tokens_used": 950992352, + "theoretical_loss": 3.674450108788295, + "tokens_seen": 930532352 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036264794383149447, + "loss": 2.8794, + "theoretical_loss": 3.674450108788295, + "tokens_seen": 930532352 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036263791374122365, + "loss": 2.9265, + "theoretical_loss": 3.6744251532581966, + "tokens_seen": 930597888 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036262788365095283, + "loss": 2.9503, + "theoretical_loss": 3.6744001999775415, + "tokens_seen": 930663424 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036261785356068207, + "loss": 3.0359, + "theoretical_loss": 3.674375248945969, + "tokens_seen": 930728960 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036260782347041125, + "loss": 2.9409, + "theoretical_loss": 3.6743503001631175, + "tokens_seen": 930794496 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036259779338014043, + "loss": 2.8982, + "theoretical_loss": 3.6743253536286264, + "tokens_seen": 930860032 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036258776328986967, + "loss": 2.9321, + "theoretical_loss": 3.674300409342134, + "tokens_seen": 930925568 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003625777331995988, + "loss": 2.9455, + "theoretical_loss": 3.6742754673032803, + "tokens_seen": 930991104 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036256770310932803, + "loss": 2.8812, + "theoretical_loss": 3.674250527511705, + "tokens_seen": 931056640 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036255767301905716, + "loss": 2.9038, + "theoretical_loss": 3.6742255899670457, + "tokens_seen": 931122176 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003625476429287864, + "loss": 2.9296, + "theoretical_loss": 3.674200654668944, + "tokens_seen": 931187712 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003625376128385156, + "loss": 2.9642, + "theoretical_loss": 3.674175721617038, + "tokens_seen": 931253248 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036252758274824475, + "loss": 2.8343, + "theoretical_loss": 3.674150790810968, + "tokens_seen": 931318784 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036251755265797394, + "loss": 2.9205, + "theoretical_loss": 3.6741258622503725, + "tokens_seen": 931384320 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003625075225677031, + "loss": 2.7638, + "theoretical_loss": 3.674100935934893, + "tokens_seen": 931449856 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003624974924774323, + "loss": 2.9551, + "theoretical_loss": 3.6740760118641678, + "tokens_seen": 931515392 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036248746238716153, + "loss": 2.9341, + "theoretical_loss": 3.674051090037838, + "tokens_seen": 931580928 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036247743229689066, + "loss": 2.9065, + "theoretical_loss": 3.6740261704555435, + "tokens_seen": 931646464 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003624674022066199, + "loss": 2.9182, + "theoretical_loss": 3.674001253116924, + "tokens_seen": 931712000 + }, + { + "epoch": 11.01, + "learning_rate": 0.000362457372116349, + "loss": 2.8352, + "theoretical_loss": 3.6739763380216193, + "tokens_seen": 931777536 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036244734202607826, + "loss": 2.9373, + "theoretical_loss": 3.6739514251692706, + "tokens_seen": 931843072 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036243731193580744, + "loss": 2.9585, + "theoretical_loss": 3.6739265145595175, + "tokens_seen": 931908608 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003624272818455366, + "loss": 2.8957, + "theoretical_loss": 3.6739016061920013, + "tokens_seen": 931974144 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003624172517552658, + "loss": 2.8846, + "theoretical_loss": 3.6738767000663612, + "tokens_seen": 932039680 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036240722166499504, + "loss": 2.9401, + "theoretical_loss": 3.6738517961822392, + "tokens_seen": 932105216 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2224530, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6038362979888916, + "objective/train/theoretical_loss": 3.673826894539275, + "objective/train/tokens_used": 952630752, + "theoretical_loss": 3.673826894539275, + "tokens_seen": 932170752 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036239719157472416, + "loss": 2.8329, + "theoretical_loss": 3.673826894539275, + "tokens_seen": 932170752 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623871614844534, + "loss": 2.8399, + "theoretical_loss": 3.6738019951371097, + "tokens_seen": 932236288 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623771313941825, + "loss": 3.018, + "theoretical_loss": 3.6737770979753845, + "tokens_seen": 932301824 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036236710130391176, + "loss": 3.0653, + "theoretical_loss": 3.67375220305374, + "tokens_seen": 932367360 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036235707121364094, + "loss": 2.8729, + "theoretical_loss": 3.673727310371817, + "tokens_seen": 932432896 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623470411233701, + "loss": 2.8688, + "theoretical_loss": 3.673702419929257, + "tokens_seen": 932498432 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623370110330993, + "loss": 2.8686, + "theoretical_loss": 3.673677531725701, + "tokens_seen": 932563968 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623269809428285, + "loss": 2.8404, + "theoretical_loss": 3.6736526457607903, + "tokens_seen": 932629504 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036231695085255767, + "loss": 2.9109, + "theoretical_loss": 3.673627762034166, + "tokens_seen": 932695040 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003623069207622869, + "loss": 2.9101, + "theoretical_loss": 3.67360288054547, + "tokens_seen": 932760576 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036229689067201603, + "loss": 2.8849, + "theoretical_loss": 3.673578001294344, + "tokens_seen": 932826112 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036228686058174526, + "loss": 3.041, + "theoretical_loss": 3.6735531242804287, + "tokens_seen": 932891648 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036227683049147444, + "loss": 2.937, + "theoretical_loss": 3.6735282495033665, + "tokens_seen": 932957184 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003622668004012036, + "loss": 2.847, + "theoretical_loss": 3.673503376962799, + "tokens_seen": 933022720 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003622567703109328, + "loss": 2.935, + "theoretical_loss": 3.6734785066583675, + "tokens_seen": 933088256 + }, + { + "epoch": 11.01, + "learning_rate": 0.000362246740220662, + "loss": 2.9189, + "theoretical_loss": 3.6734536385897147, + "tokens_seen": 933153792 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036223671013039117, + "loss": 2.9985, + "theoretical_loss": 3.6734287727564823, + "tokens_seen": 933219328 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003622266800401204, + "loss": 3.0201, + "theoretical_loss": 3.6734039091583126, + "tokens_seen": 933284864 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036221664994984953, + "loss": 2.8709, + "theoretical_loss": 3.6733790477948474, + "tokens_seen": 933350400 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036220661985957877, + "loss": 2.9317, + "theoretical_loss": 3.673354188665729, + "tokens_seen": 933415936 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003621965897693079, + "loss": 2.9266, + "theoretical_loss": 3.6733293317706, + "tokens_seen": 933481472 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036218655967903713, + "loss": 2.8239, + "theoretical_loss": 3.6733044771091024, + "tokens_seen": 933547008 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003621765295887663, + "loss": 2.8631, + "theoretical_loss": 3.6732796246808794, + "tokens_seen": 933612544 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003621664994984955, + "loss": 2.8519, + "theoretical_loss": 3.6732547744855735, + "tokens_seen": 933678080 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036215646940822467, + "loss": 2.9057, + "theoretical_loss": 3.6732299265228265, + "tokens_seen": 933743616 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2229470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0019688606262207, + "objective/train/theoretical_loss": 3.673205080792282, + "objective/train/tokens_used": 954269152, + "theoretical_loss": 3.673205080792282, + "tokens_seen": 933809152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036214643931795385, + "loss": 2.907, + "theoretical_loss": 3.673205080792282, + "tokens_seen": 933809152 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036213640922768303, + "loss": 2.8823, + "theoretical_loss": 3.6731802372935825, + "tokens_seen": 933874688 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036212637913741227, + "loss": 2.9082, + "theoretical_loss": 3.6731553960263708, + "tokens_seen": 933940224 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003621163490471414, + "loss": 2.8469, + "theoretical_loss": 3.6731305569902903, + "tokens_seen": 934005760 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036210631895687063, + "loss": 2.9601, + "theoretical_loss": 3.6731057201849833, + "tokens_seen": 934071296 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003620962888665998, + "loss": 2.8353, + "theoretical_loss": 3.6730808856100943, + "tokens_seen": 934136832 + }, + { + "epoch": 11.01, + "learning_rate": 0.000362086258776329, + "loss": 2.9075, + "theoretical_loss": 3.6730560532652654, + "tokens_seen": 934202368 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003620762286860582, + "loss": 2.9014, + "theoretical_loss": 3.67303122315014, + "tokens_seen": 934267904 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036206619859578736, + "loss": 2.9859, + "theoretical_loss": 3.673006395264362, + "tokens_seen": 934333440 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036205616850551654, + "loss": 2.9727, + "theoretical_loss": 3.672981569607575, + "tokens_seen": 934398976 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003620461384152458, + "loss": 2.9287, + "theoretical_loss": 3.6729567461794215, + "tokens_seen": 934464512 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003620361083249749, + "loss": 3.0059, + "theoretical_loss": 3.6729319249795465, + "tokens_seen": 934530048 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036202607823470414, + "loss": 2.8226, + "theoretical_loss": 3.6729071060075933, + "tokens_seen": 934595584 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036201604814443326, + "loss": 2.8956, + "theoretical_loss": 3.672882289263205, + "tokens_seen": 934661120 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003620060180541625, + "loss": 2.9905, + "theoretical_loss": 3.672857474746026, + "tokens_seen": 934726656 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003619959879638917, + "loss": 2.9407, + "theoretical_loss": 3.672832662455701, + "tokens_seen": 934792192 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036198595787362086, + "loss": 2.8818, + "theoretical_loss": 3.6728078523918732, + "tokens_seen": 934857728 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036197592778335004, + "loss": 2.9774, + "theoretical_loss": 3.6727830445541865, + "tokens_seen": 934923264 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003619658976930792, + "loss": 2.8249, + "theoretical_loss": 3.672758238942286, + "tokens_seen": 934988800 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003619558676028084, + "loss": 3.0069, + "theoretical_loss": 3.6727334355558154, + "tokens_seen": 935054336 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036194583751253764, + "loss": 2.8698, + "theoretical_loss": 3.6727086343944193, + "tokens_seen": 935119872 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036193580742226677, + "loss": 2.9029, + "theoretical_loss": 3.6726838354577422, + "tokens_seen": 935185408 + }, + { + "epoch": 11.01, + "learning_rate": 0.000361925777331996, + "loss": 2.9162, + "theoretical_loss": 3.6726590387454285, + "tokens_seen": 935250944 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003619157472417252, + "loss": 2.9355, + "theoretical_loss": 3.672634244257123, + "tokens_seen": 935316480 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036190571715145436, + "loss": 2.9058, + "theoretical_loss": 3.67260945199247, + "tokens_seen": 935382016 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2232524, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.193175792694092, + "objective/train/theoretical_loss": 3.6725846619511158, + "objective/train/tokens_used": 955907552, + "theoretical_loss": 3.6725846619511158, + "tokens_seen": 935447552 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036189568706118354, + "loss": 2.9902, + "theoretical_loss": 3.6725846619511158, + "tokens_seen": 935447552 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618856569709127, + "loss": 2.8104, + "theoretical_loss": 3.672559874132703, + "tokens_seen": 935513088 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618756268806419, + "loss": 2.9581, + "theoretical_loss": 3.672535088536878, + "tokens_seen": 935578624 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036186559679037114, + "loss": 3.0064, + "theoretical_loss": 3.6725103051632857, + "tokens_seen": 935644160 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618555667001003, + "loss": 2.9574, + "theoretical_loss": 3.672485524011571, + "tokens_seen": 935709696 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618455366098295, + "loss": 2.9372, + "theoretical_loss": 3.672460745081379, + "tokens_seen": 935775232 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618355065195587, + "loss": 2.8715, + "theoretical_loss": 3.6724359683723553, + "tokens_seen": 935840768 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036182547642928787, + "loss": 2.8842, + "theoretical_loss": 3.6724111938841455, + "tokens_seen": 935906304 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003618154463390171, + "loss": 2.9578, + "theoretical_loss": 3.6723864216163937, + "tokens_seen": 935971840 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036180541624874623, + "loss": 2.9197, + "theoretical_loss": 3.672361651568748, + "tokens_seen": 936037376 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036179538615847546, + "loss": 2.8798, + "theoretical_loss": 3.6723368837408508, + "tokens_seen": 936102912 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036178535606820464, + "loss": 2.8032, + "theoretical_loss": 3.6723121181323504, + "tokens_seen": 936168448 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003617753259779338, + "loss": 2.9756, + "theoretical_loss": 3.6722873547428914, + "tokens_seen": 936233984 + }, + { + "epoch": 11.01, + "learning_rate": 0.000361765295887663, + "loss": 2.9483, + "theoretical_loss": 3.67226259357212, + "tokens_seen": 936299520 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003617552657973922, + "loss": 2.8764, + "theoretical_loss": 3.672237834619682, + "tokens_seen": 936365056 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036174523570712137, + "loss": 2.9529, + "theoretical_loss": 3.672213077885223, + "tokens_seen": 936430592 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003617352056168506, + "loss": 2.8938, + "theoretical_loss": 3.67218832336839, + "tokens_seen": 936496128 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036172517552657973, + "loss": 2.9659, + "theoretical_loss": 3.672163571068828, + "tokens_seen": 936561664 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036171514543630897, + "loss": 2.9679, + "theoretical_loss": 3.672138820986185, + "tokens_seen": 936627200 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003617051153460381, + "loss": 2.8597, + "theoretical_loss": 3.672114073120106, + "tokens_seen": 936692736 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036169508525576733, + "loss": 2.8895, + "theoretical_loss": 3.6720893274702373, + "tokens_seen": 936758272 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003616850551654965, + "loss": 2.9216, + "theoretical_loss": 3.6720645840362263, + "tokens_seen": 936823808 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003616750250752257, + "loss": 2.9036, + "theoretical_loss": 3.6720398428177194, + "tokens_seen": 936889344 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036166499498495487, + "loss": 2.9197, + "theoretical_loss": 3.6720151038143625, + "tokens_seen": 936954880 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036165496489468405, + "loss": 2.9219, + "theoretical_loss": 3.6719903670258027, + "tokens_seen": 937020416 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2235468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.981865406036377, + "objective/train/theoretical_loss": 3.6719656324516876, + "objective/train/tokens_used": 957545952, + "theoretical_loss": 3.6719656324516876, + "tokens_seen": 937085952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036164493480441323, + "loss": 2.9457, + "theoretical_loss": 3.6719656324516876, + "tokens_seen": 937085952 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036163490471414247, + "loss": 2.9427, + "theoretical_loss": 3.6719409000916636, + "tokens_seen": 937151488 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003616248746238716, + "loss": 2.9674, + "theoretical_loss": 3.6719161699453773, + "tokens_seen": 937217024 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036161484453360083, + "loss": 2.9228, + "theoretical_loss": 3.671891442012476, + "tokens_seen": 937282560 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036160481444333, + "loss": 2.8663, + "theoretical_loss": 3.671866716292607, + "tokens_seen": 937348096 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003615947843530592, + "loss": 2.9971, + "theoretical_loss": 3.6718419927854176, + "tokens_seen": 937413632 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003615847542627884, + "loss": 2.8802, + "theoretical_loss": 3.6718172714905553, + "tokens_seen": 937479168 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036157472417251756, + "loss": 2.8045, + "theoretical_loss": 3.6717925524076667, + "tokens_seen": 937544704 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036156469408224674, + "loss": 2.8811, + "theoretical_loss": 3.6717678355364, + "tokens_seen": 937610240 + }, + { + "epoch": 11.01, + "learning_rate": 0.000361554663991976, + "loss": 2.986, + "theoretical_loss": 3.6717431208764024, + "tokens_seen": 937675776 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003615446339017051, + "loss": 2.7795, + "theoretical_loss": 3.671718408427322, + "tokens_seen": 937741312 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036153460381143434, + "loss": 2.8598, + "theoretical_loss": 3.6716936981888058, + "tokens_seen": 937806848 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036152457372116346, + "loss": 2.9581, + "theoretical_loss": 3.6716689901605024, + "tokens_seen": 937872384 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003615145436308927, + "loss": 2.7063, + "theoretical_loss": 3.6716442843420594, + "tokens_seen": 937937920 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003615045135406219, + "loss": 2.9372, + "theoretical_loss": 3.6716195807331244, + "tokens_seen": 938003456 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036149448345035106, + "loss": 2.8969, + "theoretical_loss": 3.6715948793333455, + "tokens_seen": 938068992 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036148445336008024, + "loss": 2.9157, + "theoretical_loss": 3.6715701801423717, + "tokens_seen": 938134528 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003614744232698094, + "loss": 3.0137, + "theoretical_loss": 3.67154548315985, + "tokens_seen": 938200064 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003614643931795386, + "loss": 2.8724, + "theoretical_loss": 3.6715207883854295, + "tokens_seen": 938265600 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036145436308926784, + "loss": 2.9543, + "theoretical_loss": 3.671496095818758, + "tokens_seen": 938331136 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036144433299899697, + "loss": 2.8869, + "theoretical_loss": 3.6714714054594846, + "tokens_seen": 938396672 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003614343029087262, + "loss": 2.8989, + "theoretical_loss": 3.6714467173072576, + "tokens_seen": 938462208 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003614242728184554, + "loss": 2.8272, + "theoretical_loss": 3.671422031361725, + "tokens_seen": 938527744 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036141424272818456, + "loss": 2.9473, + "theoretical_loss": 3.671397347622536, + "tokens_seen": 938593280 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036140421263791374, + "loss": 2.8232, + "theoretical_loss": 3.67137266608934, + "tokens_seen": 938658816 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2240238, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8858377933502197, + "objective/train/theoretical_loss": 3.671347986761784, + "objective/train/tokens_used": 959184352, + "theoretical_loss": 3.671347986761784, + "tokens_seen": 938724352 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003613941825476429, + "loss": 2.9629, + "theoretical_loss": 3.671347986761784, + "tokens_seen": 938724352 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003613841524573721, + "loss": 2.8526, + "theoretical_loss": 3.6713233096395195, + "tokens_seen": 938789888 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036137412236710134, + "loss": 2.8689, + "theoretical_loss": 3.671298634722193, + "tokens_seen": 938855424 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036136409227683047, + "loss": 2.9426, + "theoretical_loss": 3.6712739620094554, + "tokens_seen": 938920960 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003613540621865597, + "loss": 3.0299, + "theoretical_loss": 3.6712492915009554, + "tokens_seen": 938986496 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036134403209628883, + "loss": 2.9737, + "theoretical_loss": 3.6712246231963417, + "tokens_seen": 939052032 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036133400200601807, + "loss": 2.9105, + "theoretical_loss": 3.671199957095264, + "tokens_seen": 939117568 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036132397191574725, + "loss": 2.871, + "theoretical_loss": 3.6711752931973716, + "tokens_seen": 939183104 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036131394182547643, + "loss": 2.9379, + "theoretical_loss": 3.6711506315023144, + "tokens_seen": 939248640 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003613039117352056, + "loss": 2.9477, + "theoretical_loss": 3.6711259720097416, + "tokens_seen": 939314176 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036129388164493484, + "loss": 2.8188, + "theoretical_loss": 3.6711013147193023, + "tokens_seen": 939379712 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036128385155466397, + "loss": 2.8521, + "theoretical_loss": 3.671076659630648, + "tokens_seen": 939445248 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003612738214643932, + "loss": 2.8305, + "theoretical_loss": 3.6710520067434267, + "tokens_seen": 939510784 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036126379137412233, + "loss": 2.997, + "theoretical_loss": 3.671027356057289, + "tokens_seen": 939576320 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036125376128385157, + "loss": 2.9303, + "theoretical_loss": 3.6710027075718847, + "tokens_seen": 939641856 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036124373119358075, + "loss": 2.9556, + "theoretical_loss": 3.6709780612868643, + "tokens_seen": 939707392 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036123370110330993, + "loss": 2.957, + "theoretical_loss": 3.6709534172018774, + "tokens_seen": 939772928 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003612236710130391, + "loss": 3.0249, + "theoretical_loss": 3.6709287753165745, + "tokens_seen": 939838464 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003612136409227683, + "loss": 2.9906, + "theoretical_loss": 3.6709041356306065, + "tokens_seen": 939904000 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003612036108324975, + "loss": 2.8955, + "theoretical_loss": 3.670879498143622, + "tokens_seen": 939969536 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003611935807422267, + "loss": 2.9347, + "theoretical_loss": 3.6708548628552733, + "tokens_seen": 940035072 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036118355065195584, + "loss": 2.8671, + "theoretical_loss": 3.67083022976521, + "tokens_seen": 940100608 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036117352056168507, + "loss": 2.9008, + "theoretical_loss": 3.670805598873083, + "tokens_seen": 940166144 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003611634904714142, + "loss": 3.0086, + "theoretical_loss": 3.6707809701785425, + "tokens_seen": 940231680 + }, + { + "epoch": 11.01, + "learning_rate": 0.00036115346038114343, + "loss": 2.9475, + "theoretical_loss": 3.6707563436812403, + "tokens_seen": 940297216 + }, + { + "epoch": 11.01, + "objective/train/docs_used": 2244150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.670147657394409, + "objective/train/theoretical_loss": 3.6707317193808264, + "objective/train/tokens_used": 960822752, + "theoretical_loss": 3.6707317193808264, + "tokens_seen": 940362752 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003611434302908726, + "loss": 2.8989, + "theoretical_loss": 3.6707317193808264, + "tokens_seen": 940362752 + }, + { + "epoch": 11.01, + "learning_rate": 0.0003611334002006018, + "loss": 2.9387, + "theoretical_loss": 3.670707097276952, + "tokens_seen": 940428288 + }, + { + "epoch": 11.02, + "learning_rate": 0.000361123370110331, + "loss": 2.9486, + "theoretical_loss": 3.670682477369268, + "tokens_seen": 940493824 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003611133400200602, + "loss": 3.0016, + "theoretical_loss": 3.6706578596574255, + "tokens_seen": 940559360 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003611033099297894, + "loss": 2.8917, + "theoretical_loss": 3.670633244141076, + "tokens_seen": 940624896 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003610932798395186, + "loss": 2.9605, + "theoretical_loss": 3.6706086308198707, + "tokens_seen": 940690432 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036108324974924776, + "loss": 2.9497, + "theoretical_loss": 3.67058401969346, + "tokens_seen": 940755968 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036107321965897694, + "loss": 2.7425, + "theoretical_loss": 3.6705594107614967, + "tokens_seen": 940821504 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003610631895687062, + "loss": 2.8954, + "theoretical_loss": 3.670534804023631, + "tokens_seen": 940887040 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003610531594784353, + "loss": 2.9858, + "theoretical_loss": 3.6705101994795157, + "tokens_seen": 940952576 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036104312938816454, + "loss": 2.8877, + "theoretical_loss": 3.6704855971288017, + "tokens_seen": 941018112 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036103309929789366, + "loss": 2.8586, + "theoretical_loss": 3.670460996971141, + "tokens_seen": 941083648 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003610230692076229, + "loss": 2.9627, + "theoretical_loss": 3.670436399006185, + "tokens_seen": 941149184 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003610130391173521, + "loss": 2.9086, + "theoretical_loss": 3.670411803233586, + "tokens_seen": 941214720 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036100300902708126, + "loss": 2.9694, + "theoretical_loss": 3.670387209652996, + "tokens_seen": 941280256 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036099297893681044, + "loss": 2.868, + "theoretical_loss": 3.6703626182640674, + "tokens_seen": 941345792 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003609829488465396, + "loss": 2.8463, + "theoretical_loss": 3.6703380290664507, + "tokens_seen": 941411328 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003609729187562688, + "loss": 2.9924, + "theoretical_loss": 3.6703134420598005, + "tokens_seen": 941476864 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036096288866599804, + "loss": 2.9596, + "theoretical_loss": 3.670288857243767, + "tokens_seen": 941542400 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036095285857572717, + "loss": 3.0424, + "theoretical_loss": 3.6702642746180034, + "tokens_seen": 941607936 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003609428284854564, + "loss": 2.9954, + "theoretical_loss": 3.6702396941821624, + "tokens_seen": 941673472 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003609327983951856, + "loss": 2.9908, + "theoretical_loss": 3.670215115935896, + "tokens_seen": 941739008 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036092276830491476, + "loss": 2.8543, + "theoretical_loss": 3.6701905398788575, + "tokens_seen": 941804544 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036091273821464394, + "loss": 2.9369, + "theoretical_loss": 3.670165966010699, + "tokens_seen": 941870080 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003609027081243731, + "loss": 2.9367, + "theoretical_loss": 3.6701413943310737, + "tokens_seen": 941935616 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2248659, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9798500537872314, + "objective/train/theoretical_loss": 3.670116824839633, + "objective/train/tokens_used": 962461152, + "theoretical_loss": 3.670116824839633, + "tokens_seen": 942001152 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003608926780341023, + "loss": 2.8915, + "theoretical_loss": 3.670116824839633, + "tokens_seen": 942001152 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036088264794383154, + "loss": 2.951, + "theoretical_loss": 3.6700922575360324, + "tokens_seen": 942066688 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036087261785356067, + "loss": 2.9865, + "theoretical_loss": 3.6700676924199227, + "tokens_seen": 942132224 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003608625877632899, + "loss": 2.8152, + "theoretical_loss": 3.670043129490958, + "tokens_seen": 942197760 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036085255767301903, + "loss": 2.9468, + "theoretical_loss": 3.6700185687487914, + "tokens_seen": 942263296 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036084252758274827, + "loss": 2.833, + "theoretical_loss": 3.6699940101930757, + "tokens_seen": 942328832 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036083249749247745, + "loss": 2.9647, + "theoretical_loss": 3.6699694538234646, + "tokens_seen": 942394368 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036082246740220663, + "loss": 2.961, + "theoretical_loss": 3.6699448996396113, + "tokens_seen": 942459904 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003608124373119358, + "loss": 3.0023, + "theoretical_loss": 3.6699203476411695, + "tokens_seen": 942525440 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036080240722166505, + "loss": 2.9143, + "theoretical_loss": 3.6698957978277926, + "tokens_seen": 942590976 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036079237713139417, + "loss": 2.9751, + "theoretical_loss": 3.669871250199134, + "tokens_seen": 942656512 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003607823470411234, + "loss": 2.9046, + "theoretical_loss": 3.6698467047548475, + "tokens_seen": 942722048 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036077231695085253, + "loss": 2.8396, + "theoretical_loss": 3.6698221614945874, + "tokens_seen": 942787584 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036076228686058177, + "loss": 2.8584, + "theoretical_loss": 3.669797620418007, + "tokens_seen": 942853120 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036075225677031095, + "loss": 2.7935, + "theoretical_loss": 3.6697730815247604, + "tokens_seen": 942918656 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036074222668004013, + "loss": 2.9725, + "theoretical_loss": 3.669748544814502, + "tokens_seen": 942984192 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003607321965897693, + "loss": 3.0129, + "theoretical_loss": 3.6697240102868856, + "tokens_seen": 943049728 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003607221664994985, + "loss": 2.8936, + "theoretical_loss": 3.669699477941565, + "tokens_seen": 943115264 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003607121364092277, + "loss": 2.9433, + "theoretical_loss": 3.669674947778195, + "tokens_seen": 943180800 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003607021063189569, + "loss": 2.8682, + "theoretical_loss": 3.66965041979643, + "tokens_seen": 943246336 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036069207622868604, + "loss": 2.8946, + "theoretical_loss": 3.6696258939959243, + "tokens_seen": 943311872 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036068204613841527, + "loss": 2.8746, + "theoretical_loss": 3.6696013703763315, + "tokens_seen": 943377408 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003606720160481444, + "loss": 2.8204, + "theoretical_loss": 3.6695768489373077, + "tokens_seen": 943442944 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036066198595787364, + "loss": 2.7645, + "theoretical_loss": 3.669552329678507, + "tokens_seen": 943508480 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003606519558676028, + "loss": 2.9142, + "theoretical_loss": 3.669527812599583, + "tokens_seen": 943574016 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2251927, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9529049396514893, + "objective/train/theoretical_loss": 3.6695032977001922, + "objective/train/tokens_used": 964099552, + "theoretical_loss": 3.6695032977001922, + "tokens_seen": 943639552 + }, + { + "epoch": 11.02, + "learning_rate": 0.000360641925777332, + "loss": 2.9044, + "theoretical_loss": 3.6695032977001922, + "tokens_seen": 943639552 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003606318956870612, + "loss": 2.8582, + "theoretical_loss": 3.6694787849799892, + "tokens_seen": 943705088 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003606218655967904, + "loss": 2.8623, + "theoretical_loss": 3.669454274438628, + "tokens_seen": 943770624 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036061183550651954, + "loss": 2.9805, + "theoretical_loss": 3.669429766075764, + "tokens_seen": 943836160 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003606018054162488, + "loss": 2.8667, + "theoretical_loss": 3.669405259891053, + "tokens_seen": 943901696 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605917753259779, + "loss": 2.9757, + "theoretical_loss": 3.6693807558841494, + "tokens_seen": 943967232 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036058174523570714, + "loss": 2.9204, + "theoretical_loss": 3.6693562540547093, + "tokens_seen": 944032768 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605717151454363, + "loss": 2.9679, + "theoretical_loss": 3.6693317544023873, + "tokens_seen": 944098304 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605616850551655, + "loss": 2.9719, + "theoretical_loss": 3.669307256926839, + "tokens_seen": 944163840 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605516549648947, + "loss": 2.8527, + "theoretical_loss": 3.6692827616277204, + "tokens_seen": 944229376 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036054162487462386, + "loss": 3.0429, + "theoretical_loss": 3.6692582685046866, + "tokens_seen": 944294912 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036053159478435304, + "loss": 2.8988, + "theoretical_loss": 3.6692337775573938, + "tokens_seen": 944360448 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605215646940823, + "loss": 2.9267, + "theoretical_loss": 3.6692092887854963, + "tokens_seen": 944425984 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003605115346038114, + "loss": 2.8447, + "theoretical_loss": 3.669184802188652, + "tokens_seen": 944491520 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036050150451354064, + "loss": 2.9429, + "theoretical_loss": 3.669160317766516, + "tokens_seen": 944557056 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036049147442326977, + "loss": 2.9353, + "theoretical_loss": 3.669135835518744, + "tokens_seen": 944622592 + }, + { + "epoch": 11.02, + "learning_rate": 0.000360481444332999, + "loss": 2.9562, + "theoretical_loss": 3.6691113554449917, + "tokens_seen": 944688128 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003604714142427282, + "loss": 2.9208, + "theoretical_loss": 3.669086877544916, + "tokens_seen": 944753664 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036046138415245737, + "loss": 2.8907, + "theoretical_loss": 3.669062401818173, + "tokens_seen": 944819200 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036045135406218655, + "loss": 2.9677, + "theoretical_loss": 3.6690379282644185, + "tokens_seen": 944884736 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003604413239719158, + "loss": 2.9236, + "theoretical_loss": 3.66901345688331, + "tokens_seen": 944950272 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003604312938816449, + "loss": 2.7996, + "theoretical_loss": 3.6689889876745028, + "tokens_seen": 945015808 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036042126379137414, + "loss": 2.976, + "theoretical_loss": 3.6689645206376538, + "tokens_seen": 945081344 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036041123370110327, + "loss": 3.0243, + "theoretical_loss": 3.66894005577242, + "tokens_seen": 945146880 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003604012036108325, + "loss": 2.8801, + "theoretical_loss": 3.6689155930784576, + "tokens_seen": 945212416 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2254850, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8093366622924805, + "objective/train/theoretical_loss": 3.668891132555424, + "objective/train/tokens_used": 965737952, + "theoretical_loss": 3.668891132555424, + "tokens_seen": 945277952 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003603911735205617, + "loss": 2.919, + "theoretical_loss": 3.668891132555424, + "tokens_seen": 945277952 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036038114343029087, + "loss": 2.8329, + "theoretical_loss": 3.668866674202975, + "tokens_seen": 945343488 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036037111334002005, + "loss": 2.867, + "theoretical_loss": 3.6688422180207683, + "tokens_seen": 945409024 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036036108324974923, + "loss": 2.8106, + "theoretical_loss": 3.668817764008461, + "tokens_seen": 945474560 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036035105315947847, + "loss": 2.8793, + "theoretical_loss": 3.66879331216571, + "tokens_seen": 945540096 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036034102306920765, + "loss": 2.8887, + "theoretical_loss": 3.668768862492172, + "tokens_seen": 945605632 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036033099297893683, + "loss": 2.9142, + "theoretical_loss": 3.668744414987505, + "tokens_seen": 945671168 + }, + { + "epoch": 11.02, + "learning_rate": 0.000360320962888666, + "loss": 2.9114, + "theoretical_loss": 3.6687199696513657, + "tokens_seen": 945736704 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036031093279839525, + "loss": 2.845, + "theoretical_loss": 3.668695526483412, + "tokens_seen": 945802240 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036030090270812437, + "loss": 2.9624, + "theoretical_loss": 3.668671085483301, + "tokens_seen": 945867776 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003602908726178536, + "loss": 2.9174, + "theoretical_loss": 3.6686466466506906, + "tokens_seen": 945933312 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036028084252758273, + "loss": 3.0049, + "theoretical_loss": 3.668622209985238, + "tokens_seen": 945998848 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036027081243731197, + "loss": 2.8208, + "theoretical_loss": 3.6685977754866013, + "tokens_seen": 946064384 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036026078234704115, + "loss": 2.9259, + "theoretical_loss": 3.668573343154438, + "tokens_seen": 946129920 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036025075225677033, + "loss": 2.8957, + "theoretical_loss": 3.668548912988406, + "tokens_seen": 946195456 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003602407221664995, + "loss": 3.0042, + "theoretical_loss": 3.6685244849881635, + "tokens_seen": 946260992 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003602306920762287, + "loss": 2.8694, + "theoretical_loss": 3.6685000591533683, + "tokens_seen": 946326528 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003602206619859579, + "loss": 2.9416, + "theoretical_loss": 3.6684756354836785, + "tokens_seen": 946392064 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003602106318956871, + "loss": 2.9402, + "theoretical_loss": 3.668451213978752, + "tokens_seen": 946457600 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036020060180541624, + "loss": 2.8721, + "theoretical_loss": 3.6684267946382483, + "tokens_seen": 946523136 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601905717151455, + "loss": 2.9039, + "theoretical_loss": 3.6684023774618244, + "tokens_seen": 946588672 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601805416248746, + "loss": 2.9271, + "theoretical_loss": 3.6683779624491386, + "tokens_seen": 946654208 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036017051153460384, + "loss": 2.9574, + "theoretical_loss": 3.6683535495998507, + "tokens_seen": 946719744 + }, + { + "epoch": 11.02, + "learning_rate": 0.000360160481444333, + "loss": 2.9147, + "theoretical_loss": 3.668329138913618, + "tokens_seen": 946785280 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601504513540622, + "loss": 2.9318, + "theoretical_loss": 3.6683047303900995, + "tokens_seen": 946850816 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2258710, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9074478149414062, + "objective/train/theoretical_loss": 3.668280324028954, + "objective/train/tokens_used": 967376352, + "theoretical_loss": 3.668280324028954, + "tokens_seen": 946916352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601404212637914, + "loss": 2.8972, + "theoretical_loss": 3.668280324028954, + "tokens_seen": 946916352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601303911735206, + "loss": 2.8924, + "theoretical_loss": 3.6682559198298406, + "tokens_seen": 946981888 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036012036108324974, + "loss": 2.9116, + "theoretical_loss": 3.668231517792418, + "tokens_seen": 947047424 + }, + { + "epoch": 11.02, + "learning_rate": 0.000360110330992979, + "loss": 2.8557, + "theoretical_loss": 3.668207117916345, + "tokens_seen": 947112960 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003601003009027081, + "loss": 2.964, + "theoretical_loss": 3.6681827202012802, + "tokens_seen": 947178496 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036009027081243734, + "loss": 2.8713, + "theoretical_loss": 3.6681583246468836, + "tokens_seen": 947244032 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003600802407221665, + "loss": 2.9658, + "theoretical_loss": 3.6681339312528136, + "tokens_seen": 947309568 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003600702106318957, + "loss": 2.8621, + "theoretical_loss": 3.6681095400187305, + "tokens_seen": 947375104 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003600601805416249, + "loss": 2.981, + "theoretical_loss": 3.668085150944292, + "tokens_seen": 947440640 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036005015045135406, + "loss": 2.9391, + "theoretical_loss": 3.668060764029159, + "tokens_seen": 947506176 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036004012036108324, + "loss": 2.9913, + "theoretical_loss": 3.6680363792729906, + "tokens_seen": 947571712 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003600300902708125, + "loss": 2.9881, + "theoretical_loss": 3.668011996675446, + "tokens_seen": 947637248 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003600200601805416, + "loss": 2.8992, + "theoretical_loss": 3.667987616236185, + "tokens_seen": 947702784 + }, + { + "epoch": 11.02, + "learning_rate": 0.00036001003009027084, + "loss": 2.929, + "theoretical_loss": 3.6679632379548677, + "tokens_seen": 947768320 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035999999999999997, + "loss": 2.9046, + "theoretical_loss": 3.6679388618311535, + "tokens_seen": 947833856 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003599899699097292, + "loss": 3.0128, + "theoretical_loss": 3.667914487864702, + "tokens_seen": 947899392 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003599799398194584, + "loss": 2.9781, + "theoretical_loss": 3.6678901160551742, + "tokens_seen": 947964928 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035996990972918757, + "loss": 2.9615, + "theoretical_loss": 3.667865746402229, + "tokens_seen": 948030464 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035995987963891675, + "loss": 2.9202, + "theoretical_loss": 3.6678413789055275, + "tokens_seen": 948096000 + }, + { + "epoch": 11.02, + "learning_rate": 0.000359949849548646, + "loss": 2.877, + "theoretical_loss": 3.667817013564729, + "tokens_seen": 948161536 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003599398194583751, + "loss": 3.0036, + "theoretical_loss": 3.6677926503794938, + "tokens_seen": 948227072 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035992978936810434, + "loss": 2.8277, + "theoretical_loss": 3.6677682893494827, + "tokens_seen": 948292608 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035991975927783347, + "loss": 2.9105, + "theoretical_loss": 3.6677439304743564, + "tokens_seen": 948358144 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003599097291875627, + "loss": 2.9745, + "theoretical_loss": 3.6677195737537747, + "tokens_seen": 948423680 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003598996990972919, + "loss": 2.9859, + "theoretical_loss": 3.667695219187398, + "tokens_seen": 948489216 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2263365, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7486765384674072, + "objective/train/theoretical_loss": 3.6676708667748885, + "objective/train/tokens_used": 969014752, + "theoretical_loss": 3.6676708667748885, + "tokens_seen": 948554752 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035988966900702107, + "loss": 2.8669, + "theoretical_loss": 3.6676708667748885, + "tokens_seen": 948554752 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035987963891675025, + "loss": 2.9897, + "theoretical_loss": 3.667646516515905, + "tokens_seen": 948620288 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035986960882647943, + "loss": 2.9849, + "theoretical_loss": 3.6676221684101096, + "tokens_seen": 948685824 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003598595787362086, + "loss": 2.9583, + "theoretical_loss": 3.667597822457162, + "tokens_seen": 948751360 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035984954864593785, + "loss": 3.0075, + "theoretical_loss": 3.6675734786567245, + "tokens_seen": 948816896 + }, + { + "epoch": 11.02, + "learning_rate": 0.000359839518555667, + "loss": 3.0147, + "theoretical_loss": 3.667549137008457, + "tokens_seen": 948882432 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003598294884653962, + "loss": 2.89, + "theoretical_loss": 3.6675247975120215, + "tokens_seen": 948947968 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035981945837512534, + "loss": 2.9538, + "theoretical_loss": 3.6675004601670786, + "tokens_seen": 949013504 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035980942828485457, + "loss": 2.9284, + "theoretical_loss": 3.66747612497329, + "tokens_seen": 949079040 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035979939819458375, + "loss": 2.9873, + "theoretical_loss": 3.6674517919303167, + "tokens_seen": 949144576 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035978936810431293, + "loss": 2.8368, + "theoretical_loss": 3.66742746103782, + "tokens_seen": 949210112 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003597793380140421, + "loss": 2.886, + "theoretical_loss": 3.6674031322954614, + "tokens_seen": 949275648 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035976930792377135, + "loss": 2.8746, + "theoretical_loss": 3.667378805702903, + "tokens_seen": 949341184 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003597592778335005, + "loss": 2.9127, + "theoretical_loss": 3.667354481259806, + "tokens_seen": 949406720 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003597492477432297, + "loss": 2.9524, + "theoretical_loss": 3.6673301589658323, + "tokens_seen": 949472256 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035973921765295884, + "loss": 2.9284, + "theoretical_loss": 3.667305838820644, + "tokens_seen": 949537792 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003597291875626881, + "loss": 2.8913, + "theoretical_loss": 3.6672815208239022, + "tokens_seen": 949603328 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035971915747241726, + "loss": 2.8107, + "theoretical_loss": 3.667257204975269, + "tokens_seen": 949668864 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035970912738214644, + "loss": 2.956, + "theoretical_loss": 3.667232891274407, + "tokens_seen": 949734400 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596990972918756, + "loss": 2.8915, + "theoretical_loss": 3.6672085797209784, + "tokens_seen": 949799936 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596890672016048, + "loss": 2.9232, + "theoretical_loss": 3.6671842703146442, + "tokens_seen": 949865472 + }, + { + "epoch": 11.02, + "learning_rate": 0.000359679037111334, + "loss": 2.9262, + "theoretical_loss": 3.6671599630550675, + "tokens_seen": 949931008 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596690070210632, + "loss": 2.9367, + "theoretical_loss": 3.6671356579419108, + "tokens_seen": 949996544 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035965897693079234, + "loss": 2.9203, + "theoretical_loss": 3.6671113549748364, + "tokens_seen": 950062080 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596489468405216, + "loss": 2.9731, + "theoretical_loss": 3.667087054153507, + "tokens_seen": 950127616 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2266131, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.944831371307373, + "objective/train/theoretical_loss": 3.667062755477584, + "objective/train/tokens_used": 970653152, + "theoretical_loss": 3.667062755477584, + "tokens_seen": 950193152 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035963891675025076, + "loss": 3.0015, + "theoretical_loss": 3.667062755477584, + "tokens_seen": 950193152 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035962888665997994, + "loss": 2.9571, + "theoretical_loss": 3.6670384589467315, + "tokens_seen": 950258688 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596188565697091, + "loss": 2.8752, + "theoretical_loss": 3.6670141645606114, + "tokens_seen": 950324224 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003596088264794383, + "loss": 2.8449, + "theoretical_loss": 3.6669898723188865, + "tokens_seen": 950389760 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035959879638916754, + "loss": 2.8407, + "theoretical_loss": 3.66696558222122, + "tokens_seen": 950455296 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003595887662988967, + "loss": 2.9759, + "theoretical_loss": 3.666941294267275, + "tokens_seen": 950520832 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003595787362086259, + "loss": 2.931, + "theoretical_loss": 3.666917008456714, + "tokens_seen": 950586368 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003595687061183551, + "loss": 2.9261, + "theoretical_loss": 3.6668927247892, + "tokens_seen": 950651904 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035955867602808426, + "loss": 2.8249, + "theoretical_loss": 3.6668684432643968, + "tokens_seen": 950717440 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035954864593781344, + "loss": 2.9367, + "theoretical_loss": 3.6668441638819673, + "tokens_seen": 950782976 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003595386158475427, + "loss": 2.8916, + "theoretical_loss": 3.666819886641575, + "tokens_seen": 950848512 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003595285857572718, + "loss": 2.9881, + "theoretical_loss": 3.666795611542883, + "tokens_seen": 950914048 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035951855566700104, + "loss": 2.9112, + "theoretical_loss": 3.666771338585555, + "tokens_seen": 950979584 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035950852557673017, + "loss": 3.017, + "theoretical_loss": 3.666747067769254, + "tokens_seen": 951045120 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594984954864594, + "loss": 2.9651, + "theoretical_loss": 3.666722799093645, + "tokens_seen": 951110656 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594884653961886, + "loss": 2.9969, + "theoretical_loss": 3.6666985325583905, + "tokens_seen": 951176192 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035947843530591777, + "loss": 2.9356, + "theoretical_loss": 3.6666742681631543, + "tokens_seen": 951241728 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035946840521564695, + "loss": 2.8952, + "theoretical_loss": 3.6666500059076013, + "tokens_seen": 951307264 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594583751253762, + "loss": 2.9729, + "theoretical_loss": 3.6666257457913938, + "tokens_seen": 951372800 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594483450351053, + "loss": 2.9903, + "theoretical_loss": 3.666601487814197, + "tokens_seen": 951438336 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035943831494483454, + "loss": 2.9624, + "theoretical_loss": 3.6665772319756753, + "tokens_seen": 951503872 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035942828485456367, + "loss": 2.8544, + "theoretical_loss": 3.6665529782754915, + "tokens_seen": 951569408 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594182547642929, + "loss": 2.9859, + "theoretical_loss": 3.6665287267133113, + "tokens_seen": 951634944 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003594082246740221, + "loss": 2.9185, + "theoretical_loss": 3.6665044772887976, + "tokens_seen": 951700480 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035939819458375127, + "loss": 2.9681, + "theoretical_loss": 3.666480230001616, + "tokens_seen": 951766016 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.943784236907959, + "objective/train/theoretical_loss": 3.6664559848514306, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6664559848514306, + "tokens_seen": 951831552 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035938816449348045, + "loss": 2.916, + "theoretical_loss": 3.6664559848514306, + "tokens_seen": 951831552 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035937813440320963, + "loss": 2.803, + "theoretical_loss": 3.666431741837905, + "tokens_seen": 951897088 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003593681043129388, + "loss": 2.8494, + "theoretical_loss": 3.6664075009607053, + "tokens_seen": 951962624 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035935807422266805, + "loss": 2.892, + "theoretical_loss": 3.6663832622194956, + "tokens_seen": 952028160 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003593480441323972, + "loss": 2.7763, + "theoretical_loss": 3.6663590256139402, + "tokens_seen": 952093696 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003593380140421264, + "loss": 2.9045, + "theoretical_loss": 3.6663347911437043, + "tokens_seen": 952159232 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035932798395185554, + "loss": 2.9569, + "theoretical_loss": 3.6663105588084535, + "tokens_seen": 952224768 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035931795386158477, + "loss": 2.7922, + "theoretical_loss": 3.6662863286078515, + "tokens_seen": 952290304 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035930792377131395, + "loss": 2.8837, + "theoretical_loss": 3.6662621005415645, + "tokens_seen": 952355840 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035929789368104313, + "loss": 2.9635, + "theoretical_loss": 3.6662378746092568, + "tokens_seen": 952421376 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003592878635907723, + "loss": 2.8723, + "theoretical_loss": 3.6662136508105942, + "tokens_seen": 952486912 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035927783350050155, + "loss": 2.8815, + "theoretical_loss": 3.666189429145242, + "tokens_seen": 952552448 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003592678034102307, + "loss": 2.9471, + "theoretical_loss": 3.666165209612865, + "tokens_seen": 952617984 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003592577733199599, + "loss": 2.9837, + "theoretical_loss": 3.666140992213129, + "tokens_seen": 952683520 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035924774322968904, + "loss": 2.9318, + "theoretical_loss": 3.6661167769456995, + "tokens_seen": 952749056 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003592377131394183, + "loss": 2.858, + "theoretical_loss": 3.666092563810243, + "tokens_seen": 952814592 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035922768304914746, + "loss": 2.9268, + "theoretical_loss": 3.6660683528064233, + "tokens_seen": 952880128 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035921765295887664, + "loss": 2.9961, + "theoretical_loss": 3.6660441439339073, + "tokens_seen": 952945664 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003592076228686058, + "loss": 2.8189, + "theoretical_loss": 3.666019937192361, + "tokens_seen": 953011200 + }, + { + "epoch": 11.02, + "learning_rate": 0.000359197592778335, + "loss": 2.9174, + "theoretical_loss": 3.66599573258145, + "tokens_seen": 953076736 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591875626880642, + "loss": 2.8964, + "theoretical_loss": 3.6659715301008395, + "tokens_seen": 953142272 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591775325977934, + "loss": 2.8021, + "theoretical_loss": 3.6659473297501974, + "tokens_seen": 953207808 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035916750250752254, + "loss": 2.9836, + "theoretical_loss": 3.6659231315291883, + "tokens_seen": 953273344 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591574724172518, + "loss": 2.882, + "theoretical_loss": 3.665898935437479, + "tokens_seen": 953338880 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035914744232698096, + "loss": 2.9462, + "theoretical_loss": 3.6658747414747355, + "tokens_seen": 953404416 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8782799243927, + "objective/train/theoretical_loss": 3.665850549640624, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.665850549640624, + "tokens_seen": 953469952 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035913741223671014, + "loss": 2.9455, + "theoretical_loss": 3.665850549640624, + "tokens_seen": 953469952 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591273821464393, + "loss": 2.9629, + "theoretical_loss": 3.6658263599348118, + "tokens_seen": 953535488 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591173520561685, + "loss": 2.9148, + "theoretical_loss": 3.665802172356964, + "tokens_seen": 953601024 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003591073219658977, + "loss": 3.0279, + "theoretical_loss": 3.6657779869067486, + "tokens_seen": 953666560 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003590972918756269, + "loss": 2.8837, + "theoretical_loss": 3.665753803583831, + "tokens_seen": 953732096 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035908726178535605, + "loss": 2.8751, + "theoretical_loss": 3.6657296223878797, + "tokens_seen": 953797632 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003590772316950853, + "loss": 2.9291, + "theoretical_loss": 3.6657054433185596, + "tokens_seen": 953863168 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003590672016048144, + "loss": 3.0176, + "theoretical_loss": 3.6656812663755383, + "tokens_seen": 953928704 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035905717151454364, + "loss": 2.9924, + "theoretical_loss": 3.665657091558483, + "tokens_seen": 953994240 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003590471414242728, + "loss": 2.8979, + "theoretical_loss": 3.66563291886706, + "tokens_seen": 954059776 + }, + { + "epoch": 11.02, + "learning_rate": 0.000359037111334002, + "loss": 2.8522, + "theoretical_loss": 3.665608748300938, + "tokens_seen": 954125312 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003590270812437312, + "loss": 2.8456, + "theoretical_loss": 3.6655845798597824, + "tokens_seen": 954190848 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035901705115346037, + "loss": 2.9305, + "theoretical_loss": 3.6655604135432616, + "tokens_seen": 954256384 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035900702106318955, + "loss": 2.9644, + "theoretical_loss": 3.6655362493510424, + "tokens_seen": 954321920 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589969909729188, + "loss": 2.9424, + "theoretical_loss": 3.665512087282792, + "tokens_seen": 954387456 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589869608826479, + "loss": 2.9565, + "theoretical_loss": 3.6654879273381784, + "tokens_seen": 954452992 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035897693079237715, + "loss": 2.9179, + "theoretical_loss": 3.665463769516869, + "tokens_seen": 954518528 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035896690070210633, + "loss": 2.8564, + "theoretical_loss": 3.6654396138185312, + "tokens_seen": 954584064 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589568706118355, + "loss": 2.9183, + "theoretical_loss": 3.6654154602428326, + "tokens_seen": 954649600 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589468405215647, + "loss": 2.8164, + "theoretical_loss": 3.6653913087894416, + "tokens_seen": 954715136 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035893681043129387, + "loss": 2.8754, + "theoretical_loss": 3.665367159458026, + "tokens_seen": 954780672 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035892678034102305, + "loss": 2.8664, + "theoretical_loss": 3.665343012248253, + "tokens_seen": 954846208 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589167502507523, + "loss": 2.9656, + "theoretical_loss": 3.665318867159791, + "tokens_seen": 954911744 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003589067201604814, + "loss": 2.9261, + "theoretical_loss": 3.665294724192308, + "tokens_seen": 954977280 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035889669007021065, + "loss": 2.9179, + "theoretical_loss": 3.665270583345473, + "tokens_seen": 955042816 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0287020206451416, + "objective/train/theoretical_loss": 3.665246444618953, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.665246444618953, + "tokens_seen": 955108352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003588866599799398, + "loss": 2.9944, + "theoretical_loss": 3.665246444618953, + "tokens_seen": 955108352 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358876629889669, + "loss": 2.9326, + "theoretical_loss": 3.665222308012417, + "tokens_seen": 955173888 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003588665997993982, + "loss": 2.9271, + "theoretical_loss": 3.665198173525533, + "tokens_seen": 955239424 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003588565697091274, + "loss": 2.8709, + "theoretical_loss": 3.6651740411579694, + "tokens_seen": 955304960 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003588465396188566, + "loss": 2.9527, + "theoretical_loss": 3.6651499109093955, + "tokens_seen": 955370496 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035883650952858574, + "loss": 2.9202, + "theoretical_loss": 3.6651257827794788, + "tokens_seen": 955436032 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035882647943831497, + "loss": 2.9496, + "theoretical_loss": 3.665101656767889, + "tokens_seen": 955501568 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035881644934804415, + "loss": 2.8142, + "theoretical_loss": 3.6650775328742937, + "tokens_seen": 955567104 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035880641925777333, + "loss": 2.9287, + "theoretical_loss": 3.665053411098363, + "tokens_seen": 955632640 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003587963891675025, + "loss": 2.9444, + "theoretical_loss": 3.6650292914397644, + "tokens_seen": 955698176 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035878635907723175, + "loss": 2.9114, + "theoretical_loss": 3.6650051738981686, + "tokens_seen": 955763712 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003587763289869609, + "loss": 2.9618, + "theoretical_loss": 3.664981058473243, + "tokens_seen": 955829248 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003587662988966901, + "loss": 2.9336, + "theoretical_loss": 3.664956945164658, + "tokens_seen": 955894784 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035875626880641924, + "loss": 2.8648, + "theoretical_loss": 3.664932833972082, + "tokens_seen": 955960320 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003587462387161485, + "loss": 2.9624, + "theoretical_loss": 3.6649087248951844, + "tokens_seen": 956025856 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035873620862587766, + "loss": 2.9688, + "theoretical_loss": 3.6648846179336347, + "tokens_seen": 956091392 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035872617853560684, + "loss": 2.9246, + "theoretical_loss": 3.6648605130871026, + "tokens_seen": 956156928 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358716148445336, + "loss": 2.9453, + "theoretical_loss": 3.664836410355257, + "tokens_seen": 956222464 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003587061183550652, + "loss": 2.9374, + "theoretical_loss": 3.664812309737768, + "tokens_seen": 956288000 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586960882647944, + "loss": 2.938, + "theoretical_loss": 3.6647882112343044, + "tokens_seen": 956353536 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586860581745236, + "loss": 2.9953, + "theoretical_loss": 3.6647641148445365, + "tokens_seen": 956419072 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035867602808425274, + "loss": 2.8714, + "theoretical_loss": 3.6647400205681344, + "tokens_seen": 956484608 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358665997993982, + "loss": 2.954, + "theoretical_loss": 3.6647159284047675, + "tokens_seen": 956550144 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035865596790371116, + "loss": 2.9761, + "theoretical_loss": 3.6646918383541056, + "tokens_seen": 956615680 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035864593781344034, + "loss": 2.9411, + "theoretical_loss": 3.66466775041582, + "tokens_seen": 956681216 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.945890426635742, + "objective/train/theoretical_loss": 3.664643664589579, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.664643664589579, + "tokens_seen": 956746752 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586359077231695, + "loss": 2.873, + "theoretical_loss": 3.664643664589579, + "tokens_seen": 956746752 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586258776328987, + "loss": 2.9616, + "theoretical_loss": 3.664619580875053, + "tokens_seen": 956812288 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586158475426279, + "loss": 2.9534, + "theoretical_loss": 3.664595499271914, + "tokens_seen": 956877824 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003586058174523571, + "loss": 2.9309, + "theoretical_loss": 3.66457141977983, + "tokens_seen": 956943360 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035859578736208625, + "loss": 2.9779, + "theoretical_loss": 3.664547342398473, + "tokens_seen": 957008896 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003585857572718155, + "loss": 3.0553, + "theoretical_loss": 3.6645232671275125, + "tokens_seen": 957074432 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003585757271815446, + "loss": 2.9174, + "theoretical_loss": 3.66449919396662, + "tokens_seen": 957139968 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035856569709127384, + "loss": 2.9602, + "theoretical_loss": 3.6644751229154653, + "tokens_seen": 957205504 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358555667001003, + "loss": 2.8562, + "theoretical_loss": 3.6644510539737194, + "tokens_seen": 957271040 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003585456369107322, + "loss": 2.9708, + "theoretical_loss": 3.6644269871410526, + "tokens_seen": 957336576 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003585356068204614, + "loss": 2.965, + "theoretical_loss": 3.6644029224171364, + "tokens_seen": 957402112 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035852557673019057, + "loss": 3.0275, + "theoretical_loss": 3.6643788598016416, + "tokens_seen": 957467648 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035851554663991975, + "loss": 2.9317, + "theoretical_loss": 3.664354799294239, + "tokens_seen": 957533184 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358505516549649, + "loss": 2.8704, + "theoretical_loss": 3.6643307408945995, + "tokens_seen": 957598720 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003584954864593781, + "loss": 2.9446, + "theoretical_loss": 3.664306684602394, + "tokens_seen": 957664256 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035848545636910735, + "loss": 2.9846, + "theoretical_loss": 3.6642826304172944, + "tokens_seen": 957729792 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035847542627883653, + "loss": 3.0198, + "theoretical_loss": 3.6642585783389716, + "tokens_seen": 957795328 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003584653961885657, + "loss": 2.8966, + "theoretical_loss": 3.664234528367097, + "tokens_seen": 957860864 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003584553660982949, + "loss": 2.9184, + "theoretical_loss": 3.6642104805013425, + "tokens_seen": 957926400 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035844533600802407, + "loss": 2.8237, + "theoretical_loss": 3.664186434741378, + "tokens_seen": 957991936 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035843530591775325, + "loss": 2.9249, + "theoretical_loss": 3.664162391086877, + "tokens_seen": 958057472 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003584252758274825, + "loss": 2.9324, + "theoretical_loss": 3.6641383495375095, + "tokens_seen": 958123008 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003584152457372116, + "loss": 2.8577, + "theoretical_loss": 3.6641143100929487, + "tokens_seen": 958188544 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035840521564694085, + "loss": 2.9403, + "theoretical_loss": 3.6640902727528655, + "tokens_seen": 958254080 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035839518555667, + "loss": 2.9848, + "theoretical_loss": 3.664066237516932, + "tokens_seen": 958319616 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.93461012840271, + "objective/train/theoretical_loss": 3.66404220438482, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.66404220438482, + "tokens_seen": 958385152 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583851554663992, + "loss": 2.8774, + "theoretical_loss": 3.66404220438482, + "tokens_seen": 958385152 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583751253761284, + "loss": 2.9208, + "theoretical_loss": 3.6640181733562014, + "tokens_seen": 958450688 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583650952858576, + "loss": 2.9114, + "theoretical_loss": 3.663994144430749, + "tokens_seen": 958516224 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035835506519558676, + "loss": 2.8791, + "theoretical_loss": 3.663970117608134, + "tokens_seen": 958581760 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035834503510531594, + "loss": 2.8844, + "theoretical_loss": 3.663946092888029, + "tokens_seen": 958647296 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583350050150451, + "loss": 2.8868, + "theoretical_loss": 3.6639220702701065, + "tokens_seen": 958712832 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035832497492477435, + "loss": 2.9647, + "theoretical_loss": 3.6638980497540383, + "tokens_seen": 958778368 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583149448345035, + "loss": 2.911, + "theoretical_loss": 3.663874031339498, + "tokens_seen": 958843904 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003583049147442327, + "loss": 2.858, + "theoretical_loss": 3.663850015026157, + "tokens_seen": 958909440 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003582948846539619, + "loss": 2.8557, + "theoretical_loss": 3.663826000813688, + "tokens_seen": 958974976 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003582848545636911, + "loss": 2.9981, + "theoretical_loss": 3.6638019887017643, + "tokens_seen": 959040512 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035827482447342026, + "loss": 2.9536, + "theoretical_loss": 3.6637779786900584, + "tokens_seen": 959106048 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035826479438314944, + "loss": 2.9288, + "theoretical_loss": 3.6637539707782425, + "tokens_seen": 959171584 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003582547642928786, + "loss": 2.8867, + "theoretical_loss": 3.6637299649659907, + "tokens_seen": 959237120 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035824473420260786, + "loss": 3.0086, + "theoretical_loss": 3.663705961252975, + "tokens_seen": 959302656 + }, + { + "epoch": 11.02, + "learning_rate": 0.000358234704112337, + "loss": 2.9861, + "theoretical_loss": 3.6636819596388683, + "tokens_seen": 959368192 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003582246740220662, + "loss": 2.9122, + "theoretical_loss": 3.6636579601233445, + "tokens_seen": 959433728 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035821464393179535, + "loss": 2.8581, + "theoretical_loss": 3.6636339627060766, + "tokens_seen": 959499264 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003582046138415246, + "loss": 2.8656, + "theoretical_loss": 3.663609967386737, + "tokens_seen": 959564800 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035819458375125376, + "loss": 2.9669, + "theoretical_loss": 3.663585974165, + "tokens_seen": 959630336 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035818455366098294, + "loss": 2.9548, + "theoretical_loss": 3.663561983040539, + "tokens_seen": 959695872 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003581745235707121, + "loss": 2.9485, + "theoretical_loss": 3.6635379940130264, + "tokens_seen": 959761408 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035816449348044136, + "loss": 2.9146, + "theoretical_loss": 3.6635140070821373, + "tokens_seen": 959826944 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003581544633901705, + "loss": 3.023, + "theoretical_loss": 3.6634900222475437, + "tokens_seen": 959892480 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003581444332998997, + "loss": 2.8633, + "theoretical_loss": 3.663466039508921, + "tokens_seen": 959958016 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.795987844467163, + "objective/train/theoretical_loss": 3.6634420588659413, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6634420588659413, + "tokens_seen": 960023552 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035813440320962885, + "loss": 2.8535, + "theoretical_loss": 3.6634420588659413, + "tokens_seen": 960023552 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003581243731193581, + "loss": 2.9897, + "theoretical_loss": 3.6634180803182796, + "tokens_seen": 960089088 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035811434302908727, + "loss": 2.9071, + "theoretical_loss": 3.6633941038656093, + "tokens_seen": 960154624 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035810431293881645, + "loss": 2.9302, + "theoretical_loss": 3.663370129507604, + "tokens_seen": 960220160 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580942828485457, + "loss": 2.9382, + "theoretical_loss": 3.6633461572439394, + "tokens_seen": 960285696 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580842527582748, + "loss": 2.8643, + "theoretical_loss": 3.6633221870742876, + "tokens_seen": 960351232 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035807422266800404, + "loss": 2.9743, + "theoretical_loss": 3.663298218998324, + "tokens_seen": 960416768 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580641925777332, + "loss": 3.0981, + "theoretical_loss": 3.6632742530157225, + "tokens_seen": 960482304 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580541624874624, + "loss": 2.9321, + "theoretical_loss": 3.6632502891261574, + "tokens_seen": 960547840 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580441323971916, + "loss": 2.9466, + "theoretical_loss": 3.663226327329304, + "tokens_seen": 960613376 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035803410230692077, + "loss": 2.8674, + "theoretical_loss": 3.6632023676248355, + "tokens_seen": 960678912 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035802407221664995, + "loss": 2.8792, + "theoretical_loss": 3.663178410012427, + "tokens_seen": 960744448 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580140421263792, + "loss": 2.9074, + "theoretical_loss": 3.663154454491753, + "tokens_seen": 960809984 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003580040120361083, + "loss": 2.9372, + "theoretical_loss": 3.6631305010624886, + "tokens_seen": 960875520 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035799398194583755, + "loss": 2.9483, + "theoretical_loss": 3.663106549724308, + "tokens_seen": 960941056 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035798395185556673, + "loss": 2.8476, + "theoretical_loss": 3.663082600476887, + "tokens_seen": 961006592 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003579739217652959, + "loss": 2.8618, + "theoretical_loss": 3.6630586533198994, + "tokens_seen": 961072128 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003579638916750251, + "loss": 2.8766, + "theoretical_loss": 3.6630347082530212, + "tokens_seen": 961137664 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035795386158475427, + "loss": 2.8531, + "theoretical_loss": 3.663010765275927, + "tokens_seen": 961203200 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035794383149448345, + "loss": 2.9879, + "theoretical_loss": 3.6629868243882915, + "tokens_seen": 961268736 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003579338014042127, + "loss": 2.9283, + "theoretical_loss": 3.662962885589791, + "tokens_seen": 961334272 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003579237713139418, + "loss": 2.9952, + "theoretical_loss": 3.6629389488800994, + "tokens_seen": 961399808 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035791374122367105, + "loss": 2.8508, + "theoretical_loss": 3.6629150142588935, + "tokens_seen": 961465344 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003579037111334002, + "loss": 2.8771, + "theoretical_loss": 3.6628910817258484, + "tokens_seen": 961530880 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578936810431294, + "loss": 2.8457, + "theoretical_loss": 3.662867151280639, + "tokens_seen": 961596416 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8634836673736572, + "objective/train/theoretical_loss": 3.662843222922941, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.662843222922941, + "tokens_seen": 961661952 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578836509528586, + "loss": 2.9436, + "theoretical_loss": 3.662843222922941, + "tokens_seen": 961661952 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578736208625878, + "loss": 2.9581, + "theoretical_loss": 3.6628192966524304, + "tokens_seen": 961727488 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035786359077231696, + "loss": 2.9574, + "theoretical_loss": 3.662795372468783, + "tokens_seen": 961793024 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035785356068204614, + "loss": 2.9855, + "theoretical_loss": 3.6627714503716744, + "tokens_seen": 961858560 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578435305917753, + "loss": 2.9499, + "theoretical_loss": 3.66274753036078, + "tokens_seen": 961924096 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035783350050150455, + "loss": 2.9389, + "theoretical_loss": 3.6627236124357765, + "tokens_seen": 961989632 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578234704112337, + "loss": 2.9503, + "theoretical_loss": 3.66269969659634, + "tokens_seen": 962055168 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578134403209629, + "loss": 2.9499, + "theoretical_loss": 3.662675782842146, + "tokens_seen": 962120704 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003578034102306921, + "loss": 2.9202, + "theoretical_loss": 3.662651871172871, + "tokens_seen": 962186240 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003577933801404213, + "loss": 2.9148, + "theoretical_loss": 3.662627961588191, + "tokens_seen": 962251776 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035778335005015046, + "loss": 2.8624, + "theoretical_loss": 3.6626040540877822, + "tokens_seen": 962317312 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035777331995987964, + "loss": 2.9525, + "theoretical_loss": 3.6625801486713216, + "tokens_seen": 962382848 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003577632898696088, + "loss": 2.8828, + "theoretical_loss": 3.662556245338485, + "tokens_seen": 962448384 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035775325977933806, + "loss": 3.0756, + "theoretical_loss": 3.66253234408895, + "tokens_seen": 962513920 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003577432296890672, + "loss": 2.9629, + "theoretical_loss": 3.6625084449223917, + "tokens_seen": 962579456 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003577331995987964, + "loss": 2.9649, + "theoretical_loss": 3.6624845478384875, + "tokens_seen": 962644992 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035772316950852555, + "loss": 2.9303, + "theoretical_loss": 3.6624606528369146, + "tokens_seen": 962710528 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003577131394182548, + "loss": 2.9305, + "theoretical_loss": 3.662436759917349, + "tokens_seen": 962776064 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035770310932798396, + "loss": 2.9045, + "theoretical_loss": 3.6624128690794677, + "tokens_seen": 962841600 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035769307923771314, + "loss": 2.9438, + "theoretical_loss": 3.662388980322948, + "tokens_seen": 962907136 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003576830491474423, + "loss": 2.9975, + "theoretical_loss": 3.6623650936474674, + "tokens_seen": 962972672 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035767301905717156, + "loss": 2.7787, + "theoretical_loss": 3.662341209052702, + "tokens_seen": 963038208 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003576629889669007, + "loss": 2.9404, + "theoretical_loss": 3.6623173265383295, + "tokens_seen": 963103744 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003576529588766299, + "loss": 2.8316, + "theoretical_loss": 3.6622934461040275, + "tokens_seen": 963169280 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035764292878635905, + "loss": 2.8441, + "theoretical_loss": 3.6622695677494725, + "tokens_seen": 963234816 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8630871772766113, + "objective/train/theoretical_loss": 3.6622456914743418, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6622456914743418, + "tokens_seen": 963300352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003576328986960883, + "loss": 2.822, + "theoretical_loss": 3.6622456914743418, + "tokens_seen": 963300352 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035762286860581747, + "loss": 2.8932, + "theoretical_loss": 3.662221817278314, + "tokens_seen": 963365888 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035761283851554665, + "loss": 2.8642, + "theoretical_loss": 3.662197945161066, + "tokens_seen": 963431424 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035760280842527583, + "loss": 2.9813, + "theoretical_loss": 3.662174075122275, + "tokens_seen": 963496960 + }, + { + "epoch": 11.02, + "learning_rate": 0.000357592778335005, + "loss": 2.8444, + "theoretical_loss": 3.662150207161619, + "tokens_seen": 963562496 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575827482447342, + "loss": 2.8958, + "theoretical_loss": 3.6621263412787757, + "tokens_seen": 963628032 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575727181544634, + "loss": 2.8819, + "theoretical_loss": 3.6621024774734234, + "tokens_seen": 963693568 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035756268806419255, + "loss": 2.8084, + "theoretical_loss": 3.66207861574524, + "tokens_seen": 963759104 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575526579739218, + "loss": 2.8463, + "theoretical_loss": 3.6620547560939025, + "tokens_seen": 963824640 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575426278836509, + "loss": 3.0105, + "theoretical_loss": 3.66203089851909, + "tokens_seen": 963890176 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035753259779338015, + "loss": 2.9779, + "theoretical_loss": 3.6620070430204796, + "tokens_seen": 963955712 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035752256770310933, + "loss": 2.9119, + "theoretical_loss": 3.661983189597751, + "tokens_seen": 964021248 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575125376128385, + "loss": 2.9724, + "theoretical_loss": 3.661959338250581, + "tokens_seen": 964086784 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003575025075225677, + "loss": 2.9455, + "theoretical_loss": 3.661935488978648, + "tokens_seen": 964152320 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035749247743229693, + "loss": 2.9183, + "theoretical_loss": 3.6619116417816313, + "tokens_seen": 964217856 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035748244734202606, + "loss": 2.9041, + "theoretical_loss": 3.6618877966592085, + "tokens_seen": 964283392 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003574724172517553, + "loss": 2.8839, + "theoretical_loss": 3.661863953611059, + "tokens_seen": 964348928 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003574623871614844, + "loss": 2.971, + "theoretical_loss": 3.6618401126368605, + "tokens_seen": 964414464 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035745235707121365, + "loss": 2.9274, + "theoretical_loss": 3.6618162737362923, + "tokens_seen": 964480000 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035744232698094283, + "loss": 2.9045, + "theoretical_loss": 3.661792436909033, + "tokens_seen": 964545536 + }, + { + "epoch": 11.02, + "learning_rate": 0.000357432296890672, + "loss": 2.9514, + "theoretical_loss": 3.6617686021547615, + "tokens_seen": 964611072 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003574222668004012, + "loss": 2.7715, + "theoretical_loss": 3.661744769473157, + "tokens_seen": 964676608 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003574122367101304, + "loss": 2.9367, + "theoretical_loss": 3.6617209388638976, + "tokens_seen": 964742144 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035740220661985956, + "loss": 2.8856, + "theoretical_loss": 3.6616971103266627, + "tokens_seen": 964807680 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573921765295888, + "loss": 2.9517, + "theoretical_loss": 3.6616732838611314, + "tokens_seen": 964873216 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8777992725372314, + "objective/train/theoretical_loss": 3.6616494594669837, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6616494594669837, + "tokens_seen": 964938752 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573821464393179, + "loss": 2.8081, + "theoretical_loss": 3.6616494594669837, + "tokens_seen": 964938752 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035737211634904716, + "loss": 2.8893, + "theoretical_loss": 3.661625637143898, + "tokens_seen": 965004288 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573620862587763, + "loss": 3.0087, + "theoretical_loss": 3.6616018168915536, + "tokens_seen": 965069824 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573520561685055, + "loss": 3.0222, + "theoretical_loss": 3.66157799870963, + "tokens_seen": 965135360 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035734202607823475, + "loss": 2.8779, + "theoretical_loss": 3.6615541825978077, + "tokens_seen": 965200896 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573319959879639, + "loss": 2.923, + "theoretical_loss": 3.6615303685557645, + "tokens_seen": 965266432 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573219658976931, + "loss": 3.0033, + "theoretical_loss": 3.661506556583181, + "tokens_seen": 965331968 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573119358074223, + "loss": 2.9845, + "theoretical_loss": 3.6614827466797366, + "tokens_seen": 965397504 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003573019057171515, + "loss": 2.7361, + "theoretical_loss": 3.6614589388451115, + "tokens_seen": 965463040 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035729187562688066, + "loss": 2.7728, + "theoretical_loss": 3.661435133078985, + "tokens_seen": 965528576 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035728184553660984, + "loss": 3.0361, + "theoretical_loss": 3.6614113293810373, + "tokens_seen": 965594112 + }, + { + "epoch": 11.02, + "learning_rate": 0.000357271815446339, + "loss": 2.8367, + "theoretical_loss": 3.6613875277509482, + "tokens_seen": 965659648 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035726178535606826, + "loss": 2.9835, + "theoretical_loss": 3.661363728188398, + "tokens_seen": 965725184 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003572517552657974, + "loss": 2.9048, + "theoretical_loss": 3.6613399306930665, + "tokens_seen": 965790720 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003572417251755266, + "loss": 2.827, + "theoretical_loss": 3.6613161352646344, + "tokens_seen": 965856256 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035723169508525575, + "loss": 2.9447, + "theoretical_loss": 3.661292341902781, + "tokens_seen": 965921792 + }, + { + "epoch": 11.02, + "learning_rate": 0.000357221664994985, + "loss": 2.9654, + "theoretical_loss": 3.6612685506071876, + "tokens_seen": 965987328 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035721163490471416, + "loss": 2.9343, + "theoretical_loss": 3.6612447613775343, + "tokens_seen": 966052864 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035720160481444334, + "loss": 2.9669, + "theoretical_loss": 3.661220974213501, + "tokens_seen": 966118400 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003571915747241725, + "loss": 2.9255, + "theoretical_loss": 3.6611971891147697, + "tokens_seen": 966183936 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035718154463390176, + "loss": 2.9832, + "theoretical_loss": 3.6611734060810193, + "tokens_seen": 966249472 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003571715145436309, + "loss": 3.0156, + "theoretical_loss": 3.661149625111931, + "tokens_seen": 966315008 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003571614844533601, + "loss": 2.9583, + "theoretical_loss": 3.6611258462071863, + "tokens_seen": 966380544 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035715145436308925, + "loss": 2.9224, + "theoretical_loss": 3.661102069366465, + "tokens_seen": 966446080 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003571414242728185, + "loss": 2.9462, + "theoretical_loss": 3.6610782945894487, + "tokens_seen": 966511616 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.957657814025879, + "objective/train/theoretical_loss": 3.661054521875818, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.661054521875818, + "tokens_seen": 966577152 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035713139418254767, + "loss": 2.9201, + "theoretical_loss": 3.661054521875818, + "tokens_seen": 966577152 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035712136409227685, + "loss": 2.8928, + "theoretical_loss": 3.6610307512252547, + "tokens_seen": 966642688 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035711133400200603, + "loss": 2.9704, + "theoretical_loss": 3.661006982637439, + "tokens_seen": 966708224 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003571013039117352, + "loss": 2.9578, + "theoretical_loss": 3.660983216112052, + "tokens_seen": 966773760 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003570912738214644, + "loss": 2.9847, + "theoretical_loss": 3.6609594516487762, + "tokens_seen": 966839296 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003570812437311936, + "loss": 2.9776, + "theoretical_loss": 3.6609356892472915, + "tokens_seen": 966904832 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035707121364092275, + "loss": 2.9873, + "theoretical_loss": 3.6609119289072796, + "tokens_seen": 966970368 + }, + { + "epoch": 11.02, + "learning_rate": 0.000357061183550652, + "loss": 2.9658, + "theoretical_loss": 3.6608881706284224, + "tokens_seen": 967035904 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003570511534603811, + "loss": 2.9612, + "theoretical_loss": 3.6608644144104012, + "tokens_seen": 967101440 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035704112337011035, + "loss": 2.9758, + "theoretical_loss": 3.660840660252898, + "tokens_seen": 967166976 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035703109327983953, + "loss": 2.9178, + "theoretical_loss": 3.660816908155594, + "tokens_seen": 967232512 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003570210631895687, + "loss": 3.0033, + "theoretical_loss": 3.6607931581181705, + "tokens_seen": 967298048 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003570110330992979, + "loss": 2.8556, + "theoretical_loss": 3.6607694101403103, + "tokens_seen": 967363584 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035700100300902713, + "loss": 2.942, + "theoretical_loss": 3.660745664221695, + "tokens_seen": 967429120 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035699097291875626, + "loss": 2.9429, + "theoretical_loss": 3.6607219203620063, + "tokens_seen": 967494656 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003569809428284855, + "loss": 2.9123, + "theoretical_loss": 3.660698178560926, + "tokens_seen": 967560192 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003569709127382146, + "loss": 2.9451, + "theoretical_loss": 3.660674438818137, + "tokens_seen": 967625728 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035696088264794385, + "loss": 2.9523, + "theoretical_loss": 3.6606507011333216, + "tokens_seen": 967691264 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035695085255767303, + "loss": 2.9734, + "theoretical_loss": 3.6606269655061605, + "tokens_seen": 967756800 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003569408224674022, + "loss": 2.9339, + "theoretical_loss": 3.6606032319363373, + "tokens_seen": 967822336 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003569307923771314, + "loss": 2.9298, + "theoretical_loss": 3.660579500423534, + "tokens_seen": 967887872 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003569207622868606, + "loss": 3.0065, + "theoretical_loss": 3.660555770967433, + "tokens_seen": 967953408 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035691073219658976, + "loss": 2.8768, + "theoretical_loss": 3.6605320435677173, + "tokens_seen": 968018944 + }, + { + "epoch": 11.02, + "learning_rate": 0.000356900702106319, + "loss": 2.9502, + "theoretical_loss": 3.6605083182240685, + "tokens_seen": 968084480 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568906720160481, + "loss": 2.9472, + "theoretical_loss": 3.660484594936171, + "tokens_seen": 968150016 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.906515121459961, + "objective/train/theoretical_loss": 3.6604608737037054, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6604608737037054, + "tokens_seen": 968215552 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035688064192577736, + "loss": 2.9122, + "theoretical_loss": 3.6604608737037054, + "tokens_seen": 968215552 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568706118355065, + "loss": 2.9342, + "theoretical_loss": 3.6604371545263557, + "tokens_seen": 968281088 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568605817452357, + "loss": 2.9955, + "theoretical_loss": 3.660413437403805, + "tokens_seen": 968346624 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568505516549649, + "loss": 2.9364, + "theoretical_loss": 3.6603897223357356, + "tokens_seen": 968412160 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568405215646941, + "loss": 2.9429, + "theoretical_loss": 3.66036600932183, + "tokens_seen": 968477696 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035683049147442326, + "loss": 3.0015, + "theoretical_loss": 3.6603422983617735, + "tokens_seen": 968543232 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568204613841525, + "loss": 2.9807, + "theoretical_loss": 3.6603185894552466, + "tokens_seen": 968608768 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003568104312938816, + "loss": 2.8037, + "theoretical_loss": 3.660294882601934, + "tokens_seen": 968674304 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035680040120361086, + "loss": 2.8851, + "theoretical_loss": 3.6602711778015196, + "tokens_seen": 968739840 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035679037111334, + "loss": 2.8628, + "theoretical_loss": 3.6602474750536853, + "tokens_seen": 968805376 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003567803410230692, + "loss": 2.9284, + "theoretical_loss": 3.6602237743581147, + "tokens_seen": 968870912 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003567703109327984, + "loss": 2.8289, + "theoretical_loss": 3.660200075714492, + "tokens_seen": 968936448 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003567602808425276, + "loss": 2.9382, + "theoretical_loss": 3.6601763791225013, + "tokens_seen": 969001984 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035675025075225677, + "loss": 2.9486, + "theoretical_loss": 3.660152684581824, + "tokens_seen": 969067520 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035674022066198595, + "loss": 2.9872, + "theoretical_loss": 3.6601289920921465, + "tokens_seen": 969133056 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035673019057171513, + "loss": 2.9293, + "theoretical_loss": 3.660105301653151, + "tokens_seen": 969198592 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035672016048144436, + "loss": 3.0264, + "theoretical_loss": 3.6600816132645217, + "tokens_seen": 969264128 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003567101303911735, + "loss": 2.8262, + "theoretical_loss": 3.6600579269259423, + "tokens_seen": 969329664 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003567001003009027, + "loss": 2.9826, + "theoretical_loss": 3.6600342426370975, + "tokens_seen": 969395200 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035669007021063185, + "loss": 2.881, + "theoretical_loss": 3.660010560397671, + "tokens_seen": 969460736 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003566800401203611, + "loss": 2.979, + "theoretical_loss": 3.6599868802073465, + "tokens_seen": 969526272 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035667001003009027, + "loss": 2.8982, + "theoretical_loss": 3.6599632020658084, + "tokens_seen": 969591808 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035665997993981945, + "loss": 2.8501, + "theoretical_loss": 3.6599395259727414, + "tokens_seen": 969657344 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035664994984954863, + "loss": 2.9767, + "theoretical_loss": 3.65991585192783, + "tokens_seen": 969722880 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035663991975927787, + "loss": 2.8614, + "theoretical_loss": 3.6598921799307575, + "tokens_seen": 969788416 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9397683143615723, + "objective/train/theoretical_loss": 3.6598685099812096, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.6598685099812096, + "tokens_seen": 969853952 + }, + { + "epoch": 11.02, + "learning_rate": 0.000356629889669007, + "loss": 2.9154, + "theoretical_loss": 3.6598685099812096, + "tokens_seen": 969853952 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035661985957873623, + "loss": 2.9157, + "theoretical_loss": 3.6598448420788703, + "tokens_seen": 969919488 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035660982948846536, + "loss": 2.9056, + "theoretical_loss": 3.6598211762234243, + "tokens_seen": 969985024 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565997993981946, + "loss": 2.9186, + "theoretical_loss": 3.6597975124145563, + "tokens_seen": 970050560 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565897693079238, + "loss": 3.0406, + "theoretical_loss": 3.659773850651951, + "tokens_seen": 970116096 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035657973921765295, + "loss": 3.0097, + "theoretical_loss": 3.659750190935293, + "tokens_seen": 970181632 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565697091273822, + "loss": 2.891, + "theoretical_loss": 3.659726533264268, + "tokens_seen": 970247168 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565596790371113, + "loss": 2.9331, + "theoretical_loss": 3.6597028776385603, + "tokens_seen": 970312704 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035654964894684055, + "loss": 2.9034, + "theoretical_loss": 3.659679224057855, + "tokens_seen": 970378240 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035653961885656973, + "loss": 2.9666, + "theoretical_loss": 3.6596555725218374, + "tokens_seen": 970443776 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565295887662989, + "loss": 2.9489, + "theoretical_loss": 3.659631923030193, + "tokens_seen": 970509312 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003565195586760281, + "loss": 2.9672, + "theoretical_loss": 3.659608275582606, + "tokens_seen": 970574848 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035650952858575733, + "loss": 2.9717, + "theoretical_loss": 3.659584630178763, + "tokens_seen": 970640384 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035649949849548646, + "loss": 2.9192, + "theoretical_loss": 3.6595609868183487, + "tokens_seen": 970705920 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564894684052157, + "loss": 2.8987, + "theoretical_loss": 3.6595373455010485, + "tokens_seen": 970771456 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564794383149448, + "loss": 2.8622, + "theoretical_loss": 3.659513706226548, + "tokens_seen": 970836992 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035646940822467405, + "loss": 2.9636, + "theoretical_loss": 3.6594900689945336, + "tokens_seen": 970902528 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035645937813440323, + "loss": 2.8755, + "theoretical_loss": 3.6594664338046896, + "tokens_seen": 970968064 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564493480441324, + "loss": 2.8887, + "theoretical_loss": 3.659442800656703, + "tokens_seen": 971033600 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564393179538616, + "loss": 2.9254, + "theoretical_loss": 3.6594191695502585, + "tokens_seen": 971099136 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564292878635908, + "loss": 2.8573, + "theoretical_loss": 3.659395540485043, + "tokens_seen": 971164672 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035641925777331996, + "loss": 2.9659, + "theoretical_loss": 3.659371913460742, + "tokens_seen": 971230208 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003564092276830492, + "loss": 2.9873, + "theoretical_loss": 3.659348288477041, + "tokens_seen": 971295744 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563991975927783, + "loss": 2.8859, + "theoretical_loss": 3.6593246655336267, + "tokens_seen": 971361280 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035638916750250756, + "loss": 2.8808, + "theoretical_loss": 3.659301044630185, + "tokens_seen": 971426816 + }, + { + "epoch": 11.02, + "objective/train/docs_used": 2269056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0157132148742676, + "objective/train/theoretical_loss": 3.659277425766403, + "objective/train/tokens_used": 971923936, + "theoretical_loss": 3.659277425766403, + "tokens_seen": 971492352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563791374122367, + "loss": 2.8766, + "theoretical_loss": 3.659277425766403, + "tokens_seen": 971492352 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563691073219659, + "loss": 3.0334, + "theoretical_loss": 3.6592538089419664, + "tokens_seen": 971557888 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563590772316951, + "loss": 3.0517, + "theoretical_loss": 3.6592301941565606, + "tokens_seen": 971623424 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563490471414243, + "loss": 3.0328, + "theoretical_loss": 3.6592065814098733, + "tokens_seen": 971688960 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035633901705115346, + "loss": 2.9312, + "theoretical_loss": 3.6591829707015906, + "tokens_seen": 971754496 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563289869608827, + "loss": 2.9263, + "theoretical_loss": 3.6591593620313994, + "tokens_seen": 971820032 + }, + { + "epoch": 11.02, + "learning_rate": 0.0003563189568706118, + "loss": 2.8566, + "theoretical_loss": 3.659135755398986, + "tokens_seen": 971885568 + }, + { + "epoch": 11.02, + "learning_rate": 0.00035630892678034106, + "loss": 2.9748, + "theoretical_loss": 3.659114732457382, + "tokens_seen": 971943936 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562988966900702, + "loss": 2.8215, + "theoretical_loss": 3.6590911296767863, + "tokens_seen": 972009472 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562888665997994, + "loss": 2.8797, + "theoretical_loss": 3.659067528933063, + "tokens_seen": 972075008 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562788365095286, + "loss": 2.8167, + "theoretical_loss": 3.6590439302258986, + "tokens_seen": 972140544 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562688064192578, + "loss": 2.7863, + "theoretical_loss": 3.659020333554981, + "tokens_seen": 972206080 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035625877632898697, + "loss": 2.8592, + "theoretical_loss": 3.6589967389199964, + "tokens_seen": 972271616 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035624874623871615, + "loss": 2.7912, + "theoretical_loss": 3.658973146320633, + "tokens_seen": 972337152 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035623871614844533, + "loss": 2.8157, + "theoretical_loss": 3.658949555756577, + "tokens_seen": 972402688 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035622868605817456, + "loss": 2.8909, + "theoretical_loss": 3.658925967227516, + "tokens_seen": 972468224 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562186559679037, + "loss": 2.8138, + "theoretical_loss": 3.658902380733137, + "tokens_seen": 972533760 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003562086258776329, + "loss": 2.9765, + "theoretical_loss": 3.658878796273128, + "tokens_seen": 972599296 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035619859578736205, + "loss": 2.9141, + "theoretical_loss": 3.658855213847176, + "tokens_seen": 972664832 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003561885656970913, + "loss": 2.8883, + "theoretical_loss": 3.658831633454969, + "tokens_seen": 972730368 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035617853560682047, + "loss": 2.819, + "theoretical_loss": 3.658808055096194, + "tokens_seen": 972795904 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035616850551654965, + "loss": 2.9237, + "theoretical_loss": 3.6587844787705395, + "tokens_seen": 972861440 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035615847542627883, + "loss": 2.7637, + "theoretical_loss": 3.6587609044776923, + "tokens_seen": 972926976 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035614844533600807, + "loss": 2.8616, + "theoretical_loss": 3.6587373322173407, + "tokens_seen": 972992512 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003561384152457372, + "loss": 2.9031, + "theoretical_loss": 3.6587137619891728, + "tokens_seen": 973058048 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2319500, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.92916202545166, + "objective/train/theoretical_loss": 3.658690193792876, + "objective/train/tokens_used": 993583584, + "theoretical_loss": 3.658690193792876, + "tokens_seen": 973123584 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035612838515546643, + "loss": 2.9719, + "theoretical_loss": 3.658690193792876, + "tokens_seen": 973123584 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035611835506519556, + "loss": 2.8488, + "theoretical_loss": 3.658666627628139, + "tokens_seen": 973189120 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003561083249749248, + "loss": 2.881, + "theoretical_loss": 3.658643063494649, + "tokens_seen": 973254656 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035609829488465397, + "loss": 2.8777, + "theoretical_loss": 3.6586195013920952, + "tokens_seen": 973320192 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035608826479438315, + "loss": 2.8905, + "theoretical_loss": 3.658595941320165, + "tokens_seen": 973385728 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035607823470411233, + "loss": 2.8919, + "theoretical_loss": 3.658572383278547, + "tokens_seen": 973451264 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003560682046138415, + "loss": 2.8729, + "theoretical_loss": 3.65854882726693, + "tokens_seen": 973516800 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003560581745235707, + "loss": 2.8486, + "theoretical_loss": 3.6585252732850018, + "tokens_seen": 973582336 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035604814443329993, + "loss": 2.8253, + "theoretical_loss": 3.658501721332451, + "tokens_seen": 973647872 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035603811434302906, + "loss": 2.7505, + "theoretical_loss": 3.658478171408966, + "tokens_seen": 973713408 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003560280842527583, + "loss": 2.8037, + "theoretical_loss": 3.658454623514236, + "tokens_seen": 973778944 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003560180541624874, + "loss": 2.921, + "theoretical_loss": 3.6584310776479496, + "tokens_seen": 973844480 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035600802407221666, + "loss": 2.9045, + "theoretical_loss": 3.658407533809795, + "tokens_seen": 973910016 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035599799398194584, + "loss": 2.8461, + "theoretical_loss": 3.658383991999462, + "tokens_seen": 973975552 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355987963891675, + "loss": 2.8667, + "theoretical_loss": 3.6583604522166384, + "tokens_seen": 974041088 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003559779338014042, + "loss": 2.936, + "theoretical_loss": 3.658336914461014, + "tokens_seen": 974106624 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035596790371113344, + "loss": 2.8604, + "theoretical_loss": 3.6583133787322777, + "tokens_seen": 974172160 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035595787362086256, + "loss": 2.979, + "theoretical_loss": 3.6582898450301187, + "tokens_seen": 974237696 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003559478435305918, + "loss": 2.838, + "theoretical_loss": 3.6582663133542255, + "tokens_seen": 974303232 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003559378134403209, + "loss": 2.816, + "theoretical_loss": 3.6582427837042886, + "tokens_seen": 974368768 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035592778335005016, + "loss": 2.8345, + "theoretical_loss": 3.658219256079996, + "tokens_seen": 974434304 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035591775325977934, + "loss": 2.7611, + "theoretical_loss": 3.6581957304810375, + "tokens_seen": 974499840 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003559077231695085, + "loss": 2.8883, + "theoretical_loss": 3.6581722069071034, + "tokens_seen": 974565376 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003558976930792377, + "loss": 2.9279, + "theoretical_loss": 3.6581486853578826, + "tokens_seen": 974630912 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003558876629889669, + "loss": 2.8965, + "theoretical_loss": 3.6581251658330647, + "tokens_seen": 974696448 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2324465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.001706600189209, + "objective/train/theoretical_loss": 3.6581016483323388, + "objective/train/tokens_used": 995221984, + "theoretical_loss": 3.6581016483323388, + "tokens_seen": 974761984 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035587763289869607, + "loss": 2.9147, + "theoretical_loss": 3.6581016483323388, + "tokens_seen": 974761984 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003558676028084253, + "loss": 2.8853, + "theoretical_loss": 3.6580781328553957, + "tokens_seen": 974827520 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035585757271815443, + "loss": 2.7712, + "theoretical_loss": 3.6580546194019243, + "tokens_seen": 974893056 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035584754262788366, + "loss": 2.9331, + "theoretical_loss": 3.6580311079716155, + "tokens_seen": 974958592 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003558375125376129, + "loss": 2.8191, + "theoretical_loss": 3.6580075985641582, + "tokens_seen": 975024128 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355827482447342, + "loss": 2.9301, + "theoretical_loss": 3.6579840911792427, + "tokens_seen": 975089664 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035581745235707126, + "loss": 2.856, + "theoretical_loss": 3.65796058581656, + "tokens_seen": 975155200 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003558074222668004, + "loss": 2.7243, + "theoretical_loss": 3.657937082475799, + "tokens_seen": 975220736 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003557973921765296, + "loss": 2.8114, + "theoretical_loss": 3.6579135811566506, + "tokens_seen": 975286272 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003557873620862588, + "loss": 2.9223, + "theoretical_loss": 3.6578900818588043, + "tokens_seen": 975351808 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355777331995988, + "loss": 2.932, + "theoretical_loss": 3.6578665845819516, + "tokens_seen": 975417344 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035576730190571717, + "loss": 2.8917, + "theoretical_loss": 3.657843089325782, + "tokens_seen": 975482880 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035575727181544635, + "loss": 2.9038, + "theoretical_loss": 3.657819596089987, + "tokens_seen": 975548416 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035574724172517553, + "loss": 2.937, + "theoretical_loss": 3.6577961048742558, + "tokens_seen": 975613952 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035573721163490476, + "loss": 2.8131, + "theoretical_loss": 3.65777261567828, + "tokens_seen": 975679488 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003557271815446339, + "loss": 2.8367, + "theoretical_loss": 3.65774912850175, + "tokens_seen": 975745024 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003557171514543631, + "loss": 2.8349, + "theoretical_loss": 3.6577256433443566, + "tokens_seen": 975810560 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035570712136409225, + "loss": 2.8743, + "theoretical_loss": 3.6577021602057904, + "tokens_seen": 975876096 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003556970912738215, + "loss": 2.7422, + "theoretical_loss": 3.657678679085743, + "tokens_seen": 975941632 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035568706118355067, + "loss": 2.8007, + "theoretical_loss": 3.6576551999839046, + "tokens_seen": 976007168 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035567703109327985, + "loss": 2.7891, + "theoretical_loss": 3.657631722899967, + "tokens_seen": 976072704 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035566700100300903, + "loss": 2.8496, + "theoretical_loss": 3.65760824783362, + "tokens_seen": 976138240 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035565697091273827, + "loss": 2.8046, + "theoretical_loss": 3.6575847747845556, + "tokens_seen": 976203776 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003556469408224674, + "loss": 2.8105, + "theoretical_loss": 3.6575613037524657, + "tokens_seen": 976269312 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035563691073219663, + "loss": 2.7838, + "theoretical_loss": 3.657537834737041, + "tokens_seen": 976334848 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2327583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.841701030731201, + "objective/train/theoretical_loss": 3.657514367737972, + "objective/train/tokens_used": 996860384, + "theoretical_loss": 3.657514367737972, + "tokens_seen": 976400384 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035562688064192576, + "loss": 2.7727, + "theoretical_loss": 3.657514367737972, + "tokens_seen": 976400384 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355616850551655, + "loss": 2.8762, + "theoretical_loss": 3.657490902754952, + "tokens_seen": 976465920 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035560682046138417, + "loss": 2.7938, + "theoretical_loss": 3.6574674397876707, + "tokens_seen": 976531456 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035559679037111335, + "loss": 2.9006, + "theoretical_loss": 3.6574439788358206, + "tokens_seen": 976596992 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035558676028084253, + "loss": 2.8006, + "theoretical_loss": 3.6574205198990932, + "tokens_seen": 976662528 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003555767301905717, + "loss": 2.8725, + "theoretical_loss": 3.6573970629771804, + "tokens_seen": 976728064 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003555667001003009, + "loss": 2.9619, + "theoretical_loss": 3.657373608069774, + "tokens_seen": 976793600 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035555667001003013, + "loss": 2.8384, + "theoretical_loss": 3.657350155176565, + "tokens_seen": 976859136 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035554663991975926, + "loss": 2.7675, + "theoretical_loss": 3.6573267042972457, + "tokens_seen": 976924672 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003555366098294885, + "loss": 2.7939, + "theoretical_loss": 3.6573032554315095, + "tokens_seen": 976990208 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003555265797392176, + "loss": 2.894, + "theoretical_loss": 3.6572798085790463, + "tokens_seen": 977055744 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035551654964894686, + "loss": 2.8552, + "theoretical_loss": 3.65725636373955, + "tokens_seen": 977121280 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035550651955867604, + "loss": 2.7881, + "theoretical_loss": 3.6572329209127115, + "tokens_seen": 977186816 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003554964894684052, + "loss": 2.8622, + "theoretical_loss": 3.6572094800982238, + "tokens_seen": 977252352 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003554864593781344, + "loss": 2.8017, + "theoretical_loss": 3.6571860412957786, + "tokens_seen": 977317888 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035547642928786364, + "loss": 2.8101, + "theoretical_loss": 3.6571626045050696, + "tokens_seen": 977383424 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035546639919759276, + "loss": 2.9136, + "theoretical_loss": 3.657139169725787, + "tokens_seen": 977448960 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355456369107322, + "loss": 2.8116, + "theoretical_loss": 3.6571157369576257, + "tokens_seen": 977514496 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003554463390170511, + "loss": 2.8694, + "theoretical_loss": 3.657092306200277, + "tokens_seen": 977580032 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035543630892678036, + "loss": 2.8187, + "theoretical_loss": 3.657068877453434, + "tokens_seen": 977645568 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035542627883650954, + "loss": 2.8446, + "theoretical_loss": 3.6570454507167893, + "tokens_seen": 977711104 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003554162487462387, + "loss": 2.8106, + "theoretical_loss": 3.6570220259900355, + "tokens_seen": 977776640 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003554062186559679, + "loss": 2.9319, + "theoretical_loss": 3.656998603272865, + "tokens_seen": 977842176 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003553961885656971, + "loss": 2.8119, + "theoretical_loss": 3.656975182564972, + "tokens_seen": 977907712 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035538615847542627, + "loss": 2.8983, + "theoretical_loss": 3.656951763866049, + "tokens_seen": 977973248 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2331466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.749829053878784, + "objective/train/theoretical_loss": 3.6569283471757883, + "objective/train/tokens_used": 998498784, + "theoretical_loss": 3.6569283471757883, + "tokens_seen": 978038784 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003553761283851555, + "loss": 2.8259, + "theoretical_loss": 3.6569283471757883, + "tokens_seen": 978038784 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035536609829488463, + "loss": 2.8706, + "theoretical_loss": 3.6569049324938847, + "tokens_seen": 978104320 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035535606820461386, + "loss": 2.8418, + "theoretical_loss": 3.656881519820029, + "tokens_seen": 978169856 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035534603811434304, + "loss": 2.8674, + "theoretical_loss": 3.6568581091539167, + "tokens_seen": 978235392 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003553360080240722, + "loss": 2.7976, + "theoretical_loss": 3.6568347004952404, + "tokens_seen": 978300928 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003553259779338014, + "loss": 2.8573, + "theoretical_loss": 3.6568112938436927, + "tokens_seen": 978366464 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003553159478435306, + "loss": 2.7403, + "theoretical_loss": 3.6567878891989682, + "tokens_seen": 978432000 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035530591775325977, + "loss": 2.816, + "theoretical_loss": 3.6567644865607605, + "tokens_seen": 978497536 + }, + { + "epoch": 12.0, + "learning_rate": 0.000355295887662989, + "loss": 2.8193, + "theoretical_loss": 3.6567410859287617, + "tokens_seen": 978563072 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035528585757271813, + "loss": 2.8133, + "theoretical_loss": 3.6567176873026668, + "tokens_seen": 978628608 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035527582748244737, + "loss": 2.9816, + "theoretical_loss": 3.6566942906821693, + "tokens_seen": 978694144 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003552657973921765, + "loss": 2.8449, + "theoretical_loss": 3.656670896066963, + "tokens_seen": 978759680 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035525576730190573, + "loss": 2.7671, + "theoretical_loss": 3.6566475034567416, + "tokens_seen": 978825216 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003552457372116349, + "loss": 2.83, + "theoretical_loss": 3.656624112851199, + "tokens_seen": 978890752 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003552357071213641, + "loss": 2.7685, + "theoretical_loss": 3.6566007242500294, + "tokens_seen": 978956288 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035522567703109327, + "loss": 2.7761, + "theoretical_loss": 3.656577337652927, + "tokens_seen": 979021824 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035521564694082245, + "loss": 2.9258, + "theoretical_loss": 3.6565539530595856, + "tokens_seen": 979087360 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035520561685055163, + "loss": 2.837, + "theoretical_loss": 3.6565305704697, + "tokens_seen": 979152896 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035519558676028087, + "loss": 2.7504, + "theoretical_loss": 3.656507189882963, + "tokens_seen": 979218432 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035518555667001, + "loss": 2.8384, + "theoretical_loss": 3.656483811299071, + "tokens_seen": 979283968 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035517552657973923, + "loss": 2.8697, + "theoretical_loss": 3.656460434717717, + "tokens_seen": 979349504 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003551654964894684, + "loss": 2.8165, + "theoretical_loss": 3.6564370601385963, + "tokens_seen": 979415040 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003551554663991976, + "loss": 2.8654, + "theoretical_loss": 3.6564136875614026, + "tokens_seen": 979480576 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003551454363089268, + "loss": 2.9315, + "theoretical_loss": 3.656390316985831, + "tokens_seen": 979546112 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035513540621865596, + "loss": 2.8707, + "theoretical_loss": 3.656366948411576, + "tokens_seen": 979611648 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2336039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.900575876235962, + "objective/train/theoretical_loss": 3.656343581838333, + "objective/train/tokens_used": 1000137184, + "theoretical_loss": 3.656343581838333, + "tokens_seen": 979677184 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035512537612838514, + "loss": 2.7528, + "theoretical_loss": 3.656343581838333, + "tokens_seen": 979677184 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035511534603811437, + "loss": 2.8811, + "theoretical_loss": 3.656320217265796, + "tokens_seen": 979742720 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003551053159478435, + "loss": 2.8334, + "theoretical_loss": 3.65629685469366, + "tokens_seen": 979808256 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035509528585757273, + "loss": 2.7948, + "theoretical_loss": 3.6562734941216206, + "tokens_seen": 979873792 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003550852557673019, + "loss": 2.8536, + "theoretical_loss": 3.6562501355493726, + "tokens_seen": 979939328 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003550752256770311, + "loss": 3.0209, + "theoretical_loss": 3.65622677897661, + "tokens_seen": 980004864 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035506519558676033, + "loss": 2.8863, + "theoretical_loss": 3.6562034244030293, + "tokens_seen": 980070400 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035505516549648946, + "loss": 2.7654, + "theoretical_loss": 3.6561800718283246, + "tokens_seen": 980135936 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003550451354062187, + "loss": 2.9087, + "theoretical_loss": 3.6561567212521924, + "tokens_seen": 980201472 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003550351053159478, + "loss": 2.7378, + "theoretical_loss": 3.6561333726743275, + "tokens_seen": 980267008 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035502507522567706, + "loss": 2.8351, + "theoretical_loss": 3.656110026094425, + "tokens_seen": 980332544 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035501504513540624, + "loss": 2.8344, + "theoretical_loss": 3.6560866815121806, + "tokens_seen": 980398080 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003550050150451354, + "loss": 2.8278, + "theoretical_loss": 3.6560633389272903, + "tokens_seen": 980463616 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549949849548646, + "loss": 2.8876, + "theoretical_loss": 3.6560399983394483, + "tokens_seen": 980529152 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035498495486459384, + "loss": 2.9226, + "theoretical_loss": 3.656016659748352, + "tokens_seen": 980594688 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035497492477432296, + "loss": 2.9, + "theoretical_loss": 3.6559933231536963, + "tokens_seen": 980660224 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549648946840522, + "loss": 2.8716, + "theoretical_loss": 3.6559699885551775, + "tokens_seen": 980725760 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549548645937813, + "loss": 2.9012, + "theoretical_loss": 3.6559466559524907, + "tokens_seen": 980791296 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035494483450351056, + "loss": 2.897, + "theoretical_loss": 3.6559233253453325, + "tokens_seen": 980856832 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035493480441323974, + "loss": 2.8543, + "theoretical_loss": 3.6558999967333987, + "tokens_seen": 980922368 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549247743229689, + "loss": 2.9032, + "theoretical_loss": 3.6558766701163856, + "tokens_seen": 980987904 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549147442326981, + "loss": 2.9377, + "theoretical_loss": 3.6558533454939885, + "tokens_seen": 981053440 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003549047141424273, + "loss": 2.7772, + "theoretical_loss": 3.6558300228659046, + "tokens_seen": 981118976 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035489468405215647, + "loss": 2.8209, + "theoretical_loss": 3.65580670223183, + "tokens_seen": 981184512 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003548846539618857, + "loss": 2.727, + "theoretical_loss": 3.6557833835914613, + "tokens_seen": 981250048 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2339224, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.939506769180298, + "objective/train/theoretical_loss": 3.6557600669444934, + "objective/train/tokens_used": 1001775584, + "theoretical_loss": 3.6557600669444934, + "tokens_seen": 981315584 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035487462387161483, + "loss": 2.9297, + "theoretical_loss": 3.6557600669444934, + "tokens_seen": 981315584 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035486459378134406, + "loss": 2.8638, + "theoretical_loss": 3.6557367522906246, + "tokens_seen": 981381120 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035485456369107324, + "loss": 2.8864, + "theoretical_loss": 3.655713439629551, + "tokens_seen": 981446656 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003548445336008024, + "loss": 2.8957, + "theoretical_loss": 3.6556901289609685, + "tokens_seen": 981512192 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003548345035105316, + "loss": 2.9194, + "theoretical_loss": 3.6556668202845746, + "tokens_seen": 981577728 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003548244734202608, + "loss": 2.7355, + "theoretical_loss": 3.655643513600065, + "tokens_seen": 981643264 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035481444332998997, + "loss": 2.8422, + "theoretical_loss": 3.6556202089071377, + "tokens_seen": 981708800 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003548044132397192, + "loss": 2.8845, + "theoretical_loss": 3.6555969062054894, + "tokens_seen": 981774336 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035479438314944833, + "loss": 2.9537, + "theoretical_loss": 3.6555736054948165, + "tokens_seen": 981839872 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035478435305917757, + "loss": 2.9111, + "theoretical_loss": 3.655550306774816, + "tokens_seen": 981905408 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003547743229689067, + "loss": 2.7807, + "theoretical_loss": 3.6555270100451853, + "tokens_seen": 981970944 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035476429287863593, + "loss": 2.8365, + "theoretical_loss": 3.655503715305622, + "tokens_seen": 982036480 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003547542627883651, + "loss": 2.7883, + "theoretical_loss": 3.655480422555822, + "tokens_seen": 982102016 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003547442326980943, + "loss": 2.9645, + "theoretical_loss": 3.655457131795484, + "tokens_seen": 982167552 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035473420260782347, + "loss": 2.7815, + "theoretical_loss": 3.6554338430243045, + "tokens_seen": 982233088 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035472417251755265, + "loss": 2.8359, + "theoretical_loss": 3.655410556241981, + "tokens_seen": 982298624 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035471414242728183, + "loss": 2.8192, + "theoretical_loss": 3.6553872714482116, + "tokens_seen": 982364160 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035470411233701107, + "loss": 2.8244, + "theoretical_loss": 3.6553639886426925, + "tokens_seen": 982429696 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003546940822467402, + "loss": 2.9375, + "theoretical_loss": 3.6553407078251228, + "tokens_seen": 982495232 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035468405215646943, + "loss": 2.9707, + "theoretical_loss": 3.6553174289951995, + "tokens_seen": 982560768 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003546740220661986, + "loss": 2.7658, + "theoretical_loss": 3.6552941521526203, + "tokens_seen": 982626304 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003546639919759278, + "loss": 2.695, + "theoretical_loss": 3.655270877297083, + "tokens_seen": 982691840 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354653961885657, + "loss": 2.7854, + "theoretical_loss": 3.655247604428286, + "tokens_seen": 982757376 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035464393179538616, + "loss": 2.7698, + "theoretical_loss": 3.6552243335459265, + "tokens_seen": 982822912 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035463390170511534, + "loss": 2.7725, + "theoretical_loss": 3.6552010646497024, + "tokens_seen": 982888448 + }, + { + "debugging/Self-BLEU-5": 0.6758849331525665, + "debugging/distinct-1-grams": 0.7483349012438663, + "debugging/distinct-2-grams": 0.9578163084521312, + "debugging/entropy-1-grams": 6.339552358511071, + "debugging/entropy-2-grams": 7.633405934469852, + "debugging/length": 567.5862068965517, + "debugging/num_segments": 29, + "epoch": 12.0, + "objective/train/docs_used": 2343915, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8543343544006348, + "objective/train/theoretical_loss": 3.655177797739313, + "objective/train/tokens_used": 1003413984, + "theoretical_loss": 3.655177797739313, + "tokens_seen": 982953984 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035462387161484457, + "loss": 2.854, + "theoretical_loss": 3.655177797739313, + "tokens_seen": 982953984 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003546138415245737, + "loss": 2.899, + "theoretical_loss": 3.6551545328144552, + "tokens_seen": 983019520 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035460381143430293, + "loss": 2.869, + "theoretical_loss": 3.6551312698748277, + "tokens_seen": 983085056 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035459378134403206, + "loss": 2.8344, + "theoretical_loss": 3.655108008920129, + "tokens_seen": 983150592 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003545837512537613, + "loss": 2.8901, + "theoretical_loss": 3.6550847499500567, + "tokens_seen": 983216128 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003545737211634905, + "loss": 2.8933, + "theoretical_loss": 3.65506149296431, + "tokens_seen": 983281664 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035456369107321966, + "loss": 2.7968, + "theoretical_loss": 3.6550382379625868, + "tokens_seen": 983347200 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035455366098294884, + "loss": 2.9323, + "theoretical_loss": 3.655014984944586, + "tokens_seen": 983412736 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354543630892678, + "loss": 2.858, + "theoretical_loss": 3.6549917339100055, + "tokens_seen": 983478272 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003545336008024072, + "loss": 2.7548, + "theoretical_loss": 3.6549684848585455, + "tokens_seen": 983543808 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035452357071213644, + "loss": 2.8184, + "theoretical_loss": 3.6549452377899034, + "tokens_seen": 983609344 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035451354062186556, + "loss": 2.7712, + "theoretical_loss": 3.654921992703778, + "tokens_seen": 983674880 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003545035105315948, + "loss": 2.8922, + "theoretical_loss": 3.6548987495998686, + "tokens_seen": 983740416 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354493480441324, + "loss": 2.8516, + "theoretical_loss": 3.6548755084778746, + "tokens_seen": 983805952 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035448345035105316, + "loss": 2.7551, + "theoretical_loss": 3.654852269337494, + "tokens_seen": 983871488 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035447342026078234, + "loss": 2.8541, + "theoretical_loss": 3.6548290321784265, + "tokens_seen": 983937024 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003544633901705115, + "loss": 2.808, + "theoretical_loss": 3.654805797000371, + "tokens_seen": 984002560 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003544533600802407, + "loss": 2.7842, + "theoretical_loss": 3.6547825638030265, + "tokens_seen": 984068096 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035444332998996994, + "loss": 2.9446, + "theoretical_loss": 3.654759332586093, + "tokens_seen": 984133632 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035443329989969907, + "loss": 2.8655, + "theoretical_loss": 3.654736103349269, + "tokens_seen": 984199168 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003544232698094283, + "loss": 2.7587, + "theoretical_loss": 3.654712876092254, + "tokens_seen": 984264704 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035441323971915743, + "loss": 2.9548, + "theoretical_loss": 3.654689650814748, + "tokens_seen": 984330240 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035440320962888667, + "loss": 2.7977, + "theoretical_loss": 3.6546664275164504, + "tokens_seen": 984395776 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035439317953861585, + "loss": 2.8419, + "theoretical_loss": 3.6546432061970604, + "tokens_seen": 984461312 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035438314944834503, + "loss": 2.8059, + "theoretical_loss": 3.6546199868562774, + "tokens_seen": 984526848 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2346871, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1162140369415283, + "objective/train/theoretical_loss": 3.6545967694938017, + "objective/train/tokens_used": 1005052384, + "theoretical_loss": 3.6545967694938017, + "tokens_seen": 984592384 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003543731193580742, + "loss": 2.9831, + "theoretical_loss": 3.6545967694938017, + "tokens_seen": 984592384 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035436308926780344, + "loss": 2.9066, + "theoretical_loss": 3.6545735541093336, + "tokens_seen": 984657920 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003543530591775326, + "loss": 2.9419, + "theoretical_loss": 3.6545503407025715, + "tokens_seen": 984723456 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003543430290872618, + "loss": 2.8538, + "theoretical_loss": 3.6545271292732164, + "tokens_seen": 984788992 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354332998996991, + "loss": 2.8055, + "theoretical_loss": 3.654503919820968, + "tokens_seen": 984854528 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035432296890672017, + "loss": 2.8211, + "theoretical_loss": 3.654480712345526, + "tokens_seen": 984920064 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003543129388164494, + "loss": 2.9169, + "theoretical_loss": 3.654457506846591, + "tokens_seen": 984985600 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035430290872617853, + "loss": 2.8094, + "theoretical_loss": 3.6544343033238635, + "tokens_seen": 985051136 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035429287863590777, + "loss": 2.9054, + "theoretical_loss": 3.654411101777043, + "tokens_seen": 985116672 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003542828485456369, + "loss": 2.9525, + "theoretical_loss": 3.65438790220583, + "tokens_seen": 985182208 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035427281845536613, + "loss": 2.9745, + "theoretical_loss": 3.654364704609925, + "tokens_seen": 985247744 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003542627883650953, + "loss": 2.8813, + "theoretical_loss": 3.6543415089890283, + "tokens_seen": 985313280 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003542527582748245, + "loss": 2.9122, + "theoretical_loss": 3.6543183153428402, + "tokens_seen": 985378816 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035424272818455367, + "loss": 2.9204, + "theoretical_loss": 3.654295123671062, + "tokens_seen": 985444352 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035423269809428285, + "loss": 2.7508, + "theoretical_loss": 3.6542719339733933, + "tokens_seen": 985509888 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035422266800401203, + "loss": 2.8757, + "theoretical_loss": 3.654248746249536, + "tokens_seen": 985575424 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035421263791374127, + "loss": 2.9268, + "theoretical_loss": 3.6542255604991905, + "tokens_seen": 985640960 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003542026078234704, + "loss": 2.9508, + "theoretical_loss": 3.654202376722057, + "tokens_seen": 985706496 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035419257773319963, + "loss": 2.9042, + "theoretical_loss": 3.6541791949178366, + "tokens_seen": 985772032 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003541825476429288, + "loss": 2.8978, + "theoretical_loss": 3.6541560150862304, + "tokens_seen": 985837568 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354172517552658, + "loss": 2.7183, + "theoretical_loss": 3.6541328372269395, + "tokens_seen": 985903104 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003541624874623872, + "loss": 2.9203, + "theoretical_loss": 3.654109661339665, + "tokens_seen": 985968640 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035415245737211636, + "loss": 2.8347, + "theoretical_loss": 3.654086487424108, + "tokens_seen": 986034176 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035414242728184554, + "loss": 2.8961, + "theoretical_loss": 3.65406331547997, + "tokens_seen": 986099712 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035413239719157477, + "loss": 2.8682, + "theoretical_loss": 3.6540401455069516, + "tokens_seen": 986165248 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2350574, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.862433910369873, + "objective/train/theoretical_loss": 3.654016977504755, + "objective/train/tokens_used": 1006690784, + "theoretical_loss": 3.654016977504755, + "tokens_seen": 986230784 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003541223671013039, + "loss": 2.8141, + "theoretical_loss": 3.654016977504755, + "tokens_seen": 986230784 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035411233701103314, + "loss": 2.8266, + "theoretical_loss": 3.653993811473081, + "tokens_seen": 986296320 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035410230692076226, + "loss": 2.8361, + "theoretical_loss": 3.653970647411631, + "tokens_seen": 986361856 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003540922768304915, + "loss": 2.8609, + "theoretical_loss": 3.6539474853201073, + "tokens_seen": 986427392 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003540822467402207, + "loss": 2.8659, + "theoretical_loss": 3.653924325198211, + "tokens_seen": 986492928 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035407221664994986, + "loss": 2.9299, + "theoretical_loss": 3.6539011670456434, + "tokens_seen": 986558464 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035406218655967904, + "loss": 2.8594, + "theoretical_loss": 3.6538780108621074, + "tokens_seen": 986624000 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003540521564694082, + "loss": 2.8273, + "theoretical_loss": 3.6538548566473033, + "tokens_seen": 986689536 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003540421263791374, + "loss": 2.7793, + "theoretical_loss": 3.6538317044009343, + "tokens_seen": 986755072 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035403209628886664, + "loss": 2.8268, + "theoretical_loss": 3.6538085541227012, + "tokens_seen": 986820608 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035402206619859577, + "loss": 2.8946, + "theoretical_loss": 3.6537854058123065, + "tokens_seen": 986886144 + }, + { + "epoch": 12.0, + "learning_rate": 0.000354012036108325, + "loss": 2.8387, + "theoretical_loss": 3.653762259469453, + "tokens_seen": 986951680 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003540020060180542, + "loss": 2.845, + "theoretical_loss": 3.653739115093842, + "tokens_seen": 987017216 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035399197592778336, + "loss": 2.8852, + "theoretical_loss": 3.653715972685175, + "tokens_seen": 987082752 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035398194583751254, + "loss": 2.8315, + "theoretical_loss": 3.6536928322431566, + "tokens_seen": 987148288 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003539719157472417, + "loss": 2.819, + "theoretical_loss": 3.653669693767487, + "tokens_seen": 987213824 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003539618856569709, + "loss": 2.8517, + "theoretical_loss": 3.6536465572578685, + "tokens_seen": 987279360 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035395185556670014, + "loss": 2.9009, + "theoretical_loss": 3.653623422714005, + "tokens_seen": 987344896 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035394182547642927, + "loss": 2.9394, + "theoretical_loss": 3.653600290135598, + "tokens_seen": 987410432 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003539317953861585, + "loss": 2.9657, + "theoretical_loss": 3.653577159522351, + "tokens_seen": 987475968 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035392176529588763, + "loss": 2.9295, + "theoretical_loss": 3.653554030873966, + "tokens_seen": 987541504 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035391173520561687, + "loss": 2.8846, + "theoretical_loss": 3.653530904190145, + "tokens_seen": 987607040 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035390170511534605, + "loss": 2.8324, + "theoretical_loss": 3.6535077794705924, + "tokens_seen": 987672576 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035389167502507523, + "loss": 2.9582, + "theoretical_loss": 3.6534846567150097, + "tokens_seen": 987738112 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003538816449348044, + "loss": 2.8988, + "theoretical_loss": 3.6534615359231006, + "tokens_seen": 987803648 + }, + { + "epoch": 12.0, + "objective/train/docs_used": 2355434, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0259718894958496, + "objective/train/theoretical_loss": 3.6534384170945673, + "objective/train/tokens_used": 1008329184, + "theoretical_loss": 3.6534384170945673, + "tokens_seen": 987869184 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035387161484453364, + "loss": 2.8506, + "theoretical_loss": 3.6534384170945673, + "tokens_seen": 987869184 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035386158475426277, + "loss": 2.8893, + "theoretical_loss": 3.653415300229114, + "tokens_seen": 987934720 + }, + { + "epoch": 12.0, + "learning_rate": 0.000353851554663992, + "loss": 2.9369, + "theoretical_loss": 3.6533921853264424, + "tokens_seen": 988000256 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035384152457372113, + "loss": 2.911, + "theoretical_loss": 3.6533690723862566, + "tokens_seen": 988065792 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035383149448345037, + "loss": 2.7934, + "theoretical_loss": 3.65334596140826, + "tokens_seen": 988131328 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035382146439317955, + "loss": 2.885, + "theoretical_loss": 3.6533228523921553, + "tokens_seen": 988196864 + }, + { + "epoch": 12.0, + "learning_rate": 0.00035381143430290873, + "loss": 2.7555, + "theoretical_loss": 3.6532997453376463, + "tokens_seen": 988262400 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003538014042126379, + "loss": 2.8797, + "theoretical_loss": 3.6532766402444357, + "tokens_seen": 988327936 + }, + { + "epoch": 12.0, + "learning_rate": 0.0003537913741223671, + "loss": 2.9647, + "theoretical_loss": 3.653253537112228, + "tokens_seen": 988393472 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003537813440320963, + "loss": 2.8904, + "theoretical_loss": 3.653230435940726, + "tokens_seen": 988459008 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003537713139418255, + "loss": 2.8678, + "theoretical_loss": 3.6532073367296336, + "tokens_seen": 988524544 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035376128385155464, + "loss": 2.8555, + "theoretical_loss": 3.6531842394786542, + "tokens_seen": 988590080 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035375125376128387, + "loss": 2.8756, + "theoretical_loss": 3.653161144187492, + "tokens_seen": 988655616 + }, + { + "epoch": 12.01, + "learning_rate": 0.000353741223671013, + "loss": 2.8384, + "theoretical_loss": 3.653138050855851, + "tokens_seen": 988721152 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035373119358074223, + "loss": 2.9053, + "theoretical_loss": 3.653114959483434, + "tokens_seen": 988786688 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003537211634904714, + "loss": 2.8987, + "theoretical_loss": 3.6530918700699457, + "tokens_seen": 988852224 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003537111334002006, + "loss": 2.9254, + "theoretical_loss": 3.6530687826150907, + "tokens_seen": 988917760 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003537011033099298, + "loss": 2.8065, + "theoretical_loss": 3.653045697118572, + "tokens_seen": 988983296 + }, + { + "epoch": 12.01, + "learning_rate": 0.000353691073219659, + "loss": 2.8159, + "theoretical_loss": 3.6530226135800943, + "tokens_seen": 989048832 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035368104312938814, + "loss": 2.8053, + "theoretical_loss": 3.6529995319993613, + "tokens_seen": 989114368 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536710130391174, + "loss": 2.8146, + "theoretical_loss": 3.6529764523760777, + "tokens_seen": 989179904 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536609829488465, + "loss": 2.8643, + "theoretical_loss": 3.6529533747099476, + "tokens_seen": 989245440 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035365095285857574, + "loss": 2.9277, + "theoretical_loss": 3.652930299000676, + "tokens_seen": 989310976 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536409227683049, + "loss": 2.9165, + "theoretical_loss": 3.652907225247967, + "tokens_seen": 989376512 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536308926780341, + "loss": 2.8661, + "theoretical_loss": 3.652884153451524, + "tokens_seen": 989442048 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2358353, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0790271759033203, + "objective/train/theoretical_loss": 3.652861083611053, + "objective/train/tokens_used": 1009967584, + "theoretical_loss": 3.652861083611053, + "tokens_seen": 989507584 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536208625877633, + "loss": 2.9647, + "theoretical_loss": 3.652861083611053, + "tokens_seen": 989507584 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035361083249749246, + "loss": 2.7812, + "theoretical_loss": 3.6528380157262585, + "tokens_seen": 989573120 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003536008024072217, + "loss": 2.7364, + "theoretical_loss": 3.652814949796845, + "tokens_seen": 989638656 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003535907723169509, + "loss": 2.8935, + "theoretical_loss": 3.6527918858225172, + "tokens_seen": 989704192 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035358074222668006, + "loss": 2.8317, + "theoretical_loss": 3.6527688238029796, + "tokens_seen": 989769728 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035357071213640924, + "loss": 2.9339, + "theoretical_loss": 3.6527457637379372, + "tokens_seen": 989835264 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003535606820461384, + "loss": 2.8264, + "theoretical_loss": 3.652722705627096, + "tokens_seen": 989900800 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003535506519558676, + "loss": 2.867, + "theoretical_loss": 3.65269964947016, + "tokens_seen": 989966336 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035354062186559684, + "loss": 2.9274, + "theoretical_loss": 3.6526765952668345, + "tokens_seen": 990031872 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035353059177532597, + "loss": 2.9457, + "theoretical_loss": 3.6526535430168248, + "tokens_seen": 990097408 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003535205616850552, + "loss": 2.8293, + "theoretical_loss": 3.652630492719836, + "tokens_seen": 990162944 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003535105315947844, + "loss": 2.9703, + "theoretical_loss": 3.6526074443755734, + "tokens_seen": 990228480 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035350050150451356, + "loss": 2.822, + "theoretical_loss": 3.652584397983742, + "tokens_seen": 990294016 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035349047141424274, + "loss": 2.6873, + "theoretical_loss": 3.6525613535440487, + "tokens_seen": 990359552 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003534804413239719, + "loss": 2.9328, + "theoretical_loss": 3.6525383110561966, + "tokens_seen": 990425088 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003534704112337011, + "loss": 2.7727, + "theoretical_loss": 3.6525152705198933, + "tokens_seen": 990490624 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035346038114343034, + "loss": 2.8577, + "theoretical_loss": 3.6524922319348434, + "tokens_seen": 990556160 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035345035105315947, + "loss": 2.943, + "theoretical_loss": 3.652469195300753, + "tokens_seen": 990621696 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003534403209628887, + "loss": 2.8332, + "theoretical_loss": 3.6524461606173273, + "tokens_seen": 990687232 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035343029087261783, + "loss": 2.9316, + "theoretical_loss": 3.6524231278842727, + "tokens_seen": 990752768 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035342026078234707, + "loss": 2.9788, + "theoretical_loss": 3.6524000971012947, + "tokens_seen": 990818304 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035341023069207625, + "loss": 2.9524, + "theoretical_loss": 3.652377068268099, + "tokens_seen": 990883840 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035340020060180543, + "loss": 2.8704, + "theoretical_loss": 3.6523540413843927, + "tokens_seen": 990949376 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003533901705115346, + "loss": 2.8356, + "theoretical_loss": 3.652331016449881, + "tokens_seen": 991014912 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035338014042126384, + "loss": 2.8862, + "theoretical_loss": 3.6523079934642695, + "tokens_seen": 991080448 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2363184, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8340678215026855, + "objective/train/theoretical_loss": 3.6522849724272652, + "objective/train/tokens_used": 1011605984, + "theoretical_loss": 3.6522849724272652, + "tokens_seen": 991145984 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035337011033099297, + "loss": 2.8788, + "theoretical_loss": 3.6522849724272652, + "tokens_seen": 991145984 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003533600802407222, + "loss": 2.8404, + "theoretical_loss": 3.6522619533385745, + "tokens_seen": 991211520 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035335005015045133, + "loss": 2.9058, + "theoretical_loss": 3.652238936197903, + "tokens_seen": 991277056 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035334002006018057, + "loss": 2.8033, + "theoretical_loss": 3.652215921004957, + "tokens_seen": 991342592 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035332998996990975, + "loss": 2.8803, + "theoretical_loss": 3.652192907759444, + "tokens_seen": 991408128 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035331995987963893, + "loss": 2.8907, + "theoretical_loss": 3.6521698964610696, + "tokens_seen": 991473664 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003533099297893681, + "loss": 2.9826, + "theoretical_loss": 3.6521468871095406, + "tokens_seen": 991539200 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532998996990973, + "loss": 2.8616, + "theoretical_loss": 3.652123879704564, + "tokens_seen": 991604736 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532898696088265, + "loss": 2.8794, + "theoretical_loss": 3.652100874245846, + "tokens_seen": 991670272 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532798395185557, + "loss": 2.8776, + "theoretical_loss": 3.652077870733093, + "tokens_seen": 991735808 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035326980942828484, + "loss": 2.7983, + "theoretical_loss": 3.6520548691660126, + "tokens_seen": 991801344 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035325977933801407, + "loss": 2.8651, + "theoretical_loss": 3.6520318695443117, + "tokens_seen": 991866880 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532497492477432, + "loss": 2.8396, + "theoretical_loss": 3.6520088718676966, + "tokens_seen": 991932416 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035323971915747243, + "loss": 2.8988, + "theoretical_loss": 3.6519858761358748, + "tokens_seen": 991997952 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532296890672016, + "loss": 2.8337, + "theoretical_loss": 3.6519628823485535, + "tokens_seen": 992063488 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003532196589769308, + "loss": 2.8937, + "theoretical_loss": 3.651939890505439, + "tokens_seen": 992129024 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035320962888666, + "loss": 2.7771, + "theoretical_loss": 3.6519169006062393, + "tokens_seen": 992194560 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531995987963892, + "loss": 2.9205, + "theoretical_loss": 3.6518939126506615, + "tokens_seen": 992260096 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035318956870611834, + "loss": 2.8457, + "theoretical_loss": 3.651870926638413, + "tokens_seen": 992325632 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531795386158476, + "loss": 2.9317, + "theoretical_loss": 3.6518479425692005, + "tokens_seen": 992391168 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531695085255767, + "loss": 2.6951, + "theoretical_loss": 3.651824960442733, + "tokens_seen": 992456704 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035315947843530594, + "loss": 2.8957, + "theoretical_loss": 3.651801980258716, + "tokens_seen": 992522240 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531494483450351, + "loss": 2.7809, + "theoretical_loss": 3.6517790020168586, + "tokens_seen": 992587776 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531394182547643, + "loss": 2.9315, + "theoretical_loss": 3.6517560257168675, + "tokens_seen": 992653312 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003531293881644935, + "loss": 2.9366, + "theoretical_loss": 3.651733051358451, + "tokens_seen": 992718848 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2366173, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9958088397979736, + "objective/train/theoretical_loss": 3.651710078941317, + "objective/train/tokens_used": 1013244384, + "theoretical_loss": 3.651710078941317, + "tokens_seen": 992784384 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035311935807422266, + "loss": 2.8495, + "theoretical_loss": 3.651710078941317, + "tokens_seen": 992784384 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035310932798395184, + "loss": 2.8269, + "theoretical_loss": 3.6516871084651727, + "tokens_seen": 992849920 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003530992978936811, + "loss": 2.938, + "theoretical_loss": 3.6516641399297267, + "tokens_seen": 992915456 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003530892678034102, + "loss": 2.9385, + "theoretical_loss": 3.6516411733346867, + "tokens_seen": 992980992 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035307923771313944, + "loss": 2.7767, + "theoretical_loss": 3.6516182086797606, + "tokens_seen": 993046528 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035306920762286857, + "loss": 2.937, + "theoretical_loss": 3.651595245964656, + "tokens_seen": 993112064 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003530591775325978, + "loss": 2.9439, + "theoretical_loss": 3.651572285189082, + "tokens_seen": 993177600 + }, + { + "epoch": 12.01, + "learning_rate": 0.000353049147442327, + "loss": 2.9216, + "theoretical_loss": 3.6515493263527463, + "tokens_seen": 993243136 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035303911735205617, + "loss": 2.9227, + "theoretical_loss": 3.651526369455357, + "tokens_seen": 993308672 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035302908726178535, + "loss": 2.7919, + "theoretical_loss": 3.651503414496623, + "tokens_seen": 993374208 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003530190571715146, + "loss": 2.8885, + "theoretical_loss": 3.651480461476252, + "tokens_seen": 993439744 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003530090270812437, + "loss": 2.8971, + "theoretical_loss": 3.6514575103939535, + "tokens_seen": 993505280 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035299899699097294, + "loss": 2.7858, + "theoretical_loss": 3.651434561249435, + "tokens_seen": 993570816 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035298896690070207, + "loss": 2.8694, + "theoretical_loss": 3.6514116140424058, + "tokens_seen": 993636352 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003529789368104313, + "loss": 2.9173, + "theoretical_loss": 3.651388668772574, + "tokens_seen": 993701888 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003529689067201605, + "loss": 2.8758, + "theoretical_loss": 3.6513657254396485, + "tokens_seen": 993767424 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035295887662988967, + "loss": 2.9341, + "theoretical_loss": 3.6513427840433383, + "tokens_seen": 993832960 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035294884653961885, + "loss": 2.7813, + "theoretical_loss": 3.6513198445833517, + "tokens_seen": 993898496 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035293881644934803, + "loss": 2.8461, + "theoretical_loss": 3.6512969070593986, + "tokens_seen": 993964032 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003529287863590772, + "loss": 2.8527, + "theoretical_loss": 3.651273971471187, + "tokens_seen": 994029568 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035291875626880645, + "loss": 2.8616, + "theoretical_loss": 3.651251037818426, + "tokens_seen": 994095104 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003529087261785356, + "loss": 2.7952, + "theoretical_loss": 3.6512281061008256, + "tokens_seen": 994160640 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003528986960882648, + "loss": 2.7507, + "theoretical_loss": 3.6512051763180935, + "tokens_seen": 994226176 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035288866599799394, + "loss": 2.9328, + "theoretical_loss": 3.65118224846994, + "tokens_seen": 994291712 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035287863590772317, + "loss": 2.7804, + "theoretical_loss": 3.651159322556075, + "tokens_seen": 994357248 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2370019, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.920151948928833, + "objective/train/theoretical_loss": 3.6511363985762064, + "objective/train/tokens_used": 1014882784, + "theoretical_loss": 3.6511363985762064, + "tokens_seen": 994422784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035286860581745235, + "loss": 2.9438, + "theoretical_loss": 3.6511363985762064, + "tokens_seen": 994422784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035285857572718153, + "loss": 2.9732, + "theoretical_loss": 3.6511134765300444, + "tokens_seen": 994488320 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035284854563691077, + "loss": 2.8623, + "theoretical_loss": 3.651090556417298, + "tokens_seen": 994553856 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035283851554663995, + "loss": 2.8646, + "theoretical_loss": 3.6510676382376768, + "tokens_seen": 994619392 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035282848545636913, + "loss": 2.8937, + "theoretical_loss": 3.651044721990891, + "tokens_seen": 994684928 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003528184553660983, + "loss": 2.9608, + "theoretical_loss": 3.65102180767665, + "tokens_seen": 994750464 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003528084252758275, + "loss": 2.8265, + "theoretical_loss": 3.650998895294663, + "tokens_seen": 994816000 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527983951855567, + "loss": 2.8083, + "theoretical_loss": 3.6509759848446404, + "tokens_seen": 994881536 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527883650952859, + "loss": 2.7728, + "theoretical_loss": 3.650953076326292, + "tokens_seen": 994947072 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035277833500501504, + "loss": 2.9109, + "theoretical_loss": 3.6509301697393273, + "tokens_seen": 995012608 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035276830491474427, + "loss": 3.0029, + "theoretical_loss": 3.650907265083456, + "tokens_seen": 995078144 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527582748244734, + "loss": 2.7466, + "theoretical_loss": 3.6508843623583895, + "tokens_seen": 995143680 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035274824473420263, + "loss": 2.6957, + "theoretical_loss": 3.650861461563837, + "tokens_seen": 995209216 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527382146439318, + "loss": 2.976, + "theoretical_loss": 3.6508385626995086, + "tokens_seen": 995274752 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352728184553661, + "loss": 2.853, + "theoretical_loss": 3.650815665765114, + "tokens_seen": 995340288 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527181544633902, + "loss": 2.8999, + "theoretical_loss": 3.650792770760365, + "tokens_seen": 995405824 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003527081243731194, + "loss": 2.8537, + "theoretical_loss": 3.6507698776849704, + "tokens_seen": 995471360 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035269809428284854, + "loss": 2.968, + "theoretical_loss": 3.6507469865386417, + "tokens_seen": 995536896 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526880641925778, + "loss": 2.8213, + "theoretical_loss": 3.6507240973210884, + "tokens_seen": 995602432 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526780341023069, + "loss": 2.9454, + "theoretical_loss": 3.6507012100320226, + "tokens_seen": 995667968 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035266800401203614, + "loss": 2.8494, + "theoretical_loss": 3.650678324671153, + "tokens_seen": 995733504 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526579739217653, + "loss": 2.8527, + "theoretical_loss": 3.6506554412381917, + "tokens_seen": 995799040 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526479438314945, + "loss": 2.8084, + "theoretical_loss": 3.6506325597328484, + "tokens_seen": 995864576 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526379137412237, + "loss": 2.9595, + "theoretical_loss": 3.650609680154835, + "tokens_seen": 995930112 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035262788365095286, + "loss": 2.8355, + "theoretical_loss": 3.6505868025038613, + "tokens_seen": 995995648 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2374772, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5016207695007324, + "objective/train/theoretical_loss": 3.650563926779639, + "objective/train/tokens_used": 1016521184, + "theoretical_loss": 3.650563926779639, + "tokens_seen": 996061184 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035261785356068204, + "loss": 2.8597, + "theoretical_loss": 3.650563926779639, + "tokens_seen": 996061184 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003526078234704113, + "loss": 2.7333, + "theoretical_loss": 3.650541052981878, + "tokens_seen": 996126720 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003525977933801404, + "loss": 2.9539, + "theoretical_loss": 3.6505181811102907, + "tokens_seen": 996192256 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035258776328986964, + "loss": 2.8614, + "theoretical_loss": 3.6504953111645873, + "tokens_seen": 996257792 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035257773319959877, + "loss": 2.8966, + "theoretical_loss": 3.650472443144479, + "tokens_seen": 996323328 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352567703109328, + "loss": 2.7754, + "theoretical_loss": 3.650449577049678, + "tokens_seen": 996388864 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003525576730190572, + "loss": 2.8571, + "theoretical_loss": 3.650426712879894, + "tokens_seen": 996454400 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035254764292878637, + "loss": 2.8285, + "theoretical_loss": 3.6504038506348393, + "tokens_seen": 996519936 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035253761283851555, + "loss": 2.9146, + "theoretical_loss": 3.6503809903142255, + "tokens_seen": 996585472 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003525275827482448, + "loss": 2.8767, + "theoretical_loss": 3.6503581319177636, + "tokens_seen": 996651008 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003525175526579739, + "loss": 2.8444, + "theoretical_loss": 3.6503352754451655, + "tokens_seen": 996716544 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035250752256770314, + "loss": 2.8104, + "theoretical_loss": 3.6503124208961424, + "tokens_seen": 996782080 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035249749247743227, + "loss": 2.8658, + "theoretical_loss": 3.6502895682704057, + "tokens_seen": 996847616 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003524874623871615, + "loss": 2.8117, + "theoretical_loss": 3.650266717567668, + "tokens_seen": 996913152 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003524774322968907, + "loss": 2.9052, + "theoretical_loss": 3.6502438687876406, + "tokens_seen": 996978688 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035246740220661987, + "loss": 2.7864, + "theoretical_loss": 3.6502210219300357, + "tokens_seen": 997044224 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035245737211634905, + "loss": 2.8913, + "theoretical_loss": 3.6501981769945644, + "tokens_seen": 997109760 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035244734202607823, + "loss": 2.9557, + "theoretical_loss": 3.6501753339809397, + "tokens_seen": 997175296 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003524373119358074, + "loss": 2.9462, + "theoretical_loss": 3.6501524928888722, + "tokens_seen": 997240832 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035242728184553665, + "loss": 2.8967, + "theoretical_loss": 3.650129653718076, + "tokens_seen": 997306368 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003524172517552658, + "loss": 2.8648, + "theoretical_loss": 3.6501068164682615, + "tokens_seen": 997371904 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352407221664995, + "loss": 2.8443, + "theoretical_loss": 3.650083981139142, + "tokens_seen": 997437440 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035239719157472414, + "loss": 2.9782, + "theoretical_loss": 3.650061147730429, + "tokens_seen": 997502976 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035238716148445337, + "loss": 2.8843, + "theoretical_loss": 3.650038316241835, + "tokens_seen": 997568512 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035237713139418255, + "loss": 2.8022, + "theoretical_loss": 3.650015486673073, + "tokens_seen": 997634048 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2377852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.010340452194214, + "objective/train/theoretical_loss": 3.649992659023855, + "objective/train/tokens_used": 1018159584, + "theoretical_loss": 3.649992659023855, + "tokens_seen": 997699584 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035236710130391173, + "loss": 2.8843, + "theoretical_loss": 3.649992659023855, + "tokens_seen": 997699584 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003523570712136409, + "loss": 2.7592, + "theoretical_loss": 3.6499698332938935, + "tokens_seen": 997765120 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035234704112337015, + "loss": 2.8648, + "theoretical_loss": 3.649947009482901, + "tokens_seen": 997830656 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003523370110330993, + "loss": 2.8613, + "theoretical_loss": 3.6499241875905906, + "tokens_seen": 997896192 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003523269809428285, + "loss": 2.8291, + "theoretical_loss": 3.6499013676166747, + "tokens_seen": 997961728 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035231695085255764, + "loss": 2.8429, + "theoretical_loss": 3.6498785495608663, + "tokens_seen": 998027264 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003523069207622869, + "loss": 2.8573, + "theoretical_loss": 3.6498557334228785, + "tokens_seen": 998092800 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035229689067201606, + "loss": 2.7917, + "theoretical_loss": 3.649832919202423, + "tokens_seen": 998158336 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035228686058174524, + "loss": 2.8423, + "theoretical_loss": 3.6498101068992135, + "tokens_seen": 998223872 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003522768304914744, + "loss": 2.8146, + "theoretical_loss": 3.6497872965129634, + "tokens_seen": 998289408 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003522668004012036, + "loss": 2.8873, + "theoretical_loss": 3.6497644880433855, + "tokens_seen": 998354944 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003522567703109328, + "loss": 2.8471, + "theoretical_loss": 3.649741681490193, + "tokens_seen": 998420480 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352246740220662, + "loss": 2.911, + "theoretical_loss": 3.6497188768530986, + "tokens_seen": 998486016 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035223671013039114, + "loss": 2.9074, + "theoretical_loss": 3.649696074131816, + "tokens_seen": 998551552 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003522266800401204, + "loss": 2.8988, + "theoretical_loss": 3.6496732733260586, + "tokens_seen": 998617088 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035221664994984956, + "loss": 2.8833, + "theoretical_loss": 3.6496504744355396, + "tokens_seen": 998682624 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035220661985957874, + "loss": 2.9968, + "theoretical_loss": 3.6496276774599723, + "tokens_seen": 998748160 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521965897693079, + "loss": 2.8863, + "theoretical_loss": 3.649604882399071, + "tokens_seen": 998813696 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521865596790371, + "loss": 2.9473, + "theoretical_loss": 3.6495820892525486, + "tokens_seen": 998879232 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521765295887663, + "loss": 2.8823, + "theoretical_loss": 3.649559298020119, + "tokens_seen": 998944768 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521664994984955, + "loss": 2.6756, + "theoretical_loss": 3.6495365087014955, + "tokens_seen": 999010304 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035215646940822465, + "loss": 2.9124, + "theoretical_loss": 3.649513721296392, + "tokens_seen": 999075840 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521464393179539, + "loss": 2.8873, + "theoretical_loss": 3.649490935804523, + "tokens_seen": 999141376 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352136409227683, + "loss": 2.8774, + "theoretical_loss": 3.6494681522256016, + "tokens_seen": 999206912 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035212637913741224, + "loss": 2.8616, + "theoretical_loss": 3.649445370559342, + "tokens_seen": 999272448 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2382654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8756210803985596, + "objective/train/theoretical_loss": 3.6494225908054583, + "objective/train/tokens_used": 1019797984, + "theoretical_loss": 3.6494225908054583, + "tokens_seen": 999337984 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521163490471414, + "loss": 2.9181, + "theoretical_loss": 3.6494225908054583, + "tokens_seen": 999337984 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003521063189568706, + "loss": 2.8157, + "theoretical_loss": 3.6493998129636642, + "tokens_seen": 999403520 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035209628886659984, + "loss": 2.8341, + "theoretical_loss": 3.649377037033674, + "tokens_seen": 999469056 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035208625877632897, + "loss": 2.8912, + "theoretical_loss": 3.6493542630152027, + "tokens_seen": 999534592 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003520762286860582, + "loss": 2.9898, + "theoretical_loss": 3.6493314909079637, + "tokens_seen": 999600128 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003520661985957874, + "loss": 2.8231, + "theoretical_loss": 3.6493087207116712, + "tokens_seen": 999665664 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035205616850551657, + "loss": 2.8084, + "theoretical_loss": 3.64928595242604, + "tokens_seen": 999731200 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035204613841524575, + "loss": 2.8133, + "theoretical_loss": 3.6492631860507845, + "tokens_seen": 999796736 + }, + { + "epoch": 12.01, + "learning_rate": 0.000352036108324975, + "loss": 2.9112, + "theoretical_loss": 3.649240421585619, + "tokens_seen": 999862272 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003520260782347041, + "loss": 2.8352, + "theoretical_loss": 3.649217659030258, + "tokens_seen": 999927808 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035201604814443334, + "loss": 2.8453, + "theoretical_loss": 3.649194898384417, + "tokens_seen": 999993344 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035200601805416247, + "loss": 2.9463, + "theoretical_loss": 3.6491721396478094, + "tokens_seen": 1000058880 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003519959879638917, + "loss": 2.9132, + "theoretical_loss": 3.6491493828201507, + "tokens_seen": 1000124416 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003519859578736209, + "loss": 2.8867, + "theoretical_loss": 3.6491266279011554, + "tokens_seen": 1000189952 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035197592778335007, + "loss": 2.8077, + "theoretical_loss": 3.649103874890539, + "tokens_seen": 1000255488 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035196589769307925, + "loss": 2.8532, + "theoretical_loss": 3.649081123788015, + "tokens_seen": 1000321024 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035195586760280843, + "loss": 2.8896, + "theoretical_loss": 3.6490583745933, + "tokens_seen": 1000386560 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003519458375125376, + "loss": 2.9539, + "theoretical_loss": 3.6490356273061084, + "tokens_seen": 1000452096 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035193580742226685, + "loss": 2.8279, + "theoretical_loss": 3.649012881926155, + "tokens_seen": 1000517632 + }, + { + "epoch": 12.01, + "learning_rate": 0.000351925777331996, + "loss": 2.8603, + "theoretical_loss": 3.648990138453156, + "tokens_seen": 1000583168 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003519157472417252, + "loss": 2.8678, + "theoretical_loss": 3.648967396886825, + "tokens_seen": 1000648704 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035190571715145434, + "loss": 2.888, + "theoretical_loss": 3.6489446572268784, + "tokens_seen": 1000714240 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035189568706118357, + "loss": 2.8183, + "theoretical_loss": 3.6489219194730316, + "tokens_seen": 1000779776 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035188565697091275, + "loss": 2.9453, + "theoretical_loss": 3.6488991836249998, + "tokens_seen": 1000845312 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035187562688064193, + "loss": 3.008, + "theoretical_loss": 3.648876449682499, + "tokens_seen": 1000910848 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2385627, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.881676197052002, + "objective/train/theoretical_loss": 3.648853717645243, + "objective/train/tokens_used": 1021436384, + "theoretical_loss": 3.648853717645243, + "tokens_seen": 1000976384 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003518655967903711, + "loss": 2.8077, + "theoretical_loss": 3.648853717645243, + "tokens_seen": 1000976384 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035185556670010035, + "loss": 2.8352, + "theoretical_loss": 3.6488309875129494, + "tokens_seen": 1001041920 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003518455366098295, + "loss": 2.907, + "theoretical_loss": 3.648808259285333, + "tokens_seen": 1001107456 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003518355065195587, + "loss": 2.8485, + "theoretical_loss": 3.6487855329621093, + "tokens_seen": 1001172992 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035182547642928784, + "loss": 2.9643, + "theoretical_loss": 3.648762808542995, + "tokens_seen": 1001238528 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003518154463390171, + "loss": 2.9046, + "theoretical_loss": 3.648740086027705, + "tokens_seen": 1001304064 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035180541624874626, + "loss": 2.8318, + "theoretical_loss": 3.6487173654159557, + "tokens_seen": 1001369600 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035179538615847544, + "loss": 2.9114, + "theoretical_loss": 3.6486946467074626, + "tokens_seen": 1001435136 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003517853560682046, + "loss": 2.8635, + "theoretical_loss": 3.6486719299019423, + "tokens_seen": 1001500672 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003517753259779338, + "loss": 2.9108, + "theoretical_loss": 3.6486492149991108, + "tokens_seen": 1001566208 + }, + { + "epoch": 12.01, + "learning_rate": 0.000351765295887663, + "loss": 2.8457, + "theoretical_loss": 3.6486265019986837, + "tokens_seen": 1001631744 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003517552657973922, + "loss": 2.8567, + "theoretical_loss": 3.6486037909003777, + "tokens_seen": 1001697280 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035174523570712134, + "loss": 2.8854, + "theoretical_loss": 3.6485810817039095, + "tokens_seen": 1001762816 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003517352056168506, + "loss": 2.7985, + "theoretical_loss": 3.6485583744089944, + "tokens_seen": 1001828352 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035172517552657976, + "loss": 2.8821, + "theoretical_loss": 3.6485356690153496, + "tokens_seen": 1001893888 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035171514543630894, + "loss": 2.8499, + "theoretical_loss": 3.6485129655226913, + "tokens_seen": 1001959424 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003517051153460381, + "loss": 2.9, + "theoretical_loss": 3.6484902639307357, + "tokens_seen": 1002024960 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516950852557673, + "loss": 2.8795, + "theoretical_loss": 3.6484675642391995, + "tokens_seen": 1002090496 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516850551654965, + "loss": 2.8568, + "theoretical_loss": 3.6484448664478, + "tokens_seen": 1002156032 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516750250752257, + "loss": 2.8273, + "theoretical_loss": 3.6484221705562536, + "tokens_seen": 1002221568 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035166499498495485, + "loss": 2.8723, + "theoretical_loss": 3.6483994765642764, + "tokens_seen": 1002287104 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516549648946841, + "loss": 3.0179, + "theoretical_loss": 3.6483767844715858, + "tokens_seen": 1002352640 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516449348044132, + "loss": 2.7783, + "theoretical_loss": 3.6483540942778987, + "tokens_seen": 1002418176 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035163490471414244, + "loss": 2.9567, + "theoretical_loss": 3.648331405982932, + "tokens_seen": 1002483712 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516248746238716, + "loss": 2.8507, + "theoretical_loss": 3.648308719586402, + "tokens_seen": 1002549248 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2389296, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.983617067337036, + "objective/train/theoretical_loss": 3.6482860350880273, + "objective/train/tokens_used": 1023074784, + "theoretical_loss": 3.6482860350880273, + "tokens_seen": 1002614784 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003516148445336008, + "loss": 2.9022, + "theoretical_loss": 3.6482860350880273, + "tokens_seen": 1002614784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035160481444333, + "loss": 2.8294, + "theoretical_loss": 3.6482633524875236, + "tokens_seen": 1002680320 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035159478435305917, + "loss": 2.8705, + "theoretical_loss": 3.6482406717846088, + "tokens_seen": 1002745856 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035158475426278835, + "loss": 2.9192, + "theoretical_loss": 3.6482179929789997, + "tokens_seen": 1002811392 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003515747241725176, + "loss": 2.9591, + "theoretical_loss": 3.648195316070414, + "tokens_seen": 1002876928 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003515646940822467, + "loss": 2.8343, + "theoretical_loss": 3.648172641058569, + "tokens_seen": 1002942464 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035155466399197595, + "loss": 2.9156, + "theoretical_loss": 3.648149967943182, + "tokens_seen": 1003008000 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035154463390170513, + "loss": 2.8578, + "theoretical_loss": 3.6481272967239704, + "tokens_seen": 1003073536 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003515346038114343, + "loss": 2.9489, + "theoretical_loss": 3.6481046274006523, + "tokens_seen": 1003139072 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003515245737211635, + "loss": 2.9337, + "theoretical_loss": 3.6480819599729446, + "tokens_seen": 1003204608 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035151454363089267, + "loss": 2.9046, + "theoretical_loss": 3.648059294440565, + "tokens_seen": 1003270144 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035150451354062185, + "loss": 2.832, + "theoretical_loss": 3.648036630803232, + "tokens_seen": 1003335680 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514944834503511, + "loss": 2.763, + "theoretical_loss": 3.648013969060663, + "tokens_seen": 1003401216 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514844533600802, + "loss": 3.0283, + "theoretical_loss": 3.6479913092125753, + "tokens_seen": 1003466752 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035147442326980945, + "loss": 2.9255, + "theoretical_loss": 3.6479686512586875, + "tokens_seen": 1003532288 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514643931795386, + "loss": 2.8285, + "theoretical_loss": 3.647945995198717, + "tokens_seen": 1003597824 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514543630892678, + "loss": 2.7626, + "theoretical_loss": 3.6479233410323824, + "tokens_seen": 1003663360 + }, + { + "epoch": 12.01, + "learning_rate": 0.000351444332998997, + "loss": 2.7977, + "theoretical_loss": 3.647900688759402, + "tokens_seen": 1003728896 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514343029087262, + "loss": 2.6959, + "theoretical_loss": 3.647878038379493, + "tokens_seen": 1003794432 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035142427281845536, + "loss": 2.9365, + "theoretical_loss": 3.6478553898923742, + "tokens_seen": 1003859968 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035141424272818454, + "loss": 2.9409, + "theoretical_loss": 3.647832743297764, + "tokens_seen": 1003925504 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003514042126379137, + "loss": 2.9009, + "theoretical_loss": 3.6478100985953805, + "tokens_seen": 1003991040 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035139418254764295, + "loss": 2.8846, + "theoretical_loss": 3.647787455784942, + "tokens_seen": 1004056576 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513841524573721, + "loss": 2.9278, + "theoretical_loss": 3.647764814866167, + "tokens_seen": 1004122112 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513741223671013, + "loss": 2.8884, + "theoretical_loss": 3.6477421758387742, + "tokens_seen": 1004187648 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2394444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7734971046447754, + "objective/train/theoretical_loss": 3.647719538702482, + "objective/train/tokens_used": 1024713184, + "theoretical_loss": 3.647719538702482, + "tokens_seen": 1004253184 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513640922768305, + "loss": 2.8861, + "theoretical_loss": 3.647719538702482, + "tokens_seen": 1004253184 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513540621865597, + "loss": 2.9288, + "theoretical_loss": 3.6476969034570095, + "tokens_seen": 1004318720 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513440320962889, + "loss": 2.9188, + "theoretical_loss": 3.6476742701020743, + "tokens_seen": 1004384256 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035133400200601804, + "loss": 2.8951, + "theoretical_loss": 3.6476516386373965, + "tokens_seen": 1004449792 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003513239719157473, + "loss": 2.8454, + "theoretical_loss": 3.647629009062694, + "tokens_seen": 1004515328 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035131394182547646, + "loss": 2.8142, + "theoretical_loss": 3.6476063813776864, + "tokens_seen": 1004580864 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035130391173520564, + "loss": 2.9291, + "theoretical_loss": 3.6475837555820916, + "tokens_seen": 1004646400 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512938816449348, + "loss": 2.8919, + "theoretical_loss": 3.6475611316756296, + "tokens_seen": 1004711936 + }, + { + "epoch": 12.01, + "learning_rate": 0.000351283851554664, + "loss": 2.8781, + "theoretical_loss": 3.647538509658019, + "tokens_seen": 1004777472 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512738214643932, + "loss": 2.7013, + "theoretical_loss": 3.6475158895289788, + "tokens_seen": 1004843008 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512637913741224, + "loss": 2.8211, + "theoretical_loss": 3.647493271288228, + "tokens_seen": 1004908544 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035125376128385154, + "loss": 2.9026, + "theoretical_loss": 3.647470654935487, + "tokens_seen": 1004974080 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512437311935808, + "loss": 2.9181, + "theoretical_loss": 3.647448040470474, + "tokens_seen": 1005039616 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035123370110330996, + "loss": 2.8597, + "theoretical_loss": 3.6474254278929084, + "tokens_seen": 1005105152 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035122367101303914, + "loss": 2.8363, + "theoretical_loss": 3.64740281720251, + "tokens_seen": 1005170688 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512136409227683, + "loss": 2.8386, + "theoretical_loss": 3.6473802083989986, + "tokens_seen": 1005236224 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003512036108324975, + "loss": 2.8211, + "theoretical_loss": 3.6473576014820925, + "tokens_seen": 1005301760 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511935807422267, + "loss": 2.8546, + "theoretical_loss": 3.6473349964515123, + "tokens_seen": 1005367296 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511835506519559, + "loss": 2.9673, + "theoretical_loss": 3.6473123933069775, + "tokens_seen": 1005432832 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035117352056168505, + "loss": 2.9098, + "theoretical_loss": 3.647289792048208, + "tokens_seen": 1005498368 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511634904714143, + "loss": 2.8694, + "theoretical_loss": 3.6472671926749234, + "tokens_seen": 1005563904 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511534603811434, + "loss": 2.8465, + "theoretical_loss": 3.6472445951868426, + "tokens_seen": 1005629440 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035114343029087264, + "loss": 2.7937, + "theoretical_loss": 3.647221999583687, + "tokens_seen": 1005694976 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511334002006018, + "loss": 2.9479, + "theoretical_loss": 3.647199405865176, + "tokens_seen": 1005760512 + }, + { + "epoch": 12.01, + "learning_rate": 0.000351123370110331, + "loss": 2.8849, + "theoretical_loss": 3.6471768140310297, + "tokens_seen": 1005826048 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2397129, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8947620391845703, + "objective/train/theoretical_loss": 3.6471542240809676, + "objective/train/tokens_used": 1026351584, + "theoretical_loss": 3.6471542240809676, + "tokens_seen": 1005891584 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003511133400200602, + "loss": 2.8947, + "theoretical_loss": 3.6471542240809676, + "tokens_seen": 1005891584 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035110330992978937, + "loss": 2.9052, + "theoretical_loss": 3.6471316360147106, + "tokens_seen": 1005957120 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035109327983951855, + "loss": 2.9079, + "theoretical_loss": 3.647109049831978, + "tokens_seen": 1006022656 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003510832497492478, + "loss": 2.9019, + "theoretical_loss": 3.6470864655324915, + "tokens_seen": 1006088192 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003510732196589769, + "loss": 2.9533, + "theoretical_loss": 3.64706388311597, + "tokens_seen": 1006153728 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035106318956870615, + "loss": 2.8373, + "theoretical_loss": 3.647041302582134, + "tokens_seen": 1006219264 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035105315947843533, + "loss": 2.8906, + "theoretical_loss": 3.647018723930705, + "tokens_seen": 1006284800 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003510431293881645, + "loss": 2.8145, + "theoretical_loss": 3.6469961471614023, + "tokens_seen": 1006350336 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003510330992978937, + "loss": 2.8387, + "theoretical_loss": 3.6469735722739474, + "tokens_seen": 1006415872 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035102306920762287, + "loss": 2.9317, + "theoretical_loss": 3.64695099926806, + "tokens_seen": 1006481408 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035101303911735205, + "loss": 2.9666, + "theoretical_loss": 3.646928428143462, + "tokens_seen": 1006546944 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003510030090270813, + "loss": 2.9284, + "theoretical_loss": 3.646905858899873, + "tokens_seen": 1006612480 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003509929789368104, + "loss": 3.0101, + "theoretical_loss": 3.646883291537014, + "tokens_seen": 1006678016 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035098294884653965, + "loss": 2.919, + "theoretical_loss": 3.646860726054606, + "tokens_seen": 1006743552 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003509729187562688, + "loss": 2.9386, + "theoretical_loss": 3.6468381624523705, + "tokens_seen": 1006809088 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350962888665998, + "loss": 2.8182, + "theoretical_loss": 3.6468156007300276, + "tokens_seen": 1006874624 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003509528585757272, + "loss": 2.7938, + "theoretical_loss": 3.6467930408872986, + "tokens_seen": 1006940160 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003509428284854564, + "loss": 2.808, + "theoretical_loss": 3.646770482923905, + "tokens_seen": 1007005696 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035093279839518556, + "loss": 2.9465, + "theoretical_loss": 3.6467479268395673, + "tokens_seen": 1007071232 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035092276830491474, + "loss": 2.8457, + "theoretical_loss": 3.646725372634007, + "tokens_seen": 1007136768 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003509127382146439, + "loss": 2.8875, + "theoretical_loss": 3.646702820306946, + "tokens_seen": 1007202304 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035090270812437315, + "loss": 2.9342, + "theoretical_loss": 3.6466802698581042, + "tokens_seen": 1007267840 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508926780341023, + "loss": 2.8878, + "theoretical_loss": 3.6466577212872044, + "tokens_seen": 1007333376 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508826479438315, + "loss": 2.8995, + "theoretical_loss": 3.646635174593967, + "tokens_seen": 1007398912 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508726178535607, + "loss": 2.9509, + "theoretical_loss": 3.6466126297781143, + "tokens_seen": 1007464448 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2400395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9684865474700928, + "objective/train/theoretical_loss": 3.6465900868393675, + "objective/train/tokens_used": 1027989984, + "theoretical_loss": 3.6465900868393675, + "tokens_seen": 1007529984 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508625877632899, + "loss": 2.8007, + "theoretical_loss": 3.6465900868393675, + "tokens_seen": 1007529984 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035085255767301906, + "loss": 2.9615, + "theoretical_loss": 3.6465675457774482, + "tokens_seen": 1007595520 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035084252758274824, + "loss": 2.8462, + "theoretical_loss": 3.6465450065920777, + "tokens_seen": 1007661056 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508324974924774, + "loss": 2.9281, + "theoretical_loss": 3.6465224692829787, + "tokens_seen": 1007726592 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035082246740220666, + "loss": 2.9243, + "theoretical_loss": 3.6464999338498725, + "tokens_seen": 1007792128 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003508124373119358, + "loss": 2.8274, + "theoretical_loss": 3.646477400292481, + "tokens_seen": 1007857664 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350802407221665, + "loss": 2.843, + "theoretical_loss": 3.646454868610526, + "tokens_seen": 1007923200 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035079237713139415, + "loss": 2.9173, + "theoretical_loss": 3.64643233880373, + "tokens_seen": 1007988736 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003507823470411234, + "loss": 2.899, + "theoretical_loss": 3.6464098108718135, + "tokens_seen": 1008054272 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035077231695085256, + "loss": 2.8364, + "theoretical_loss": 3.6463872848145007, + "tokens_seen": 1008119808 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035076228686058174, + "loss": 2.8854, + "theoretical_loss": 3.6463647606315126, + "tokens_seen": 1008185344 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003507522567703109, + "loss": 2.8169, + "theoretical_loss": 3.646342238322571, + "tokens_seen": 1008250880 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035074222668004016, + "loss": 2.897, + "theoretical_loss": 3.6463197178873994, + "tokens_seen": 1008316416 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003507321965897693, + "loss": 2.8137, + "theoretical_loss": 3.646297199325719, + "tokens_seen": 1008381952 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003507221664994985, + "loss": 2.8784, + "theoretical_loss": 3.6462746826372534, + "tokens_seen": 1008447488 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035071213640922765, + "loss": 2.8833, + "theoretical_loss": 3.646252167821724, + "tokens_seen": 1008513024 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003507021063189569, + "loss": 2.9411, + "theoretical_loss": 3.6462296548788533, + "tokens_seen": 1008578560 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035069207622868607, + "loss": 2.8548, + "theoretical_loss": 3.6462071438083647, + "tokens_seen": 1008644096 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035068204613841525, + "loss": 2.9162, + "theoretical_loss": 3.6461846346099804, + "tokens_seen": 1008709632 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035067201604814443, + "loss": 2.6762, + "theoretical_loss": 3.6461621272834233, + "tokens_seen": 1008775168 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003506619859578736, + "loss": 2.8344, + "theoretical_loss": 3.6461396218284157, + "tokens_seen": 1008840704 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003506519558676028, + "loss": 2.9508, + "theoretical_loss": 3.6461171182446805, + "tokens_seen": 1008906240 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350641925777332, + "loss": 2.9939, + "theoretical_loss": 3.646094616531941, + "tokens_seen": 1008971776 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035063189568706115, + "loss": 2.9066, + "theoretical_loss": 3.6460721166899193, + "tokens_seen": 1009037312 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003506218655967904, + "loss": 2.9051, + "theoretical_loss": 3.6460496187183393, + "tokens_seen": 1009102848 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2404182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7902474403381348, + "objective/train/theoretical_loss": 3.646027122616924, + "objective/train/tokens_used": 1029628384, + "theoretical_loss": 3.646027122616924, + "tokens_seen": 1009168384 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003506118355065195, + "loss": 2.8008, + "theoretical_loss": 3.646027122616924, + "tokens_seen": 1009168384 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035060180541624875, + "loss": 2.9176, + "theoretical_loss": 3.6460046283853957, + "tokens_seen": 1009233920 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350591775325978, + "loss": 2.9109, + "theoretical_loss": 3.645982136023478, + "tokens_seen": 1009299456 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003505817452357071, + "loss": 2.8966, + "theoretical_loss": 3.645959645530895, + "tokens_seen": 1009364992 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035057171514543635, + "loss": 2.64, + "theoretical_loss": 3.645937156907368, + "tokens_seen": 1009430528 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035056168505516553, + "loss": 2.9007, + "theoretical_loss": 3.6459146701526226, + "tokens_seen": 1009496064 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003505516549648947, + "loss": 2.8639, + "theoretical_loss": 3.64589218526638, + "tokens_seen": 1009561600 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003505416248746239, + "loss": 2.864, + "theoretical_loss": 3.6458697022483655, + "tokens_seen": 1009627136 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035053159478435307, + "loss": 2.9202, + "theoretical_loss": 3.6458472210983017, + "tokens_seen": 1009692672 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035052156469408225, + "loss": 2.8609, + "theoretical_loss": 3.6458247418159124, + "tokens_seen": 1009758208 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003505115346038115, + "loss": 2.8044, + "theoretical_loss": 3.6458022644009214, + "tokens_seen": 1009823744 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003505015045135406, + "loss": 2.8568, + "theoretical_loss": 3.6457797888530514, + "tokens_seen": 1009889280 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035049147442326985, + "loss": 2.7857, + "theoretical_loss": 3.6457573151720277, + "tokens_seen": 1009954816 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350481444332999, + "loss": 3.0032, + "theoretical_loss": 3.645734843357573, + "tokens_seen": 1010020352 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003504714142427282, + "loss": 2.9162, + "theoretical_loss": 3.6457123734094123, + "tokens_seen": 1010085888 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003504613841524574, + "loss": 2.9104, + "theoretical_loss": 3.645689905327268, + "tokens_seen": 1010151424 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003504513540621866, + "loss": 2.8378, + "theoretical_loss": 3.645667439110865, + "tokens_seen": 1010216960 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035044132397191576, + "loss": 2.8499, + "theoretical_loss": 3.645644974759927, + "tokens_seen": 1010282496 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035043129388164494, + "loss": 2.9136, + "theoretical_loss": 3.645622512274179, + "tokens_seen": 1010348032 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003504212637913741, + "loss": 2.9427, + "theoretical_loss": 3.6456000516533438, + "tokens_seen": 1010413568 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035041123370110335, + "loss": 2.9575, + "theoretical_loss": 3.6455775928971463, + "tokens_seen": 1010479104 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003504012036108325, + "loss": 2.8845, + "theoretical_loss": 3.645555136005311, + "tokens_seen": 1010544640 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003503911735205617, + "loss": 2.9199, + "theoretical_loss": 3.645532680977562, + "tokens_seen": 1010610176 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003503811434302909, + "loss": 2.8801, + "theoretical_loss": 3.6455102278136238, + "tokens_seen": 1010675712 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003503711133400201, + "loss": 2.9038, + "theoretical_loss": 3.645487776513221, + "tokens_seen": 1010741248 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2408685, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.016160249710083, + "objective/train/theoretical_loss": 3.6454653270760775, + "objective/train/tokens_used": 1031266784, + "theoretical_loss": 3.6454653270760775, + "tokens_seen": 1010806784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035036108324974926, + "loss": 2.9604, + "theoretical_loss": 3.6454653270760775, + "tokens_seen": 1010806784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035035105315947844, + "loss": 2.871, + "theoretical_loss": 3.6454428795019185, + "tokens_seen": 1010872320 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003503410230692076, + "loss": 2.9777, + "theoretical_loss": 3.645420433790468, + "tokens_seen": 1010937856 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035033099297893686, + "loss": 2.9242, + "theoretical_loss": 3.6453979899414515, + "tokens_seen": 1011003392 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350320962888666, + "loss": 2.9666, + "theoretical_loss": 3.6453755479545933, + "tokens_seen": 1011068928 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003503109327983952, + "loss": 2.8684, + "theoretical_loss": 3.645353107829618, + "tokens_seen": 1011134464 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035030090270812435, + "loss": 2.8595, + "theoretical_loss": 3.645330669566251, + "tokens_seen": 1011200000 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003502908726178536, + "loss": 2.9931, + "theoretical_loss": 3.645308233164217, + "tokens_seen": 1011265536 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035028084252758276, + "loss": 2.9316, + "theoretical_loss": 3.6452857986232408, + "tokens_seen": 1011331072 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035027081243731194, + "loss": 2.8559, + "theoretical_loss": 3.6452633659430473, + "tokens_seen": 1011396608 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003502607823470411, + "loss": 2.9954, + "theoretical_loss": 3.645240935123363, + "tokens_seen": 1011462144 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035025075225677036, + "loss": 2.8875, + "theoretical_loss": 3.6452185061639106, + "tokens_seen": 1011527680 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003502407221664995, + "loss": 2.8517, + "theoretical_loss": 3.6451960790644176, + "tokens_seen": 1011593216 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003502306920762287, + "loss": 2.8364, + "theoretical_loss": 3.6451736538246076, + "tokens_seen": 1011658752 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035022066198595785, + "loss": 2.886, + "theoretical_loss": 3.645151230444207, + "tokens_seen": 1011724288 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003502106318956871, + "loss": 2.852, + "theoretical_loss": 3.645128808922941, + "tokens_seen": 1011789824 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035020060180541627, + "loss": 2.9504, + "theoretical_loss": 3.645106389260535, + "tokens_seen": 1011855360 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035019057171514545, + "loss": 2.9927, + "theoretical_loss": 3.6450839714567143, + "tokens_seen": 1011920896 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035018054162487463, + "loss": 3.0178, + "theoretical_loss": 3.6450615555112047, + "tokens_seen": 1011986432 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003501705115346038, + "loss": 3.0419, + "theoretical_loss": 3.6450391414237315, + "tokens_seen": 1012051968 + }, + { + "epoch": 12.01, + "learning_rate": 0.000350160481444333, + "loss": 2.8726, + "theoretical_loss": 3.6450167291940208, + "tokens_seen": 1012117504 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003501504513540622, + "loss": 2.8607, + "theoretical_loss": 3.644994318821798, + "tokens_seen": 1012183040 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035014042126379135, + "loss": 2.875, + "theoretical_loss": 3.6449719103067895, + "tokens_seen": 1012248576 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003501303911735206, + "loss": 2.8828, + "theoretical_loss": 3.6449495036487205, + "tokens_seen": 1012314112 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003501203610832497, + "loss": 2.9444, + "theoretical_loss": 3.644927098847317, + "tokens_seen": 1012379648 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2411820, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.818852424621582, + "objective/train/theoretical_loss": 3.644904695902305, + "objective/train/tokens_used": 1032905184, + "theoretical_loss": 3.644904695902305, + "tokens_seen": 1012445184 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035011033099297895, + "loss": 2.8711, + "theoretical_loss": 3.644904695902305, + "tokens_seen": 1012445184 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035010030090270813, + "loss": 2.8943, + "theoretical_loss": 3.644882294813411, + "tokens_seen": 1012510720 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003500902708124373, + "loss": 2.8639, + "theoretical_loss": 3.644859895580361, + "tokens_seen": 1012576256 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003500802407221665, + "loss": 2.7904, + "theoretical_loss": 3.6448374982028806, + "tokens_seen": 1012641792 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035007021063189573, + "loss": 2.8923, + "theoretical_loss": 3.6448151026806963, + "tokens_seen": 1012707328 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035006018054162486, + "loss": 2.8233, + "theoretical_loss": 3.644792709013534, + "tokens_seen": 1012772864 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003500501504513541, + "loss": 2.9555, + "theoretical_loss": 3.6447703172011208, + "tokens_seen": 1012838400 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003500401203610832, + "loss": 2.9756, + "theoretical_loss": 3.6447479272431824, + "tokens_seen": 1012903936 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035003009027081245, + "loss": 2.8695, + "theoretical_loss": 3.644725539139446, + "tokens_seen": 1012969472 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035002006018054163, + "loss": 2.8706, + "theoretical_loss": 3.6447031528896376, + "tokens_seen": 1013035008 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003500100300902708, + "loss": 2.9678, + "theoretical_loss": 3.6446807684934837, + "tokens_seen": 1013100544 + }, + { + "epoch": 12.01, + "learning_rate": 0.00035, + "loss": 2.7381, + "theoretical_loss": 3.6446583859507107, + "tokens_seen": 1013166080 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499899699097292, + "loss": 2.9302, + "theoretical_loss": 3.644636005261046, + "tokens_seen": 1013231616 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034997993981945836, + "loss": 2.8432, + "theoretical_loss": 3.6446136264242153, + "tokens_seen": 1013297152 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499699097291876, + "loss": 2.8927, + "theoretical_loss": 3.644591249439947, + "tokens_seen": 1013362688 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499598796389167, + "loss": 2.8779, + "theoretical_loss": 3.644568874307966, + "tokens_seen": 1013428224 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034994984954864596, + "loss": 2.78, + "theoretical_loss": 3.6445465010280005, + "tokens_seen": 1013493760 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499398194583751, + "loss": 2.8404, + "theoretical_loss": 3.6445241295997772, + "tokens_seen": 1013559296 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499297893681043, + "loss": 2.8619, + "theoretical_loss": 3.644501760023023, + "tokens_seen": 1013624832 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499197592778335, + "loss": 2.8436, + "theoretical_loss": 3.644479392297465, + "tokens_seen": 1013690368 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003499097291875627, + "loss": 2.9328, + "theoretical_loss": 3.64445702642283, + "tokens_seen": 1013755904 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034989969909729186, + "loss": 2.908, + "theoretical_loss": 3.6444346623988464, + "tokens_seen": 1013821440 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498896690070211, + "loss": 2.8812, + "theoretical_loss": 3.64441230022524, + "tokens_seen": 1013886976 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498796389167502, + "loss": 2.9275, + "theoretical_loss": 3.644389939901739, + "tokens_seen": 1013952512 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034986960882647946, + "loss": 2.845, + "theoretical_loss": 3.6443675814280705, + "tokens_seen": 1014018048 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2416661, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9006659984588623, + "objective/train/theoretical_loss": 3.6443452248039616, + "objective/train/tokens_used": 1034543584, + "theoretical_loss": 3.6443452248039616, + "tokens_seen": 1014083584 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498595787362086, + "loss": 2.8896, + "theoretical_loss": 3.6443452248039616, + "tokens_seen": 1014083584 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498495486459378, + "loss": 2.8918, + "theoretical_loss": 3.6443228700291406, + "tokens_seen": 1014149120 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034983951855566706, + "loss": 2.7839, + "theoretical_loss": 3.6443005171033342, + "tokens_seen": 1014214656 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498294884653962, + "loss": 2.8164, + "theoretical_loss": 3.6442781660262704, + "tokens_seen": 1014280192 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003498194583751254, + "loss": 3.0533, + "theoretical_loss": 3.644255816797677, + "tokens_seen": 1014345728 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034980942828485455, + "loss": 2.9261, + "theoretical_loss": 3.644233469417281, + "tokens_seen": 1014411264 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003497993981945838, + "loss": 2.8402, + "theoretical_loss": 3.644211123884811, + "tokens_seen": 1014476800 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034978936810431296, + "loss": 2.9287, + "theoretical_loss": 3.6441887801999946, + "tokens_seen": 1014542336 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034977933801404214, + "loss": 2.7918, + "theoretical_loss": 3.6441664383625594, + "tokens_seen": 1014607872 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003497693079237713, + "loss": 2.87, + "theoretical_loss": 3.6441440983722337, + "tokens_seen": 1014673408 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034975927783350056, + "loss": 2.9794, + "theoretical_loss": 3.644121760228745, + "tokens_seen": 1014738944 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003497492477432297, + "loss": 2.94, + "theoretical_loss": 3.6440994239318223, + "tokens_seen": 1014804480 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003497392176529589, + "loss": 2.8887, + "theoretical_loss": 3.644077089481193, + "tokens_seen": 1014870016 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034972918756268805, + "loss": 2.7529, + "theoretical_loss": 3.644054756876585, + "tokens_seen": 1014935552 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003497191574724173, + "loss": 2.9532, + "theoretical_loss": 3.644032426117727, + "tokens_seen": 1015001088 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034970912738214647, + "loss": 2.8976, + "theoretical_loss": 3.644010097204348, + "tokens_seen": 1015066624 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034969909729187565, + "loss": 2.867, + "theoretical_loss": 3.643987770136175, + "tokens_seen": 1015132160 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034968906720160483, + "loss": 2.8307, + "theoretical_loss": 3.6439654449129373, + "tokens_seen": 1015197696 + }, + { + "epoch": 12.01, + "learning_rate": 0.000349679037111334, + "loss": 2.8365, + "theoretical_loss": 3.6439431215343623, + "tokens_seen": 1015263232 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003496690070210632, + "loss": 2.9312, + "theoretical_loss": 3.6439208000001795, + "tokens_seen": 1015328768 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003496589769307924, + "loss": 2.8371, + "theoretical_loss": 3.643898480310118, + "tokens_seen": 1015394304 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034964894684052155, + "loss": 2.9368, + "theoretical_loss": 3.643876162463905, + "tokens_seen": 1015459840 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003496389167502508, + "loss": 2.8633, + "theoretical_loss": 3.64385384646127, + "tokens_seen": 1015525376 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003496288866599799, + "loss": 2.8465, + "theoretical_loss": 3.643831532301942, + "tokens_seen": 1015590912 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034961885656970915, + "loss": 2.9587, + "theoretical_loss": 3.6438092199856493, + "tokens_seen": 1015656448 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2419664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8936917781829834, + "objective/train/theoretical_loss": 3.643786909512121, + "objective/train/tokens_used": 1036181984, + "theoretical_loss": 3.643786909512121, + "tokens_seen": 1015721984 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034960882647943833, + "loss": 2.8014, + "theoretical_loss": 3.643786909512121, + "tokens_seen": 1015721984 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003495987963891675, + "loss": 2.8935, + "theoretical_loss": 3.6437646008810853, + "tokens_seen": 1015787520 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003495887662988967, + "loss": 2.8119, + "theoretical_loss": 3.643742294092273, + "tokens_seen": 1015853056 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034957873620862593, + "loss": 2.8663, + "theoretical_loss": 3.643719989145411, + "tokens_seen": 1015918592 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034956870611835506, + "loss": 2.8095, + "theoretical_loss": 3.6436976860402295, + "tokens_seen": 1015984128 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003495586760280843, + "loss": 2.9781, + "theoretical_loss": 3.643675384776458, + "tokens_seen": 1016049664 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003495486459378134, + "loss": 2.9592, + "theoretical_loss": 3.6436530853538245, + "tokens_seen": 1016115200 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034953861584754265, + "loss": 2.8566, + "theoretical_loss": 3.643630787772059, + "tokens_seen": 1016180736 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034952858575727183, + "loss": 2.9673, + "theoretical_loss": 3.643608492030891, + "tokens_seen": 1016246272 + }, + { + "epoch": 12.01, + "learning_rate": 0.000349518555667001, + "loss": 2.9341, + "theoretical_loss": 3.64358619813005, + "tokens_seen": 1016311808 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003495085255767302, + "loss": 2.8703, + "theoretical_loss": 3.6435639060692653, + "tokens_seen": 1016377344 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494984954864594, + "loss": 2.9152, + "theoretical_loss": 3.6435416158482656, + "tokens_seen": 1016442880 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034948846539618856, + "loss": 3.0082, + "theoretical_loss": 3.643519327466781, + "tokens_seen": 1016508416 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494784353059178, + "loss": 2.8922, + "theoretical_loss": 3.643497040924542, + "tokens_seen": 1016573952 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494684052156469, + "loss": 2.8545, + "theoretical_loss": 3.643474756221277, + "tokens_seen": 1016639488 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034945837512537616, + "loss": 2.9069, + "theoretical_loss": 3.6434524733567164, + "tokens_seen": 1016705024 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494483450351053, + "loss": 2.8098, + "theoretical_loss": 3.643430192330589, + "tokens_seen": 1016770560 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494383149448345, + "loss": 2.9228, + "theoretical_loss": 3.6434079131426262, + "tokens_seen": 1016836096 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494282848545637, + "loss": 2.9065, + "theoretical_loss": 3.6433856357925567, + "tokens_seen": 1016901632 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003494182547642929, + "loss": 2.8522, + "theoretical_loss": 3.6433633602801114, + "tokens_seen": 1016967168 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034940822467402206, + "loss": 2.9203, + "theoretical_loss": 3.6433410866050195, + "tokens_seen": 1017032704 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003493981945837513, + "loss": 2.9645, + "theoretical_loss": 3.643318814767011, + "tokens_seen": 1017098240 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003493881644934804, + "loss": 2.7756, + "theoretical_loss": 3.6432965447658168, + "tokens_seen": 1017163776 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034937813440320966, + "loss": 2.9327, + "theoretical_loss": 3.6432742766011663, + "tokens_seen": 1017229312 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003493681043129388, + "loss": 2.8636, + "theoretical_loss": 3.64325201027279, + "tokens_seen": 1017294848 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2423556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0627188682556152, + "objective/train/theoretical_loss": 3.6432297457804186, + "objective/train/tokens_used": 1037820384, + "theoretical_loss": 3.6432297457804186, + "tokens_seen": 1017360384 + }, + { + "epoch": 12.01, + "learning_rate": 0.000349358074222668, + "loss": 2.9727, + "theoretical_loss": 3.6432297457804186, + "tokens_seen": 1017360384 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003493480441323972, + "loss": 2.8907, + "theoretical_loss": 3.643207483123782, + "tokens_seen": 1017425920 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003493380140421264, + "loss": 2.8025, + "theoretical_loss": 3.643185222302611, + "tokens_seen": 1017491456 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034932798395185557, + "loss": 2.8418, + "theoretical_loss": 3.6431629633166356, + "tokens_seen": 1017556992 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034931795386158475, + "loss": 2.8405, + "theoretical_loss": 3.6431407061655863, + "tokens_seen": 1017622528 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034930792377131393, + "loss": 2.9211, + "theoretical_loss": 3.643118450849194, + "tokens_seen": 1017688064 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034929789368104316, + "loss": 2.9036, + "theoretical_loss": 3.6430961973671896, + "tokens_seen": 1017753600 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003492878635907723, + "loss": 2.8145, + "theoretical_loss": 3.643073945719303, + "tokens_seen": 1017819136 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003492778335005015, + "loss": 2.7716, + "theoretical_loss": 3.6430516959052657, + "tokens_seen": 1017884672 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034926780341023065, + "loss": 2.8892, + "theoretical_loss": 3.6430294479248078, + "tokens_seen": 1017950208 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003492577733199599, + "loss": 2.8149, + "theoretical_loss": 3.643007201777661, + "tokens_seen": 1018015744 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034924774322968907, + "loss": 2.8311, + "theoretical_loss": 3.6429849574635558, + "tokens_seen": 1018081280 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034923771313941825, + "loss": 2.8759, + "theoretical_loss": 3.642962714982223, + "tokens_seen": 1018146816 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034922768304914743, + "loss": 2.8585, + "theoretical_loss": 3.642940474333394, + "tokens_seen": 1018212352 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034921765295887667, + "loss": 2.8704, + "theoretical_loss": 3.6429182355167993, + "tokens_seen": 1018277888 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003492076228686058, + "loss": 2.8973, + "theoretical_loss": 3.642895998532171, + "tokens_seen": 1018343424 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034919759277833503, + "loss": 3.0071, + "theoretical_loss": 3.6428737633792387, + "tokens_seen": 1018408960 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034918756268806416, + "loss": 2.9021, + "theoretical_loss": 3.6428515300577358, + "tokens_seen": 1018474496 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003491775325977934, + "loss": 2.9634, + "theoretical_loss": 3.642829298567392, + "tokens_seen": 1018540032 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034916750250752257, + "loss": 2.9015, + "theoretical_loss": 3.6428070689079397, + "tokens_seen": 1018605568 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034915747241725175, + "loss": 2.8564, + "theoretical_loss": 3.6427848410791093, + "tokens_seen": 1018671104 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034914744232698093, + "loss": 2.7547, + "theoretical_loss": 3.642762615080633, + "tokens_seen": 1018736640 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003491374122367101, + "loss": 2.747, + "theoretical_loss": 3.642740390912242, + "tokens_seen": 1018802176 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003491273821464393, + "loss": 2.792, + "theoretical_loss": 3.6427181685736687, + "tokens_seen": 1018867712 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034911735205616853, + "loss": 2.8214, + "theoretical_loss": 3.6426959480646435, + "tokens_seen": 1018933248 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2428250, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8995399475097656, + "objective/train/theoretical_loss": 3.642673729384899, + "objective/train/tokens_used": 1039458784, + "theoretical_loss": 3.642673729384899, + "tokens_seen": 1018998784 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034910732196589766, + "loss": 2.7009, + "theoretical_loss": 3.642673729384899, + "tokens_seen": 1018998784 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490972918756269, + "loss": 2.9286, + "theoretical_loss": 3.642651512534167, + "tokens_seen": 1019064320 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034908726178535613, + "loss": 2.7919, + "theoretical_loss": 3.642629297512179, + "tokens_seen": 1019129856 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034907723169508526, + "loss": 2.8967, + "theoretical_loss": 3.642607084318666, + "tokens_seen": 1019195392 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490672016048145, + "loss": 2.976, + "theoretical_loss": 3.6425848729533614, + "tokens_seen": 1019260928 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490571715145436, + "loss": 2.8823, + "theoretical_loss": 3.642562663415997, + "tokens_seen": 1019326464 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034904714142427285, + "loss": 2.8882, + "theoretical_loss": 3.6425404557063046, + "tokens_seen": 1019392000 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034903711133400203, + "loss": 2.7539, + "theoretical_loss": 3.642518249824016, + "tokens_seen": 1019457536 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490270812437312, + "loss": 2.8846, + "theoretical_loss": 3.6424960457688638, + "tokens_seen": 1019523072 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490170511534604, + "loss": 2.8501, + "theoretical_loss": 3.64247384354058, + "tokens_seen": 1019588608 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003490070210631896, + "loss": 2.8632, + "theoretical_loss": 3.6424516431388962, + "tokens_seen": 1019654144 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034899699097291876, + "loss": 2.824, + "theoretical_loss": 3.6424294445635463, + "tokens_seen": 1019719680 + }, + { + "epoch": 12.01, + "learning_rate": 0.000348986960882648, + "loss": 2.8762, + "theoretical_loss": 3.642407247814262, + "tokens_seen": 1019785216 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489769307923771, + "loss": 2.8023, + "theoretical_loss": 3.642385052890775, + "tokens_seen": 1019850752 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034896690070210636, + "loss": 2.9031, + "theoretical_loss": 3.6423628597928186, + "tokens_seen": 1019916288 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489568706118355, + "loss": 2.8972, + "theoretical_loss": 3.6423406685201254, + "tokens_seen": 1019981824 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489468405215647, + "loss": 2.8716, + "theoretical_loss": 3.6423184790724275, + "tokens_seen": 1020047360 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489368104312939, + "loss": 2.984, + "theoretical_loss": 3.642296291449458, + "tokens_seen": 1020112896 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489267803410231, + "loss": 2.9121, + "theoretical_loss": 3.6422741056509493, + "tokens_seen": 1020178432 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034891675025075226, + "loss": 2.9288, + "theoretical_loss": 3.6422519216766345, + "tokens_seen": 1020243968 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003489067201604815, + "loss": 2.8306, + "theoretical_loss": 3.6422297395262464, + "tokens_seen": 1020309504 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003488966900702106, + "loss": 2.9813, + "theoretical_loss": 3.6422075591995178, + "tokens_seen": 1020375040 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034888665997993986, + "loss": 2.8852, + "theoretical_loss": 3.6421853806961817, + "tokens_seen": 1020440576 + }, + { + "epoch": 12.01, + "learning_rate": 0.000348876629889669, + "loss": 2.8721, + "theoretical_loss": 3.6421632040159713, + "tokens_seen": 1020506112 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003488665997993982, + "loss": 2.8781, + "theoretical_loss": 3.642141029158619, + "tokens_seen": 1020571648 + }, + { + "epoch": 12.01, + "objective/train/docs_used": 2431286, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828155279159546, + "objective/train/theoretical_loss": 3.642118856123859, + "objective/train/tokens_used": 1041097184, + "theoretical_loss": 3.642118856123859, + "tokens_seen": 1020637184 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003488565697091274, + "loss": 2.7923, + "theoretical_loss": 3.642118856123859, + "tokens_seen": 1020637184 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003488465396188566, + "loss": 2.9027, + "theoretical_loss": 3.642096684911423, + "tokens_seen": 1020702720 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034883650952858577, + "loss": 2.726, + "theoretical_loss": 3.6420745155210463, + "tokens_seen": 1020768256 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034882647943831495, + "loss": 2.9255, + "theoretical_loss": 3.6420523479524602, + "tokens_seen": 1020833792 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034881644934804413, + "loss": 2.9851, + "theoretical_loss": 3.6420301822053993, + "tokens_seen": 1020899328 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034880641925777336, + "loss": 2.8233, + "theoretical_loss": 3.642008018279596, + "tokens_seen": 1020964864 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003487963891675025, + "loss": 2.8453, + "theoretical_loss": 3.6419858561747844, + "tokens_seen": 1021030400 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003487863590772317, + "loss": 2.9627, + "theoretical_loss": 3.6419636958906985, + "tokens_seen": 1021095936 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034877632898696085, + "loss": 2.9476, + "theoretical_loss": 3.641941537427071, + "tokens_seen": 1021161472 + }, + { + "epoch": 12.01, + "learning_rate": 0.0003487662988966901, + "loss": 2.9233, + "theoretical_loss": 3.641919380783636, + "tokens_seen": 1021227008 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034875626880641927, + "loss": 2.8866, + "theoretical_loss": 3.641897225960127, + "tokens_seen": 1021292544 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034874623871614845, + "loss": 2.9242, + "theoretical_loss": 3.6418750729562777, + "tokens_seen": 1021358080 + }, + { + "epoch": 12.01, + "learning_rate": 0.00034873620862587763, + "loss": 2.809, + "theoretical_loss": 3.6418529217718216, + "tokens_seen": 1021423616 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034872617853560687, + "loss": 2.8471, + "theoretical_loss": 3.6418307724064936, + "tokens_seen": 1021489152 + }, + { + "epoch": 12.02, + "learning_rate": 0.000348716148445336, + "loss": 2.7599, + "theoretical_loss": 3.641808624860027, + "tokens_seen": 1021554688 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034870611835506523, + "loss": 2.8939, + "theoretical_loss": 3.641786479132155, + "tokens_seen": 1021620224 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034869608826479436, + "loss": 2.7008, + "theoretical_loss": 3.641764335222613, + "tokens_seen": 1021685760 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003486860581745236, + "loss": 2.8254, + "theoretical_loss": 3.641742193131134, + "tokens_seen": 1021751296 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034867602808425277, + "loss": 2.8496, + "theoretical_loss": 3.641720052857453, + "tokens_seen": 1021816832 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034866599799398195, + "loss": 2.9086, + "theoretical_loss": 3.6416979144013037, + "tokens_seen": 1021882368 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034865596790371113, + "loss": 2.8897, + "theoretical_loss": 3.64167577776242, + "tokens_seen": 1021947904 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003486459378134403, + "loss": 2.8049, + "theoretical_loss": 3.641653642940537, + "tokens_seen": 1022013440 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003486359077231695, + "loss": 2.9149, + "theoretical_loss": 3.6416315099353884, + "tokens_seen": 1022078976 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034862587763289873, + "loss": 2.9254, + "theoretical_loss": 3.6416093787467094, + "tokens_seen": 1022144512 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034861584754262786, + "loss": 2.835, + "theoretical_loss": 3.641587249374233, + "tokens_seen": 1022210048 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2435994, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.872642755508423, + "objective/train/theoretical_loss": 3.6415651218176954, + "objective/train/tokens_used": 1042735584, + "theoretical_loss": 3.6415651218176954, + "tokens_seen": 1022275584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003486058174523571, + "loss": 2.8645, + "theoretical_loss": 3.6415651218176954, + "tokens_seen": 1022275584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003485957873620862, + "loss": 2.872, + "theoretical_loss": 3.64154299607683, + "tokens_seen": 1022341120 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034858575727181546, + "loss": 2.9332, + "theoretical_loss": 3.6415208721513723, + "tokens_seen": 1022406656 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034857572718154464, + "loss": 2.893, + "theoretical_loss": 3.6414987500410563, + "tokens_seen": 1022472192 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003485656970912738, + "loss": 2.9682, + "theoretical_loss": 3.6414766297456174, + "tokens_seen": 1022537728 + }, + { + "epoch": 12.02, + "learning_rate": 0.000348555667001003, + "loss": 2.8445, + "theoretical_loss": 3.64145451126479, + "tokens_seen": 1022603264 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034854563691073223, + "loss": 2.9061, + "theoretical_loss": 3.6414323945983087, + "tokens_seen": 1022668800 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034853560682046136, + "loss": 2.8272, + "theoretical_loss": 3.6414102797459087, + "tokens_seen": 1022734336 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003485255767301906, + "loss": 2.8222, + "theoretical_loss": 3.6413881667073253, + "tokens_seen": 1022799872 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003485155466399197, + "loss": 2.8708, + "theoretical_loss": 3.641366055482293, + "tokens_seen": 1022865408 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034850551654964896, + "loss": 2.9003, + "theoretical_loss": 3.6413439460705472, + "tokens_seen": 1022930944 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034849548645937814, + "loss": 2.9417, + "theoretical_loss": 3.6413218384718227, + "tokens_seen": 1022996480 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484854563691073, + "loss": 2.9795, + "theoretical_loss": 3.641299732685855, + "tokens_seen": 1023062016 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484754262788365, + "loss": 2.8483, + "theoretical_loss": 3.6412776287123796, + "tokens_seen": 1023127552 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484653961885657, + "loss": 3.0006, + "theoretical_loss": 3.6412555265511317, + "tokens_seen": 1023193088 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034845536609829486, + "loss": 2.9636, + "theoretical_loss": 3.641233426201846, + "tokens_seen": 1023258624 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484453360080241, + "loss": 2.838, + "theoretical_loss": 3.641211327664258, + "tokens_seen": 1023324160 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484353059177532, + "loss": 2.9468, + "theoretical_loss": 3.6411892309381044, + "tokens_seen": 1023389696 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034842527582748246, + "loss": 2.9074, + "theoretical_loss": 3.6411671360231193, + "tokens_seen": 1023455232 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034841524573721164, + "loss": 2.8536, + "theoretical_loss": 3.6411450429190393, + "tokens_seen": 1023520768 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003484052156469408, + "loss": 2.8818, + "theoretical_loss": 3.641122951625599, + "tokens_seen": 1023586304 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034839518555667, + "loss": 2.9411, + "theoretical_loss": 3.641100862142535, + "tokens_seen": 1023651840 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003483851554663992, + "loss": 2.8911, + "theoretical_loss": 3.641078774469583, + "tokens_seen": 1023717376 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034837512537612837, + "loss": 2.9089, + "theoretical_loss": 3.6410566886064784, + "tokens_seen": 1023782912 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003483650952858576, + "loss": 2.8461, + "theoretical_loss": 3.6410346045529574, + "tokens_seen": 1023848448 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2439028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.875286340713501, + "objective/train/theoretical_loss": 3.6410125223087553, + "objective/train/tokens_used": 1044373984, + "theoretical_loss": 3.6410125223087553, + "tokens_seen": 1023913984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034835506519558673, + "loss": 2.8153, + "theoretical_loss": 3.6410125223087553, + "tokens_seen": 1023913984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034834503510531597, + "loss": 2.7661, + "theoretical_loss": 3.6409904418736088, + "tokens_seen": 1023979520 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034833500501504515, + "loss": 2.7867, + "theoretical_loss": 3.6409683632472536, + "tokens_seen": 1024045056 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034832497492477433, + "loss": 2.9163, + "theoretical_loss": 3.6409462864294255, + "tokens_seen": 1024110592 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034831494483450356, + "loss": 2.7799, + "theoretical_loss": 3.6409242114198612, + "tokens_seen": 1024176128 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003483049147442327, + "loss": 2.9081, + "theoretical_loss": 3.640902138218297, + "tokens_seen": 1024241664 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003482948846539619, + "loss": 2.9871, + "theoretical_loss": 3.6408800668244687, + "tokens_seen": 1024307200 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034828485456369105, + "loss": 2.9143, + "theoretical_loss": 3.640857997238112, + "tokens_seen": 1024372736 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003482748244734203, + "loss": 2.904, + "theoretical_loss": 3.6408359294589645, + "tokens_seen": 1024438272 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034826479438314947, + "loss": 3.0132, + "theoretical_loss": 3.640813863486762, + "tokens_seen": 1024503808 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034825476429287865, + "loss": 2.8003, + "theoretical_loss": 3.6407917993212413, + "tokens_seen": 1024569344 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034824473420260783, + "loss": 2.8593, + "theoretical_loss": 3.6407697369621386, + "tokens_seen": 1024634880 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034823470411233707, + "loss": 2.8818, + "theoretical_loss": 3.64074767640919, + "tokens_seen": 1024700416 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003482246740220662, + "loss": 2.9034, + "theoretical_loss": 3.6407256176621337, + "tokens_seen": 1024765952 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034821464393179543, + "loss": 2.7902, + "theoretical_loss": 3.640703560720705, + "tokens_seen": 1024831488 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034820461384152456, + "loss": 2.7975, + "theoretical_loss": 3.6406815055846407, + "tokens_seen": 1024897024 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003481945837512538, + "loss": 2.8181, + "theoretical_loss": 3.640659452253678, + "tokens_seen": 1024962560 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034818455366098297, + "loss": 2.9023, + "theoretical_loss": 3.6406374007275537, + "tokens_seen": 1025028096 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034817452357071215, + "loss": 2.9378, + "theoretical_loss": 3.640615351006005, + "tokens_seen": 1025093632 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034816449348044133, + "loss": 2.859, + "theoretical_loss": 3.6405933030887683, + "tokens_seen": 1025159168 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003481544633901705, + "loss": 2.9725, + "theoretical_loss": 3.640571256975581, + "tokens_seen": 1025224704 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003481444332998997, + "loss": 2.9059, + "theoretical_loss": 3.64054921266618, + "tokens_seen": 1025290240 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034813440320962893, + "loss": 2.8686, + "theoretical_loss": 3.6405271701603024, + "tokens_seen": 1025355776 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034812437311935806, + "loss": 2.89, + "theoretical_loss": 3.640505129457686, + "tokens_seen": 1025421312 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003481143430290873, + "loss": 2.7234, + "theoretical_loss": 3.6404830905580674, + "tokens_seen": 1025486848 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2443003, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7351675033569336, + "objective/train/theoretical_loss": 3.640461053461184, + "objective/train/tokens_used": 1046012384, + "theoretical_loss": 3.640461053461184, + "tokens_seen": 1025552384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003481043129388164, + "loss": 2.8798, + "theoretical_loss": 3.640461053461184, + "tokens_seen": 1025552384 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034809428284854566, + "loss": 2.9114, + "theoretical_loss": 3.6404390181667727, + "tokens_seen": 1025617920 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034808425275827484, + "loss": 2.789, + "theoretical_loss": 3.640416984674572, + "tokens_seen": 1025683456 + }, + { + "epoch": 12.02, + "learning_rate": 0.000348074222668004, + "loss": 2.8647, + "theoretical_loss": 3.6403949529843187, + "tokens_seen": 1025748992 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003480641925777332, + "loss": 2.8639, + "theoretical_loss": 3.6403729230957502, + "tokens_seen": 1025814528 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034805416248746243, + "loss": 2.9481, + "theoretical_loss": 3.6403508950086043, + "tokens_seen": 1025880064 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034804413239719156, + "loss": 2.9373, + "theoretical_loss": 3.6403288687226185, + "tokens_seen": 1025945600 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003480341023069208, + "loss": 2.9707, + "theoretical_loss": 3.6403068442375313, + "tokens_seen": 1026011136 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003480240722166499, + "loss": 2.9132, + "theoretical_loss": 3.6402848215530788, + "tokens_seen": 1026076672 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034801404212637916, + "loss": 2.8431, + "theoretical_loss": 3.640262800669, + "tokens_seen": 1026142208 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034800401203610834, + "loss": 2.8841, + "theoretical_loss": 3.6402407815850326, + "tokens_seen": 1026207744 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003479939819458375, + "loss": 2.9227, + "theoretical_loss": 3.640218764300914, + "tokens_seen": 1026273280 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003479839518555667, + "loss": 2.9438, + "theoretical_loss": 3.6401967488163827, + "tokens_seen": 1026338816 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003479739217652959, + "loss": 2.8547, + "theoretical_loss": 3.640174735131177, + "tokens_seen": 1026404352 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034796389167502506, + "loss": 2.9256, + "theoretical_loss": 3.640152723245034, + "tokens_seen": 1026469888 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003479538615847543, + "loss": 2.8189, + "theoretical_loss": 3.640130713157692, + "tokens_seen": 1026535424 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034794383149448343, + "loss": 2.9584, + "theoretical_loss": 3.6401087048688905, + "tokens_seen": 1026600960 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034793380140421266, + "loss": 2.9443, + "theoretical_loss": 3.640086698378366, + "tokens_seen": 1026666496 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034792377131394184, + "loss": 2.8459, + "theoretical_loss": 3.640064693685858, + "tokens_seen": 1026732032 + }, + { + "epoch": 12.02, + "learning_rate": 0.000347913741223671, + "loss": 2.8911, + "theoretical_loss": 3.6400426907911037, + "tokens_seen": 1026797568 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003479037111334002, + "loss": 2.8282, + "theoretical_loss": 3.6400206896938427, + "tokens_seen": 1026863104 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003478936810431294, + "loss": 2.8081, + "theoretical_loss": 3.6399986903938126, + "tokens_seen": 1026928640 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034788365095285857, + "loss": 2.8955, + "theoretical_loss": 3.6399766928907527, + "tokens_seen": 1026994176 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003478736208625878, + "loss": 2.8315, + "theoretical_loss": 3.639954697184401, + "tokens_seen": 1027059712 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034786359077231693, + "loss": 2.7782, + "theoretical_loss": 3.6399327032744955, + "tokens_seen": 1027125248 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2447798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9221596717834473, + "objective/train/theoretical_loss": 3.639910711160776, + "objective/train/tokens_used": 1047650784, + "theoretical_loss": 3.639910711160776, + "tokens_seen": 1027190784 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034785356068204617, + "loss": 2.8327, + "theoretical_loss": 3.639910711160776, + "tokens_seen": 1027190784 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003478435305917753, + "loss": 2.7518, + "theoretical_loss": 3.639888720842981, + "tokens_seen": 1027256320 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034783350050150453, + "loss": 2.732, + "theoretical_loss": 3.639866732320849, + "tokens_seen": 1027321856 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003478234704112337, + "loss": 2.9174, + "theoretical_loss": 3.639844745594119, + "tokens_seen": 1027387392 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003478134403209629, + "loss": 2.9125, + "theoretical_loss": 3.6398227606625295, + "tokens_seen": 1027452928 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034780341023069207, + "loss": 2.9084, + "theoretical_loss": 3.63980077752582, + "tokens_seen": 1027518464 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034779338014042125, + "loss": 2.8908, + "theoretical_loss": 3.639778796183729, + "tokens_seen": 1027584000 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034778335005015043, + "loss": 2.8184, + "theoretical_loss": 3.6397568166359955, + "tokens_seen": 1027649536 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034777331995987967, + "loss": 2.9038, + "theoretical_loss": 3.63973483888236, + "tokens_seen": 1027715072 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003477632898696088, + "loss": 2.8336, + "theoretical_loss": 3.6397128629225595, + "tokens_seen": 1027780608 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034775325977933803, + "loss": 3.0209, + "theoretical_loss": 3.6396908887563346, + "tokens_seen": 1027846144 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003477432296890672, + "loss": 2.9482, + "theoretical_loss": 3.639668916383424, + "tokens_seen": 1027911680 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003477331995987964, + "loss": 2.7925, + "theoretical_loss": 3.639646945803568, + "tokens_seen": 1027977216 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003477231695085256, + "loss": 2.8908, + "theoretical_loss": 3.6396249770165046, + "tokens_seen": 1028042752 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034771313941825476, + "loss": 2.8847, + "theoretical_loss": 3.6396030100219745, + "tokens_seen": 1028108288 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034770310932798394, + "loss": 2.8041, + "theoretical_loss": 3.639581044819716, + "tokens_seen": 1028173824 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034769307923771317, + "loss": 2.819, + "theoretical_loss": 3.6395590814094696, + "tokens_seen": 1028239360 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003476830491474423, + "loss": 2.8473, + "theoretical_loss": 3.6395371197909743, + "tokens_seen": 1028304896 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034767301905717153, + "loss": 2.8305, + "theoretical_loss": 3.6395151599639703, + "tokens_seen": 1028370432 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034766298896690066, + "loss": 2.951, + "theoretical_loss": 3.6394932019281967, + "tokens_seen": 1028435968 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003476529588766299, + "loss": 2.8866, + "theoretical_loss": 3.6394712456833935, + "tokens_seen": 1028501504 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003476429287863591, + "loss": 3.0426, + "theoretical_loss": 3.639449291229301, + "tokens_seen": 1028567040 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034763289869608826, + "loss": 2.8383, + "theoretical_loss": 3.639427338565658, + "tokens_seen": 1028632576 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034762286860581744, + "loss": 2.8512, + "theoretical_loss": 3.6394053876922055, + "tokens_seen": 1028698112 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003476128385155466, + "loss": 2.9204, + "theoretical_loss": 3.639383438608683, + "tokens_seen": 1028763648 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2450597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.975043773651123, + "objective/train/theoretical_loss": 3.6393614913148307, + "objective/train/tokens_used": 1049289184, + "theoretical_loss": 3.6393614913148307, + "tokens_seen": 1028829184 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003476028084252758, + "loss": 2.8405, + "theoretical_loss": 3.6393614913148307, + "tokens_seen": 1028829184 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034759277833500504, + "loss": 2.7743, + "theoretical_loss": 3.6393395458103877, + "tokens_seen": 1028894720 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003475827482447342, + "loss": 2.9783, + "theoretical_loss": 3.6393176020950957, + "tokens_seen": 1028960256 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003475727181544634, + "loss": 2.9516, + "theoretical_loss": 3.6392956601686945, + "tokens_seen": 1029025792 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034756268806419263, + "loss": 2.9268, + "theoretical_loss": 3.6392737200309235, + "tokens_seen": 1029091328 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034755265797392176, + "loss": 3.0225, + "theoretical_loss": 3.6392517816815237, + "tokens_seen": 1029156864 + }, + { + "epoch": 12.02, + "learning_rate": 0.000347542627883651, + "loss": 2.7367, + "theoretical_loss": 3.6392298451202354, + "tokens_seen": 1029222400 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003475325977933801, + "loss": 2.7217, + "theoretical_loss": 3.6392079103467987, + "tokens_seen": 1029287936 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034752256770310936, + "loss": 2.9059, + "theoretical_loss": 3.6391859773609547, + "tokens_seen": 1029353472 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034751253761283854, + "loss": 2.9786, + "theoretical_loss": 3.6391640461624433, + "tokens_seen": 1029419008 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003475025075225677, + "loss": 2.9057, + "theoretical_loss": 3.6391421167510054, + "tokens_seen": 1029484544 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474924774322969, + "loss": 2.8562, + "theoretical_loss": 3.6391201891263814, + "tokens_seen": 1029550080 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474824473420261, + "loss": 2.7979, + "theoretical_loss": 3.6390982632883118, + "tokens_seen": 1029615616 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034747241725175527, + "loss": 2.9435, + "theoretical_loss": 3.6390763392365386, + "tokens_seen": 1029681152 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474623871614845, + "loss": 2.8168, + "theoretical_loss": 3.639054416970801, + "tokens_seen": 1029746688 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034745235707121363, + "loss": 2.9306, + "theoretical_loss": 3.6390324964908403, + "tokens_seen": 1029812224 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034744232698094286, + "loss": 2.8671, + "theoretical_loss": 3.6390105777963977, + "tokens_seen": 1029877760 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034743229689067204, + "loss": 2.8111, + "theoretical_loss": 3.638988660887214, + "tokens_seen": 1029943296 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474222668004012, + "loss": 2.9496, + "theoretical_loss": 3.6389667457630304, + "tokens_seen": 1030008832 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474122367101304, + "loss": 2.9107, + "theoretical_loss": 3.638944832423588, + "tokens_seen": 1030074368 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003474022066198596, + "loss": 2.8979, + "theoretical_loss": 3.6389229208686276, + "tokens_seen": 1030139904 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034739217652958877, + "loss": 2.8888, + "theoretical_loss": 3.6389010110978903, + "tokens_seen": 1030205440 + }, + { + "epoch": 12.02, + "learning_rate": 0.000347382146439318, + "loss": 2.9366, + "theoretical_loss": 3.638879103111117, + "tokens_seen": 1030270976 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034737211634904713, + "loss": 2.8247, + "theoretical_loss": 3.6388571969080505, + "tokens_seen": 1030336512 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034736208625877637, + "loss": 2.8293, + "theoretical_loss": 3.638835292488431, + "tokens_seen": 1030402048 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2455219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.948676109313965, + "objective/train/theoretical_loss": 3.6388133898519994, + "objective/train/tokens_used": 1050927584, + "theoretical_loss": 3.6388133898519994, + "tokens_seen": 1030467584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003473520561685055, + "loss": 2.8842, + "theoretical_loss": 3.6388133898519994, + "tokens_seen": 1030467584 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034734202607823473, + "loss": 2.7915, + "theoretical_loss": 3.6387914889984985, + "tokens_seen": 1030533120 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003473319959879639, + "loss": 2.9318, + "theoretical_loss": 3.6387695899276684, + "tokens_seen": 1030598656 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003473219658976931, + "loss": 2.8511, + "theoretical_loss": 3.638747692639252, + "tokens_seen": 1030664192 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034731193580742227, + "loss": 2.8255, + "theoretical_loss": 3.63872579713299, + "tokens_seen": 1030729728 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034730190571715145, + "loss": 2.8632, + "theoretical_loss": 3.638703903408624, + "tokens_seen": 1030795264 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034729187562688063, + "loss": 2.9194, + "theoretical_loss": 3.638682011465896, + "tokens_seen": 1030860800 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034728184553660987, + "loss": 2.956, + "theoretical_loss": 3.6386601213045475, + "tokens_seen": 1030926336 + }, + { + "epoch": 12.02, + "learning_rate": 0.000347271815446339, + "loss": 3.0407, + "theoretical_loss": 3.638638232924321, + "tokens_seen": 1030991872 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034726178535606823, + "loss": 2.7571, + "theoretical_loss": 3.638616346324958, + "tokens_seen": 1031057408 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003472517552657974, + "loss": 2.8052, + "theoretical_loss": 3.6385944615062, + "tokens_seen": 1031122944 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003472417251755266, + "loss": 2.8863, + "theoretical_loss": 3.638572578467789, + "tokens_seen": 1031188480 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003472316950852558, + "loss": 2.9413, + "theoretical_loss": 3.6385506972094683, + "tokens_seen": 1031254016 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034722166499498496, + "loss": 2.8942, + "theoretical_loss": 3.638528817730978, + "tokens_seen": 1031319552 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034721163490471414, + "loss": 2.9968, + "theoretical_loss": 3.6385069400320624, + "tokens_seen": 1031385088 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034720160481444337, + "loss": 2.9018, + "theoretical_loss": 3.638485064112462, + "tokens_seen": 1031450624 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003471915747241725, + "loss": 2.9902, + "theoretical_loss": 3.6384631899719198, + "tokens_seen": 1031516160 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034718154463390173, + "loss": 2.9534, + "theoretical_loss": 3.6384413176101775, + "tokens_seen": 1031581696 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034717151454363086, + "loss": 2.8825, + "theoretical_loss": 3.638419447026979, + "tokens_seen": 1031647232 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003471614844533601, + "loss": 2.9258, + "theoretical_loss": 3.6383975782220643, + "tokens_seen": 1031712768 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003471514543630893, + "loss": 2.8711, + "theoretical_loss": 3.6383757111951778, + "tokens_seen": 1031778304 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034714142427281846, + "loss": 2.9777, + "theoretical_loss": 3.638353845946061, + "tokens_seen": 1031843840 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034713139418254764, + "loss": 3.0069, + "theoretical_loss": 3.6383319824744573, + "tokens_seen": 1031909376 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003471213640922768, + "loss": 2.9139, + "theoretical_loss": 3.638310120780109, + "tokens_seen": 1031974912 + }, + { + "epoch": 12.02, + "learning_rate": 0.000347111334002006, + "loss": 2.8986, + "theoretical_loss": 3.6382882608627582, + "tokens_seen": 1032040448 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.904987335205078, + "objective/train/theoretical_loss": 3.6382664027221487, + "objective/train/tokens_used": 1052565984, + "theoretical_loss": 3.6382664027221487, + "tokens_seen": 1032105984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034710130391173524, + "loss": 2.961, + "theoretical_loss": 3.6382664027221487, + "tokens_seen": 1032105984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034709127382146436, + "loss": 2.843, + "theoretical_loss": 3.638244546358022, + "tokens_seen": 1032171520 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003470812437311936, + "loss": 2.9158, + "theoretical_loss": 3.638222691770122, + "tokens_seen": 1032237056 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003470712136409228, + "loss": 2.909, + "theoretical_loss": 3.6382008389581912, + "tokens_seen": 1032302592 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034706118355065196, + "loss": 2.9592, + "theoretical_loss": 3.6381789879219726, + "tokens_seen": 1032368128 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034705115346038114, + "loss": 2.949, + "theoretical_loss": 3.6381571386612084, + "tokens_seen": 1032433664 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003470411233701103, + "loss": 2.9823, + "theoretical_loss": 3.6381352911756433, + "tokens_seen": 1032499200 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003470310932798395, + "loss": 2.8037, + "theoretical_loss": 3.6381134454650192, + "tokens_seen": 1032564736 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034702106318956874, + "loss": 2.8077, + "theoretical_loss": 3.638091601529079, + "tokens_seen": 1032630272 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034701103309929787, + "loss": 2.9711, + "theoretical_loss": 3.6380697593675677, + "tokens_seen": 1032695808 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003470010030090271, + "loss": 2.9007, + "theoretical_loss": 3.6380479189802264, + "tokens_seen": 1032761344 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034699097291875623, + "loss": 2.8927, + "theoretical_loss": 3.6380260803667994, + "tokens_seen": 1032826880 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034698094282848547, + "loss": 2.8272, + "theoretical_loss": 3.63800424352703, + "tokens_seen": 1032892416 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034697091273821465, + "loss": 2.9625, + "theoretical_loss": 3.6379824084606627, + "tokens_seen": 1032957952 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034696088264794383, + "loss": 2.8379, + "theoretical_loss": 3.6379605751674386, + "tokens_seen": 1033023488 + }, + { + "epoch": 12.02, + "learning_rate": 0.000346950852557673, + "loss": 2.8039, + "theoretical_loss": 3.637938743647103, + "tokens_seen": 1033089024 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034694082246740224, + "loss": 2.849, + "theoretical_loss": 3.6379169138993994, + "tokens_seen": 1033154560 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034693079237713137, + "loss": 2.8553, + "theoretical_loss": 3.637895085924071, + "tokens_seen": 1033220096 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003469207622868606, + "loss": 2.915, + "theoretical_loss": 3.6378732597208616, + "tokens_seen": 1033285632 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034691073219658973, + "loss": 2.8573, + "theoretical_loss": 3.637851435289515, + "tokens_seen": 1033351168 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034690070210631897, + "loss": 2.8406, + "theoretical_loss": 3.6378296126297744, + "tokens_seen": 1033416704 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034689067201604815, + "loss": 2.8866, + "theoretical_loss": 3.637807791741385, + "tokens_seen": 1033482240 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034688064192577733, + "loss": 2.9342, + "theoretical_loss": 3.6377859726240893, + "tokens_seen": 1033547776 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003468706118355065, + "loss": 2.9083, + "theoretical_loss": 3.637764155277632, + "tokens_seen": 1033613312 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003468605817452357, + "loss": 2.8743, + "theoretical_loss": 3.6377423397017568, + "tokens_seen": 1033678848 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9553468227386475, + "objective/train/theoretical_loss": 3.637720525896208, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.637720525896208, + "tokens_seen": 1033744384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003468505516549649, + "loss": 2.9193, + "theoretical_loss": 3.637720525896208, + "tokens_seen": 1033744384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003468405215646941, + "loss": 2.8657, + "theoretical_loss": 3.63769871386073, + "tokens_seen": 1033809920 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003468304914744233, + "loss": 2.9343, + "theoretical_loss": 3.6376769035950662, + "tokens_seen": 1033875456 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034682046138415247, + "loss": 2.9397, + "theoretical_loss": 3.6376550950989617, + "tokens_seen": 1033940992 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034681043129388165, + "loss": 2.9113, + "theoretical_loss": 3.63763328837216, + "tokens_seen": 1034006528 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034680040120361083, + "loss": 2.9485, + "theoretical_loss": 3.637611483414406, + "tokens_seen": 1034072064 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034679037111334007, + "loss": 2.8285, + "theoretical_loss": 3.6375896802254437, + "tokens_seen": 1034137600 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003467803410230692, + "loss": 2.9005, + "theoretical_loss": 3.637567878805018, + "tokens_seen": 1034203136 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034677031093279843, + "loss": 2.8707, + "theoretical_loss": 3.6375460791528726, + "tokens_seen": 1034268672 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003467602808425276, + "loss": 2.7637, + "theoretical_loss": 3.6375242812687527, + "tokens_seen": 1034334208 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003467502507522568, + "loss": 2.8493, + "theoretical_loss": 3.637502485152403, + "tokens_seen": 1034399744 + }, + { + "epoch": 12.02, + "learning_rate": 0.000346740220661986, + "loss": 2.8477, + "theoretical_loss": 3.6374806908035673, + "tokens_seen": 1034465280 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034673019057171516, + "loss": 2.7907, + "theoretical_loss": 3.6374588982219915, + "tokens_seen": 1034530816 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034672016048144434, + "loss": 2.9738, + "theoretical_loss": 3.637437107407419, + "tokens_seen": 1034596352 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034671013039117357, + "loss": 2.7752, + "theoretical_loss": 3.6374153183595963, + "tokens_seen": 1034661888 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003467001003009027, + "loss": 2.9412, + "theoretical_loss": 3.637393531078267, + "tokens_seen": 1034727424 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034669007021063193, + "loss": 2.9427, + "theoretical_loss": 3.637371745563176, + "tokens_seen": 1034792960 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034668004012036106, + "loss": 2.8518, + "theoretical_loss": 3.6373499618140688, + "tokens_seen": 1034858496 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003466700100300903, + "loss": 2.843, + "theoretical_loss": 3.63732817983069, + "tokens_seen": 1034924032 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003466599799398195, + "loss": 2.8677, + "theoretical_loss": 3.6373063996127852, + "tokens_seen": 1034989568 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034664994984954866, + "loss": 2.8905, + "theoretical_loss": 3.6372846211600987, + "tokens_seen": 1035055104 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034663991975927784, + "loss": 2.8586, + "theoretical_loss": 3.637262844472377, + "tokens_seen": 1035120640 + }, + { + "epoch": 12.02, + "learning_rate": 0.000346629889669007, + "loss": 2.8307, + "theoretical_loss": 3.637241069549364, + "tokens_seen": 1035186176 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003466198595787362, + "loss": 2.9235, + "theoretical_loss": 3.6372192963908057, + "tokens_seen": 1035251712 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034660982948846544, + "loss": 2.891, + "theoretical_loss": 3.6371975249964468, + "tokens_seen": 1035317248 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8901422023773193, + "objective/train/theoretical_loss": 3.6371757553660338, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6371757553660338, + "tokens_seen": 1035382784 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034659979939819456, + "loss": 2.7743, + "theoretical_loss": 3.6371757553660338, + "tokens_seen": 1035382784 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003465897693079238, + "loss": 2.8825, + "theoretical_loss": 3.6371539874993113, + "tokens_seen": 1035448320 + }, + { + "epoch": 12.02, + "learning_rate": 0.000346579739217653, + "loss": 2.9157, + "theoretical_loss": 3.6371322213960253, + "tokens_seen": 1035513856 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034656970912738216, + "loss": 2.8727, + "theoretical_loss": 3.63711045705592, + "tokens_seen": 1035579392 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034655967903711134, + "loss": 2.8431, + "theoretical_loss": 3.6370886944787433, + "tokens_seen": 1035644928 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003465496489468405, + "loss": 2.832, + "theoretical_loss": 3.637066933664239, + "tokens_seen": 1035710464 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003465396188565697, + "loss": 2.8198, + "theoretical_loss": 3.6370451746121533, + "tokens_seen": 1035776000 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034652958876629894, + "loss": 2.9477, + "theoretical_loss": 3.637023417322232, + "tokens_seen": 1035841536 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034651955867602807, + "loss": 2.7599, + "theoretical_loss": 3.6370016617942214, + "tokens_seen": 1035907072 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003465095285857573, + "loss": 2.9064, + "theoretical_loss": 3.636979908027867, + "tokens_seen": 1035972608 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034649949849548643, + "loss": 2.8267, + "theoretical_loss": 3.6369581560229145, + "tokens_seen": 1036038144 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034648946840521567, + "loss": 2.8889, + "theoretical_loss": 3.63693640577911, + "tokens_seen": 1036103680 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034647943831494485, + "loss": 2.8723, + "theoretical_loss": 3.6369146572961997, + "tokens_seen": 1036169216 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034646940822467403, + "loss": 2.8666, + "theoretical_loss": 3.636892910573929, + "tokens_seen": 1036234752 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003464593781344032, + "loss": 2.8857, + "theoretical_loss": 3.636871165612045, + "tokens_seen": 1036300288 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034644934804413244, + "loss": 2.9642, + "theoretical_loss": 3.636849422410294, + "tokens_seen": 1036365824 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034643931795386157, + "loss": 2.9286, + "theoretical_loss": 3.636827680968421, + "tokens_seen": 1036431360 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003464292878635908, + "loss": 2.9121, + "theoretical_loss": 3.6368059412861733, + "tokens_seen": 1036496896 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034641925777331993, + "loss": 2.6652, + "theoretical_loss": 3.636784203363297, + "tokens_seen": 1036562432 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034640922768304917, + "loss": 2.8704, + "theoretical_loss": 3.636762467199538, + "tokens_seen": 1036627968 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034639919759277835, + "loss": 2.8201, + "theoretical_loss": 3.6367407327946437, + "tokens_seen": 1036693504 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034638916750250753, + "loss": 2.8107, + "theoretical_loss": 3.63671900014836, + "tokens_seen": 1036759040 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003463791374122367, + "loss": 2.8769, + "theoretical_loss": 3.6366972692604334, + "tokens_seen": 1036824576 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003463691073219659, + "loss": 2.8055, + "theoretical_loss": 3.6366755401306103, + "tokens_seen": 1036890112 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003463590772316951, + "loss": 2.8316, + "theoretical_loss": 3.636653812758638, + "tokens_seen": 1036955648 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.923288583755493, + "objective/train/theoretical_loss": 3.636632087144263, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.636632087144263, + "tokens_seen": 1037021184 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003463490471414243, + "loss": 2.8482, + "theoretical_loss": 3.636632087144263, + "tokens_seen": 1037021184 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034633901705115344, + "loss": 2.8548, + "theoretical_loss": 3.636610363287232, + "tokens_seen": 1037086720 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034632898696088267, + "loss": 2.922, + "theoretical_loss": 3.6365886411872914, + "tokens_seen": 1037152256 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003463189568706118, + "loss": 3.019, + "theoretical_loss": 3.6365669208441886, + "tokens_seen": 1037217792 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034630892678034103, + "loss": 2.946, + "theoretical_loss": 3.636545202257671, + "tokens_seen": 1037283328 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462988966900702, + "loss": 2.8391, + "theoretical_loss": 3.6365234854274844, + "tokens_seen": 1037348864 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462888665997994, + "loss": 2.8627, + "theoretical_loss": 3.636501770353376, + "tokens_seen": 1037414400 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462788365095286, + "loss": 2.9281, + "theoretical_loss": 3.6364800570350937, + "tokens_seen": 1037479936 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462688064192578, + "loss": 2.9165, + "theoretical_loss": 3.6364583454723842, + "tokens_seen": 1037545472 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034625877632898694, + "loss": 2.8576, + "theoretical_loss": 3.636436635664994, + "tokens_seen": 1037611008 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462487462387162, + "loss": 3.0275, + "theoretical_loss": 3.6364149276126723, + "tokens_seen": 1037676544 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462387161484453, + "loss": 2.891, + "theoretical_loss": 3.636393221315164, + "tokens_seen": 1037742080 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034622868605817454, + "loss": 2.9663, + "theoretical_loss": 3.6363715167722175, + "tokens_seen": 1037807616 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462186559679037, + "loss": 3.0028, + "theoretical_loss": 3.636349813983581, + "tokens_seen": 1037873152 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003462086258776329, + "loss": 2.8274, + "theoretical_loss": 3.636328112949, + "tokens_seen": 1037938688 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003461985957873621, + "loss": 3.0287, + "theoretical_loss": 3.636306413668224, + "tokens_seen": 1038004224 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034618856569709126, + "loss": 2.841, + "theoretical_loss": 3.6362847161409997, + "tokens_seen": 1038069760 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034617853560682044, + "loss": 2.9443, + "theoretical_loss": 3.6362630203670743, + "tokens_seen": 1038135296 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003461685055165497, + "loss": 2.8375, + "theoretical_loss": 3.636241326346196, + "tokens_seen": 1038200832 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003461584754262788, + "loss": 2.8981, + "theoretical_loss": 3.6362196340781123, + "tokens_seen": 1038266368 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034614844533600804, + "loss": 2.9425, + "theoretical_loss": 3.6361979435625713, + "tokens_seen": 1038331904 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034613841524573717, + "loss": 2.78, + "theoretical_loss": 3.63617625479932, + "tokens_seen": 1038397440 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003461283851554664, + "loss": 2.7902, + "theoretical_loss": 3.6361545677881066, + "tokens_seen": 1038462976 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003461183550651956, + "loss": 2.9398, + "theoretical_loss": 3.63613288252868, + "tokens_seen": 1038528512 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034610832497492476, + "loss": 2.8868, + "theoretical_loss": 3.6361111990207866, + "tokens_seen": 1038594048 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7132678031921387, + "objective/train/theoretical_loss": 3.6360895172641756, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6360895172641756, + "tokens_seen": 1038659584 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034609829488465395, + "loss": 2.833, + "theoretical_loss": 3.6360895172641756, + "tokens_seen": 1038659584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003460882647943832, + "loss": 2.8182, + "theoretical_loss": 3.6360678372585946, + "tokens_seen": 1038725120 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034607823470411236, + "loss": 2.8454, + "theoretical_loss": 3.636046159003792, + "tokens_seen": 1038790656 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034606820461384154, + "loss": 2.7971, + "theoretical_loss": 3.636024482499515, + "tokens_seen": 1038856192 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003460581745235707, + "loss": 2.9212, + "theoretical_loss": 3.6360028077455127, + "tokens_seen": 1038921728 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003460481444332999, + "loss": 2.7965, + "theoretical_loss": 3.6359811347415336, + "tokens_seen": 1038987264 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034603811434302914, + "loss": 2.8774, + "theoretical_loss": 3.6359594634873256, + "tokens_seen": 1039052800 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034602808425275827, + "loss": 2.9609, + "theoretical_loss": 3.635937793982637, + "tokens_seen": 1039118336 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003460180541624875, + "loss": 2.7875, + "theoretical_loss": 3.6359161262272166, + "tokens_seen": 1039183872 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034600802407221663, + "loss": 3.03, + "theoretical_loss": 3.6358944602208125, + "tokens_seen": 1039249408 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034599799398194587, + "loss": 2.9507, + "theoretical_loss": 3.6358727959631736, + "tokens_seen": 1039314944 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034598796389167505, + "loss": 2.9304, + "theoretical_loss": 3.6358511334540475, + "tokens_seen": 1039380480 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034597793380140423, + "loss": 2.8415, + "theoretical_loss": 3.635829472693185, + "tokens_seen": 1039446016 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003459679037111334, + "loss": 2.8097, + "theoretical_loss": 3.6358078136803327, + "tokens_seen": 1039511552 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034595787362086264, + "loss": 2.9887, + "theoretical_loss": 3.63578615641524, + "tokens_seen": 1039577088 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034594784353059177, + "loss": 2.9089, + "theoretical_loss": 3.635764500897656, + "tokens_seen": 1039642624 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345937813440321, + "loss": 2.79, + "theoretical_loss": 3.6357428471273288, + "tokens_seen": 1039708160 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034592778335005013, + "loss": 2.7819, + "theoretical_loss": 3.6357211951040087, + "tokens_seen": 1039773696 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034591775325977937, + "loss": 2.8606, + "theoretical_loss": 3.6356995448274434, + "tokens_seen": 1039839232 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034590772316950855, + "loss": 3.0008, + "theoretical_loss": 3.6356778962973815, + "tokens_seen": 1039904768 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034589769307923773, + "loss": 2.9219, + "theoretical_loss": 3.6356562495135742, + "tokens_seen": 1039970304 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003458876629889669, + "loss": 2.864, + "theoretical_loss": 3.6356346044757686, + "tokens_seen": 1040035840 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003458776328986961, + "loss": 2.7991, + "theoretical_loss": 3.635612961183714, + "tokens_seen": 1040101376 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003458676028084253, + "loss": 2.8901, + "theoretical_loss": 3.635591319637161, + "tokens_seen": 1040166912 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003458575727181545, + "loss": 2.9391, + "theoretical_loss": 3.6355696798358577, + "tokens_seen": 1040232448 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9913017749786377, + "objective/train/theoretical_loss": 3.635548041779554, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.635548041779554, + "tokens_seen": 1040297984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034584754262788364, + "loss": 2.8515, + "theoretical_loss": 3.635548041779554, + "tokens_seen": 1040297984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034583751253761287, + "loss": 2.9319, + "theoretical_loss": 3.6355264054679983, + "tokens_seen": 1040363520 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345827482447342, + "loss": 2.8794, + "theoretical_loss": 3.6355047709009414, + "tokens_seen": 1040429056 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034581745235707123, + "loss": 2.9271, + "theoretical_loss": 3.635483138078132, + "tokens_seen": 1040494592 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003458074222668004, + "loss": 3.0043, + "theoretical_loss": 3.6354615069993192, + "tokens_seen": 1040560128 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457973921765296, + "loss": 2.8596, + "theoretical_loss": 3.6354398776642536, + "tokens_seen": 1040625664 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457873620862588, + "loss": 2.9225, + "theoretical_loss": 3.635418250072684, + "tokens_seen": 1040691200 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345777331995988, + "loss": 2.9647, + "theoretical_loss": 3.6353966242243603, + "tokens_seen": 1040756736 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034576730190571714, + "loss": 2.9599, + "theoretical_loss": 3.6353750001190326, + "tokens_seen": 1040822272 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457572718154464, + "loss": 2.9655, + "theoretical_loss": 3.6353533777564504, + "tokens_seen": 1040887808 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457472417251755, + "loss": 2.7905, + "theoretical_loss": 3.6353317571363633, + "tokens_seen": 1040953344 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034573721163490474, + "loss": 2.9258, + "theoretical_loss": 3.6353101382585216, + "tokens_seen": 1041018880 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457271815446339, + "loss": 2.9012, + "theoretical_loss": 3.635288521122675, + "tokens_seen": 1041084416 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457171514543631, + "loss": 2.8203, + "theoretical_loss": 3.6352669057285736, + "tokens_seen": 1041149952 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003457071213640923, + "loss": 2.861, + "theoretical_loss": 3.6352452920759677, + "tokens_seen": 1041215488 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034569709127382146, + "loss": 2.9451, + "theoretical_loss": 3.6352236801646063, + "tokens_seen": 1041281024 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034568706118355064, + "loss": 2.9413, + "theoretical_loss": 3.635202069994241, + "tokens_seen": 1041346560 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003456770310932799, + "loss": 2.9217, + "theoretical_loss": 3.6351804615646213, + "tokens_seen": 1041412096 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345667001003009, + "loss": 2.8399, + "theoretical_loss": 3.635158854875497, + "tokens_seen": 1041477632 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034565697091273824, + "loss": 2.8431, + "theoretical_loss": 3.6351372499266192, + "tokens_seen": 1041543168 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034564694082246737, + "loss": 2.8659, + "theoretical_loss": 3.635115646717738, + "tokens_seen": 1041608704 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003456369107321966, + "loss": 2.9337, + "theoretical_loss": 3.6350940452486036, + "tokens_seen": 1041674240 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003456268806419258, + "loss": 2.8353, + "theoretical_loss": 3.6350724455189667, + "tokens_seen": 1041739776 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034561685055165496, + "loss": 2.8496, + "theoretical_loss": 3.6350508475285777, + "tokens_seen": 1041805312 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034560682046138415, + "loss": 2.9922, + "theoretical_loss": 3.635029251277187, + "tokens_seen": 1041870848 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9794132709503174, + "objective/train/theoretical_loss": 3.6350076567645453, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6350076567645453, + "tokens_seen": 1041936384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003455967903711134, + "loss": 2.9432, + "theoretical_loss": 3.6350076567645453, + "tokens_seen": 1041936384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003455867602808425, + "loss": 2.8133, + "theoretical_loss": 3.6349860639904032, + "tokens_seen": 1042001920 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034557673019057174, + "loss": 2.8504, + "theoretical_loss": 3.634964472954512, + "tokens_seen": 1042067456 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034556670010030087, + "loss": 2.812, + "theoretical_loss": 3.6349428836566213, + "tokens_seen": 1042132992 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003455566700100301, + "loss": 2.9854, + "theoretical_loss": 3.634921296096483, + "tokens_seen": 1042198528 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003455466399197593, + "loss": 2.8892, + "theoretical_loss": 3.6348997102738476, + "tokens_seen": 1042264064 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034553660982948847, + "loss": 2.8949, + "theoretical_loss": 3.634878126188466, + "tokens_seen": 1042329600 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034552657973921765, + "loss": 2.8668, + "theoretical_loss": 3.634856543840089, + "tokens_seen": 1042395136 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034551654964894683, + "loss": 2.9897, + "theoretical_loss": 3.6348349632284678, + "tokens_seen": 1042460672 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345506519558676, + "loss": 2.9455, + "theoretical_loss": 3.6348133843533534, + "tokens_seen": 1042526208 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034549648946840525, + "loss": 2.9024, + "theoretical_loss": 3.634791807214497, + "tokens_seen": 1042591744 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003454864593781344, + "loss": 2.8816, + "theoretical_loss": 3.6347702318116495, + "tokens_seen": 1042657280 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003454764292878636, + "loss": 2.8013, + "theoretical_loss": 3.634748658144563, + "tokens_seen": 1042722816 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034546639919759274, + "loss": 2.8499, + "theoretical_loss": 3.6347270862129877, + "tokens_seen": 1042788352 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034545636910732197, + "loss": 2.8417, + "theoretical_loss": 3.634705516016675, + "tokens_seen": 1042853888 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034544633901705115, + "loss": 3.0387, + "theoretical_loss": 3.634683947555377, + "tokens_seen": 1042919424 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034543630892678033, + "loss": 2.9275, + "theoretical_loss": 3.634662380828845, + "tokens_seen": 1042984960 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003454262788365095, + "loss": 2.8172, + "theoretical_loss": 3.6346408158368297, + "tokens_seen": 1043050496 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034541624874623875, + "loss": 2.7753, + "theoretical_loss": 3.6346192525790837, + "tokens_seen": 1043116032 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003454062186559679, + "loss": 2.9112, + "theoretical_loss": 3.634597691055358, + "tokens_seen": 1043181568 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003453961885656971, + "loss": 2.9696, + "theoretical_loss": 3.634576131265404, + "tokens_seen": 1043247104 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034538615847542624, + "loss": 2.8892, + "theoretical_loss": 3.6345545732089737, + "tokens_seen": 1043312640 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003453761283851555, + "loss": 2.865, + "theoretical_loss": 3.6345330168858188, + "tokens_seen": 1043378176 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034536609829488466, + "loss": 2.9282, + "theoretical_loss": 3.6345114622956913, + "tokens_seen": 1043443712 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034535606820461384, + "loss": 2.8948, + "theoretical_loss": 3.6344899094383427, + "tokens_seen": 1043509248 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0025129318237305, + "objective/train/theoretical_loss": 3.634468358313525, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.634468358313525, + "tokens_seen": 1043574784 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345346038114343, + "loss": 2.9448, + "theoretical_loss": 3.634468358313525, + "tokens_seen": 1043574784 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003453360080240722, + "loss": 2.8886, + "theoretical_loss": 3.6344468089209903, + "tokens_seen": 1043640320 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034532597793380143, + "loss": 2.9031, + "theoretical_loss": 3.634425261260491, + "tokens_seen": 1043705856 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003453159478435306, + "loss": 2.9926, + "theoretical_loss": 3.6344037153317776, + "tokens_seen": 1043771392 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003453059177532598, + "loss": 2.9072, + "theoretical_loss": 3.634382171134604, + "tokens_seen": 1043836928 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345295887662989, + "loss": 2.9948, + "theoretical_loss": 3.6343606286687207, + "tokens_seen": 1043902464 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452858575727182, + "loss": 2.8766, + "theoretical_loss": 3.6343390879338813, + "tokens_seen": 1043968000 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034527582748244734, + "loss": 3.0306, + "theoretical_loss": 3.634317548929837, + "tokens_seen": 1044033536 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452657973921766, + "loss": 3.0135, + "theoretical_loss": 3.6342960116563416, + "tokens_seen": 1044099072 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452557673019057, + "loss": 2.8204, + "theoretical_loss": 3.634274476113146, + "tokens_seen": 1044164608 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034524573721163494, + "loss": 2.9773, + "theoretical_loss": 3.6342529423000025, + "tokens_seen": 1044230144 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452357071213641, + "loss": 2.9351, + "theoretical_loss": 3.6342314102166644, + "tokens_seen": 1044295680 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452256770310933, + "loss": 2.8719, + "theoretical_loss": 3.634209879862884, + "tokens_seen": 1044361216 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003452156469408225, + "loss": 2.8959, + "theoretical_loss": 3.6341883512384134, + "tokens_seen": 1044426752 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034520561685055166, + "loss": 2.8766, + "theoretical_loss": 3.6341668243430063, + "tokens_seen": 1044492288 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034519558676028084, + "loss": 2.904, + "theoretical_loss": 3.634145299176414, + "tokens_seen": 1044557824 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003451855566700101, + "loss": 2.8751, + "theoretical_loss": 3.6341237757383897, + "tokens_seen": 1044623360 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003451755265797392, + "loss": 2.8822, + "theoretical_loss": 3.634102254028687, + "tokens_seen": 1044688896 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034516549648946844, + "loss": 2.9783, + "theoretical_loss": 3.6340807340470573, + "tokens_seen": 1044754432 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034515546639919757, + "loss": 2.9731, + "theoretical_loss": 3.6340592157932545, + "tokens_seen": 1044819968 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003451454363089268, + "loss": 2.8221, + "theoretical_loss": 3.634037699267031, + "tokens_seen": 1044885504 + }, + { + "epoch": 12.02, + "learning_rate": 0.000345135406218656, + "loss": 2.9043, + "theoretical_loss": 3.63401618446814, + "tokens_seen": 1044951040 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034512537612838517, + "loss": 3.0037, + "theoretical_loss": 3.6339946713963345, + "tokens_seen": 1045016576 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034511534603811435, + "loss": 2.8376, + "theoretical_loss": 3.633973160051367, + "tokens_seen": 1045082112 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003451053159478436, + "loss": 2.8218, + "theoretical_loss": 3.6339516504329916, + "tokens_seen": 1045147648 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.932687282562256, + "objective/train/theoretical_loss": 3.6339301425409607, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6339301425409607, + "tokens_seen": 1045213184 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003450952858575727, + "loss": 2.9067, + "theoretical_loss": 3.6339301425409607, + "tokens_seen": 1045213184 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034508525576730194, + "loss": 2.8159, + "theoretical_loss": 3.6339086363750277, + "tokens_seen": 1045278720 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034507522567703107, + "loss": 2.8778, + "theoretical_loss": 3.633887131934946, + "tokens_seen": 1045344256 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003450651955867603, + "loss": 2.9091, + "theoretical_loss": 3.6338656292204687, + "tokens_seen": 1045409792 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003450551654964895, + "loss": 2.9395, + "theoretical_loss": 3.63384412823135, + "tokens_seen": 1045475328 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034504513540621867, + "loss": 2.8646, + "theoretical_loss": 3.6338226289673417, + "tokens_seen": 1045540864 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034503510531594785, + "loss": 2.9023, + "theoretical_loss": 3.633801131428199, + "tokens_seen": 1045606400 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034502507522567703, + "loss": 2.9395, + "theoretical_loss": 3.6337796356136742, + "tokens_seen": 1045671936 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003450150451354062, + "loss": 2.8116, + "theoretical_loss": 3.633758141523521, + "tokens_seen": 1045737472 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034500501504513545, + "loss": 2.9296, + "theoretical_loss": 3.633736649157494, + "tokens_seen": 1045803008 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003449949849548646, + "loss": 2.8019, + "theoretical_loss": 3.633715158515346, + "tokens_seen": 1045868544 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003449849548645938, + "loss": 2.9097, + "theoretical_loss": 3.633693669596831, + "tokens_seen": 1045934080 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034497492477432294, + "loss": 2.9599, + "theoretical_loss": 3.6336721824017024, + "tokens_seen": 1045999616 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034496489468405217, + "loss": 2.897, + "theoretical_loss": 3.633650696929714, + "tokens_seen": 1046065152 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034495486459378135, + "loss": 2.8945, + "theoretical_loss": 3.6336292131806203, + "tokens_seen": 1046130688 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034494483450351053, + "loss": 2.9667, + "theoretical_loss": 3.633607731154175, + "tokens_seen": 1046196224 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003449348044132397, + "loss": 2.8283, + "theoretical_loss": 3.633586250850132, + "tokens_seen": 1046261760 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034492477432296895, + "loss": 2.8286, + "theoretical_loss": 3.6335647722682447, + "tokens_seen": 1046327296 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003449147442326981, + "loss": 2.9315, + "theoretical_loss": 3.633543295408269, + "tokens_seen": 1046392832 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003449047141424273, + "loss": 2.956, + "theoretical_loss": 3.6335218202699564, + "tokens_seen": 1046458368 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034489468405215644, + "loss": 2.8387, + "theoretical_loss": 3.6335003468530633, + "tokens_seen": 1046523904 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448846539618857, + "loss": 2.9147, + "theoretical_loss": 3.633478875157343, + "tokens_seen": 1046589440 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034487462387161486, + "loss": 2.9022, + "theoretical_loss": 3.6334574051825497, + "tokens_seen": 1046654976 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034486459378134404, + "loss": 2.9285, + "theoretical_loss": 3.633435936928438, + "tokens_seen": 1046720512 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448545636910732, + "loss": 2.9621, + "theoretical_loss": 3.633414470394762, + "tokens_seen": 1046786048 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6718428134918213, + "objective/train/theoretical_loss": 3.6333930055812766, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6333930055812766, + "tokens_seen": 1046851584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448445336008024, + "loss": 2.9316, + "theoretical_loss": 3.6333930055812766, + "tokens_seen": 1046851584 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448345035105316, + "loss": 2.8644, + "theoretical_loss": 3.6333715424877355, + "tokens_seen": 1046917120 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448244734202608, + "loss": 2.871, + "theoretical_loss": 3.6333500811138943, + "tokens_seen": 1046982656 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034481444332998994, + "loss": 2.9528, + "theoretical_loss": 3.6333286214595066, + "tokens_seen": 1047048192 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003448044132397192, + "loss": 2.8443, + "theoretical_loss": 3.6333071635243277, + "tokens_seen": 1047113728 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034479438314944836, + "loss": 2.9128, + "theoretical_loss": 3.6332857073081115, + "tokens_seen": 1047179264 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034478435305917754, + "loss": 2.9177, + "theoretical_loss": 3.633264252810614, + "tokens_seen": 1047244800 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447743229689067, + "loss": 2.9222, + "theoretical_loss": 3.6332428000315886, + "tokens_seen": 1047310336 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447642928786359, + "loss": 2.9233, + "theoretical_loss": 3.6332213489707907, + "tokens_seen": 1047375872 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447542627883651, + "loss": 2.9436, + "theoretical_loss": 3.633199899627976, + "tokens_seen": 1047441408 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447442326980943, + "loss": 3.0001, + "theoretical_loss": 3.633178452002898, + "tokens_seen": 1047506944 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034473420260782345, + "loss": 2.9356, + "theoretical_loss": 3.6331570060953124, + "tokens_seen": 1047572480 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447241725175527, + "loss": 2.8968, + "theoretical_loss": 3.6331355619049743, + "tokens_seen": 1047638016 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003447141424272818, + "loss": 2.8957, + "theoretical_loss": 3.6331141194316388, + "tokens_seen": 1047703552 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034470411233701104, + "loss": 3.0205, + "theoretical_loss": 3.6330926786750606, + "tokens_seen": 1047769088 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446940822467402, + "loss": 2.8817, + "theoretical_loss": 3.633071239634995, + "tokens_seen": 1047834624 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446840521564694, + "loss": 2.7345, + "theoretical_loss": 3.6330498023111977, + "tokens_seen": 1047900160 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446740220661986, + "loss": 2.8798, + "theoretical_loss": 3.6330283667034236, + "tokens_seen": 1047965696 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034466399197592777, + "loss": 2.8765, + "theoretical_loss": 3.6330069328114285, + "tokens_seen": 1048031232 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034465396188565695, + "loss": 2.983, + "theoretical_loss": 3.632985500634967, + "tokens_seen": 1048096768 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446439317953862, + "loss": 2.9556, + "theoretical_loss": 3.632964070173795, + "tokens_seen": 1048162304 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446339017051153, + "loss": 2.9092, + "theoretical_loss": 3.632942641427668, + "tokens_seen": 1048227840 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034462387161484455, + "loss": 2.7838, + "theoretical_loss": 3.632921214396342, + "tokens_seen": 1048293376 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034461384152457373, + "loss": 2.9603, + "theoretical_loss": 3.6328997890795716, + "tokens_seen": 1048358912 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003446038114343029, + "loss": 2.9467, + "theoretical_loss": 3.6328783654771124, + "tokens_seen": 1048424448 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.05375599861145, + "objective/train/theoretical_loss": 3.6328569435887212, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.6328569435887212, + "tokens_seen": 1048489984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034459378134403214, + "loss": 2.9548, + "theoretical_loss": 3.6328569435887212, + "tokens_seen": 1048489984 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034458375125376127, + "loss": 2.9247, + "theoretical_loss": 3.632835523414153, + "tokens_seen": 1048555520 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003445737211634905, + "loss": 2.8452, + "theoretical_loss": 3.6328141049531633, + "tokens_seen": 1048621056 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003445636910732197, + "loss": 2.8861, + "theoretical_loss": 3.632792688205509, + "tokens_seen": 1048686592 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034455366098294887, + "loss": 2.9808, + "theoretical_loss": 3.6327712731709445, + "tokens_seen": 1048752128 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034454363089267805, + "loss": 2.8757, + "theoretical_loss": 3.632749859849227, + "tokens_seen": 1048817664 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034453360080240723, + "loss": 2.9007, + "theoretical_loss": 3.632728448240112, + "tokens_seen": 1048883200 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003445235707121364, + "loss": 2.9363, + "theoretical_loss": 3.632707038343356, + "tokens_seen": 1048948736 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034451354062186565, + "loss": 2.9116, + "theoretical_loss": 3.632685630158714, + "tokens_seen": 1049014272 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003445035105315948, + "loss": 2.9944, + "theoretical_loss": 3.6326642236859428, + "tokens_seen": 1049079808 + }, + { + "epoch": 12.02, + "learning_rate": 0.000344493480441324, + "loss": 2.911, + "theoretical_loss": 3.632642818924799, + "tokens_seen": 1049145344 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034448345035105314, + "loss": 2.8791, + "theoretical_loss": 3.6326214158750383, + "tokens_seen": 1049210880 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034447342026078237, + "loss": 2.9029, + "theoretical_loss": 3.6326000145364166, + "tokens_seen": 1049276416 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034446339017051155, + "loss": 2.9007, + "theoretical_loss": 3.6325786149086916, + "tokens_seen": 1049341952 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034445336008024073, + "loss": 2.8738, + "theoretical_loss": 3.632557216991618, + "tokens_seen": 1049407488 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003444433299899699, + "loss": 2.8968, + "theoretical_loss": 3.6325358207849536, + "tokens_seen": 1049473024 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034443329989969915, + "loss": 2.926, + "theoretical_loss": 3.632514426288454, + "tokens_seen": 1049538560 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003444232698094283, + "loss": 2.842, + "theoretical_loss": 3.632493033501876, + "tokens_seen": 1049604096 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003444132397191575, + "loss": 2.9733, + "theoretical_loss": 3.632471642424977, + "tokens_seen": 1049669632 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034440320962888664, + "loss": 3.0153, + "theoretical_loss": 3.6324502530575122, + "tokens_seen": 1049735168 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003443931795386159, + "loss": 2.9299, + "theoretical_loss": 3.632428865399239, + "tokens_seen": 1049800704 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034438314944834506, + "loss": 2.9429, + "theoretical_loss": 3.632407479449914, + "tokens_seen": 1049866240 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034437311935807424, + "loss": 2.9413, + "theoretical_loss": 3.6323860952092946, + "tokens_seen": 1049931776 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003443630892678034, + "loss": 2.9693, + "theoretical_loss": 3.632364712677137, + "tokens_seen": 1049997312 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003443530591775326, + "loss": 2.8299, + "theoretical_loss": 3.632343331853198, + "tokens_seen": 1050062848 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0133776664733887, + "objective/train/theoretical_loss": 3.632321952737235, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.632321952737235, + "tokens_seen": 1050128384 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003443430290872618, + "loss": 2.9257, + "theoretical_loss": 3.632321952737235, + "tokens_seen": 1050128384 + }, + { + "epoch": 12.02, + "learning_rate": 0.000344332998996991, + "loss": 2.8998, + "theoretical_loss": 3.632300575329004, + "tokens_seen": 1050193920 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034432296890672014, + "loss": 2.9273, + "theoretical_loss": 3.6322791996282637, + "tokens_seen": 1050259456 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003443129388164494, + "loss": 2.9321, + "theoretical_loss": 3.6322578256347695, + "tokens_seen": 1050324992 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034430290872617856, + "loss": 2.8781, + "theoretical_loss": 3.6322364533482796, + "tokens_seen": 1050390528 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034429287863590774, + "loss": 2.9406, + "theoretical_loss": 3.6322150827685507, + "tokens_seen": 1050456064 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442828485456369, + "loss": 2.9407, + "theoretical_loss": 3.6321937138953406, + "tokens_seen": 1050521600 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442728184553661, + "loss": 3.0157, + "theoretical_loss": 3.6321723467284057, + "tokens_seen": 1050587136 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442627883650953, + "loss": 2.9455, + "theoretical_loss": 3.632150981267504, + "tokens_seen": 1050652672 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442527582748245, + "loss": 2.8494, + "theoretical_loss": 3.6321296175123927, + "tokens_seen": 1050718208 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034424272818455365, + "loss": 2.9175, + "theoretical_loss": 3.6321082554628292, + "tokens_seen": 1050783744 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442326980942829, + "loss": 3.0271, + "theoretical_loss": 3.632086895118571, + "tokens_seen": 1050849280 + }, + { + "epoch": 12.02, + "learning_rate": 0.000344222668004012, + "loss": 2.8024, + "theoretical_loss": 3.6320655364793755, + "tokens_seen": 1050914816 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034421263791374124, + "loss": 2.8726, + "theoretical_loss": 3.6320441795450007, + "tokens_seen": 1050980352 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003442026078234704, + "loss": 2.9166, + "theoretical_loss": 3.6320228243152037, + "tokens_seen": 1051045888 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441925777331996, + "loss": 2.9225, + "theoretical_loss": 3.632001470789742, + "tokens_seen": 1051111424 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441825476429288, + "loss": 2.9426, + "theoretical_loss": 3.6319801189683742, + "tokens_seen": 1051176960 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034417251755265797, + "loss": 2.9421, + "theoretical_loss": 3.6319587688508577, + "tokens_seen": 1051242496 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034416248746238715, + "loss": 2.8968, + "theoretical_loss": 3.63193742043695, + "tokens_seen": 1051308032 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441524573721164, + "loss": 2.8445, + "theoretical_loss": 3.6319160737264093, + "tokens_seen": 1051373568 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441424272818455, + "loss": 2.9133, + "theoretical_loss": 3.631894728718993, + "tokens_seen": 1051439104 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034413239719157475, + "loss": 2.9356, + "theoretical_loss": 3.63187338541446, + "tokens_seen": 1051504640 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034412236710130393, + "loss": 2.8761, + "theoretical_loss": 3.631852043812568, + "tokens_seen": 1051570176 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441123370110331, + "loss": 2.8484, + "theoretical_loss": 3.6318307039130744, + "tokens_seen": 1051635712 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003441023069207623, + "loss": 2.8523, + "theoretical_loss": 3.631809365715738, + "tokens_seen": 1051701248 + }, + { + "epoch": 12.02, + "objective/train/docs_used": 2458144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8188655376434326, + "objective/train/theoretical_loss": 3.631788029220317, + "objective/train/tokens_used": 1052919264, + "theoretical_loss": 3.631788029220317, + "tokens_seen": 1051766784 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034409227683049147, + "loss": 2.8593, + "theoretical_loss": 3.631788029220317, + "tokens_seen": 1051766784 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034408224674022065, + "loss": 2.9219, + "theoretical_loss": 3.6317666944265694, + "tokens_seen": 1051832320 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003440722166499499, + "loss": 2.9708, + "theoretical_loss": 3.6317453613342536, + "tokens_seen": 1051897856 + }, + { + "epoch": 12.02, + "learning_rate": 0.000344062186559679, + "loss": 2.9651, + "theoretical_loss": 3.6317240299431273, + "tokens_seen": 1051963392 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034405215646940825, + "loss": 2.9495, + "theoretical_loss": 3.63170270025295, + "tokens_seen": 1052028928 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003440421263791374, + "loss": 2.911, + "theoretical_loss": 3.631681372263479, + "tokens_seen": 1052094464 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003440320962888666, + "loss": 3.0171, + "theoretical_loss": 3.6316600459744737, + "tokens_seen": 1052160000 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003440220661985958, + "loss": 3.01, + "theoretical_loss": 3.631638721385692, + "tokens_seen": 1052225536 + }, + { + "epoch": 12.02, + "learning_rate": 0.000344012036108325, + "loss": 2.9727, + "theoretical_loss": 3.6316173984968927, + "tokens_seen": 1052291072 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034400200601805416, + "loss": 2.9401, + "theoretical_loss": 3.6315960773078344, + "tokens_seen": 1052356608 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034399197592778334, + "loss": 2.9353, + "theoretical_loss": 3.6315747578182758, + "tokens_seen": 1052422144 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003439819458375125, + "loss": 2.902, + "theoretical_loss": 3.631553440027975, + "tokens_seen": 1052487680 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034397191574724175, + "loss": 2.9094, + "theoretical_loss": 3.631532123936692, + "tokens_seen": 1052553216 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003439618856569709, + "loss": 2.9471, + "theoretical_loss": 3.631510809544185, + "tokens_seen": 1052618752 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003439518555667001, + "loss": 2.9978, + "theoretical_loss": 3.6314894968502127, + "tokens_seen": 1052684288 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003439418254764293, + "loss": 2.8968, + "theoretical_loss": 3.631468185854534, + "tokens_seen": 1052749824 + }, + { + "epoch": 12.02, + "learning_rate": 0.0003439317953861585, + "loss": 2.9478, + "theoretical_loss": 3.6314468765569083, + "tokens_seen": 1052815360 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034392176529588766, + "loss": 2.8271, + "theoretical_loss": 3.631425568957094, + "tokens_seen": 1052880896 + }, + { + "epoch": 12.02, + "learning_rate": 0.00034391173520561684, + "loss": 2.941, + "theoretical_loss": 3.6314065933052335, + "tokens_seen": 1052939264 + }, + { + "epoch": 13.0, + "learning_rate": 0.000343901705115346, + "loss": 2.8445, + "theoretical_loss": 3.631385288914686, + "tokens_seen": 1053004800 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034389167502507526, + "loss": 2.8647, + "theoretical_loss": 3.6313639862212543, + "tokens_seen": 1053070336 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003438816449348044, + "loss": 2.8897, + "theoretical_loss": 3.6313426852246966, + "tokens_seen": 1053135872 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003438716148445336, + "loss": 2.7358, + "theoretical_loss": 3.631321385924773, + "tokens_seen": 1053201408 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034386158475426275, + "loss": 2.8306, + "theoretical_loss": 3.6313000883212423, + "tokens_seen": 1053266944 + }, + { + "epoch": 13.0, + "learning_rate": 0.000343851554663992, + "loss": 2.7904, + "theoretical_loss": 3.6312787924138643, + "tokens_seen": 1053332480 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2508588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.899001121520996, + "objective/train/theoretical_loss": 3.631257498202398, + "objective/train/tokens_used": 1073858016, + "theoretical_loss": 3.631257498202398, + "tokens_seen": 1053398016 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003438415245737212, + "loss": 2.8641, + "theoretical_loss": 3.631257498202398, + "tokens_seen": 1053398016 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034383149448345034, + "loss": 2.8151, + "theoretical_loss": 3.6312362056866037, + "tokens_seen": 1053463552 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003438214643931796, + "loss": 2.8354, + "theoretical_loss": 3.6312149148662396, + "tokens_seen": 1053529088 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034381143430290876, + "loss": 2.8795, + "theoretical_loss": 3.6311936257410657, + "tokens_seen": 1053594624 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034380140421263794, + "loss": 2.6832, + "theoretical_loss": 3.6311723383108423, + "tokens_seen": 1053660160 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437913741223671, + "loss": 2.8952, + "theoretical_loss": 3.631151052575328, + "tokens_seen": 1053725696 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437813440320963, + "loss": 2.8043, + "theoretical_loss": 3.6311297685342834, + "tokens_seen": 1053791232 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437713139418255, + "loss": 2.7475, + "theoretical_loss": 3.6311084861874674, + "tokens_seen": 1053856768 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437612838515547, + "loss": 2.6203, + "theoretical_loss": 3.631087205534641, + "tokens_seen": 1053922304 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034375125376128385, + "loss": 2.8396, + "theoretical_loss": 3.6310659265755625, + "tokens_seen": 1053987840 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437412236710131, + "loss": 2.7383, + "theoretical_loss": 3.6310446493099926, + "tokens_seen": 1054053376 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437311935807422, + "loss": 2.8079, + "theoretical_loss": 3.631023373737692, + "tokens_seen": 1054118912 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034372116349047144, + "loss": 2.7459, + "theoretical_loss": 3.631002099858419, + "tokens_seen": 1054184448 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437111334002006, + "loss": 2.7497, + "theoretical_loss": 3.6309808276719346, + "tokens_seen": 1054249984 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003437011033099298, + "loss": 2.8608, + "theoretical_loss": 3.6309595571779996, + "tokens_seen": 1054315520 + }, + { + "epoch": 13.0, + "learning_rate": 0.000343691073219659, + "loss": 2.7481, + "theoretical_loss": 3.6309382883763726, + "tokens_seen": 1054381056 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034368104312938817, + "loss": 2.6954, + "theoretical_loss": 3.6309170212668147, + "tokens_seen": 1054446592 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034367101303911735, + "loss": 2.7269, + "theoretical_loss": 3.630895755849086, + "tokens_seen": 1054512128 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003436609829488466, + "loss": 2.863, + "theoretical_loss": 3.6308744921229468, + "tokens_seen": 1054577664 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003436509528585757, + "loss": 2.8433, + "theoretical_loss": 3.6308532300881575, + "tokens_seen": 1054643200 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034364092276830495, + "loss": 2.8375, + "theoretical_loss": 3.630831969744478, + "tokens_seen": 1054708736 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034363089267803413, + "loss": 2.8701, + "theoretical_loss": 3.6308107110916694, + "tokens_seen": 1054774272 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003436208625877633, + "loss": 2.7963, + "theoretical_loss": 3.6307894541294914, + "tokens_seen": 1054839808 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003436108324974925, + "loss": 2.8682, + "theoretical_loss": 3.630768198857705, + "tokens_seen": 1054905344 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034360080240722167, + "loss": 2.9781, + "theoretical_loss": 3.6307469452760714, + "tokens_seen": 1054970880 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2511734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.744760274887085, + "objective/train/theoretical_loss": 3.6307256933843504, + "objective/train/tokens_used": 1075496416, + "theoretical_loss": 3.6307256933843504, + "tokens_seen": 1055036416 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034359077231695085, + "loss": 2.7598, + "theoretical_loss": 3.6307256933843504, + "tokens_seen": 1055036416 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003435807422266801, + "loss": 2.7024, + "theoretical_loss": 3.6307044431823026, + "tokens_seen": 1055101952 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003435707121364092, + "loss": 2.8735, + "theoretical_loss": 3.6306831946696887, + "tokens_seen": 1055167488 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034356068204613845, + "loss": 2.8023, + "theoretical_loss": 3.6306619478462707, + "tokens_seen": 1055233024 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003435506519558676, + "loss": 2.8575, + "theoretical_loss": 3.6306407027118075, + "tokens_seen": 1055298560 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003435406218655968, + "loss": 2.7698, + "theoretical_loss": 3.630619459266061, + "tokens_seen": 1055364096 + }, + { + "epoch": 13.0, + "learning_rate": 0.000343530591775326, + "loss": 2.8968, + "theoretical_loss": 3.6305982175087923, + "tokens_seen": 1055429632 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003435205616850552, + "loss": 2.7967, + "theoretical_loss": 3.6305769774397625, + "tokens_seen": 1055495168 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034351053159478436, + "loss": 2.865, + "theoretical_loss": 3.6305557390587317, + "tokens_seen": 1055560704 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034350050150451354, + "loss": 2.8132, + "theoretical_loss": 3.6305345023654616, + "tokens_seen": 1055626240 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434904714142427, + "loss": 2.88, + "theoretical_loss": 3.630513267359713, + "tokens_seen": 1055691776 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034348044132397195, + "loss": 2.8476, + "theoretical_loss": 3.6304920340412474, + "tokens_seen": 1055757312 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434704112337011, + "loss": 2.7887, + "theoretical_loss": 3.630470802409826, + "tokens_seen": 1055822848 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434603811434303, + "loss": 2.8198, + "theoretical_loss": 3.63044957246521, + "tokens_seen": 1055888384 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434503510531595, + "loss": 2.797, + "theoretical_loss": 3.63042834420716, + "tokens_seen": 1055953920 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434403209628887, + "loss": 2.8159, + "theoretical_loss": 3.630407117635438, + "tokens_seen": 1056019456 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034343029087261786, + "loss": 2.7948, + "theoretical_loss": 3.630385892749806, + "tokens_seen": 1056084992 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034342026078234704, + "loss": 2.9275, + "theoretical_loss": 3.6303646695500245, + "tokens_seen": 1056150528 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003434102306920762, + "loss": 2.8083, + "theoretical_loss": 3.6303434480358554, + "tokens_seen": 1056216064 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034340020060180546, + "loss": 2.7823, + "theoretical_loss": 3.6303222282070604, + "tokens_seen": 1056281600 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433901705115346, + "loss": 2.9031, + "theoretical_loss": 3.6303010100634006, + "tokens_seen": 1056347136 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433801404212638, + "loss": 2.7692, + "theoretical_loss": 3.630279793604638, + "tokens_seen": 1056412672 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034337011033099295, + "loss": 2.8368, + "theoretical_loss": 3.630258578830534, + "tokens_seen": 1056478208 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433600802407222, + "loss": 2.9324, + "theoretical_loss": 3.63023736574085, + "tokens_seen": 1056543744 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034335005015045136, + "loss": 2.8239, + "theoretical_loss": 3.6302161543353493, + "tokens_seen": 1056609280 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2515407, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963581085205078, + "objective/train/theoretical_loss": 3.6301949446137924, + "objective/train/tokens_used": 1077134816, + "theoretical_loss": 3.6301949446137924, + "tokens_seen": 1056674816 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034334002006018054, + "loss": 2.7948, + "theoretical_loss": 3.6301949446137924, + "tokens_seen": 1056674816 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433299899699097, + "loss": 2.7927, + "theoretical_loss": 3.6301737365759417, + "tokens_seen": 1056740352 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433199598796389, + "loss": 2.8586, + "theoretical_loss": 3.630152530221559, + "tokens_seen": 1056805888 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003433099297893681, + "loss": 2.7914, + "theoretical_loss": 3.6301313255504057, + "tokens_seen": 1056871424 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003432998996990973, + "loss": 2.7889, + "theoretical_loss": 3.630110122562245, + "tokens_seen": 1056936960 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034328986960882645, + "loss": 2.8756, + "theoretical_loss": 3.630088921256838, + "tokens_seen": 1057002496 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003432798395185557, + "loss": 2.9424, + "theoretical_loss": 3.6300677216339476, + "tokens_seen": 1057068032 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034326980942828487, + "loss": 2.7988, + "theoretical_loss": 3.630046523693335, + "tokens_seen": 1057133568 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034325977933801405, + "loss": 2.8107, + "theoretical_loss": 3.6300253274347636, + "tokens_seen": 1057199104 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034324974924774323, + "loss": 2.7907, + "theoretical_loss": 3.6300041328579953, + "tokens_seen": 1057264640 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003432397191574724, + "loss": 2.8411, + "theoretical_loss": 3.629982939962791, + "tokens_seen": 1057330176 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003432296890672016, + "loss": 2.7562, + "theoretical_loss": 3.629961748748915, + "tokens_seen": 1057395712 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003432196589769308, + "loss": 2.8388, + "theoretical_loss": 3.62994055921613, + "tokens_seen": 1057461248 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034320962888665995, + "loss": 2.7477, + "theoretical_loss": 3.629919371364196, + "tokens_seen": 1057526784 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431995987963892, + "loss": 2.8597, + "theoretical_loss": 3.6298981851928773, + "tokens_seen": 1057592320 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431895687061183, + "loss": 2.8458, + "theoretical_loss": 3.6298770007019363, + "tokens_seen": 1057657856 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034317953861584755, + "loss": 2.6879, + "theoretical_loss": 3.6298558178911353, + "tokens_seen": 1057723392 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034316950852557673, + "loss": 2.8345, + "theoretical_loss": 3.6298346367602377, + "tokens_seen": 1057788928 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431594784353059, + "loss": 2.8737, + "theoretical_loss": 3.6298134573090053, + "tokens_seen": 1057854464 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431494483450351, + "loss": 2.84, + "theoretical_loss": 3.6297922795372006, + "tokens_seen": 1057920000 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034313941825476433, + "loss": 2.864, + "theoretical_loss": 3.629771103444588, + "tokens_seen": 1057985536 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034312938816449345, + "loss": 2.7476, + "theoretical_loss": 3.629749929030929, + "tokens_seen": 1058051072 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431193580742227, + "loss": 2.7671, + "theoretical_loss": 3.6297287562959863, + "tokens_seen": 1058116608 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003431093279839518, + "loss": 2.8831, + "theoretical_loss": 3.629707585239524, + "tokens_seen": 1058182144 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034309929789368105, + "loss": 2.8064, + "theoretical_loss": 3.629686415861304, + "tokens_seen": 1058247680 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2518591, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9141550064086914, + "objective/train/theoretical_loss": 3.62966524816109, + "objective/train/tokens_used": 1078773216, + "theoretical_loss": 3.62966524816109, + "tokens_seen": 1058313216 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003430892678034103, + "loss": 2.8756, + "theoretical_loss": 3.62966524816109, + "tokens_seen": 1058313216 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003430792377131394, + "loss": 2.8308, + "theoretical_loss": 3.629644082138645, + "tokens_seen": 1058378752 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034306920762286865, + "loss": 2.7863, + "theoretical_loss": 3.629622917793732, + "tokens_seen": 1058444288 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003430591775325978, + "loss": 2.8505, + "theoretical_loss": 3.6296017551261146, + "tokens_seen": 1058509824 + }, + { + "epoch": 13.0, + "learning_rate": 0.000343049147442327, + "loss": 2.8505, + "theoretical_loss": 3.629580594135555, + "tokens_seen": 1058575360 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003430391173520562, + "loss": 2.8031, + "theoretical_loss": 3.6295594348218176, + "tokens_seen": 1058640896 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003430290872617854, + "loss": 2.8117, + "theoretical_loss": 3.6295382771846656, + "tokens_seen": 1058706432 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034301905717151456, + "loss": 2.8124, + "theoretical_loss": 3.6295171212238615, + "tokens_seen": 1058771968 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034300902708124374, + "loss": 2.7562, + "theoretical_loss": 3.6294959669391695, + "tokens_seen": 1058837504 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429989969909729, + "loss": 2.7986, + "theoretical_loss": 3.629474814330353, + "tokens_seen": 1058903040 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034298896690070215, + "loss": 2.8824, + "theoretical_loss": 3.6294536633971752, + "tokens_seen": 1058968576 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429789368104313, + "loss": 2.8895, + "theoretical_loss": 3.6294325141394004, + "tokens_seen": 1059034112 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429689067201605, + "loss": 2.7453, + "theoretical_loss": 3.6294113665567913, + "tokens_seen": 1059099648 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429588766298897, + "loss": 2.7594, + "theoretical_loss": 3.629390220649112, + "tokens_seen": 1059165184 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429488465396189, + "loss": 2.8818, + "theoretical_loss": 3.6293690764161264, + "tokens_seen": 1059230720 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034293881644934806, + "loss": 2.789, + "theoretical_loss": 3.6293479338575985, + "tokens_seen": 1059296256 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034292878635907724, + "loss": 2.8268, + "theoretical_loss": 3.629326792973291, + "tokens_seen": 1059361792 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003429187562688064, + "loss": 2.8378, + "theoretical_loss": 3.6293056537629687, + "tokens_seen": 1059427328 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034290872617853566, + "loss": 2.7949, + "theoretical_loss": 3.629284516226395, + "tokens_seen": 1059492864 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428986960882648, + "loss": 2.8581, + "theoretical_loss": 3.6292633803633345, + "tokens_seen": 1059558400 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342888665997994, + "loss": 2.7953, + "theoretical_loss": 3.6292422461735505, + "tokens_seen": 1059623936 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034287863590772315, + "loss": 2.8036, + "theoretical_loss": 3.6292211136568078, + "tokens_seen": 1059689472 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428686058174524, + "loss": 2.8222, + "theoretical_loss": 3.62919998281287, + "tokens_seen": 1059755008 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034285857572718156, + "loss": 2.895, + "theoretical_loss": 3.629178853641501, + "tokens_seen": 1059820544 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034284854563691074, + "loss": 2.8032, + "theoretical_loss": 3.6291577261424655, + "tokens_seen": 1059886080 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2523251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.801368236541748, + "objective/train/theoretical_loss": 3.629136600315528, + "objective/train/tokens_used": 1080411616, + "theoretical_loss": 3.629136600315528, + "tokens_seen": 1059951616 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428385155466399, + "loss": 2.9522, + "theoretical_loss": 3.629136600315528, + "tokens_seen": 1059951616 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428284854563691, + "loss": 2.8473, + "theoretical_loss": 3.6291154761604516, + "tokens_seen": 1060017152 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428184553660983, + "loss": 2.8588, + "theoretical_loss": 3.6290943536770017, + "tokens_seen": 1060082688 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003428084252758275, + "loss": 2.7912, + "theoretical_loss": 3.6290732328649424, + "tokens_seen": 1060148224 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034279839518555665, + "loss": 2.8176, + "theoretical_loss": 3.6290521137240384, + "tokens_seen": 1060213760 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003427883650952859, + "loss": 2.777, + "theoretical_loss": 3.6290309962540537, + "tokens_seen": 1060279296 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034277833500501507, + "loss": 2.8853, + "theoretical_loss": 3.6290098804547526, + "tokens_seen": 1060344832 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034276830491474425, + "loss": 2.8373, + "theoretical_loss": 3.628988766325901, + "tokens_seen": 1060410368 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034275827482447343, + "loss": 2.7149, + "theoretical_loss": 3.628967653867262, + "tokens_seen": 1060475904 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003427482447342026, + "loss": 2.8493, + "theoretical_loss": 3.628946543078601, + "tokens_seen": 1060541440 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003427382146439318, + "loss": 2.8815, + "theoretical_loss": 3.628925433959683, + "tokens_seen": 1060606976 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342728184553661, + "loss": 2.609, + "theoretical_loss": 3.6289043265102725, + "tokens_seen": 1060672512 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034271815446339015, + "loss": 2.8964, + "theoretical_loss": 3.628883220730134, + "tokens_seen": 1060738048 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003427081243731194, + "loss": 2.7417, + "theoretical_loss": 3.6288621166190325, + "tokens_seen": 1060803584 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003426980942828485, + "loss": 2.7898, + "theoretical_loss": 3.628841014176733, + "tokens_seen": 1060869120 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034268806419257775, + "loss": 2.7789, + "theoretical_loss": 3.6288199134030004, + "tokens_seen": 1060934656 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034267803410230693, + "loss": 2.8616, + "theoretical_loss": 3.6287988142975998, + "tokens_seen": 1061000192 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003426680040120361, + "loss": 2.9074, + "theoretical_loss": 3.6287777168602964, + "tokens_seen": 1061065728 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003426579739217653, + "loss": 2.8078, + "theoretical_loss": 3.6287566210908553, + "tokens_seen": 1061131264 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034264794383149453, + "loss": 2.875, + "theoretical_loss": 3.6287355269890416, + "tokens_seen": 1061196800 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034263791374122366, + "loss": 2.8498, + "theoretical_loss": 3.62871443455462, + "tokens_seen": 1061262336 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003426278836509529, + "loss": 2.9006, + "theoretical_loss": 3.628693343787356, + "tokens_seen": 1061327872 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342617853560682, + "loss": 2.8034, + "theoretical_loss": 3.6286722546870154, + "tokens_seen": 1061393408 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034260782347041125, + "loss": 2.8398, + "theoretical_loss": 3.628651167253363, + "tokens_seen": 1061458944 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034259779338014043, + "loss": 2.9198, + "theoretical_loss": 3.628630081486164, + "tokens_seen": 1061524480 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2526352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.849714756011963, + "objective/train/theoretical_loss": 3.628608997385185, + "objective/train/tokens_used": 1082050016, + "theoretical_loss": 3.628608997385185, + "tokens_seen": 1061590016 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425877632898696, + "loss": 2.8428, + "theoretical_loss": 3.628608997385185, + "tokens_seen": 1061590016 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425777331995988, + "loss": 2.8619, + "theoretical_loss": 3.6285879149501903, + "tokens_seen": 1061655552 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342567703109328, + "loss": 2.7451, + "theoretical_loss": 3.628566834180946, + "tokens_seen": 1061721088 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034255767301905716, + "loss": 2.7859, + "theoretical_loss": 3.628545755077217, + "tokens_seen": 1061786624 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425476429287864, + "loss": 2.8138, + "theoretical_loss": 3.6285246776387696, + "tokens_seen": 1061852160 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425376128385155, + "loss": 2.8784, + "theoretical_loss": 3.6285036018653694, + "tokens_seen": 1061917696 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034252758274824476, + "loss": 2.8556, + "theoretical_loss": 3.6284825277567823, + "tokens_seen": 1061983232 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425175526579739, + "loss": 2.7345, + "theoretical_loss": 3.628461455312773, + "tokens_seen": 1062048768 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003425075225677031, + "loss": 2.8319, + "theoretical_loss": 3.628440384533109, + "tokens_seen": 1062114304 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424974924774323, + "loss": 2.9231, + "theoretical_loss": 3.628419315417555, + "tokens_seen": 1062179840 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424874623871615, + "loss": 2.9318, + "theoretical_loss": 3.628398247965877, + "tokens_seen": 1062245376 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034247743229689066, + "loss": 2.8402, + "theoretical_loss": 3.628377182177841, + "tokens_seen": 1062310912 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424674022066199, + "loss": 2.81, + "theoretical_loss": 3.628356118053213, + "tokens_seen": 1062376448 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342457372116349, + "loss": 2.8194, + "theoretical_loss": 3.6283350555917595, + "tokens_seen": 1062441984 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034244734202607826, + "loss": 2.8342, + "theoretical_loss": 3.6283139947932463, + "tokens_seen": 1062507520 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424373119358074, + "loss": 2.7495, + "theoretical_loss": 3.6282929356574396, + "tokens_seen": 1062573056 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424272818455366, + "loss": 2.7537, + "theoretical_loss": 3.6282718781841057, + "tokens_seen": 1062638592 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003424172517552658, + "loss": 2.8251, + "theoretical_loss": 3.62825082237301, + "tokens_seen": 1062704128 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342407221664995, + "loss": 2.8288, + "theoretical_loss": 3.62822976822392, + "tokens_seen": 1062769664 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034239719157472416, + "loss": 2.8709, + "theoretical_loss": 3.6282087157366014, + "tokens_seen": 1062835200 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034238716148445335, + "loss": 2.7993, + "theoretical_loss": 3.6281876649108207, + "tokens_seen": 1062900736 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423771313941825, + "loss": 2.6908, + "theoretical_loss": 3.628166615746344, + "tokens_seen": 1062966272 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034236710130391176, + "loss": 2.7417, + "theoretical_loss": 3.628145568242939, + "tokens_seen": 1063031808 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423570712136409, + "loss": 2.7694, + "theoretical_loss": 3.62812452240037, + "tokens_seen": 1063097344 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423470411233701, + "loss": 2.7672, + "theoretical_loss": 3.6281034782184056, + "tokens_seen": 1063162880 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2530203, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.832376003265381, + "objective/train/theoretical_loss": 3.628082435696812, + "objective/train/tokens_used": 1083688416, + "theoretical_loss": 3.628082435696812, + "tokens_seen": 1063228416 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423370110330993, + "loss": 2.7596, + "theoretical_loss": 3.628082435696812, + "tokens_seen": 1063228416 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423269809428285, + "loss": 2.8369, + "theoretical_loss": 3.6280613948353553, + "tokens_seen": 1063293952 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003423169508525577, + "loss": 2.8542, + "theoretical_loss": 3.6280403556338023, + "tokens_seen": 1063359488 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034230692076228685, + "loss": 2.8377, + "theoretical_loss": 3.62801931809192, + "tokens_seen": 1063425024 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003422968906720161, + "loss": 2.8813, + "theoretical_loss": 3.627998282209475, + "tokens_seen": 1063490560 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034228686058174527, + "loss": 2.8543, + "theoretical_loss": 3.6279772479862347, + "tokens_seen": 1063556096 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034227683049147445, + "loss": 2.8579, + "theoretical_loss": 3.627956215421966, + "tokens_seen": 1063621632 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034226680040120363, + "loss": 2.8863, + "theoretical_loss": 3.627935184516435, + "tokens_seen": 1063687168 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003422567703109328, + "loss": 2.8327, + "theoretical_loss": 3.6279141552694085, + "tokens_seen": 1063752704 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342246740220662, + "loss": 2.8223, + "theoretical_loss": 3.6278931276806548, + "tokens_seen": 1063818240 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003422367101303912, + "loss": 2.8213, + "theoretical_loss": 3.627872101749941, + "tokens_seen": 1063883776 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034222668004012035, + "loss": 2.8225, + "theoretical_loss": 3.6278510774770325, + "tokens_seen": 1063949312 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003422166499498496, + "loss": 2.7815, + "theoretical_loss": 3.6278300548616986, + "tokens_seen": 1064014848 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003422066198595787, + "loss": 2.7292, + "theoretical_loss": 3.627809033903705, + "tokens_seen": 1064080384 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034219658976930795, + "loss": 2.8334, + "theoretical_loss": 3.6277880146028196, + "tokens_seen": 1064145920 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034218655967903713, + "loss": 2.7922, + "theoretical_loss": 3.62776699695881, + "tokens_seen": 1064211456 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003421765295887663, + "loss": 2.8188, + "theoretical_loss": 3.6277459809714427, + "tokens_seen": 1064276992 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003421664994984955, + "loss": 2.9351, + "theoretical_loss": 3.6277249666404856, + "tokens_seen": 1064342528 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034215646940822473, + "loss": 2.8187, + "theoretical_loss": 3.6277039539657068, + "tokens_seen": 1064408064 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034214643931795386, + "loss": 2.8436, + "theoretical_loss": 3.627682942946873, + "tokens_seen": 1064473600 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003421364092276831, + "loss": 2.9096, + "theoretical_loss": 3.627661933583752, + "tokens_seen": 1064539136 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003421263791374122, + "loss": 2.8655, + "theoretical_loss": 3.6276409258761113, + "tokens_seen": 1064604672 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034211634904714145, + "loss": 2.8995, + "theoretical_loss": 3.6276199198237187, + "tokens_seen": 1064670208 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034210631895687063, + "loss": 2.7242, + "theoretical_loss": 3.6275989154263417, + "tokens_seen": 1064735744 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420962888665998, + "loss": 2.8014, + "theoretical_loss": 3.6275779126837486, + "tokens_seen": 1064801280 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2534803, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9428391456604004, + "objective/train/theoretical_loss": 3.6275569115957067, + "objective/train/tokens_used": 1085326816, + "theoretical_loss": 3.6275569115957067, + "tokens_seen": 1064866816 + }, + { + "epoch": 13.0, + "learning_rate": 0.000342086258776329, + "loss": 2.8995, + "theoretical_loss": 3.6275569115957067, + "tokens_seen": 1064866816 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420762286860582, + "loss": 2.8096, + "theoretical_loss": 3.627535912161984, + "tokens_seen": 1064932352 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034206619859578736, + "loss": 2.7719, + "theoretical_loss": 3.627514914382348, + "tokens_seen": 1064997888 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420561685055166, + "loss": 2.8941, + "theoretical_loss": 3.627493918256567, + "tokens_seen": 1065063424 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420461384152457, + "loss": 2.8704, + "theoretical_loss": 3.6274729237844094, + "tokens_seen": 1065128960 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034203610832497496, + "loss": 2.7897, + "theoretical_loss": 3.6274519309656426, + "tokens_seen": 1065194496 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420260782347041, + "loss": 2.7971, + "theoretical_loss": 3.6274309398000346, + "tokens_seen": 1065260032 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420160481444333, + "loss": 2.8812, + "theoretical_loss": 3.627409950287354, + "tokens_seen": 1065325568 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003420060180541625, + "loss": 2.6458, + "theoretical_loss": 3.6273889624273687, + "tokens_seen": 1065391104 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419959879638917, + "loss": 2.8485, + "theoretical_loss": 3.627367976219847, + "tokens_seen": 1065456640 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034198595787362086, + "loss": 2.8496, + "theoretical_loss": 3.6273469916645573, + "tokens_seen": 1065522176 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419759277833501, + "loss": 2.9299, + "theoretical_loss": 3.6273260087612673, + "tokens_seen": 1065587712 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419658976930792, + "loss": 2.663, + "theoretical_loss": 3.6273050275097463, + "tokens_seen": 1065653248 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034195586760280846, + "loss": 2.8915, + "theoretical_loss": 3.6272840479097623, + "tokens_seen": 1065718784 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419458375125376, + "loss": 2.7906, + "theoretical_loss": 3.6272630699610833, + "tokens_seen": 1065784320 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419358074222668, + "loss": 2.8964, + "theoretical_loss": 3.627242093663478, + "tokens_seen": 1065849856 + }, + { + "epoch": 13.0, + "learning_rate": 0.000341925777331996, + "loss": 2.8683, + "theoretical_loss": 3.6272211190167156, + "tokens_seen": 1065915392 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003419157472417252, + "loss": 2.8528, + "theoretical_loss": 3.627200146020564, + "tokens_seen": 1065980928 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034190571715145436, + "loss": 2.685, + "theoretical_loss": 3.627179174674792, + "tokens_seen": 1066046464 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034189568706118355, + "loss": 2.8707, + "theoretical_loss": 3.6271582049791684, + "tokens_seen": 1066112000 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003418856569709127, + "loss": 2.8015, + "theoretical_loss": 3.6271372369334616, + "tokens_seen": 1066177536 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034187562688064196, + "loss": 2.772, + "theoretical_loss": 3.6271162705374405, + "tokens_seen": 1066243072 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003418655967903711, + "loss": 2.9515, + "theoretical_loss": 3.6270953057908746, + "tokens_seen": 1066308608 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003418555667001003, + "loss": 2.8234, + "theoretical_loss": 3.6270743426935317, + "tokens_seen": 1066374144 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034184553660982945, + "loss": 2.8234, + "theoretical_loss": 3.6270533812451817, + "tokens_seen": 1066439680 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2537925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9378468990325928, + "objective/train/theoretical_loss": 3.627032421445593, + "objective/train/tokens_used": 1086965216, + "theoretical_loss": 3.627032421445593, + "tokens_seen": 1066505216 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003418355065195587, + "loss": 2.8872, + "theoretical_loss": 3.627032421445593, + "tokens_seen": 1066505216 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034182547642928787, + "loss": 2.902, + "theoretical_loss": 3.6270114632945347, + "tokens_seen": 1066570752 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034181544633901705, + "loss": 2.8537, + "theoretical_loss": 3.626990506791776, + "tokens_seen": 1066636288 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034180541624874623, + "loss": 2.7764, + "theoretical_loss": 3.6269695519370853, + "tokens_seen": 1066701824 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034179538615847547, + "loss": 2.7787, + "theoretical_loss": 3.626948598730232, + "tokens_seen": 1066767360 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003417853560682046, + "loss": 2.8509, + "theoretical_loss": 3.626927647170987, + "tokens_seen": 1066832896 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034177532597793383, + "loss": 2.8448, + "theoretical_loss": 3.626906697259117, + "tokens_seen": 1066898432 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034176529588766295, + "loss": 2.8246, + "theoretical_loss": 3.6268857489943933, + "tokens_seen": 1066963968 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003417552657973922, + "loss": 2.7853, + "theoretical_loss": 3.6268648023765837, + "tokens_seen": 1067029504 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034174523570712137, + "loss": 2.8577, + "theoretical_loss": 3.6268438574054587, + "tokens_seen": 1067095040 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034173520561685055, + "loss": 2.8507, + "theoretical_loss": 3.626822914080787, + "tokens_seen": 1067160576 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034172517552657973, + "loss": 2.8388, + "theoretical_loss": 3.6268019724023386, + "tokens_seen": 1067226112 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003417151454363089, + "loss": 2.8237, + "theoretical_loss": 3.6267810323698826, + "tokens_seen": 1067291648 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003417051153460381, + "loss": 2.8452, + "theoretical_loss": 3.6267600939831888, + "tokens_seen": 1067357184 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034169508525576733, + "loss": 2.8118, + "theoretical_loss": 3.626739157242027, + "tokens_seen": 1067422720 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034168505516549646, + "loss": 2.8802, + "theoretical_loss": 3.6267182221461667, + "tokens_seen": 1067488256 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003416750250752257, + "loss": 2.8095, + "theoretical_loss": 3.626697288695377, + "tokens_seen": 1067553792 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003416649949849548, + "loss": 2.7338, + "theoretical_loss": 3.6266763568894285, + "tokens_seen": 1067619328 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034165496489468406, + "loss": 2.8212, + "theoretical_loss": 3.6266554267280906, + "tokens_seen": 1067684864 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034164493480441324, + "loss": 2.8053, + "theoretical_loss": 3.6266344982111334, + "tokens_seen": 1067750400 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003416349047141424, + "loss": 2.8555, + "theoretical_loss": 3.6266135713383267, + "tokens_seen": 1067815936 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003416248746238716, + "loss": 2.814, + "theoretical_loss": 3.62659264610944, + "tokens_seen": 1067881472 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034161484453360083, + "loss": 2.7473, + "theoretical_loss": 3.626571722524244, + "tokens_seen": 1067947008 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034160481444332996, + "loss": 2.7427, + "theoretical_loss": 3.6265508005825087, + "tokens_seen": 1068012544 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415947843530592, + "loss": 2.7828, + "theoretical_loss": 3.626529880284003, + "tokens_seen": 1068078080 + }, + { + "epoch": 13.0, + "objective/train/docs_used": 2542566, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9877328872680664, + "objective/train/theoretical_loss": 3.626508961628499, + "objective/train/tokens_used": 1088603616, + "theoretical_loss": 3.626508961628499, + "tokens_seen": 1068143616 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415847542627884, + "loss": 2.863, + "theoretical_loss": 3.626508961628499, + "tokens_seen": 1068143616 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034157472417251756, + "loss": 2.8086, + "theoretical_loss": 3.6264880446157646, + "tokens_seen": 1068209152 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415646940822468, + "loss": 2.8322, + "theoretical_loss": 3.626467129245572, + "tokens_seen": 1068274688 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415546639919759, + "loss": 2.9093, + "theoretical_loss": 3.6264462155176904, + "tokens_seen": 1068340224 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034154463390170516, + "loss": 2.8821, + "theoretical_loss": 3.6264253034318905, + "tokens_seen": 1068405760 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415346038114343, + "loss": 2.7926, + "theoretical_loss": 3.626404392987942, + "tokens_seen": 1068471296 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415245737211635, + "loss": 2.7377, + "theoretical_loss": 3.626383484185616, + "tokens_seen": 1068536832 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415145436308927, + "loss": 2.8152, + "theoretical_loss": 3.6263625770246835, + "tokens_seen": 1068602368 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003415045135406219, + "loss": 2.8562, + "theoretical_loss": 3.626341671504913, + "tokens_seen": 1068667904 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034149448345035106, + "loss": 2.8799, + "theoretical_loss": 3.626320767626077, + "tokens_seen": 1068733440 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003414844533600803, + "loss": 2.8281, + "theoretical_loss": 3.6262998653879457, + "tokens_seen": 1068798976 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003414744232698094, + "loss": 2.8603, + "theoretical_loss": 3.626278964790289, + "tokens_seen": 1068864512 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034146439317953866, + "loss": 2.8189, + "theoretical_loss": 3.6262580658328782, + "tokens_seen": 1068930048 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003414543630892678, + "loss": 2.8065, + "theoretical_loss": 3.626237168515484, + "tokens_seen": 1068995584 + }, + { + "epoch": 13.0, + "learning_rate": 0.000341444332998997, + "loss": 2.8839, + "theoretical_loss": 3.626216272837876, + "tokens_seen": 1069061120 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003414343029087262, + "loss": 2.8694, + "theoretical_loss": 3.626195378799827, + "tokens_seen": 1069126656 + }, + { + "epoch": 13.0, + "learning_rate": 0.0003414242728184554, + "loss": 2.8025, + "theoretical_loss": 3.6261744864011067, + "tokens_seen": 1069192192 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034141424272818456, + "loss": 2.79, + "theoretical_loss": 3.626153595641486, + "tokens_seen": 1069257728 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034140421263791375, + "loss": 2.7347, + "theoretical_loss": 3.6261327065207363, + "tokens_seen": 1069323264 + }, + { + "epoch": 13.0, + "learning_rate": 0.00034139418254764293, + "loss": 2.8674, + "theoretical_loss": 3.626111819038628, + "tokens_seen": 1069388800 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034138415245737216, + "loss": 2.8227, + "theoretical_loss": 3.6260909331949325, + "tokens_seen": 1069454336 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003413741223671013, + "loss": 2.8199, + "theoretical_loss": 3.626070048989421, + "tokens_seen": 1069519872 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003413640922768305, + "loss": 2.8906, + "theoretical_loss": 3.6260491664218644, + "tokens_seen": 1069585408 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034135406218655965, + "loss": 2.7624, + "theoretical_loss": 3.626028285492034, + "tokens_seen": 1069650944 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003413440320962889, + "loss": 2.8047, + "theoretical_loss": 3.626007406199701, + "tokens_seen": 1069716480 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2545552, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9821791648864746, + "objective/train/theoretical_loss": 3.6259865285446367, + "objective/train/tokens_used": 1090242016, + "theoretical_loss": 3.6259865285446367, + "tokens_seen": 1069782016 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034133400200601807, + "loss": 2.9061, + "theoretical_loss": 3.6259865285446367, + "tokens_seen": 1069782016 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034132397191574725, + "loss": 2.809, + "theoretical_loss": 3.6259656525266126, + "tokens_seen": 1069847552 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034131394182547643, + "loss": 2.8625, + "theoretical_loss": 3.6259447781453997, + "tokens_seen": 1069913088 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034130391173520567, + "loss": 2.8567, + "theoretical_loss": 3.6259239054007697, + "tokens_seen": 1069978624 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003412938816449348, + "loss": 2.9093, + "theoretical_loss": 3.6259030342924943, + "tokens_seen": 1070044160 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034128385155466403, + "loss": 2.7974, + "theoretical_loss": 3.625882164820344, + "tokens_seen": 1070109696 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034127382146439315, + "loss": 2.7869, + "theoretical_loss": 3.625861296984091, + "tokens_seen": 1070175232 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003412637913741224, + "loss": 2.8276, + "theoretical_loss": 3.6258404307835077, + "tokens_seen": 1070240768 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034125376128385157, + "loss": 2.9018, + "theoretical_loss": 3.6258195662183645, + "tokens_seen": 1070306304 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034124373119358075, + "loss": 2.7987, + "theoretical_loss": 3.625798703288434, + "tokens_seen": 1070371840 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034123370110330993, + "loss": 2.8018, + "theoretical_loss": 3.625777841993487, + "tokens_seen": 1070437376 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003412236710130391, + "loss": 2.8374, + "theoretical_loss": 3.625756982333296, + "tokens_seen": 1070502912 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003412136409227683, + "loss": 2.8313, + "theoretical_loss": 3.6257361243076325, + "tokens_seen": 1070568448 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034120361083249753, + "loss": 2.7947, + "theoretical_loss": 3.6257152679162683, + "tokens_seen": 1070633984 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034119358074222666, + "loss": 2.9014, + "theoretical_loss": 3.6256944131589757, + "tokens_seen": 1070699520 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003411835506519559, + "loss": 2.8114, + "theoretical_loss": 3.6256735600355268, + "tokens_seen": 1070765056 + }, + { + "epoch": 13.01, + "learning_rate": 0.000341173520561685, + "loss": 2.8244, + "theoretical_loss": 3.625652708545693, + "tokens_seen": 1070830592 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034116349047141426, + "loss": 2.808, + "theoretical_loss": 3.625631858689247, + "tokens_seen": 1070896128 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034115346038114344, + "loss": 2.7596, + "theoretical_loss": 3.62561101046596, + "tokens_seen": 1070961664 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003411434302908726, + "loss": 2.923, + "theoretical_loss": 3.6255901638756045, + "tokens_seen": 1071027200 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003411334002006018, + "loss": 2.7482, + "theoretical_loss": 3.625569318917954, + "tokens_seen": 1071092736 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034112337011033103, + "loss": 2.8317, + "theoretical_loss": 3.6255484755927787, + "tokens_seen": 1071158272 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034111334002006016, + "loss": 2.8846, + "theoretical_loss": 3.6255276338998517, + "tokens_seen": 1071223808 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003411033099297894, + "loss": 2.907, + "theoretical_loss": 3.6255067938389454, + "tokens_seen": 1071289344 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003410932798395185, + "loss": 2.8322, + "theoretical_loss": 3.625485955409833, + "tokens_seen": 1071354880 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2549282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6766340732574463, + "objective/train/theoretical_loss": 3.6254651186122855, + "objective/train/tokens_used": 1091880416, + "theoretical_loss": 3.6254651186122855, + "tokens_seen": 1071420416 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034108324974924776, + "loss": 2.8199, + "theoretical_loss": 3.6254651186122855, + "tokens_seen": 1071420416 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034107321965897694, + "loss": 2.9112, + "theoretical_loss": 3.6254442834460763, + "tokens_seen": 1071485952 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003410631895687061, + "loss": 2.7497, + "theoretical_loss": 3.6254234499109774, + "tokens_seen": 1071551488 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003410531594784353, + "loss": 2.8826, + "theoretical_loss": 3.6254026180067616, + "tokens_seen": 1071617024 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003410431293881645, + "loss": 2.9667, + "theoretical_loss": 3.6253817877332013, + "tokens_seen": 1071682560 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034103309929789366, + "loss": 2.8356, + "theoretical_loss": 3.6253609590900697, + "tokens_seen": 1071748096 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003410230692076229, + "loss": 2.9363, + "theoretical_loss": 3.625340132077139, + "tokens_seen": 1071813632 + }, + { + "epoch": 13.01, + "learning_rate": 0.000341013039117352, + "loss": 2.8286, + "theoretical_loss": 3.6253193066941822, + "tokens_seen": 1071879168 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034100300902708126, + "loss": 2.7759, + "theoretical_loss": 3.6252984829409716, + "tokens_seen": 1071944704 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034099297893681044, + "loss": 2.8505, + "theoretical_loss": 3.625277660817281, + "tokens_seen": 1072010240 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003409829488465396, + "loss": 2.8329, + "theoretical_loss": 3.625256840322882, + "tokens_seen": 1072075776 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003409729187562688, + "loss": 2.8274, + "theoretical_loss": 3.6252360214575488, + "tokens_seen": 1072141312 + }, + { + "epoch": 13.01, + "learning_rate": 0.000340962888665998, + "loss": 2.7781, + "theoretical_loss": 3.6252152042210533, + "tokens_seen": 1072206848 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034095285857572717, + "loss": 2.7872, + "theoretical_loss": 3.6251943886131692, + "tokens_seen": 1072272384 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003409428284854564, + "loss": 2.831, + "theoretical_loss": 3.6251735746336693, + "tokens_seen": 1072337920 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034093279839518553, + "loss": 2.7899, + "theoretical_loss": 3.625152762282327, + "tokens_seen": 1072403456 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034092276830491477, + "loss": 2.8499, + "theoretical_loss": 3.625131951558915, + "tokens_seen": 1072468992 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003409127382146439, + "loss": 2.7554, + "theoretical_loss": 3.625111142463206, + "tokens_seen": 1072534528 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034090270812437313, + "loss": 2.8602, + "theoretical_loss": 3.6250903349949746, + "tokens_seen": 1072600064 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003408926780341023, + "loss": 2.8153, + "theoretical_loss": 3.6250695291539934, + "tokens_seen": 1072665600 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003408826479438315, + "loss": 2.844, + "theoretical_loss": 3.6250487249400356, + "tokens_seen": 1072731136 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034087261785356067, + "loss": 2.9094, + "theoretical_loss": 3.625027922352875, + "tokens_seen": 1072796672 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034086258776328985, + "loss": 2.8732, + "theoretical_loss": 3.6250071213922848, + "tokens_seen": 1072862208 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034085255767301903, + "loss": 2.8179, + "theoretical_loss": 3.624986322058038, + "tokens_seen": 1072927744 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034084252758274827, + "loss": 2.8727, + "theoretical_loss": 3.624965524349909, + "tokens_seen": 1072993280 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2554192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.787851333618164, + "objective/train/theoretical_loss": 3.6249447282676703, + "objective/train/tokens_used": 1093518816, + "theoretical_loss": 3.6249447282676703, + "tokens_seen": 1073058816 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034083249749247745, + "loss": 2.8747, + "theoretical_loss": 3.6249447282676703, + "tokens_seen": 1073058816 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034082246740220663, + "loss": 2.7957, + "theoretical_loss": 3.624923933811097, + "tokens_seen": 1073124352 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034081243731193587, + "loss": 2.6777, + "theoretical_loss": 3.6249031409799612, + "tokens_seen": 1073189888 + }, + { + "epoch": 13.01, + "learning_rate": 0.000340802407221665, + "loss": 2.8834, + "theoretical_loss": 3.6248823497740377, + "tokens_seen": 1073255424 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034079237713139423, + "loss": 2.7456, + "theoretical_loss": 3.624861560193099, + "tokens_seen": 1073320960 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034078234704112335, + "loss": 2.6805, + "theoretical_loss": 3.6248407722369205, + "tokens_seen": 1073386496 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003407723169508526, + "loss": 2.7515, + "theoretical_loss": 3.624819985905275, + "tokens_seen": 1073452032 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034076228686058177, + "loss": 2.8598, + "theoretical_loss": 3.6247992011979364, + "tokens_seen": 1073517568 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034075225677031095, + "loss": 2.8165, + "theoretical_loss": 3.624778418114679, + "tokens_seen": 1073583104 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034074222668004013, + "loss": 2.8371, + "theoretical_loss": 3.6247576366552767, + "tokens_seen": 1073648640 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003407321965897693, + "loss": 2.9261, + "theoretical_loss": 3.624736856819503, + "tokens_seen": 1073714176 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003407221664994985, + "loss": 2.9662, + "theoretical_loss": 3.6247160786071326, + "tokens_seen": 1073779712 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034071213640922773, + "loss": 2.8963, + "theoretical_loss": 3.6246953020179395, + "tokens_seen": 1073845248 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034070210631895686, + "loss": 2.8048, + "theoretical_loss": 3.624674527051697, + "tokens_seen": 1073910784 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003406920762286861, + "loss": 2.8279, + "theoretical_loss": 3.6246537537081807, + "tokens_seen": 1073976320 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003406820461384152, + "loss": 2.9052, + "theoretical_loss": 3.624632981987164, + "tokens_seen": 1074041856 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034067201604814446, + "loss": 2.9001, + "theoretical_loss": 3.6246122118884214, + "tokens_seen": 1074107392 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034066198595787364, + "loss": 2.8825, + "theoretical_loss": 3.6245914434117275, + "tokens_seen": 1074172928 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003406519558676028, + "loss": 2.8404, + "theoretical_loss": 3.6245706765568557, + "tokens_seen": 1074238464 + }, + { + "epoch": 13.01, + "learning_rate": 0.000340641925777332, + "loss": 2.7435, + "theoretical_loss": 3.624549911323581, + "tokens_seen": 1074304000 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034063189568706123, + "loss": 2.9415, + "theoretical_loss": 3.6245291477116783, + "tokens_seen": 1074369536 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034062186559679036, + "loss": 2.8764, + "theoretical_loss": 3.6245083857209215, + "tokens_seen": 1074435072 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003406118355065196, + "loss": 2.7723, + "theoretical_loss": 3.624487625351086, + "tokens_seen": 1074500608 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003406018054162487, + "loss": 2.8328, + "theoretical_loss": 3.624466866601945, + "tokens_seen": 1074566144 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034059177532597796, + "loss": 2.7557, + "theoretical_loss": 3.6244461094732743, + "tokens_seen": 1074631680 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2557174, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.992907762527466, + "objective/train/theoretical_loss": 3.624425353964848, + "objective/train/tokens_used": 1095157216, + "theoretical_loss": 3.624425353964848, + "tokens_seen": 1074697216 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034058174523570714, + "loss": 2.9323, + "theoretical_loss": 3.624425353964848, + "tokens_seen": 1074697216 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003405717151454363, + "loss": 2.9029, + "theoretical_loss": 3.624404600076441, + "tokens_seen": 1074762752 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003405616850551655, + "loss": 2.8405, + "theoretical_loss": 3.6243838478078283, + "tokens_seen": 1074828288 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003405516549648947, + "loss": 2.7806, + "theoretical_loss": 3.624363097158785, + "tokens_seen": 1074893824 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034054162487462386, + "loss": 2.9158, + "theoretical_loss": 3.624342348129085, + "tokens_seen": 1074959360 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003405315947843531, + "loss": 2.7704, + "theoretical_loss": 3.624321600718504, + "tokens_seen": 1075024896 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003405215646940822, + "loss": 2.8357, + "theoretical_loss": 3.6243008549268163, + "tokens_seen": 1075090432 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034051153460381146, + "loss": 2.7245, + "theoretical_loss": 3.624280110753798, + "tokens_seen": 1075155968 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034050150451354064, + "loss": 2.9048, + "theoretical_loss": 3.6242593681992226, + "tokens_seen": 1075221504 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003404914744232698, + "loss": 2.8635, + "theoretical_loss": 3.6242386272628666, + "tokens_seen": 1075287040 + }, + { + "epoch": 13.01, + "learning_rate": 0.000340481444332999, + "loss": 2.8217, + "theoretical_loss": 3.624217887944505, + "tokens_seen": 1075352576 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003404714142427282, + "loss": 2.8448, + "theoretical_loss": 3.624197150243912, + "tokens_seen": 1075418112 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034046138415245737, + "loss": 2.7879, + "theoretical_loss": 3.6241764141608632, + "tokens_seen": 1075483648 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003404513540621866, + "loss": 2.9156, + "theoretical_loss": 3.624155679695135, + "tokens_seen": 1075549184 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034044132397191573, + "loss": 2.8458, + "theoretical_loss": 3.624134946846501, + "tokens_seen": 1075614720 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034043129388164497, + "loss": 2.8781, + "theoretical_loss": 3.6241142156147377, + "tokens_seen": 1075680256 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003404212637913741, + "loss": 2.8717, + "theoretical_loss": 3.6240934859996203, + "tokens_seen": 1075745792 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034041123370110333, + "loss": 2.7868, + "theoretical_loss": 3.6240727580009238, + "tokens_seen": 1075811328 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003404012036108325, + "loss": 2.8813, + "theoretical_loss": 3.624052031618424, + "tokens_seen": 1075876864 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003403911735205617, + "loss": 2.7019, + "theoretical_loss": 3.6240313068518972, + "tokens_seen": 1075942400 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034038114343029087, + "loss": 2.8238, + "theoretical_loss": 3.624010583701118, + "tokens_seen": 1076007936 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034037111334002005, + "loss": 2.7902, + "theoretical_loss": 3.6239898621658617, + "tokens_seen": 1076073472 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034036108324974923, + "loss": 2.8069, + "theoretical_loss": 3.623969142245905, + "tokens_seen": 1076139008 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034035105315947847, + "loss": 2.7995, + "theoretical_loss": 3.623948423941023, + "tokens_seen": 1076204544 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003403410230692076, + "loss": 2.8626, + "theoretical_loss": 3.623927707250992, + "tokens_seen": 1076270080 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2562084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.657792091369629, + "objective/train/theoretical_loss": 3.6239069921755878, + "objective/train/tokens_used": 1096795616, + "theoretical_loss": 3.6239069921755878, + "tokens_seen": 1076335616 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034033099297893683, + "loss": 2.8248, + "theoretical_loss": 3.6239069921755878, + "tokens_seen": 1076335616 + }, + { + "epoch": 13.01, + "learning_rate": 0.000340320962888666, + "loss": 2.8414, + "theoretical_loss": 3.6238862787145854, + "tokens_seen": 1076401152 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003403109327983952, + "loss": 2.8756, + "theoretical_loss": 3.623865566867762, + "tokens_seen": 1076466688 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003403009027081244, + "loss": 2.8549, + "theoretical_loss": 3.623844856634892, + "tokens_seen": 1076532224 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034029087261785356, + "loss": 2.8593, + "theoretical_loss": 3.6238241480157525, + "tokens_seen": 1076597760 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034028084252758274, + "loss": 2.829, + "theoretical_loss": 3.6238034410101188, + "tokens_seen": 1076663296 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034027081243731197, + "loss": 2.733, + "theoretical_loss": 3.623782735617768, + "tokens_seen": 1076728832 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003402607823470411, + "loss": 2.8428, + "theoretical_loss": 3.623762031838476, + "tokens_seen": 1076794368 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034025075225677033, + "loss": 2.6857, + "theoretical_loss": 3.6237413296720176, + "tokens_seen": 1076859904 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034024072216649946, + "loss": 2.8336, + "theoretical_loss": 3.623720629118171, + "tokens_seen": 1076925440 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003402306920762287, + "loss": 2.7978, + "theoretical_loss": 3.623699930176711, + "tokens_seen": 1076990976 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003402206619859579, + "loss": 2.7682, + "theoretical_loss": 3.6236792328474143, + "tokens_seen": 1077056512 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034021063189568706, + "loss": 2.8833, + "theoretical_loss": 3.6236585371300576, + "tokens_seen": 1077122048 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034020060180541624, + "loss": 2.8195, + "theoretical_loss": 3.623637843024417, + "tokens_seen": 1077187584 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401905717151454, + "loss": 2.7455, + "theoretical_loss": 3.623617150530269, + "tokens_seen": 1077253120 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401805416248746, + "loss": 2.8909, + "theoretical_loss": 3.62359645964739, + "tokens_seen": 1077318656 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034017051153460384, + "loss": 2.8441, + "theoretical_loss": 3.623575770375557, + "tokens_seen": 1077384192 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034016048144433296, + "loss": 2.9433, + "theoretical_loss": 3.6235550827145455, + "tokens_seen": 1077449728 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401504513540622, + "loss": 2.8739, + "theoretical_loss": 3.6235343966641334, + "tokens_seen": 1077515264 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401404212637914, + "loss": 2.7624, + "theoretical_loss": 3.6235137122240966, + "tokens_seen": 1077580800 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034013039117352056, + "loss": 2.9622, + "theoretical_loss": 3.6234930293942114, + "tokens_seen": 1077646336 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034012036108324974, + "loss": 2.7783, + "theoretical_loss": 3.6234723481742557, + "tokens_seen": 1077711872 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401103309929789, + "loss": 2.9289, + "theoretical_loss": 3.6234516685640057, + "tokens_seen": 1077777408 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003401003009027081, + "loss": 2.9069, + "theoretical_loss": 3.623430990563238, + "tokens_seen": 1077842944 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034009027081243734, + "loss": 2.8716, + "theoretical_loss": 3.62341031417173, + "tokens_seen": 1077908480 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2564982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8452179431915283, + "objective/train/theoretical_loss": 3.6233896393892584, + "objective/train/tokens_used": 1098434016, + "theoretical_loss": 3.6233896393892584, + "tokens_seen": 1077974016 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003400802407221665, + "loss": 2.8852, + "theoretical_loss": 3.6233896393892584, + "tokens_seen": 1077974016 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003400702106318957, + "loss": 2.797, + "theoretical_loss": 3.6233689662156, + "tokens_seen": 1078039552 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003400601805416249, + "loss": 2.8457, + "theoretical_loss": 3.6233482946505315, + "tokens_seen": 1078105088 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034005015045135406, + "loss": 2.8909, + "theoretical_loss": 3.623327624693831, + "tokens_seen": 1078170624 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003400401203610833, + "loss": 2.8599, + "theoretical_loss": 3.6233069563452753, + "tokens_seen": 1078236160 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003400300902708124, + "loss": 2.8115, + "theoretical_loss": 3.623286289604641, + "tokens_seen": 1078301696 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034002006018054166, + "loss": 2.8675, + "theoretical_loss": 3.623265624471705, + "tokens_seen": 1078367232 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034001003009027084, + "loss": 2.898, + "theoretical_loss": 3.623244960946246, + "tokens_seen": 1078432768 + }, + { + "epoch": 13.01, + "learning_rate": 0.00034, + "loss": 2.8713, + "theoretical_loss": 3.6232242990280397, + "tokens_seen": 1078498304 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003399899699097292, + "loss": 2.8361, + "theoretical_loss": 3.6232036387168645, + "tokens_seen": 1078563840 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003399799398194584, + "loss": 2.87, + "theoretical_loss": 3.6231829800124973, + "tokens_seen": 1078629376 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033996990972918757, + "loss": 2.8883, + "theoretical_loss": 3.6231623229147156, + "tokens_seen": 1078694912 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003399598796389168, + "loss": 2.862, + "theoretical_loss": 3.623141667423297, + "tokens_seen": 1078760448 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033994984954864593, + "loss": 2.8901, + "theoretical_loss": 3.623121013538019, + "tokens_seen": 1078825984 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033993981945837517, + "loss": 2.7929, + "theoretical_loss": 3.6231003612586585, + "tokens_seen": 1078891520 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003399297893681043, + "loss": 2.7845, + "theoretical_loss": 3.623079710584994, + "tokens_seen": 1078957056 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033991975927783353, + "loss": 2.9135, + "theoretical_loss": 3.623059061516803, + "tokens_seen": 1079022592 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003399097291875627, + "loss": 2.8401, + "theoretical_loss": 3.6230384140538625, + "tokens_seen": 1079088128 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003398996990972919, + "loss": 2.8944, + "theoretical_loss": 3.6230177681959512, + "tokens_seen": 1079153664 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033988966900702107, + "loss": 2.9276, + "theoretical_loss": 3.622997123942846, + "tokens_seen": 1079219200 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033987963891675025, + "loss": 2.8931, + "theoretical_loss": 3.6229764812943253, + "tokens_seen": 1079284736 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033986960882647943, + "loss": 2.8034, + "theoretical_loss": 3.6229558402501665, + "tokens_seen": 1079350272 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033985957873620867, + "loss": 2.8389, + "theoretical_loss": 3.622935200810148, + "tokens_seen": 1079415808 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003398495486459378, + "loss": 2.925, + "theoretical_loss": 3.6229145629740476, + "tokens_seen": 1079481344 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033983951855566703, + "loss": 2.8546, + "theoretical_loss": 3.622893926741643, + "tokens_seen": 1079546880 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2568779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8891396522521973, + "objective/train/theoretical_loss": 3.6228732921127125, + "objective/train/tokens_used": 1100072416, + "theoretical_loss": 3.6228732921127125, + "tokens_seen": 1079612416 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003398294884653962, + "loss": 2.9084, + "theoretical_loss": 3.6228732921127125, + "tokens_seen": 1079612416 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003398194583751254, + "loss": 2.8309, + "theoretical_loss": 3.622852659087034, + "tokens_seen": 1079677952 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003398094282848546, + "loss": 2.6931, + "theoretical_loss": 3.6228320276643857, + "tokens_seen": 1079743488 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033979939819458376, + "loss": 2.7397, + "theoretical_loss": 3.622811397844546, + "tokens_seen": 1079809024 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033978936810431294, + "loss": 2.9278, + "theoretical_loss": 3.6227907696272927, + "tokens_seen": 1079874560 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033977933801404217, + "loss": 2.87, + "theoretical_loss": 3.6227701430124046, + "tokens_seen": 1079940096 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003397693079237713, + "loss": 2.8967, + "theoretical_loss": 3.6227495179996594, + "tokens_seen": 1080005632 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033975927783350053, + "loss": 2.8533, + "theoretical_loss": 3.622728894588836, + "tokens_seen": 1080071168 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033974924774322966, + "loss": 2.842, + "theoretical_loss": 3.622708272779712, + "tokens_seen": 1080136704 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003397392176529589, + "loss": 2.8357, + "theoretical_loss": 3.622687652572067, + "tokens_seen": 1080202240 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003397291875626881, + "loss": 2.8592, + "theoretical_loss": 3.6226670339656786, + "tokens_seen": 1080267776 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033971915747241726, + "loss": 2.8483, + "theoretical_loss": 3.6226464169603254, + "tokens_seen": 1080333312 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033970912738214644, + "loss": 2.8886, + "theoretical_loss": 3.6226258015557864, + "tokens_seen": 1080398848 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396990972918756, + "loss": 2.8067, + "theoretical_loss": 3.62260518775184, + "tokens_seen": 1080464384 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396890672016048, + "loss": 2.8303, + "theoretical_loss": 3.6225845755482644, + "tokens_seen": 1080529920 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033967903711133404, + "loss": 2.8758, + "theoretical_loss": 3.622563964944839, + "tokens_seen": 1080595456 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033966900702106316, + "loss": 2.8551, + "theoretical_loss": 3.622543355941342, + "tokens_seen": 1080660992 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396589769307924, + "loss": 2.8479, + "theoretical_loss": 3.622522748537553, + "tokens_seen": 1080726528 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396489468405216, + "loss": 2.9276, + "theoretical_loss": 3.6225021427332496, + "tokens_seen": 1080792064 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033963891675025076, + "loss": 2.8683, + "theoretical_loss": 3.6224815385282114, + "tokens_seen": 1080857600 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033962888665997994, + "loss": 2.8121, + "theoretical_loss": 3.6224609359222177, + "tokens_seen": 1080923136 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396188565697091, + "loss": 2.6929, + "theoretical_loss": 3.622440334915047, + "tokens_seen": 1080988672 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003396088264794383, + "loss": 2.7855, + "theoretical_loss": 3.622419735506478, + "tokens_seen": 1081054208 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033959879638916754, + "loss": 2.8204, + "theoretical_loss": 3.62239913769629, + "tokens_seen": 1081119744 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033958876629889667, + "loss": 2.9458, + "theoretical_loss": 3.622378541484262, + "tokens_seen": 1081185280 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2573574, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7223801612854004, + "objective/train/theoretical_loss": 3.6223579468701743, + "objective/train/tokens_used": 1101710816, + "theoretical_loss": 3.6223579468701743, + "tokens_seen": 1081250816 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003395787362086259, + "loss": 2.6798, + "theoretical_loss": 3.6223579468701743, + "tokens_seen": 1081250816 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033956870611835503, + "loss": 2.8118, + "theoretical_loss": 3.6223373538538044, + "tokens_seen": 1081316352 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033955867602808426, + "loss": 2.7199, + "theoretical_loss": 3.622316762434932, + "tokens_seen": 1081381888 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033954864593781345, + "loss": 2.8501, + "theoretical_loss": 3.6222961726133374, + "tokens_seen": 1081447424 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003395386158475426, + "loss": 2.8945, + "theoretical_loss": 3.6222755843887984, + "tokens_seen": 1081512960 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003395285857572718, + "loss": 2.8767, + "theoretical_loss": 3.622254997761095, + "tokens_seen": 1081578496 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033951855566700104, + "loss": 2.864, + "theoretical_loss": 3.6222344127300072, + "tokens_seen": 1081644032 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033950852557673017, + "loss": 2.7993, + "theoretical_loss": 3.6222138292953137, + "tokens_seen": 1081709568 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003394984954864594, + "loss": 2.8112, + "theoretical_loss": 3.6221932474567944, + "tokens_seen": 1081775104 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033948846539618853, + "loss": 2.8832, + "theoretical_loss": 3.6221726672142283, + "tokens_seen": 1081840640 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033947843530591777, + "loss": 2.8236, + "theoretical_loss": 3.6221520885673955, + "tokens_seen": 1081906176 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033946840521564695, + "loss": 2.7251, + "theoretical_loss": 3.6221315115160753, + "tokens_seen": 1081971712 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033945837512537613, + "loss": 2.7382, + "theoretical_loss": 3.622110936060048, + "tokens_seen": 1082037248 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003394483450351053, + "loss": 2.9082, + "theoretical_loss": 3.622090362199092, + "tokens_seen": 1082102784 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003394383149448345, + "loss": 2.7779, + "theoretical_loss": 3.6220697899329886, + "tokens_seen": 1082168320 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003394282848545637, + "loss": 2.826, + "theoretical_loss": 3.6220492192615166, + "tokens_seen": 1082233856 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003394182547642929, + "loss": 2.8749, + "theoretical_loss": 3.6220286501844563, + "tokens_seen": 1082299392 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033940822467402204, + "loss": 2.83, + "theoretical_loss": 3.622008082701587, + "tokens_seen": 1082364928 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033939819458375127, + "loss": 2.8353, + "theoretical_loss": 3.6219875168126894, + "tokens_seen": 1082430464 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393881644934804, + "loss": 2.9746, + "theoretical_loss": 3.6219669525175426, + "tokens_seen": 1082496000 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033937813440320963, + "loss": 2.7231, + "theoretical_loss": 3.6219463898159274, + "tokens_seen": 1082561536 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393681043129388, + "loss": 2.8871, + "theoretical_loss": 3.621925828707624, + "tokens_seen": 1082627072 + }, + { + "epoch": 13.01, + "learning_rate": 0.000339358074222668, + "loss": 2.9016, + "theoretical_loss": 3.6219052691924114, + "tokens_seen": 1082692608 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393480441323972, + "loss": 2.8658, + "theoretical_loss": 3.621884711270071, + "tokens_seen": 1082758144 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393380140421264, + "loss": 2.8532, + "theoretical_loss": 3.621864154940382, + "tokens_seen": 1082823680 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2576542, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.690615653991699, + "objective/train/theoretical_loss": 3.6218436002031256, + "objective/train/tokens_used": 1103349216, + "theoretical_loss": 3.6218436002031256, + "tokens_seen": 1082889216 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393279839518556, + "loss": 2.8082, + "theoretical_loss": 3.6218436002031256, + "tokens_seen": 1082889216 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003393179538615848, + "loss": 2.9466, + "theoretical_loss": 3.621823047058081, + "tokens_seen": 1082954752 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033930792377131396, + "loss": 2.8484, + "theoretical_loss": 3.6218024955050288, + "tokens_seen": 1083020288 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033929789368104314, + "loss": 2.8163, + "theoretical_loss": 3.62178194554375, + "tokens_seen": 1083085824 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033928786359077237, + "loss": 2.7691, + "theoretical_loss": 3.6217613971740246, + "tokens_seen": 1083151360 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003392778335005015, + "loss": 2.896, + "theoretical_loss": 3.621740850395633, + "tokens_seen": 1083216896 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033926780341023073, + "loss": 2.8237, + "theoretical_loss": 3.6217203052083558, + "tokens_seen": 1083282432 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033925777331995986, + "loss": 2.8607, + "theoretical_loss": 3.621699761611974, + "tokens_seen": 1083347968 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003392477432296891, + "loss": 2.8634, + "theoretical_loss": 3.621679219606267, + "tokens_seen": 1083413504 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003392377131394183, + "loss": 2.9723, + "theoretical_loss": 3.6216586791910164, + "tokens_seen": 1083479040 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033922768304914746, + "loss": 2.8823, + "theoretical_loss": 3.6216381403660023, + "tokens_seen": 1083544576 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033921765295887664, + "loss": 2.859, + "theoretical_loss": 3.6216176031310057, + "tokens_seen": 1083610112 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003392076228686058, + "loss": 2.8711, + "theoretical_loss": 3.6215970674858076, + "tokens_seen": 1083675648 + }, + { + "epoch": 13.01, + "learning_rate": 0.000339197592778335, + "loss": 2.7896, + "theoretical_loss": 3.621576533430188, + "tokens_seen": 1083741184 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033918756268806424, + "loss": 2.9095, + "theoretical_loss": 3.6215560009639294, + "tokens_seen": 1083806720 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033917753259779336, + "loss": 2.8332, + "theoretical_loss": 3.6215354700868105, + "tokens_seen": 1083872256 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003391675025075226, + "loss": 2.8739, + "theoretical_loss": 3.6215149407986136, + "tokens_seen": 1083937792 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003391574724172518, + "loss": 2.8609, + "theoretical_loss": 3.6214944130991196, + "tokens_seen": 1084003328 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033914744232698096, + "loss": 2.8929, + "theoretical_loss": 3.6214738869881096, + "tokens_seen": 1084068864 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033913741223671014, + "loss": 2.8685, + "theoretical_loss": 3.6214533624653638, + "tokens_seen": 1084134400 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003391273821464393, + "loss": 2.933, + "theoretical_loss": 3.621432839530664, + "tokens_seen": 1084199936 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003391173520561685, + "loss": 2.8565, + "theoretical_loss": 3.621412318183791, + "tokens_seen": 1084265472 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033910732196589774, + "loss": 2.9172, + "theoretical_loss": 3.6213917984245265, + "tokens_seen": 1084331008 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033909729187562687, + "loss": 2.9004, + "theoretical_loss": 3.6213712802526516, + "tokens_seen": 1084396544 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003390872617853561, + "loss": 2.8173, + "theoretical_loss": 3.621350763667947, + "tokens_seen": 1084462080 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2581588, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.499856472015381, + "objective/train/theoretical_loss": 3.621330248670195, + "objective/train/tokens_used": 1104987616, + "theoretical_loss": 3.621330248670195, + "tokens_seen": 1084527616 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033907723169508523, + "loss": 2.7205, + "theoretical_loss": 3.621330248670195, + "tokens_seen": 1084527616 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033906720160481446, + "loss": 2.9306, + "theoretical_loss": 3.621309735259176, + "tokens_seen": 1084593152 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033905717151454365, + "loss": 2.9273, + "theoretical_loss": 3.6212892234346716, + "tokens_seen": 1084658688 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033904714142427283, + "loss": 2.8719, + "theoretical_loss": 3.621268713196464, + "tokens_seen": 1084724224 + }, + { + "epoch": 13.01, + "learning_rate": 0.000339037111334002, + "loss": 2.7776, + "theoretical_loss": 3.6212482045443335, + "tokens_seen": 1084789760 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033902708124373124, + "loss": 2.882, + "theoretical_loss": 3.621227697478063, + "tokens_seen": 1084855296 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033901705115346037, + "loss": 2.9033, + "theoretical_loss": 3.621207191997433, + "tokens_seen": 1084920832 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003390070210631896, + "loss": 2.8738, + "theoretical_loss": 3.621186688102225, + "tokens_seen": 1084986368 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033899699097291873, + "loss": 2.8632, + "theoretical_loss": 3.621166185792222, + "tokens_seen": 1085051904 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033898696088264797, + "loss": 2.7696, + "theoretical_loss": 3.6211456850672046, + "tokens_seen": 1085117440 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033897693079237715, + "loss": 2.9471, + "theoretical_loss": 3.621125185926955, + "tokens_seen": 1085182976 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033896690070210633, + "loss": 2.9154, + "theoretical_loss": 3.621104688371254, + "tokens_seen": 1085248512 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003389568706118355, + "loss": 2.918, + "theoretical_loss": 3.621084192399885, + "tokens_seen": 1085314048 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003389468405215647, + "loss": 2.8159, + "theoretical_loss": 3.6210636980126294, + "tokens_seen": 1085379584 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003389368104312939, + "loss": 2.901, + "theoretical_loss": 3.6210432052092685, + "tokens_seen": 1085445120 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003389267803410231, + "loss": 2.8396, + "theoretical_loss": 3.6210227139895848, + "tokens_seen": 1085510656 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033891675025075224, + "loss": 2.708, + "theoretical_loss": 3.6210022243533597, + "tokens_seen": 1085576192 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033890672016048147, + "loss": 2.8344, + "theoretical_loss": 3.620981736300376, + "tokens_seen": 1085641728 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003388966900702106, + "loss": 2.8815, + "theoretical_loss": 3.6209612498304153, + "tokens_seen": 1085707264 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033888665997993983, + "loss": 2.9401, + "theoretical_loss": 3.6209407649432603, + "tokens_seen": 1085772800 + }, + { + "epoch": 13.01, + "learning_rate": 0.000338876629889669, + "loss": 2.8015, + "theoretical_loss": 3.6209202816386927, + "tokens_seen": 1085838336 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003388665997993982, + "loss": 2.8861, + "theoretical_loss": 3.6208997999164945, + "tokens_seen": 1085903872 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003388565697091274, + "loss": 2.8282, + "theoretical_loss": 3.6208793197764493, + "tokens_seen": 1085969408 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003388465396188566, + "loss": 2.8804, + "theoretical_loss": 3.620858841218338, + "tokens_seen": 1086034944 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033883650952858574, + "loss": 2.9164, + "theoretical_loss": 3.6208383642419433, + "tokens_seen": 1086100480 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2584518, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.865025043487549, + "objective/train/theoretical_loss": 3.6208178888470473, + "objective/train/tokens_used": 1106626016, + "theoretical_loss": 3.6208178888470473, + "tokens_seen": 1086166016 + }, + { + "epoch": 13.01, + "learning_rate": 0.000338826479438315, + "loss": 2.8096, + "theoretical_loss": 3.6208178888470473, + "tokens_seen": 1086166016 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003388164493480441, + "loss": 2.8967, + "theoretical_loss": 3.6207974150334334, + "tokens_seen": 1086231552 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033880641925777334, + "loss": 2.9238, + "theoretical_loss": 3.6207769428008834, + "tokens_seen": 1086297088 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387963891675025, + "loss": 2.8695, + "theoretical_loss": 3.62075647214918, + "tokens_seen": 1086362624 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387863590772317, + "loss": 2.836, + "theoretical_loss": 3.6207360030781057, + "tokens_seen": 1086428160 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387763289869609, + "loss": 2.8765, + "theoretical_loss": 3.6207155355874434, + "tokens_seen": 1086493696 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033876629889669006, + "loss": 2.8671, + "theoretical_loss": 3.620695069676975, + "tokens_seen": 1086559232 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033875626880641924, + "loss": 2.801, + "theoretical_loss": 3.620674605346484, + "tokens_seen": 1086624768 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387462387161485, + "loss": 2.7398, + "theoretical_loss": 3.620654142595753, + "tokens_seen": 1086690304 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387362086258776, + "loss": 2.7467, + "theoretical_loss": 3.6206336814245645, + "tokens_seen": 1086755840 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033872617853560684, + "loss": 2.8071, + "theoretical_loss": 3.6206132218327016, + "tokens_seen": 1086821376 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033871614844533597, + "loss": 2.8537, + "theoretical_loss": 3.620592763819947, + "tokens_seen": 1086886912 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003387061183550652, + "loss": 2.8921, + "theoretical_loss": 3.620572307386084, + "tokens_seen": 1086952448 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003386960882647944, + "loss": 2.8699, + "theoretical_loss": 3.6205518525308946, + "tokens_seen": 1087017984 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033868605817452356, + "loss": 2.78, + "theoretical_loss": 3.620531399254163, + "tokens_seen": 1087083520 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033867602808425275, + "loss": 2.9982, + "theoretical_loss": 3.6205109475556716, + "tokens_seen": 1087149056 + }, + { + "epoch": 13.01, + "learning_rate": 0.000338665997993982, + "loss": 2.8655, + "theoretical_loss": 3.6204904974352035, + "tokens_seen": 1087214592 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003386559679037111, + "loss": 2.7713, + "theoretical_loss": 3.6204700488925416, + "tokens_seen": 1087280128 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033864593781344034, + "loss": 2.9039, + "theoretical_loss": 3.6204496019274703, + "tokens_seen": 1087345664 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033863590772316947, + "loss": 2.8455, + "theoretical_loss": 3.620429156539771, + "tokens_seen": 1087411200 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003386258776328987, + "loss": 2.9128, + "theoretical_loss": 3.620408712729228, + "tokens_seen": 1087476736 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003386158475426279, + "loss": 2.9092, + "theoretical_loss": 3.6203882704956247, + "tokens_seen": 1087542272 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033860581745235707, + "loss": 2.9065, + "theoretical_loss": 3.6203678298387443, + "tokens_seen": 1087607808 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033859578736208625, + "loss": 2.8309, + "theoretical_loss": 3.62034739075837, + "tokens_seen": 1087673344 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033858575727181543, + "loss": 2.8255, + "theoretical_loss": 3.6203269532542848, + "tokens_seen": 1087738880 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2588015, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.999207019805908, + "objective/train/theoretical_loss": 3.6203065173262736, + "objective/train/tokens_used": 1108264416, + "theoretical_loss": 3.6203065173262736, + "tokens_seen": 1087804416 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033857572718154467, + "loss": 2.8524, + "theoretical_loss": 3.6203065173262736, + "tokens_seen": 1087804416 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033856569709127385, + "loss": 2.6203, + "theoretical_loss": 3.6202860829741184, + "tokens_seen": 1087869952 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033855566700100303, + "loss": 2.9081, + "theoretical_loss": 3.620265650197604, + "tokens_seen": 1087935488 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003385456369107322, + "loss": 2.7877, + "theoretical_loss": 3.620245218996513, + "tokens_seen": 1088001024 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033853560682046144, + "loss": 2.794, + "theoretical_loss": 3.620224789370629, + "tokens_seen": 1088066560 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033852557673019057, + "loss": 2.8318, + "theoretical_loss": 3.6202043613197366, + "tokens_seen": 1088132096 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003385155466399198, + "loss": 2.8609, + "theoretical_loss": 3.6201839348436193, + "tokens_seen": 1088197632 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033850551654964893, + "loss": 2.8441, + "theoretical_loss": 3.62016350994206, + "tokens_seen": 1088263168 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033849548645937817, + "loss": 2.8801, + "theoretical_loss": 3.620143086614844, + "tokens_seen": 1088328704 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033848545636910735, + "loss": 2.8678, + "theoretical_loss": 3.6201226648617535, + "tokens_seen": 1088394240 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033847542627883653, + "loss": 2.8918, + "theoretical_loss": 3.620102244682573, + "tokens_seen": 1088459776 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003384653961885657, + "loss": 2.8567, + "theoretical_loss": 3.6200818260770875, + "tokens_seen": 1088525312 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003384553660982949, + "loss": 2.7517, + "theoretical_loss": 3.62006140904508, + "tokens_seen": 1088590848 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003384453360080241, + "loss": 2.9056, + "theoretical_loss": 3.6200409935863345, + "tokens_seen": 1088656384 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003384353059177533, + "loss": 2.9367, + "theoretical_loss": 3.620020579700635, + "tokens_seen": 1088721920 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033842527582748244, + "loss": 2.9128, + "theoretical_loss": 3.620000167387766, + "tokens_seen": 1088787456 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033841524573721167, + "loss": 2.85, + "theoretical_loss": 3.6199797566475116, + "tokens_seen": 1088852992 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003384052156469408, + "loss": 2.9462, + "theoretical_loss": 3.6199593474796554, + "tokens_seen": 1088918528 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033839518555667003, + "loss": 2.8589, + "theoretical_loss": 3.6199389398839825, + "tokens_seen": 1088984064 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383851554663992, + "loss": 2.9008, + "theoretical_loss": 3.619918533860277, + "tokens_seen": 1089049600 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383751253761284, + "loss": 2.7837, + "theoretical_loss": 3.6198981294083232, + "tokens_seen": 1089115136 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383650952858576, + "loss": 2.8057, + "theoretical_loss": 3.619877726527905, + "tokens_seen": 1089180672 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383550651955868, + "loss": 2.8792, + "theoretical_loss": 3.6198573252188075, + "tokens_seen": 1089246208 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033834503510531594, + "loss": 2.8454, + "theoretical_loss": 3.6198369254808145, + "tokens_seen": 1089311744 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383350050150452, + "loss": 2.8994, + "theoretical_loss": 3.619816527313711, + "tokens_seen": 1089377280 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2593270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.594031810760498, + "objective/train/theoretical_loss": 3.619796130717281, + "objective/train/tokens_used": 1109902816, + "theoretical_loss": 3.619796130717281, + "tokens_seen": 1089442816 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383249749247743, + "loss": 2.766, + "theoretical_loss": 3.619796130717281, + "tokens_seen": 1089442816 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033831494483450354, + "loss": 2.8275, + "theoretical_loss": 3.6197757356913094, + "tokens_seen": 1089508352 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003383049147442327, + "loss": 2.7488, + "theoretical_loss": 3.6197553422355813, + "tokens_seen": 1089573888 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382948846539619, + "loss": 2.8247, + "theoretical_loss": 3.619734950349881, + "tokens_seen": 1089639424 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382848545636911, + "loss": 2.8985, + "theoretical_loss": 3.6197145600339926, + "tokens_seen": 1089704960 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033827482447342026, + "loss": 2.9138, + "theoretical_loss": 3.6196941712877018, + "tokens_seen": 1089770496 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033826479438314944, + "loss": 2.8217, + "theoretical_loss": 3.619673784110793, + "tokens_seen": 1089836032 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382547642928787, + "loss": 2.8076, + "theoretical_loss": 3.6196533985030506, + "tokens_seen": 1089901568 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382447342026078, + "loss": 2.7742, + "theoretical_loss": 3.6196330144642603, + "tokens_seen": 1089967104 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033823470411233704, + "loss": 2.8509, + "theoretical_loss": 3.6196126319942064, + "tokens_seen": 1090032640 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033822467402206617, + "loss": 2.9579, + "theoretical_loss": 3.619592251092674, + "tokens_seen": 1090098176 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382146439317954, + "loss": 2.8671, + "theoretical_loss": 3.6195718717594483, + "tokens_seen": 1090163712 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003382046138415246, + "loss": 2.9159, + "theoretical_loss": 3.6195514939943143, + "tokens_seen": 1090229248 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033819458375125376, + "loss": 2.9123, + "theoretical_loss": 3.619531117797057, + "tokens_seen": 1090294784 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033818455366098295, + "loss": 2.8759, + "theoretical_loss": 3.619510743167462, + "tokens_seen": 1090360320 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003381745235707122, + "loss": 2.8984, + "theoretical_loss": 3.6194903701053134, + "tokens_seen": 1090425856 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003381644934804413, + "loss": 2.7611, + "theoretical_loss": 3.619469998610397, + "tokens_seen": 1090491392 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033815446339017054, + "loss": 2.8981, + "theoretical_loss": 3.6194496286824984, + "tokens_seen": 1090556928 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033814443329989967, + "loss": 2.82, + "theoretical_loss": 3.619429260321403, + "tokens_seen": 1090622464 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003381344032096289, + "loss": 2.818, + "theoretical_loss": 3.619408893526895, + "tokens_seen": 1090688000 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003381243731193581, + "loss": 2.7882, + "theoretical_loss": 3.6193885282987615, + "tokens_seen": 1090753536 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033811434302908727, + "loss": 2.8428, + "theoretical_loss": 3.6193681646367866, + "tokens_seen": 1090819072 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033810431293881645, + "loss": 2.9275, + "theoretical_loss": 3.619347802540756, + "tokens_seen": 1090884608 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033809428284854563, + "loss": 2.8355, + "theoretical_loss": 3.619327442010455, + "tokens_seen": 1090950144 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003380842527582748, + "loss": 2.8739, + "theoretical_loss": 3.6193070830456704, + "tokens_seen": 1091015680 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2596133, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9173312187194824, + "objective/train/theoretical_loss": 3.6192867256461865, + "objective/train/tokens_used": 1111541216, + "theoretical_loss": 3.6192867256461865, + "tokens_seen": 1091081216 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033807422266800405, + "loss": 2.8383, + "theoretical_loss": 3.6192867256461865, + "tokens_seen": 1091081216 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003380641925777332, + "loss": 2.867, + "theoretical_loss": 3.6192663698117897, + "tokens_seen": 1091146752 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003380541624874624, + "loss": 2.7481, + "theoretical_loss": 3.619246015542265, + "tokens_seen": 1091212288 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033804413239719154, + "loss": 2.9164, + "theoretical_loss": 3.6192256628373984, + "tokens_seen": 1091277824 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033803410230692077, + "loss": 2.9088, + "theoretical_loss": 3.6192053116969767, + "tokens_seen": 1091343360 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033802407221664995, + "loss": 2.9311, + "theoretical_loss": 3.6191849621207837, + "tokens_seen": 1091408896 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033801404212637913, + "loss": 2.867, + "theoretical_loss": 3.619164614108607, + "tokens_seen": 1091474432 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003380040120361083, + "loss": 2.8866, + "theoretical_loss": 3.6191442676602312, + "tokens_seen": 1091539968 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033799398194583755, + "loss": 2.9189, + "theoretical_loss": 3.6191239227754433, + "tokens_seen": 1091605504 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003379839518555667, + "loss": 2.8147, + "theoretical_loss": 3.6191035794540287, + "tokens_seen": 1091671040 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003379739217652959, + "loss": 2.8829, + "theoretical_loss": 3.619083237695774, + "tokens_seen": 1091736576 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033796389167502504, + "loss": 2.8824, + "theoretical_loss": 3.619062897500464, + "tokens_seen": 1091802112 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003379538615847543, + "loss": 2.8967, + "theoretical_loss": 3.6190425588678865, + "tokens_seen": 1091867648 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033794383149448346, + "loss": 2.864, + "theoretical_loss": 3.6190222217978265, + "tokens_seen": 1091933184 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033793380140421264, + "loss": 2.9205, + "theoretical_loss": 3.61900188629007, + "tokens_seen": 1091998720 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003379237713139418, + "loss": 2.8217, + "theoretical_loss": 3.618981552344404, + "tokens_seen": 1092064256 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337913741223671, + "loss": 2.8224, + "theoretical_loss": 3.6189612199606147, + "tokens_seen": 1092129792 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003379037111334002, + "loss": 2.8479, + "theoretical_loss": 3.6189408891384884, + "tokens_seen": 1092195328 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378936810431294, + "loss": 2.8181, + "theoretical_loss": 3.618920559877811, + "tokens_seen": 1092260864 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033788365095285854, + "loss": 2.8567, + "theoretical_loss": 3.618900232178369, + "tokens_seen": 1092326400 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378736208625878, + "loss": 2.8983, + "theoretical_loss": 3.6188799060399495, + "tokens_seen": 1092391936 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378635907723169, + "loss": 2.9266, + "theoretical_loss": 3.6188595814623374, + "tokens_seen": 1092457472 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033785356068204614, + "loss": 2.7471, + "theoretical_loss": 3.6188392584453215, + "tokens_seen": 1092523008 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378435305917753, + "loss": 2.9322, + "theoretical_loss": 3.6188189369886867, + "tokens_seen": 1092588544 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378335005015045, + "loss": 2.8877, + "theoretical_loss": 3.61879861709222, + "tokens_seen": 1092654080 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2599073, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8063406944274902, + "objective/train/theoretical_loss": 3.6187782987557084, + "objective/train/tokens_used": 1113179616, + "theoretical_loss": 3.6187782987557084, + "tokens_seen": 1092719616 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033782347041123374, + "loss": 2.7017, + "theoretical_loss": 3.6187782987557084, + "tokens_seen": 1092719616 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378134403209629, + "loss": 2.8049, + "theoretical_loss": 3.618757981978938, + "tokens_seen": 1092785152 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003378034102306921, + "loss": 2.8975, + "theoretical_loss": 3.6187376667616964, + "tokens_seen": 1092850688 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003377933801404213, + "loss": 2.9071, + "theoretical_loss": 3.6187173531037695, + "tokens_seen": 1092916224 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033778335005015046, + "loss": 2.7186, + "theoretical_loss": 3.6186970410049444, + "tokens_seen": 1092981760 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033777331995987964, + "loss": 2.8461, + "theoretical_loss": 3.6186767304650083, + "tokens_seen": 1093047296 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003377632898696089, + "loss": 2.8865, + "theoretical_loss": 3.618656421483748, + "tokens_seen": 1093112832 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337753259779338, + "loss": 2.7677, + "theoretical_loss": 3.6186361140609495, + "tokens_seen": 1093178368 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033774322968906724, + "loss": 2.8252, + "theoretical_loss": 3.6186158081964015, + "tokens_seen": 1093243904 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033773319959879637, + "loss": 2.9112, + "theoretical_loss": 3.61859550388989, + "tokens_seen": 1093309440 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003377231695085256, + "loss": 2.9154, + "theoretical_loss": 3.6185752011412013, + "tokens_seen": 1093374976 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003377131394182548, + "loss": 2.8601, + "theoretical_loss": 3.6185548999501242, + "tokens_seen": 1093440512 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033770310932798396, + "loss": 2.932, + "theoretical_loss": 3.618534600316445, + "tokens_seen": 1093506048 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033769307923771315, + "loss": 2.8521, + "theoretical_loss": 3.6185143022399506, + "tokens_seen": 1093571584 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003376830491474424, + "loss": 2.7986, + "theoretical_loss": 3.6184940057204287, + "tokens_seen": 1093637120 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003376730190571715, + "loss": 2.8069, + "theoretical_loss": 3.6184737107576668, + "tokens_seen": 1093702656 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033766298896690074, + "loss": 2.8378, + "theoretical_loss": 3.6184534173514518, + "tokens_seen": 1093768192 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033765295887662987, + "loss": 2.8534, + "theoretical_loss": 3.618433125501571, + "tokens_seen": 1093833728 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003376429287863591, + "loss": 2.9217, + "theoretical_loss": 3.618412835207812, + "tokens_seen": 1093899264 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003376328986960883, + "loss": 2.868, + "theoretical_loss": 3.6183925464699622, + "tokens_seen": 1093964800 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033762286860581747, + "loss": 2.9202, + "theoretical_loss": 3.6183722592878094, + "tokens_seen": 1094030336 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033761283851554665, + "loss": 2.9472, + "theoretical_loss": 3.6183519736611407, + "tokens_seen": 1094095872 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033760280842527583, + "loss": 2.7921, + "theoretical_loss": 3.618331689589743, + "tokens_seen": 1094161408 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337592778335005, + "loss": 2.7727, + "theoretical_loss": 3.6183114070734055, + "tokens_seen": 1094226944 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033758274824473425, + "loss": 2.7081, + "theoretical_loss": 3.6182911261119153, + "tokens_seen": 1094292480 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2602706, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.904588460922241, + "objective/train/theoretical_loss": 3.6182708467050597, + "objective/train/tokens_used": 1114818016, + "theoretical_loss": 3.6182708467050597, + "tokens_seen": 1094358016 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003375727181544634, + "loss": 2.8629, + "theoretical_loss": 3.6182708467050597, + "tokens_seen": 1094358016 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003375626880641926, + "loss": 2.8912, + "theoretical_loss": 3.618250568852626, + "tokens_seen": 1094423552 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033755265797392174, + "loss": 2.8944, + "theoretical_loss": 3.6182302925544034, + "tokens_seen": 1094489088 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033754262788365097, + "loss": 2.8375, + "theoretical_loss": 3.6182100178101786, + "tokens_seen": 1094554624 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033753259779338015, + "loss": 2.8685, + "theoretical_loss": 3.61818974461974, + "tokens_seen": 1094620160 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033752256770310933, + "loss": 2.8887, + "theoretical_loss": 3.618169472982875, + "tokens_seen": 1094685696 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003375125376128385, + "loss": 2.8903, + "theoretical_loss": 3.6181492028993723, + "tokens_seen": 1094751232 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033750250752256775, + "loss": 2.8194, + "theoretical_loss": 3.618128934369019, + "tokens_seen": 1094816768 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374924774322969, + "loss": 2.8267, + "theoretical_loss": 3.6181086673916036, + "tokens_seen": 1094882304 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374824473420261, + "loss": 2.8931, + "theoretical_loss": 3.618088401966914, + "tokens_seen": 1094947840 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033747241725175524, + "loss": 2.7933, + "theoretical_loss": 3.6180681380947393, + "tokens_seen": 1095013376 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374623871614845, + "loss": 2.8747, + "theoretical_loss": 3.618047875774866, + "tokens_seen": 1095078912 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033745235707121366, + "loss": 2.975, + "theoretical_loss": 3.6180276150070836, + "tokens_seen": 1095144448 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033744232698094284, + "loss": 2.7934, + "theoretical_loss": 3.618007355791179, + "tokens_seen": 1095209984 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337432296890672, + "loss": 2.7823, + "theoretical_loss": 3.6179870981269424, + "tokens_seen": 1095275520 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374222668004012, + "loss": 2.8137, + "theoretical_loss": 3.6179668420141606, + "tokens_seen": 1095341056 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374122367101304, + "loss": 2.8709, + "theoretical_loss": 3.6179465874526224, + "tokens_seen": 1095406592 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003374022066198596, + "loss": 2.9023, + "theoretical_loss": 3.6179263344421164, + "tokens_seen": 1095472128 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033739217652958874, + "loss": 2.8349, + "theoretical_loss": 3.6179060829824308, + "tokens_seen": 1095537664 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337382146439318, + "loss": 2.8267, + "theoretical_loss": 3.617885833073354, + "tokens_seen": 1095603200 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373721163490471, + "loss": 2.8947, + "theoretical_loss": 3.617865584714675, + "tokens_seen": 1095668736 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033736208625877634, + "loss": 2.9723, + "theoretical_loss": 3.617845337906182, + "tokens_seen": 1095734272 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373520561685055, + "loss": 2.8616, + "theoretical_loss": 3.6178250926476636, + "tokens_seen": 1095799808 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373420260782347, + "loss": 2.8168, + "theoretical_loss": 3.6178048489389085, + "tokens_seen": 1095865344 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373319959879639, + "loss": 2.9291, + "theoretical_loss": 3.6177846067797055, + "tokens_seen": 1095930880 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2607646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7668211460113525, + "objective/train/theoretical_loss": 3.617764366169843, + "objective/train/tokens_used": 1116456416, + "theoretical_loss": 3.617764366169843, + "tokens_seen": 1095996416 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373219658976931, + "loss": 2.7649, + "theoretical_loss": 3.617764366169843, + "tokens_seen": 1095996416 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033731193580742225, + "loss": 2.8263, + "theoretical_loss": 3.617744127109111, + "tokens_seen": 1096061952 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003373019057171515, + "loss": 2.8133, + "theoretical_loss": 3.617723889597296, + "tokens_seen": 1096127488 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003372918756268806, + "loss": 2.8173, + "theoretical_loss": 3.617703653634189, + "tokens_seen": 1096193024 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033728184553660984, + "loss": 2.8917, + "theoretical_loss": 3.6176834192195786, + "tokens_seen": 1096258560 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337271815446339, + "loss": 2.9117, + "theoretical_loss": 3.6176631863532522, + "tokens_seen": 1096324096 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003372617853560682, + "loss": 2.8689, + "theoretical_loss": 3.617642955035001, + "tokens_seen": 1096389632 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003372517552657974, + "loss": 2.953, + "theoretical_loss": 3.6176227252646123, + "tokens_seen": 1096455168 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033724172517552657, + "loss": 2.8732, + "theoretical_loss": 3.617602497041876, + "tokens_seen": 1096520704 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033723169508525575, + "loss": 2.864, + "theoretical_loss": 3.6175822703665808, + "tokens_seen": 1096586240 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337221664994985, + "loss": 2.8084, + "theoretical_loss": 3.617562045238516, + "tokens_seen": 1096651776 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003372116349047141, + "loss": 2.7961, + "theoretical_loss": 3.617541821657471, + "tokens_seen": 1096717312 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033720160481444335, + "loss": 2.8839, + "theoretical_loss": 3.6175215996232346, + "tokens_seen": 1096782848 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371915747241725, + "loss": 2.8808, + "theoretical_loss": 3.6175013791355966, + "tokens_seen": 1096848384 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371815446339017, + "loss": 2.8709, + "theoretical_loss": 3.617481160194346, + "tokens_seen": 1096913920 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371715145436309, + "loss": 2.8851, + "theoretical_loss": 3.617460942799272, + "tokens_seen": 1096979456 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033716148445336007, + "loss": 2.8171, + "theoretical_loss": 3.6174407269501643, + "tokens_seen": 1097044992 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033715145436308925, + "loss": 2.8869, + "theoretical_loss": 3.617420512646812, + "tokens_seen": 1097110528 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371414242728185, + "loss": 2.8311, + "theoretical_loss": 3.617400299889006, + "tokens_seen": 1097176064 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371313941825476, + "loss": 2.7729, + "theoretical_loss": 3.6173800886765335, + "tokens_seen": 1097241600 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033712136409227685, + "loss": 2.7726, + "theoretical_loss": 3.617359879009186, + "tokens_seen": 1097307136 + }, + { + "epoch": 13.01, + "learning_rate": 0.000337111334002006, + "loss": 3.0142, + "theoretical_loss": 3.617339670886752, + "tokens_seen": 1097372672 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003371013039117352, + "loss": 2.7973, + "theoretical_loss": 3.6173194643090207, + "tokens_seen": 1097438208 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003370912738214644, + "loss": 2.7465, + "theoretical_loss": 3.6172992592757836, + "tokens_seen": 1097503744 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003370812437311936, + "loss": 2.9498, + "theoretical_loss": 3.6172790557868293, + "tokens_seen": 1097569280 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2610700, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6463654041290283, + "objective/train/theoretical_loss": 3.617258853841947, + "objective/train/tokens_used": 1118094816, + "theoretical_loss": 3.617258853841947, + "tokens_seen": 1097634816 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003370712136409228, + "loss": 2.826, + "theoretical_loss": 3.617258853841947, + "tokens_seen": 1097634816 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033706118355065194, + "loss": 2.8953, + "theoretical_loss": 3.6172386534409275, + "tokens_seen": 1097700352 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033705115346038117, + "loss": 2.8334, + "theoretical_loss": 3.617218454583561, + "tokens_seen": 1097765888 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033704112337011035, + "loss": 2.8856, + "theoretical_loss": 3.617198257269636, + "tokens_seen": 1097831424 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033703109327983953, + "loss": 2.8731, + "theoretical_loss": 3.6171780614989433, + "tokens_seen": 1097896960 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003370210631895687, + "loss": 2.8105, + "theoretical_loss": 3.6171578672712728, + "tokens_seen": 1097962496 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033701103309929795, + "loss": 2.9455, + "theoretical_loss": 3.6171376745864148, + "tokens_seen": 1098028032 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003370010030090271, + "loss": 2.8547, + "theoretical_loss": 3.617117483444158, + "tokens_seen": 1098093568 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369909729187563, + "loss": 2.94, + "theoretical_loss": 3.6170972938442945, + "tokens_seen": 1098159104 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033698094282848544, + "loss": 2.8988, + "theoretical_loss": 3.6170771057866133, + "tokens_seen": 1098224640 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369709127382147, + "loss": 2.8911, + "theoretical_loss": 3.6170569192709046, + "tokens_seen": 1098290176 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033696088264794386, + "loss": 2.8168, + "theoretical_loss": 3.617036734296959, + "tokens_seen": 1098355712 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033695085255767304, + "loss": 2.8844, + "theoretical_loss": 3.617016550864566, + "tokens_seen": 1098421248 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369408224674022, + "loss": 2.8806, + "theoretical_loss": 3.6169963689735165, + "tokens_seen": 1098486784 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369307923771314, + "loss": 2.7678, + "theoretical_loss": 3.616976188623601, + "tokens_seen": 1098552320 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369207622868606, + "loss": 2.9211, + "theoretical_loss": 3.6169560098146096, + "tokens_seen": 1098617856 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003369107321965898, + "loss": 2.8869, + "theoretical_loss": 3.616935832546333, + "tokens_seen": 1098683392 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033690070210631894, + "loss": 2.8443, + "theoretical_loss": 3.616915656818561, + "tokens_seen": 1098748928 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368906720160482, + "loss": 2.8577, + "theoretical_loss": 3.616895482631085, + "tokens_seen": 1098814464 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368806419257773, + "loss": 2.8296, + "theoretical_loss": 3.616875309983695, + "tokens_seen": 1098880000 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033687061183550654, + "loss": 2.8468, + "theoretical_loss": 3.616855138876182, + "tokens_seen": 1098945536 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368605817452357, + "loss": 2.789, + "theoretical_loss": 3.6168349693083357, + "tokens_seen": 1099011072 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368505516549649, + "loss": 2.768, + "theoretical_loss": 3.616814801279948, + "tokens_seen": 1099076608 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368405215646941, + "loss": 2.9065, + "theoretical_loss": 3.616794634790809, + "tokens_seen": 1099142144 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368304914744233, + "loss": 2.8495, + "theoretical_loss": 3.616774469840709, + "tokens_seen": 1099207680 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2615468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.787325859069824, + "objective/train/theoretical_loss": 3.6167543064294394, + "objective/train/tokens_used": 1119733216, + "theoretical_loss": 3.6167543064294394, + "tokens_seen": 1099273216 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033682046138415245, + "loss": 2.9218, + "theoretical_loss": 3.6167543064294394, + "tokens_seen": 1099273216 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368104312938817, + "loss": 2.8143, + "theoretical_loss": 3.6167341445567915, + "tokens_seen": 1099338752 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003368004012036108, + "loss": 2.9191, + "theoretical_loss": 3.6167139842225553, + "tokens_seen": 1099404288 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033679037111334004, + "loss": 2.8016, + "theoretical_loss": 3.616693825426522, + "tokens_seen": 1099469824 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003367803410230692, + "loss": 2.9435, + "theoretical_loss": 3.616673668168483, + "tokens_seen": 1099535360 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003367703109327984, + "loss": 2.8524, + "theoretical_loss": 3.616653512448228, + "tokens_seen": 1099600896 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003367602808425276, + "loss": 2.8935, + "theoretical_loss": 3.6166333582655494, + "tokens_seen": 1099666432 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033675025075225677, + "loss": 2.6832, + "theoretical_loss": 3.616613205620238, + "tokens_seen": 1099731968 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033674022066198595, + "loss": 2.8504, + "theoretical_loss": 3.616593054512084, + "tokens_seen": 1099797504 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003367301905717152, + "loss": 2.8525, + "theoretical_loss": 3.61657290494088, + "tokens_seen": 1099863040 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003367201604814443, + "loss": 2.8479, + "theoretical_loss": 3.6165527569064166, + "tokens_seen": 1099928576 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033671013039117355, + "loss": 2.9298, + "theoretical_loss": 3.616532610408485, + "tokens_seen": 1099994112 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033670010030090273, + "loss": 2.8578, + "theoretical_loss": 3.616512465446876, + "tokens_seen": 1100059648 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366900702106319, + "loss": 2.8069, + "theoretical_loss": 3.616492322021381, + "tokens_seen": 1100125184 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366800401203611, + "loss": 2.7823, + "theoretical_loss": 3.6164721801317925, + "tokens_seen": 1100190720 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033667001003009027, + "loss": 2.8148, + "theoretical_loss": 3.6164520397779008, + "tokens_seen": 1100256256 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033665997993981945, + "loss": 2.8237, + "theoretical_loss": 3.6164319009594976, + "tokens_seen": 1100321792 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366499498495487, + "loss": 2.8048, + "theoretical_loss": 3.616411763676375, + "tokens_seen": 1100387328 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366399197592778, + "loss": 2.8484, + "theoretical_loss": 3.6163916279283237, + "tokens_seen": 1100452864 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033662988966900705, + "loss": 2.8189, + "theoretical_loss": 3.6163714937151354, + "tokens_seen": 1100518400 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366198595787362, + "loss": 2.9877, + "theoretical_loss": 3.6163513610366023, + "tokens_seen": 1100583936 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003366098294884654, + "loss": 2.8869, + "theoretical_loss": 3.6163312298925154, + "tokens_seen": 1100649472 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365997993981946, + "loss": 2.8705, + "theoretical_loss": 3.616311100282667, + "tokens_seen": 1100715008 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365897693079238, + "loss": 2.8688, + "theoretical_loss": 3.616290972206848, + "tokens_seen": 1100780544 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033657973921765295, + "loss": 2.8873, + "theoretical_loss": 3.616270845664851, + "tokens_seen": 1100846080 + }, + { + "epoch": 13.01, + "objective/train/docs_used": 2618414, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6833109855651855, + "objective/train/theoretical_loss": 3.6162507206564674, + "objective/train/tokens_used": 1121371616, + "theoretical_loss": 3.6162507206564674, + "tokens_seen": 1100911616 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033656970912738214, + "loss": 2.775, + "theoretical_loss": 3.6162507206564674, + "tokens_seen": 1100911616 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365596790371113, + "loss": 2.861, + "theoretical_loss": 3.616230597181489, + "tokens_seen": 1100977152 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033654964894684055, + "loss": 2.8367, + "theoretical_loss": 3.6162104752397077, + "tokens_seen": 1101042688 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365396188565697, + "loss": 2.9073, + "theoretical_loss": 3.616190354830916, + "tokens_seen": 1101108224 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365295887662989, + "loss": 2.7477, + "theoretical_loss": 3.6161702359549053, + "tokens_seen": 1101173760 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365195586760281, + "loss": 2.7928, + "theoretical_loss": 3.6161501186114675, + "tokens_seen": 1101239296 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003365095285857573, + "loss": 2.8904, + "theoretical_loss": 3.6161300028003955, + "tokens_seen": 1101304832 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033649949849548646, + "loss": 2.8173, + "theoretical_loss": 3.61610988852148, + "tokens_seen": 1101370368 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033648946840521564, + "loss": 2.8355, + "theoretical_loss": 3.616089775774515, + "tokens_seen": 1101435904 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003364794383149448, + "loss": 2.8769, + "theoretical_loss": 3.616069664559291, + "tokens_seen": 1101501440 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033646940822467406, + "loss": 2.7955, + "theoretical_loss": 3.616049554875601, + "tokens_seen": 1101566976 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003364593781344032, + "loss": 2.8976, + "theoretical_loss": 3.6160294467232372, + "tokens_seen": 1101632512 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003364493480441324, + "loss": 2.8134, + "theoretical_loss": 3.616009340101992, + "tokens_seen": 1101698048 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033643931795386154, + "loss": 2.9084, + "theoretical_loss": 3.615989235011657, + "tokens_seen": 1101763584 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003364292878635908, + "loss": 2.9233, + "theoretical_loss": 3.615969131452026, + "tokens_seen": 1101829120 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033641925777331996, + "loss": 2.8563, + "theoretical_loss": 3.6159490294228895, + "tokens_seen": 1101894656 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033640922768304914, + "loss": 2.8515, + "theoretical_loss": 3.615928928924042, + "tokens_seen": 1101960192 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003363991975927783, + "loss": 2.8276, + "theoretical_loss": 3.6159088299552753, + "tokens_seen": 1102025728 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003363891675025075, + "loss": 2.9288, + "theoretical_loss": 3.615888732516381, + "tokens_seen": 1102091264 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003363791374122367, + "loss": 2.9451, + "theoretical_loss": 3.6158686366071526, + "tokens_seen": 1102156800 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003363691073219659, + "loss": 2.9258, + "theoretical_loss": 3.6158485422273823, + "tokens_seen": 1102222336 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033635907723169505, + "loss": 2.8987, + "theoretical_loss": 3.6158284493768633, + "tokens_seen": 1102287872 + }, + { + "epoch": 13.01, + "learning_rate": 0.0003363490471414243, + "loss": 2.8818, + "theoretical_loss": 3.615808358055388, + "tokens_seen": 1102353408 + }, + { + "epoch": 13.01, + "learning_rate": 0.00033633901705115346, + "loss": 2.9344, + "theoretical_loss": 3.615788268262749, + "tokens_seen": 1102418944 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033632898696088265, + "loss": 2.8313, + "theoretical_loss": 3.6157681799987396, + "tokens_seen": 1102484480 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2622326, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.897630214691162, + "objective/train/theoretical_loss": 3.615748093263152, + "objective/train/tokens_used": 1123010016, + "theoretical_loss": 3.615748093263152, + "tokens_seen": 1102550016 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003363189568706119, + "loss": 2.8714, + "theoretical_loss": 3.615748093263152, + "tokens_seen": 1102550016 + }, + { + "epoch": 13.02, + "learning_rate": 0.000336308926780341, + "loss": 2.927, + "theoretical_loss": 3.615728008055779, + "tokens_seen": 1102615552 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033629889669007024, + "loss": 2.8427, + "theoretical_loss": 3.6157079243764145, + "tokens_seen": 1102681088 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003362888665997994, + "loss": 2.8345, + "theoretical_loss": 3.61568784222485, + "tokens_seen": 1102746624 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003362788365095286, + "loss": 2.7823, + "theoretical_loss": 3.6156677616008803, + "tokens_seen": 1102812160 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003362688064192578, + "loss": 2.7865, + "theoretical_loss": 3.6156476825042967, + "tokens_seen": 1102877696 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033625877632898697, + "loss": 2.795, + "theoretical_loss": 3.615627604934893, + "tokens_seen": 1102943232 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033624874623871615, + "loss": 2.9457, + "theoretical_loss": 3.615607528892463, + "tokens_seen": 1103008768 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003362387161484454, + "loss": 2.8773, + "theoretical_loss": 3.615587454376799, + "tokens_seen": 1103074304 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003362286860581745, + "loss": 2.7624, + "theoretical_loss": 3.615567381387694, + "tokens_seen": 1103139840 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033621865596790375, + "loss": 2.8774, + "theoretical_loss": 3.615547309924941, + "tokens_seen": 1103205376 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033620862587763293, + "loss": 2.8076, + "theoretical_loss": 3.615527239988335, + "tokens_seen": 1103270912 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361985957873621, + "loss": 2.8743, + "theoretical_loss": 3.6155071715776677, + "tokens_seen": 1103336448 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361885656970913, + "loss": 2.8443, + "theoretical_loss": 3.6154871046927326, + "tokens_seen": 1103401984 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033617853560682047, + "loss": 2.933, + "theoretical_loss": 3.615467039333324, + "tokens_seen": 1103467520 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033616850551654965, + "loss": 2.9807, + "theoretical_loss": 3.6154469754992347, + "tokens_seen": 1103533056 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361584754262789, + "loss": 2.8665, + "theoretical_loss": 3.615426913190258, + "tokens_seen": 1103598592 + }, + { + "epoch": 13.02, + "learning_rate": 0.000336148445336008, + "loss": 2.8466, + "theoretical_loss": 3.6154068524061875, + "tokens_seen": 1103664128 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033613841524573725, + "loss": 2.806, + "theoretical_loss": 3.6153867931468175, + "tokens_seen": 1103729664 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361283851554664, + "loss": 2.8403, + "theoretical_loss": 3.61536673541194, + "tokens_seen": 1103795200 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361183550651956, + "loss": 2.9816, + "theoretical_loss": 3.615346679201351, + "tokens_seen": 1103860736 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003361083249749248, + "loss": 2.8102, + "theoretical_loss": 3.6153266245148417, + "tokens_seen": 1103926272 + }, + { + "epoch": 13.02, + "learning_rate": 0.000336098294884654, + "loss": 2.803, + "theoretical_loss": 3.6153065713522072, + "tokens_seen": 1103991808 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033608826479438316, + "loss": 2.8299, + "theoretical_loss": 3.6152865197132407, + "tokens_seen": 1104057344 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033607823470411234, + "loss": 2.8699, + "theoretical_loss": 3.615266469597737, + "tokens_seen": 1104122880 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2626835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8161697387695312, + "objective/train/theoretical_loss": 3.6152464210054887, + "objective/train/tokens_used": 1124648416, + "theoretical_loss": 3.6152464210054887, + "tokens_seen": 1104188416 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003360682046138415, + "loss": 2.7524, + "theoretical_loss": 3.6152464210054887, + "tokens_seen": 1104188416 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033605817452357075, + "loss": 2.845, + "theoretical_loss": 3.6152263739362906, + "tokens_seen": 1104253952 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003360481444332999, + "loss": 2.7218, + "theoretical_loss": 3.6152063283899354, + "tokens_seen": 1104319488 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003360381143430291, + "loss": 2.8352, + "theoretical_loss": 3.615186284366219, + "tokens_seen": 1104385024 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003360280842527583, + "loss": 2.8422, + "theoretical_loss": 3.6151662418649333, + "tokens_seen": 1104450560 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003360180541624875, + "loss": 2.8654, + "theoretical_loss": 3.6151462008858735, + "tokens_seen": 1104516096 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033600802407221666, + "loss": 2.9415, + "theoretical_loss": 3.615126161428834, + "tokens_seen": 1104581632 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033599799398194584, + "loss": 2.8464, + "theoretical_loss": 3.6151061234936077, + "tokens_seen": 1104647168 + }, + { + "epoch": 13.02, + "learning_rate": 0.000335987963891675, + "loss": 2.8897, + "theoretical_loss": 3.61508608707999, + "tokens_seen": 1104712704 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033597793380140426, + "loss": 2.8215, + "theoretical_loss": 3.6150660521877738, + "tokens_seen": 1104778240 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003359679037111334, + "loss": 2.8135, + "theoretical_loss": 3.615046018816755, + "tokens_seen": 1104843776 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003359578736208626, + "loss": 2.7945, + "theoretical_loss": 3.6150259869667263, + "tokens_seen": 1104909312 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033594784353059174, + "loss": 2.8675, + "theoretical_loss": 3.615005956637483, + "tokens_seen": 1104974848 + }, + { + "epoch": 13.02, + "learning_rate": 0.000335937813440321, + "loss": 2.9238, + "theoretical_loss": 3.614985927828819, + "tokens_seen": 1105040384 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033592778335005016, + "loss": 2.8472, + "theoretical_loss": 3.6149659005405286, + "tokens_seen": 1105105920 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033591775325977934, + "loss": 2.9284, + "theoretical_loss": 3.614945874772407, + "tokens_seen": 1105171456 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003359077231695085, + "loss": 2.7605, + "theoretical_loss": 3.614925850524248, + "tokens_seen": 1105236992 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358976930792377, + "loss": 2.8513, + "theoretical_loss": 3.6149058277958455, + "tokens_seen": 1105302528 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358876629889669, + "loss": 2.7936, + "theoretical_loss": 3.6148858065869955, + "tokens_seen": 1105368064 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358776328986961, + "loss": 2.8202, + "theoretical_loss": 3.614865786897492, + "tokens_seen": 1105433600 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033586760280842525, + "loss": 2.8341, + "theoretical_loss": 3.6148457687271294, + "tokens_seen": 1105499136 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358575727181545, + "loss": 2.8608, + "theoretical_loss": 3.614825752075703, + "tokens_seen": 1105564672 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033584754262788366, + "loss": 2.9317, + "theoretical_loss": 3.6148057369430058, + "tokens_seen": 1105630208 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033583751253761285, + "loss": 2.8246, + "theoretical_loss": 3.6147857233288345, + "tokens_seen": 1105695744 + }, + { + "epoch": 13.02, + "learning_rate": 0.000335827482447342, + "loss": 2.7427, + "theoretical_loss": 3.614765711232984, + "tokens_seen": 1105761280 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2630103, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.715022087097168, + "objective/train/theoretical_loss": 3.614745700655247, + "objective/train/tokens_used": 1126286816, + "theoretical_loss": 3.614745700655247, + "tokens_seen": 1105826816 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358174523570712, + "loss": 2.9249, + "theoretical_loss": 3.614745700655247, + "tokens_seen": 1105826816 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003358074222668004, + "loss": 2.9814, + "theoretical_loss": 3.6147256915954205, + "tokens_seen": 1105892352 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003357973921765296, + "loss": 2.8577, + "theoretical_loss": 3.6147056840532983, + "tokens_seen": 1105957888 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033578736208625875, + "loss": 2.8924, + "theoretical_loss": 3.6146856780286756, + "tokens_seen": 1106023424 + }, + { + "epoch": 13.02, + "learning_rate": 0.000335777331995988, + "loss": 2.9282, + "theoretical_loss": 3.6146656735213476, + "tokens_seen": 1106088960 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003357673019057171, + "loss": 2.8386, + "theoretical_loss": 3.6146456705311096, + "tokens_seen": 1106154496 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033575727181544635, + "loss": 2.8702, + "theoretical_loss": 3.614625669057756, + "tokens_seen": 1106220032 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033574724172517553, + "loss": 2.8154, + "theoretical_loss": 3.614605669101082, + "tokens_seen": 1106285568 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003357372116349047, + "loss": 2.9568, + "theoretical_loss": 3.614585670660883, + "tokens_seen": 1106351104 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003357271815446339, + "loss": 2.8098, + "theoretical_loss": 3.6145656737369545, + "tokens_seen": 1106416640 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033571715145436313, + "loss": 2.8507, + "theoretical_loss": 3.6145456783290912, + "tokens_seen": 1106482176 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033570712136409225, + "loss": 2.8847, + "theoretical_loss": 3.6145256844370888, + "tokens_seen": 1106547712 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356970912738215, + "loss": 2.755, + "theoretical_loss": 3.6145056920607415, + "tokens_seen": 1106613248 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356870611835506, + "loss": 2.878, + "theoretical_loss": 3.6144857011998464, + "tokens_seen": 1106678784 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033567703109327985, + "loss": 2.8633, + "theoretical_loss": 3.614465711854198, + "tokens_seen": 1106744320 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033566700100300903, + "loss": 2.8073, + "theoretical_loss": 3.614445724023591, + "tokens_seen": 1106809856 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356569709127382, + "loss": 2.801, + "theoretical_loss": 3.6144257377078226, + "tokens_seen": 1106875392 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356469408224674, + "loss": 2.8184, + "theoretical_loss": 3.614405752906687, + "tokens_seen": 1106940928 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356369107321966, + "loss": 2.8768, + "theoretical_loss": 3.6143857696199797, + "tokens_seen": 1107006464 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033562688064192576, + "loss": 2.8025, + "theoretical_loss": 3.614365787847497, + "tokens_seen": 1107072000 + }, + { + "epoch": 13.02, + "learning_rate": 0.000335616850551655, + "loss": 2.824, + "theoretical_loss": 3.6143458075890345, + "tokens_seen": 1107137536 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003356068204613841, + "loss": 2.8592, + "theoretical_loss": 3.614325828844387, + "tokens_seen": 1107203072 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033559679037111336, + "loss": 2.7209, + "theoretical_loss": 3.6143058516133513, + "tokens_seen": 1107268608 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355867602808425, + "loss": 2.9128, + "theoretical_loss": 3.6142858758957224, + "tokens_seen": 1107334144 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355767301905717, + "loss": 2.8606, + "theoretical_loss": 3.6142659016912964, + "tokens_seen": 1107399680 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2634889, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8582513332366943, + "objective/train/theoretical_loss": 3.614245928999869, + "objective/train/tokens_used": 1127925216, + "theoretical_loss": 3.614245928999869, + "tokens_seen": 1107465216 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033556670010030095, + "loss": 2.8552, + "theoretical_loss": 3.614245928999869, + "tokens_seen": 1107465216 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355566700100301, + "loss": 2.7733, + "theoretical_loss": 3.6142259578212363, + "tokens_seen": 1107530752 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355466399197593, + "loss": 2.9275, + "theoretical_loss": 3.614205988155194, + "tokens_seen": 1107596288 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355366098294885, + "loss": 2.783, + "theoretical_loss": 3.6141860200015383, + "tokens_seen": 1107661824 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003355265797392177, + "loss": 2.8988, + "theoretical_loss": 3.614166053360065, + "tokens_seen": 1107727360 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033551654964894686, + "loss": 2.8273, + "theoretical_loss": 3.6141460882305694, + "tokens_seen": 1107792896 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033550651955867604, + "loss": 2.9007, + "theoretical_loss": 3.6141261246128487, + "tokens_seen": 1107858432 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354964894684052, + "loss": 2.953, + "theoretical_loss": 3.6141061625066992, + "tokens_seen": 1107923968 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033548645937813446, + "loss": 2.812, + "theoretical_loss": 3.614086201911916, + "tokens_seen": 1107989504 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354764292878636, + "loss": 2.8073, + "theoretical_loss": 3.6140662428282955, + "tokens_seen": 1108055040 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354663991975928, + "loss": 2.819, + "theoretical_loss": 3.6140462852556343, + "tokens_seen": 1108120576 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033545636910732195, + "loss": 2.9073, + "theoretical_loss": 3.6140263291937282, + "tokens_seen": 1108186112 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354463390170512, + "loss": 2.951, + "theoretical_loss": 3.6140063746423747, + "tokens_seen": 1108251648 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033543630892678036, + "loss": 2.8826, + "theoretical_loss": 3.6139864216013686, + "tokens_seen": 1108317184 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033542627883650954, + "loss": 2.8918, + "theoretical_loss": 3.613966470070507, + "tokens_seen": 1108382720 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354162487462387, + "loss": 2.922, + "theoretical_loss": 3.613946520049587, + "tokens_seen": 1108448256 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003354062186559679, + "loss": 2.8178, + "theoretical_loss": 3.6139265715384035, + "tokens_seen": 1108513792 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353961885656971, + "loss": 2.8522, + "theoretical_loss": 3.613906624536754, + "tokens_seen": 1108579328 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353861584754263, + "loss": 2.8716, + "theoretical_loss": 3.613886679044435, + "tokens_seen": 1108644864 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033537612838515545, + "loss": 2.938, + "theoretical_loss": 3.613866735061243, + "tokens_seen": 1108710400 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353660982948847, + "loss": 2.8395, + "theoretical_loss": 3.6138467925869744, + "tokens_seen": 1108775936 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033535606820461386, + "loss": 2.808, + "theoretical_loss": 3.6138268516214263, + "tokens_seen": 1108841472 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033534603811434305, + "loss": 2.903, + "theoretical_loss": 3.613806912164395, + "tokens_seen": 1108907008 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353360080240722, + "loss": 2.9039, + "theoretical_loss": 3.6137869742156776, + "tokens_seen": 1108972544 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353259779338014, + "loss": 2.9, + "theoretical_loss": 3.6137670377750704, + "tokens_seen": 1109038080 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2637846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9132325649261475, + "objective/train/theoretical_loss": 3.6137471028423707, + "objective/train/tokens_used": 1129563616, + "theoretical_loss": 3.6137471028423707, + "tokens_seen": 1109103616 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353159478435306, + "loss": 2.8521, + "theoretical_loss": 3.6137471028423707, + "tokens_seen": 1109103616 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003353059177532598, + "loss": 2.8631, + "theoretical_loss": 3.613727169417375, + "tokens_seen": 1109169152 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033529588766298895, + "loss": 2.854, + "theoretical_loss": 3.613707237499881, + "tokens_seen": 1109234688 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003352858575727182, + "loss": 2.9178, + "theoretical_loss": 3.613687307089684, + "tokens_seen": 1109300224 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003352758274824473, + "loss": 2.8722, + "theoretical_loss": 3.6136673781865825, + "tokens_seen": 1109365760 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033526579739217655, + "loss": 2.911, + "theoretical_loss": 3.6136474507903724, + "tokens_seen": 1109431296 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033525576730190573, + "loss": 2.9086, + "theoretical_loss": 3.6136275249008523, + "tokens_seen": 1109496832 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003352457372116349, + "loss": 2.8044, + "theoretical_loss": 3.6136076005178177, + "tokens_seen": 1109562368 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003352357071213641, + "loss": 2.836, + "theoretical_loss": 3.6135876776410663, + "tokens_seen": 1109627904 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033522567703109333, + "loss": 2.8679, + "theoretical_loss": 3.6135677562703954, + "tokens_seen": 1109693440 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033521564694082245, + "loss": 2.7604, + "theoretical_loss": 3.613547836405602, + "tokens_seen": 1109758976 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003352056168505517, + "loss": 2.9259, + "theoretical_loss": 3.6135279180464837, + "tokens_seen": 1109824512 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351955867602808, + "loss": 2.8931, + "theoretical_loss": 3.613508001192838, + "tokens_seen": 1109890048 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033518555667001005, + "loss": 2.8535, + "theoretical_loss": 3.6134880858444616, + "tokens_seen": 1109955584 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033517552657973923, + "loss": 3.0046, + "theoretical_loss": 3.6134681720011517, + "tokens_seen": 1110021120 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351654964894684, + "loss": 2.7791, + "theoretical_loss": 3.613448259662706, + "tokens_seen": 1110086656 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351554663991976, + "loss": 2.8689, + "theoretical_loss": 3.6134283488289225, + "tokens_seen": 1110152192 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351454363089268, + "loss": 2.8484, + "theoretical_loss": 3.613408439499598, + "tokens_seen": 1110217728 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033513540621865596, + "loss": 2.7727, + "theoretical_loss": 3.61338853167453, + "tokens_seen": 1110283264 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351253761283852, + "loss": 2.8747, + "theoretical_loss": 3.6133686253535164, + "tokens_seen": 1110348800 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003351153460381143, + "loss": 2.8754, + "theoretical_loss": 3.6133487205363553, + "tokens_seen": 1110414336 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033510531594784356, + "loss": 2.8619, + "theoretical_loss": 3.613328817222843, + "tokens_seen": 1110479872 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350952858575727, + "loss": 2.8315, + "theoretical_loss": 3.6133089154127784, + "tokens_seen": 1110545408 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350852557673019, + "loss": 2.8133, + "theoretical_loss": 3.6132890151059582, + "tokens_seen": 1110610944 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350752256770311, + "loss": 2.9259, + "theoretical_loss": 3.613269116302181, + "tokens_seen": 1110676480 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2641541, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8927736282348633, + "objective/train/theoretical_loss": 3.6132492190012444, + "objective/train/tokens_used": 1131202016, + "theoretical_loss": 3.6132492190012444, + "tokens_seen": 1110742016 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350651955867603, + "loss": 2.8799, + "theoretical_loss": 3.6132492190012444, + "tokens_seen": 1110742016 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033505516549648946, + "loss": 2.8835, + "theoretical_loss": 3.613229323202946, + "tokens_seen": 1110807552 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350451354062187, + "loss": 2.9278, + "theoretical_loss": 3.6132094289070844, + "tokens_seen": 1110873088 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350351053159478, + "loss": 2.9156, + "theoretical_loss": 3.613189536113456, + "tokens_seen": 1110938624 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033502507522567706, + "loss": 2.9256, + "theoretical_loss": 3.613169644821861, + "tokens_seen": 1111004160 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350150451354062, + "loss": 2.8899, + "theoretical_loss": 3.613149755032095, + "tokens_seen": 1111069696 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003350050150451354, + "loss": 2.9089, + "theoretical_loss": 3.6131298667439573, + "tokens_seen": 1111135232 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349949849548646, + "loss": 2.8488, + "theoretical_loss": 3.6131099799572457, + "tokens_seen": 1111200768 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349849548645938, + "loss": 2.8367, + "theoretical_loss": 3.613090094671759, + "tokens_seen": 1111266304 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033497492477432296, + "loss": 2.9032, + "theoretical_loss": 3.6130702108872947, + "tokens_seen": 1111331840 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033496489468405215, + "loss": 2.7647, + "theoretical_loss": 3.6130503286036504, + "tokens_seen": 1111397376 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349548645937813, + "loss": 2.9068, + "theoretical_loss": 3.6130304478206257, + "tokens_seen": 1111462912 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033494483450351056, + "loss": 2.819, + "theoretical_loss": 3.6130105685380176, + "tokens_seen": 1111528448 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349348044132397, + "loss": 2.9413, + "theoretical_loss": 3.612990690755625, + "tokens_seen": 1111593984 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349247743229689, + "loss": 2.8022, + "theoretical_loss": 3.612970814473247, + "tokens_seen": 1111659520 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033491474423269805, + "loss": 2.9016, + "theoretical_loss": 3.6129509396906805, + "tokens_seen": 1111725056 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003349047141424273, + "loss": 2.8179, + "theoretical_loss": 3.6129310664077243, + "tokens_seen": 1111790592 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033489468405215647, + "loss": 2.833, + "theoretical_loss": 3.612911194624178, + "tokens_seen": 1111856128 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033488465396188565, + "loss": 2.9021, + "theoretical_loss": 3.612891324339839, + "tokens_seen": 1111921664 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033487462387161483, + "loss": 2.9755, + "theoretical_loss": 3.6128714555545063, + "tokens_seen": 1111987200 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033486459378134406, + "loss": 2.8949, + "theoretical_loss": 3.6128515882679775, + "tokens_seen": 1112052736 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003348545636910732, + "loss": 2.9513, + "theoretical_loss": 3.612831722480053, + "tokens_seen": 1112118272 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003348445336008024, + "loss": 2.86, + "theoretical_loss": 3.61281185819053, + "tokens_seen": 1112183808 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003348345035105316, + "loss": 2.8804, + "theoretical_loss": 3.612791995399208, + "tokens_seen": 1112249344 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003348244734202608, + "loss": 2.8334, + "theoretical_loss": 3.612772134105885, + "tokens_seen": 1112314880 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2646277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.758850336074829, + "objective/train/theoretical_loss": 3.6127522743103606, + "objective/train/tokens_used": 1132840416, + "theoretical_loss": 3.6127522743103606, + "tokens_seen": 1112380416 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033481444332999, + "loss": 2.8893, + "theoretical_loss": 3.6127522743103606, + "tokens_seen": 1112380416 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033480441323971915, + "loss": 2.9366, + "theoretical_loss": 3.612732416012433, + "tokens_seen": 1112445952 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003347943831494484, + "loss": 2.9734, + "theoretical_loss": 3.612712559211901, + "tokens_seen": 1112511488 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003347843530591775, + "loss": 2.7562, + "theoretical_loss": 3.612692703908564, + "tokens_seen": 1112577024 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033477432296890675, + "loss": 2.8185, + "theoretical_loss": 3.6126728501022205, + "tokens_seen": 1112642560 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033476429287863593, + "loss": 2.7418, + "theoretical_loss": 3.61265299779267, + "tokens_seen": 1112708096 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003347542627883651, + "loss": 2.7946, + "theoretical_loss": 3.612633146979711, + "tokens_seen": 1112773632 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003347442326980943, + "loss": 2.9489, + "theoretical_loss": 3.6126132976631427, + "tokens_seen": 1112839168 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033473420260782353, + "loss": 2.9077, + "theoretical_loss": 3.6125934498427643, + "tokens_seen": 1112904704 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033472417251755265, + "loss": 2.8281, + "theoretical_loss": 3.6125736035183746, + "tokens_seen": 1112970240 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003347141424272819, + "loss": 2.8927, + "theoretical_loss": 3.6125537586897734, + "tokens_seen": 1113035776 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334704112337011, + "loss": 2.836, + "theoretical_loss": 3.6125339153567593, + "tokens_seen": 1113101312 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033469408224674025, + "loss": 3.0057, + "theoretical_loss": 3.6125140735191312, + "tokens_seen": 1113166848 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033468405215646943, + "loss": 2.9304, + "theoretical_loss": 3.6124942331766894, + "tokens_seen": 1113232384 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003346740220661986, + "loss": 2.953, + "theoretical_loss": 3.6124743943292326, + "tokens_seen": 1113297920 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003346639919759278, + "loss": 2.8692, + "theoretical_loss": 3.6124545569765605, + "tokens_seen": 1113363456 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334653961885657, + "loss": 2.8111, + "theoretical_loss": 3.612434721118472, + "tokens_seen": 1113428992 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033464393179538616, + "loss": 2.8007, + "theoretical_loss": 3.6124148867547667, + "tokens_seen": 1113494528 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003346339017051154, + "loss": 2.8704, + "theoretical_loss": 3.6123950538852445, + "tokens_seen": 1113560064 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003346238716148445, + "loss": 2.9268, + "theoretical_loss": 3.612375222509704, + "tokens_seen": 1113625600 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033461384152457376, + "loss": 2.9154, + "theoretical_loss": 3.612355392627946, + "tokens_seen": 1113691136 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003346038114343029, + "loss": 2.7209, + "theoretical_loss": 3.6123355642397694, + "tokens_seen": 1113756672 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345937813440321, + "loss": 2.8987, + "theoretical_loss": 3.6123157373449737, + "tokens_seen": 1113822208 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345837512537613, + "loss": 2.8744, + "theoretical_loss": 3.6122959119433586, + "tokens_seen": 1113887744 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345737211634905, + "loss": 2.7597, + "theoretical_loss": 3.612276088034724, + "tokens_seen": 1113953280 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7183420658111572, + "objective/train/theoretical_loss": 3.61225626561887, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.61225626561887, + "tokens_seen": 1114018816 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033456369107321966, + "loss": 2.7554, + "theoretical_loss": 3.61225626561887, + "tokens_seen": 1114018816 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345536609829489, + "loss": 2.8547, + "theoretical_loss": 3.6122364446955952, + "tokens_seen": 1114084352 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334543630892678, + "loss": 2.8163, + "theoretical_loss": 3.6122166252647006, + "tokens_seen": 1114149888 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033453360080240726, + "loss": 2.9069, + "theoretical_loss": 3.6121968073259856, + "tokens_seen": 1114215424 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345235707121364, + "loss": 2.8461, + "theoretical_loss": 3.6121769908792505, + "tokens_seen": 1114280960 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345135406218656, + "loss": 2.7204, + "theoretical_loss": 3.6121571759242945, + "tokens_seen": 1114346496 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003345035105315948, + "loss": 2.9034, + "theoretical_loss": 3.612137362460918, + "tokens_seen": 1114412032 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334493480441324, + "loss": 2.8654, + "theoretical_loss": 3.6121175504889207, + "tokens_seen": 1114477568 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033448345035105316, + "loss": 2.9565, + "theoretical_loss": 3.612097740008103, + "tokens_seen": 1114543104 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033447342026078235, + "loss": 2.845, + "theoretical_loss": 3.6120779310182654, + "tokens_seen": 1114608640 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003344633901705115, + "loss": 2.7907, + "theoretical_loss": 3.612058123519207, + "tokens_seen": 1114674176 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033445336008024076, + "loss": 2.8268, + "theoretical_loss": 3.612038317510729, + "tokens_seen": 1114739712 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003344433299899699, + "loss": 2.8828, + "theoretical_loss": 3.6120185129926305, + "tokens_seen": 1114805248 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003344332998996991, + "loss": 2.9461, + "theoretical_loss": 3.6119987099647126, + "tokens_seen": 1114870784 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033442326980942825, + "loss": 2.8468, + "theoretical_loss": 3.611978908426776, + "tokens_seen": 1114936320 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003344132397191575, + "loss": 2.8503, + "theoretical_loss": 3.6119591083786196, + "tokens_seen": 1115001856 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033440320962888667, + "loss": 2.9115, + "theoretical_loss": 3.6119393098200447, + "tokens_seen": 1115067392 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033439317953861585, + "loss": 2.8867, + "theoretical_loss": 3.611919512750852, + "tokens_seen": 1115132928 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033438314944834503, + "loss": 2.7014, + "theoretical_loss": 3.6118997171708407, + "tokens_seen": 1115198464 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033437311935807427, + "loss": 2.9089, + "theoretical_loss": 3.6118799230798126, + "tokens_seen": 1115264000 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003343630892678034, + "loss": 2.8949, + "theoretical_loss": 3.6118601304775675, + "tokens_seen": 1115329536 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033435305917753263, + "loss": 2.8766, + "theoretical_loss": 3.611840339363906, + "tokens_seen": 1115395072 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033434302908726175, + "loss": 2.8902, + "theoretical_loss": 3.6118205497386286, + "tokens_seen": 1115460608 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334332998996991, + "loss": 2.8486, + "theoretical_loss": 3.611800761601536, + "tokens_seen": 1115526144 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033432296890672017, + "loss": 2.9295, + "theoretical_loss": 3.61178097495243, + "tokens_seen": 1115591680 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1052446365356445, + "objective/train/theoretical_loss": 3.6117611897911095, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6117611897911095, + "tokens_seen": 1115657216 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033431293881644935, + "loss": 2.9848, + "theoretical_loss": 3.6117611897911095, + "tokens_seen": 1115657216 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033430290872617853, + "loss": 2.8509, + "theoretical_loss": 3.6117414061173765, + "tokens_seen": 1115722752 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342928786359077, + "loss": 2.921, + "theoretical_loss": 3.611721623931031, + "tokens_seen": 1115788288 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342828485456369, + "loss": 2.9288, + "theoretical_loss": 3.611701843231874, + "tokens_seen": 1115853824 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033427281845536613, + "loss": 2.8969, + "theoretical_loss": 3.611682064019707, + "tokens_seen": 1115919360 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033426278836509526, + "loss": 2.7802, + "theoretical_loss": 3.6116622862943304, + "tokens_seen": 1115984896 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342527582748245, + "loss": 2.8589, + "theoretical_loss": 3.6116425100555447, + "tokens_seen": 1116050432 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342427281845536, + "loss": 2.8737, + "theoretical_loss": 3.6116227353031514, + "tokens_seen": 1116115968 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033423269809428285, + "loss": 2.9205, + "theoretical_loss": 3.611602962036952, + "tokens_seen": 1116181504 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033422266800401204, + "loss": 2.8758, + "theoretical_loss": 3.611583190256747, + "tokens_seen": 1116247040 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342126379137412, + "loss": 2.9595, + "theoretical_loss": 3.6115634199623368, + "tokens_seen": 1116312576 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003342026078234704, + "loss": 2.9074, + "theoretical_loss": 3.6115436511535233, + "tokens_seen": 1116378112 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033419257773319963, + "loss": 2.8846, + "theoretical_loss": 3.611523883830108, + "tokens_seen": 1116443648 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033418254764292876, + "loss": 2.8813, + "theoretical_loss": 3.6115041179918914, + "tokens_seen": 1116509184 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334172517552658, + "loss": 3.0758, + "theoretical_loss": 3.6114843536386756, + "tokens_seen": 1116574720 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003341624874623871, + "loss": 2.7559, + "theoretical_loss": 3.6114645907702605, + "tokens_seen": 1116640256 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033415245737211636, + "loss": 2.8376, + "theoretical_loss": 3.611444829386449, + "tokens_seen": 1116705792 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033414242728184554, + "loss": 2.8782, + "theoretical_loss": 3.6114250694870407, + "tokens_seen": 1116771328 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033414242728184554, + "loss": 2.8431, + "theoretical_loss": 3.611405311071839, + "tokens_seen": 1116836864 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003341323971915747, + "loss": 2.9065, + "theoretical_loss": 3.611385554140644, + "tokens_seen": 1116902400 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003341223671013039, + "loss": 2.9308, + "theoretical_loss": 3.6113657986932575, + "tokens_seen": 1116967936 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003341123370110331, + "loss": 2.8076, + "theoretical_loss": 3.6113460447294807, + "tokens_seen": 1117033472 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033410230692076226, + "loss": 2.9272, + "theoretical_loss": 3.6113262922491156, + "tokens_seen": 1117099008 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340922768304915, + "loss": 2.806, + "theoretical_loss": 3.611306541251964, + "tokens_seen": 1117164544 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340822467402207, + "loss": 2.8019, + "theoretical_loss": 3.611286791737827, + "tokens_seen": 1117230080 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7912299633026123, + "objective/train/theoretical_loss": 3.611267043706506, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.611267043706506, + "tokens_seen": 1117295616 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033407221664994986, + "loss": 2.8369, + "theoretical_loss": 3.611267043706506, + "tokens_seen": 1117295616 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340621865596791, + "loss": 2.8983, + "theoretical_loss": 3.611247297157804, + "tokens_seen": 1117361152 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340521564694082, + "loss": 2.9362, + "theoretical_loss": 3.6112275520915214, + "tokens_seen": 1117426688 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033404212637913746, + "loss": 2.8381, + "theoretical_loss": 3.6112078085074604, + "tokens_seen": 1117492224 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340320962888666, + "loss": 2.7391, + "theoretical_loss": 3.611188066405423, + "tokens_seen": 1117557760 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340220661985958, + "loss": 2.7543, + "theoretical_loss": 3.6111683257852105, + "tokens_seen": 1117623296 + }, + { + "epoch": 13.02, + "learning_rate": 0.000334012036108325, + "loss": 2.8864, + "theoretical_loss": 3.6111485866466255, + "tokens_seen": 1117688832 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003340020060180542, + "loss": 2.8386, + "theoretical_loss": 3.61112884898947, + "tokens_seen": 1117754368 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033399197592778336, + "loss": 2.8693, + "theoretical_loss": 3.6111091128135455, + "tokens_seen": 1117819904 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033398194583751255, + "loss": 2.8712, + "theoretical_loss": 3.611089378118654, + "tokens_seen": 1117885440 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003339719157472417, + "loss": 2.9225, + "theoretical_loss": 3.611069644904598, + "tokens_seen": 1117950976 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033396188565697096, + "loss": 2.8135, + "theoretical_loss": 3.6110499131711786, + "tokens_seen": 1118016512 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003339518555667001, + "loss": 2.8236, + "theoretical_loss": 3.611030182918199, + "tokens_seen": 1118082048 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003339418254764293, + "loss": 2.9508, + "theoretical_loss": 3.611010454145461, + "tokens_seen": 1118147584 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033393179538615845, + "loss": 2.8537, + "theoretical_loss": 3.6109907268527666, + "tokens_seen": 1118213120 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003339217652958877, + "loss": 2.8546, + "theoretical_loss": 3.610971001039918, + "tokens_seen": 1118278656 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033391173520561687, + "loss": 2.8215, + "theoretical_loss": 3.610951276706718, + "tokens_seen": 1118344192 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033390170511534605, + "loss": 2.8913, + "theoretical_loss": 3.6109315538529683, + "tokens_seen": 1118409728 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033389167502507523, + "loss": 2.9105, + "theoretical_loss": 3.6109118324784717, + "tokens_seen": 1118475264 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033388164493480447, + "loss": 2.8997, + "theoretical_loss": 3.6108921125830302, + "tokens_seen": 1118540800 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003338716148445336, + "loss": 2.7751, + "theoretical_loss": 3.6108723941664467, + "tokens_seen": 1118606336 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033386158475426283, + "loss": 2.9455, + "theoretical_loss": 3.610852677228523, + "tokens_seen": 1118671872 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033385155466399195, + "loss": 2.9468, + "theoretical_loss": 3.610832961769062, + "tokens_seen": 1118737408 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003338415245737212, + "loss": 2.9409, + "theoretical_loss": 3.610813247787867, + "tokens_seen": 1118802944 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033383149448345037, + "loss": 2.778, + "theoretical_loss": 3.6107935352847393, + "tokens_seen": 1118868480 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8813674449920654, + "objective/train/theoretical_loss": 3.6107738242594816, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6107738242594816, + "tokens_seen": 1118934016 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033382146439317955, + "loss": 2.8543, + "theoretical_loss": 3.6107738242594816, + "tokens_seen": 1118934016 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033381143430290873, + "loss": 2.9284, + "theoretical_loss": 3.6107541147118973, + "tokens_seen": 1118999552 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003338014042126379, + "loss": 2.8719, + "theoretical_loss": 3.6107344066417886, + "tokens_seen": 1119065088 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003337913741223671, + "loss": 2.8622, + "theoretical_loss": 3.610714700048959, + "tokens_seen": 1119130624 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033378134403209633, + "loss": 2.8399, + "theoretical_loss": 3.61069499493321, + "tokens_seen": 1119196160 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033377131394182546, + "loss": 2.7978, + "theoretical_loss": 3.6106752912943456, + "tokens_seen": 1119261696 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003337612838515547, + "loss": 2.7345, + "theoretical_loss": 3.6106555891321674, + "tokens_seen": 1119327232 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003337512537612838, + "loss": 2.825, + "theoretical_loss": 3.6106358884464793, + "tokens_seen": 1119392768 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033374122367101306, + "loss": 2.9204, + "theoretical_loss": 3.610616189237084, + "tokens_seen": 1119458304 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033373119358074224, + "loss": 2.944, + "theoretical_loss": 3.6105964915037845, + "tokens_seen": 1119523840 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003337211634904714, + "loss": 2.9054, + "theoretical_loss": 3.6105767952463834, + "tokens_seen": 1119589376 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003337111334002006, + "loss": 2.8177, + "theoretical_loss": 3.6105571004646846, + "tokens_seen": 1119654912 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033370110330992983, + "loss": 2.8773, + "theoretical_loss": 3.61053740715849, + "tokens_seen": 1119720448 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033369107321965896, + "loss": 2.8745, + "theoretical_loss": 3.610517715327603, + "tokens_seen": 1119785984 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336810431293882, + "loss": 2.9226, + "theoretical_loss": 3.6104980249718275, + "tokens_seen": 1119851520 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336710130391173, + "loss": 2.7786, + "theoretical_loss": 3.610478336090966, + "tokens_seen": 1119917056 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033366098294884656, + "loss": 2.9191, + "theoretical_loss": 3.610458648684822, + "tokens_seen": 1119982592 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033365095285857574, + "loss": 2.8817, + "theoretical_loss": 3.6104389627531983, + "tokens_seen": 1120048128 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336409227683049, + "loss": 2.9766, + "theoretical_loss": 3.610419278295899, + "tokens_seen": 1120113664 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336308926780341, + "loss": 2.9041, + "theoretical_loss": 3.610399595312727, + "tokens_seen": 1120179200 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336208625877633, + "loss": 2.919, + "theoretical_loss": 3.6103799138034853, + "tokens_seen": 1120244736 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033361083249749246, + "loss": 2.8773, + "theoretical_loss": 3.6103602337679783, + "tokens_seen": 1120310272 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003336008024072217, + "loss": 2.9691, + "theoretical_loss": 3.610340555206008, + "tokens_seen": 1120375808 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003335907723169508, + "loss": 2.9152, + "theoretical_loss": 3.6103208781173795, + "tokens_seen": 1120441344 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033358074222668006, + "loss": 2.8294, + "theoretical_loss": 3.610301202501895, + "tokens_seen": 1120506880 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7624967098236084, + "objective/train/theoretical_loss": 3.6102815283593586, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6102815283593586, + "tokens_seen": 1120572416 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033357071213640924, + "loss": 2.7913, + "theoretical_loss": 3.6102815283593586, + "tokens_seen": 1120572416 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003335606820461384, + "loss": 2.9003, + "theoretical_loss": 3.610261855689574, + "tokens_seen": 1120637952 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003335506519558676, + "loss": 2.8883, + "theoretical_loss": 3.610242184492345, + "tokens_seen": 1120703488 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003335406218655968, + "loss": 2.79, + "theoretical_loss": 3.6102225147674747, + "tokens_seen": 1120769024 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033353059177532597, + "loss": 2.8553, + "theoretical_loss": 3.6102028465147673, + "tokens_seen": 1120834560 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003335205616850552, + "loss": 2.8104, + "theoretical_loss": 3.610183179734026, + "tokens_seen": 1120900096 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033351053159478433, + "loss": 2.8301, + "theoretical_loss": 3.6101635144250555, + "tokens_seen": 1120965632 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033350050150451356, + "loss": 2.892, + "theoretical_loss": 3.610143850587659, + "tokens_seen": 1121031168 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003334904714142427, + "loss": 2.8859, + "theoretical_loss": 3.61012418822164, + "tokens_seen": 1121096704 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003334804413239719, + "loss": 2.8908, + "theoretical_loss": 3.6101045273268033, + "tokens_seen": 1121162240 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003334704112337011, + "loss": 2.9386, + "theoretical_loss": 3.610084867902952, + "tokens_seen": 1121227776 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003334603811434303, + "loss": 2.8197, + "theoretical_loss": 3.610065209949891, + "tokens_seen": 1121293312 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033345035105315947, + "loss": 2.904, + "theoretical_loss": 3.610045553467423, + "tokens_seen": 1121358848 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033344032096288865, + "loss": 2.8605, + "theoretical_loss": 3.610025898455353, + "tokens_seen": 1121424384 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033343029087261783, + "loss": 2.91, + "theoretical_loss": 3.6100062449134853, + "tokens_seen": 1121489920 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033342026078234707, + "loss": 2.8844, + "theoretical_loss": 3.609986592841623, + "tokens_seen": 1121555456 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003334102306920762, + "loss": 2.7997, + "theoretical_loss": 3.6099669422395717, + "tokens_seen": 1121620992 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033340020060180543, + "loss": 2.9203, + "theoretical_loss": 3.609947293107134, + "tokens_seen": 1121686528 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003333901705115346, + "loss": 2.8265, + "theoretical_loss": 3.609927645444115, + "tokens_seen": 1121752064 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003333801404212638, + "loss": 2.8503, + "theoretical_loss": 3.6099079992503196, + "tokens_seen": 1121817600 + }, + { + "epoch": 13.02, + "learning_rate": 0.000333370110330993, + "loss": 2.8556, + "theoretical_loss": 3.609888354525551, + "tokens_seen": 1121883136 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033336008024072215, + "loss": 2.8564, + "theoretical_loss": 3.6098687112696135, + "tokens_seen": 1121948672 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033335005015045134, + "loss": 2.8976, + "theoretical_loss": 3.609849069482312, + "tokens_seen": 1122014208 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033334002006018057, + "loss": 2.9145, + "theoretical_loss": 3.6098294291634514, + "tokens_seen": 1122079744 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033332998996990975, + "loss": 2.9647, + "theoretical_loss": 3.6098097903128354, + "tokens_seen": 1122145280 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.977263927459717, + "objective/train/theoretical_loss": 3.6097901529302687, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6097901529302687, + "tokens_seen": 1122210816 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033331995987963893, + "loss": 2.982, + "theoretical_loss": 3.6097901529302687, + "tokens_seen": 1122210816 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003333099297893681, + "loss": 2.7995, + "theoretical_loss": 3.609770517015556, + "tokens_seen": 1122276352 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003332998996990973, + "loss": 2.7935, + "theoretical_loss": 3.609750882568502, + "tokens_seen": 1122341888 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033328986960882653, + "loss": 2.8734, + "theoretical_loss": 3.6097312495889105, + "tokens_seen": 1122407424 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033327983951855566, + "loss": 2.9591, + "theoretical_loss": 3.6097116180765867, + "tokens_seen": 1122472960 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003332698094282849, + "loss": 2.8728, + "theoretical_loss": 3.6096919880313356, + "tokens_seen": 1122538496 + }, + { + "epoch": 13.02, + "learning_rate": 0.000333259779338014, + "loss": 2.8389, + "theoretical_loss": 3.6096723594529614, + "tokens_seen": 1122604032 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033324974924774326, + "loss": 2.9071, + "theoretical_loss": 3.609652732341269, + "tokens_seen": 1122669568 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033323971915747244, + "loss": 2.8986, + "theoretical_loss": 3.609633106696063, + "tokens_seen": 1122735104 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003332296890672016, + "loss": 2.815, + "theoretical_loss": 3.6096134825171493, + "tokens_seen": 1122800640 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003332196589769308, + "loss": 2.9003, + "theoretical_loss": 3.6095938598043316, + "tokens_seen": 1122866176 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033320962888666003, + "loss": 2.924, + "theoretical_loss": 3.609574238557415, + "tokens_seen": 1122931712 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033319959879638916, + "loss": 2.8176, + "theoretical_loss": 3.6095546187762046, + "tokens_seen": 1122997248 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331895687061184, + "loss": 2.8935, + "theoretical_loss": 3.609535000460506, + "tokens_seen": 1123062784 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331795386158475, + "loss": 2.8977, + "theoretical_loss": 3.6095153836101233, + "tokens_seen": 1123128320 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033316950852557676, + "loss": 2.9081, + "theoretical_loss": 3.6094957682248614, + "tokens_seen": 1123193856 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033315947843530594, + "loss": 2.7892, + "theoretical_loss": 3.6094761543045264, + "tokens_seen": 1123259392 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331494483450351, + "loss": 2.8645, + "theoretical_loss": 3.609456541848923, + "tokens_seen": 1123324928 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331394182547643, + "loss": 3.027, + "theoretical_loss": 3.609436930857856, + "tokens_seen": 1123390464 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331293881644935, + "loss": 2.7724, + "theoretical_loss": 3.6094173213311316, + "tokens_seen": 1123456000 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033311935807422266, + "loss": 2.893, + "theoretical_loss": 3.609397713268554, + "tokens_seen": 1123521536 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003331093279839519, + "loss": 2.9148, + "theoretical_loss": 3.6093781066699284, + "tokens_seen": 1123587072 + }, + { + "epoch": 13.02, + "learning_rate": 0.000333099297893681, + "loss": 2.9157, + "theoretical_loss": 3.609358501535061, + "tokens_seen": 1123652608 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033308926780341026, + "loss": 2.8869, + "theoretical_loss": 3.609338897863757, + "tokens_seen": 1123718144 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033307923771313944, + "loss": 2.8449, + "theoretical_loss": 3.609319295655821, + "tokens_seen": 1123783680 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.892554998397827, + "objective/train/theoretical_loss": 3.609299694911059, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.609299694911059, + "tokens_seen": 1123849216 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003330692076228686, + "loss": 2.8503, + "theoretical_loss": 3.609299694911059, + "tokens_seen": 1123849216 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003330591775325978, + "loss": 2.972, + "theoretical_loss": 3.6092800956292765, + "tokens_seen": 1123914752 + }, + { + "epoch": 13.02, + "learning_rate": 0.000333049147442327, + "loss": 2.8133, + "theoretical_loss": 3.609260497810279, + "tokens_seen": 1123980288 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033303911735205617, + "loss": 2.7459, + "theoretical_loss": 3.609240901453872, + "tokens_seen": 1124045824 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003330290872617854, + "loss": 2.8959, + "theoretical_loss": 3.609221306559861, + "tokens_seen": 1124111360 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033301905717151453, + "loss": 2.9094, + "theoretical_loss": 3.609201713128052, + "tokens_seen": 1124176896 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033300902708124376, + "loss": 2.8963, + "theoretical_loss": 3.6091821211582498, + "tokens_seen": 1124242432 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003329989969909729, + "loss": 2.9072, + "theoretical_loss": 3.609162530650261, + "tokens_seen": 1124307968 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003329889669007021, + "loss": 2.9039, + "theoretical_loss": 3.6091429416038907, + "tokens_seen": 1124373504 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003329789368104313, + "loss": 2.8599, + "theoretical_loss": 3.609123354018945, + "tokens_seen": 1124439040 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003329689067201605, + "loss": 2.8797, + "theoretical_loss": 3.60910376789523, + "tokens_seen": 1124504576 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033295887662988967, + "loss": 2.7943, + "theoretical_loss": 3.6090841832325506, + "tokens_seen": 1124570112 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033294884653961885, + "loss": 2.8837, + "theoretical_loss": 3.609064600030713, + "tokens_seen": 1124635648 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033293881644934803, + "loss": 2.9151, + "theoretical_loss": 3.6090450182895246, + "tokens_seen": 1124701184 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033292878635907727, + "loss": 2.8444, + "theoretical_loss": 3.609025438008789, + "tokens_seen": 1124766720 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003329187562688064, + "loss": 2.9049, + "theoretical_loss": 3.609005859188314, + "tokens_seen": 1124832256 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033290872617853563, + "loss": 2.8395, + "theoretical_loss": 3.6089862818279044, + "tokens_seen": 1124897792 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003328986960882648, + "loss": 2.8611, + "theoretical_loss": 3.608966705927367, + "tokens_seen": 1124963328 + }, + { + "epoch": 13.02, + "learning_rate": 0.000332888665997994, + "loss": 2.8512, + "theoretical_loss": 3.6089471314865076, + "tokens_seen": 1125028864 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003328786359077232, + "loss": 2.8695, + "theoretical_loss": 3.608927558505133, + "tokens_seen": 1125094400 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033286860581745235, + "loss": 2.8791, + "theoretical_loss": 3.6089079869830485, + "tokens_seen": 1125159936 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033285857572718154, + "loss": 2.8901, + "theoretical_loss": 3.60888841692006, + "tokens_seen": 1125225472 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033284854563691077, + "loss": 2.7589, + "theoretical_loss": 3.608868848315975, + "tokens_seen": 1125291008 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003328385155466399, + "loss": 2.8854, + "theoretical_loss": 3.608849281170599, + "tokens_seen": 1125356544 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033282848545636913, + "loss": 2.9292, + "theoretical_loss": 3.608829715483739, + "tokens_seen": 1125422080 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8235976696014404, + "objective/train/theoretical_loss": 3.6088101512552004, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6088101512552004, + "tokens_seen": 1125487616 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033281845536609826, + "loss": 2.8519, + "theoretical_loss": 3.6088101512552004, + "tokens_seen": 1125487616 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003328084252758275, + "loss": 2.8811, + "theoretical_loss": 3.6087905884847897, + "tokens_seen": 1125553152 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003327983951855567, + "loss": 2.8693, + "theoretical_loss": 3.608771027172314, + "tokens_seen": 1125618688 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033278836509528586, + "loss": 2.9215, + "theoretical_loss": 3.60875146731758, + "tokens_seen": 1125684224 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033277833500501504, + "loss": 2.9054, + "theoretical_loss": 3.608731908920393, + "tokens_seen": 1125749760 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003327683049147442, + "loss": 2.8474, + "theoretical_loss": 3.60871235198056, + "tokens_seen": 1125815296 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003327582748244734, + "loss": 2.8189, + "theoretical_loss": 3.608692796497888, + "tokens_seen": 1125880832 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033274824473420264, + "loss": 2.8291, + "theoretical_loss": 3.608673242472184, + "tokens_seen": 1125946368 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033273821464393176, + "loss": 2.8483, + "theoretical_loss": 3.6086536899032535, + "tokens_seen": 1126011904 + }, + { + "epoch": 13.02, + "learning_rate": 0.000332728184553661, + "loss": 2.8661, + "theoretical_loss": 3.6086341387909036, + "tokens_seen": 1126077440 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003327181544633902, + "loss": 2.9572, + "theoretical_loss": 3.608614589134942, + "tokens_seen": 1126142976 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033270812437311936, + "loss": 2.8402, + "theoretical_loss": 3.6085950409351737, + "tokens_seen": 1126208512 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033269809428284854, + "loss": 2.909, + "theoretical_loss": 3.608575494191407, + "tokens_seen": 1126274048 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003326880641925777, + "loss": 2.744, + "theoretical_loss": 3.6085559489034478, + "tokens_seen": 1126339584 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003326780341023069, + "loss": 2.8622, + "theoretical_loss": 3.6085364050711037, + "tokens_seen": 1126405120 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033266800401203614, + "loss": 2.8358, + "theoretical_loss": 3.608516862694181, + "tokens_seen": 1126470656 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033265797392176527, + "loss": 2.7738, + "theoretical_loss": 3.6084973217724876, + "tokens_seen": 1126536192 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003326479438314945, + "loss": 2.8924, + "theoretical_loss": 3.608477782305829, + "tokens_seen": 1126601728 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033263791374122363, + "loss": 2.7384, + "theoretical_loss": 3.6084582442940136, + "tokens_seen": 1126667264 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033262788365095286, + "loss": 2.9432, + "theoretical_loss": 3.6084387077368474, + "tokens_seen": 1126732800 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033261785356068205, + "loss": 2.8291, + "theoretical_loss": 3.6084191726341386, + "tokens_seen": 1126798336 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003326078234704112, + "loss": 2.9175, + "theoretical_loss": 3.6083996389856936, + "tokens_seen": 1126863872 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003325977933801404, + "loss": 2.875, + "theoretical_loss": 3.6083801067913193, + "tokens_seen": 1126929408 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033258776328986964, + "loss": 2.8681, + "theoretical_loss": 3.6083605760508237, + "tokens_seen": 1126994944 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003325777331995988, + "loss": 2.94, + "theoretical_loss": 3.6083410467640133, + "tokens_seen": 1127060480 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.061140775680542, + "objective/train/theoretical_loss": 3.6083215189306963, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.6083215189306963, + "tokens_seen": 1127126016 + }, + { + "epoch": 13.02, + "learning_rate": 0.000332567703109328, + "loss": 2.9893, + "theoretical_loss": 3.6083215189306963, + "tokens_seen": 1127126016 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003325576730190572, + "loss": 2.9419, + "theoretical_loss": 3.608301992550679, + "tokens_seen": 1127191552 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033254764292878637, + "loss": 2.937, + "theoretical_loss": 3.60828246762377, + "tokens_seen": 1127257088 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003325376128385156, + "loss": 2.9308, + "theoretical_loss": 3.608262944149775, + "tokens_seen": 1127322624 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033252758274824473, + "loss": 2.8679, + "theoretical_loss": 3.6082434221285027, + "tokens_seen": 1127388160 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033251755265797396, + "loss": 2.7931, + "theoretical_loss": 3.6082239015597604, + "tokens_seen": 1127453696 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003325075225677031, + "loss": 2.8987, + "theoretical_loss": 3.608204382443355, + "tokens_seen": 1127519232 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033249749247743233, + "loss": 2.866, + "theoretical_loss": 3.608184864779095, + "tokens_seen": 1127584768 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003324874623871615, + "loss": 2.9105, + "theoretical_loss": 3.608165348566787, + "tokens_seen": 1127650304 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003324774322968907, + "loss": 2.9637, + "theoretical_loss": 3.608145833806239, + "tokens_seen": 1127715840 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033246740220661987, + "loss": 2.9134, + "theoretical_loss": 3.6081263204972593, + "tokens_seen": 1127781376 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033245737211634905, + "loss": 2.9321, + "theoretical_loss": 3.608106808639654, + "tokens_seen": 1127846912 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033244734202607823, + "loss": 2.7593, + "theoretical_loss": 3.608087298233233, + "tokens_seen": 1127912448 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033243731193580747, + "loss": 2.8865, + "theoretical_loss": 3.6080677892778024, + "tokens_seen": 1127977984 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003324272818455366, + "loss": 2.8145, + "theoretical_loss": 3.60804828177317, + "tokens_seen": 1128043520 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033241725175526583, + "loss": 2.8335, + "theoretical_loss": 3.608028775719144, + "tokens_seen": 1128109056 + }, + { + "epoch": 13.02, + "learning_rate": 0.000332407221664995, + "loss": 2.8762, + "theoretical_loss": 3.6080092711155327, + "tokens_seen": 1128174592 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003323971915747242, + "loss": 2.8159, + "theoretical_loss": 3.6079897679621435, + "tokens_seen": 1128240128 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003323871614844534, + "loss": 2.7666, + "theoretical_loss": 3.607970266258785, + "tokens_seen": 1128305664 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033237713139418255, + "loss": 2.8832, + "theoretical_loss": 3.6079507660052643, + "tokens_seen": 1128371200 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033236710130391174, + "loss": 2.9401, + "theoretical_loss": 3.60793126720139, + "tokens_seen": 1128436736 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033235707121364097, + "loss": 2.8236, + "theoretical_loss": 3.60791176984697, + "tokens_seen": 1128502272 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003323470411233701, + "loss": 2.7638, + "theoretical_loss": 3.607892273941812, + "tokens_seen": 1128567808 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033233701103309933, + "loss": 2.9635, + "theoretical_loss": 3.607872779485725, + "tokens_seen": 1128633344 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033232698094282846, + "loss": 2.9357, + "theoretical_loss": 3.607853286478516, + "tokens_seen": 1128698880 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.705819845199585, + "objective/train/theoretical_loss": 3.607833794919994, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.607833794919994, + "tokens_seen": 1128764416 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003323169508525577, + "loss": 2.8211, + "theoretical_loss": 3.607833794919994, + "tokens_seen": 1128764416 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003323069207622869, + "loss": 2.7432, + "theoretical_loss": 3.6078143048099673, + "tokens_seen": 1128829952 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033229689067201606, + "loss": 2.9838, + "theoretical_loss": 3.607794816148244, + "tokens_seen": 1128895488 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033228686058174524, + "loss": 2.8927, + "theoretical_loss": 3.607775328934632, + "tokens_seen": 1128961024 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003322768304914744, + "loss": 2.8489, + "theoretical_loss": 3.60775584316894, + "tokens_seen": 1129026560 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003322668004012036, + "loss": 2.844, + "theoretical_loss": 3.6077363588509765, + "tokens_seen": 1129092096 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033225677031093284, + "loss": 2.8242, + "theoretical_loss": 3.6077168759805502, + "tokens_seen": 1129157632 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033224674022066196, + "loss": 2.8402, + "theoretical_loss": 3.6076973945574684, + "tokens_seen": 1129223168 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003322367101303912, + "loss": 2.8559, + "theoretical_loss": 3.607677914581541, + "tokens_seen": 1129288704 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003322266800401204, + "loss": 2.9241, + "theoretical_loss": 3.607658436052575, + "tokens_seen": 1129354240 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033221664994984956, + "loss": 2.9357, + "theoretical_loss": 3.607638958970381, + "tokens_seen": 1129419776 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033220661985957874, + "loss": 2.9659, + "theoretical_loss": 3.6076194833347657, + "tokens_seen": 1129485312 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003321965897693079, + "loss": 2.8252, + "theoretical_loss": 3.6076000091455382, + "tokens_seen": 1129550848 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003321865596790371, + "loss": 2.8559, + "theoretical_loss": 3.6075805364025078, + "tokens_seen": 1129616384 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033217652958876634, + "loss": 2.7528, + "theoretical_loss": 3.607561065105483, + "tokens_seen": 1129681920 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033216649949849547, + "loss": 2.8378, + "theoretical_loss": 3.607541595254272, + "tokens_seen": 1129747456 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003321564694082247, + "loss": 2.8766, + "theoretical_loss": 3.607522126848684, + "tokens_seen": 1129812992 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033214643931795383, + "loss": 2.9261, + "theoretical_loss": 3.607502659888528, + "tokens_seen": 1129878528 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033213640922768306, + "loss": 2.908, + "theoretical_loss": 3.6074831943736125, + "tokens_seen": 1129944064 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033212637913741225, + "loss": 2.8387, + "theoretical_loss": 3.6074637303037465, + "tokens_seen": 1130009600 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003321163490471414, + "loss": 2.8932, + "theoretical_loss": 3.607444267678739, + "tokens_seen": 1130075136 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003321063189568706, + "loss": 2.8334, + "theoretical_loss": 3.6074248064983987, + "tokens_seen": 1130140672 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033209628886659984, + "loss": 2.878, + "theoretical_loss": 3.607405346762535, + "tokens_seen": 1130206208 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033208625877632897, + "loss": 2.8537, + "theoretical_loss": 3.607385888470957, + "tokens_seen": 1130271744 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003320762286860582, + "loss": 2.8432, + "theoretical_loss": 3.6073664316234733, + "tokens_seen": 1130337280 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9068455696105957, + "objective/train/theoretical_loss": 3.607346976219893, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.607346976219893, + "tokens_seen": 1130402816 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033206619859578733, + "loss": 2.9255, + "theoretical_loss": 3.607346976219893, + "tokens_seen": 1130402816 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033205616850551657, + "loss": 2.894, + "theoretical_loss": 3.607327522260026, + "tokens_seen": 1130468352 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033204613841524575, + "loss": 2.8644, + "theoretical_loss": 3.607308069743681, + "tokens_seen": 1130533888 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033203610832497493, + "loss": 2.7195, + "theoretical_loss": 3.6072886186706663, + "tokens_seen": 1130599424 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003320260782347041, + "loss": 2.8425, + "theoretical_loss": 3.6072691690407925, + "tokens_seen": 1130664960 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003320160481444333, + "loss": 2.885, + "theoretical_loss": 3.6072497208538685, + "tokens_seen": 1130730496 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003320060180541625, + "loss": 2.8976, + "theoretical_loss": 3.6072302741097038, + "tokens_seen": 1130796032 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003319959879638917, + "loss": 2.8671, + "theoretical_loss": 3.607210828808107, + "tokens_seen": 1130861568 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033198595787362084, + "loss": 2.827, + "theoretical_loss": 3.607191384948888, + "tokens_seen": 1130927104 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033197592778335007, + "loss": 2.8513, + "theoretical_loss": 3.607171942531857, + "tokens_seen": 1130992640 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003319658976930792, + "loss": 2.8407, + "theoretical_loss": 3.607152501556822, + "tokens_seen": 1131058176 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033195586760280843, + "loss": 2.9013, + "theoretical_loss": 3.6071330620235935, + "tokens_seen": 1131123712 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003319458375125376, + "loss": 2.8724, + "theoretical_loss": 3.6071136239319808, + "tokens_seen": 1131189248 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003319358074222668, + "loss": 2.8705, + "theoretical_loss": 3.6070941872817937, + "tokens_seen": 1131254784 + }, + { + "epoch": 13.02, + "learning_rate": 0.000331925777331996, + "loss": 2.8742, + "theoretical_loss": 3.607074752072841, + "tokens_seen": 1131320320 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003319157472417252, + "loss": 2.855, + "theoretical_loss": 3.6070553183049325, + "tokens_seen": 1131385856 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033190571715145434, + "loss": 2.8385, + "theoretical_loss": 3.6070358859778793, + "tokens_seen": 1131451392 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318956870611836, + "loss": 2.7857, + "theoretical_loss": 3.6070164550914896, + "tokens_seen": 1131516928 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318856569709127, + "loss": 2.8682, + "theoretical_loss": 3.606997025645574, + "tokens_seen": 1131582464 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033187562688064194, + "loss": 2.8882, + "theoretical_loss": 3.6069775976399416, + "tokens_seen": 1131648000 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318655967903711, + "loss": 2.8379, + "theoretical_loss": 3.606958171074403, + "tokens_seen": 1131713536 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318555667001003, + "loss": 2.8395, + "theoretical_loss": 3.6069387459487676, + "tokens_seen": 1131779072 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318455366098295, + "loss": 2.7739, + "theoretical_loss": 3.6069193222628453, + "tokens_seen": 1131844608 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033183550651955866, + "loss": 2.862, + "theoretical_loss": 3.606899900016446, + "tokens_seen": 1131910144 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318254764292879, + "loss": 2.8694, + "theoretical_loss": 3.60688047920938, + "tokens_seen": 1131975680 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8403475284576416, + "objective/train/theoretical_loss": 3.606861059841457, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.606861059841457, + "tokens_seen": 1132041216 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003318154463390171, + "loss": 2.819, + "theoretical_loss": 3.606861059841457, + "tokens_seen": 1132041216 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033180541624874626, + "loss": 2.9727, + "theoretical_loss": 3.6068416419124874, + "tokens_seen": 1132106752 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033179538615847544, + "loss": 2.9129, + "theoretical_loss": 3.606822225422281, + "tokens_seen": 1132172288 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003317853560682046, + "loss": 2.9048, + "theoretical_loss": 3.606802810370648, + "tokens_seen": 1132237824 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003317753259779338, + "loss": 2.8384, + "theoretical_loss": 3.606783396757398, + "tokens_seen": 1132303360 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033176529588766304, + "loss": 2.8709, + "theoretical_loss": 3.606763984582342, + "tokens_seen": 1132368896 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033175526579739216, + "loss": 2.8119, + "theoretical_loss": 3.6067445738452903, + "tokens_seen": 1132434432 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003317452357071214, + "loss": 2.8523, + "theoretical_loss": 3.6067251645460523, + "tokens_seen": 1132499968 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003317352056168506, + "loss": 2.9157, + "theoretical_loss": 3.6067057566844394, + "tokens_seen": 1132565504 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033172517552657976, + "loss": 2.8883, + "theoretical_loss": 3.606686350260261, + "tokens_seen": 1132631040 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033171514543630894, + "loss": 2.9081, + "theoretical_loss": 3.606666945273328, + "tokens_seen": 1132696576 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003317051153460381, + "loss": 2.9566, + "theoretical_loss": 3.6066475417234507, + "tokens_seen": 1132762112 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003316950852557673, + "loss": 2.9575, + "theoretical_loss": 3.6066281396104394, + "tokens_seen": 1132827648 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033168505516549654, + "loss": 2.8956, + "theoretical_loss": 3.6066087389341046, + "tokens_seen": 1132893184 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033167502507522567, + "loss": 2.8179, + "theoretical_loss": 3.606589339694257, + "tokens_seen": 1132958720 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003316649949849549, + "loss": 2.8659, + "theoretical_loss": 3.606569941890707, + "tokens_seen": 1133024256 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033165496489468403, + "loss": 2.9047, + "theoretical_loss": 3.606550545523265, + "tokens_seen": 1133089792 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033164493480441326, + "loss": 2.8074, + "theoretical_loss": 3.606531150591742, + "tokens_seen": 1133155328 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033163490471414245, + "loss": 2.984, + "theoretical_loss": 3.606511757095949, + "tokens_seen": 1133220864 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003316248746238716, + "loss": 2.9229, + "theoretical_loss": 3.6064923650356957, + "tokens_seen": 1133286400 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003316148445336008, + "loss": 2.8084, + "theoretical_loss": 3.6064729744107935, + "tokens_seen": 1133351936 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033160481444333004, + "loss": 2.9856, + "theoretical_loss": 3.606453585221053, + "tokens_seen": 1133417472 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033159478435305917, + "loss": 2.9475, + "theoretical_loss": 3.6064341974662852, + "tokens_seen": 1133483008 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003315847542627884, + "loss": 2.8621, + "theoretical_loss": 3.6064148111463004, + "tokens_seen": 1133548544 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033157472417251753, + "loss": 2.8295, + "theoretical_loss": 3.60639542626091, + "tokens_seen": 1133614080 + }, + { + "epoch": 13.02, + "objective/train/docs_used": 2647232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.950747013092041, + "objective/train/theoretical_loss": 3.606376042809925, + "objective/train/tokens_used": 1133914592, + "theoretical_loss": 3.606376042809925, + "tokens_seen": 1133679616 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033156469408224677, + "loss": 2.836, + "theoretical_loss": 3.606376042809925, + "tokens_seen": 1133679616 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033155466399197595, + "loss": 2.8151, + "theoretical_loss": 3.606356660793156, + "tokens_seen": 1133745152 + }, + { + "epoch": 13.02, + "learning_rate": 0.00033154463390170513, + "loss": 2.7995, + "theoretical_loss": 3.606337280210414, + "tokens_seen": 1133810688 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003315346038114343, + "loss": 2.7259, + "theoretical_loss": 3.60631790106151, + "tokens_seen": 1133876224 + }, + { + "epoch": 13.02, + "learning_rate": 0.0003315245737211635, + "loss": 2.8929, + "theoretical_loss": 3.60630064271404, + "tokens_seen": 1133934592 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003315145436308927, + "loss": 2.7853, + "theoretical_loss": 3.60628126627547, + "tokens_seen": 1134000128 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003315045135406219, + "loss": 2.7816, + "theoretical_loss": 3.606261891270192, + "tokens_seen": 1134065664 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033149448345035104, + "loss": 2.8076, + "theoretical_loss": 3.6062425176980173, + "tokens_seen": 1134131200 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033148445336008027, + "loss": 2.8369, + "theoretical_loss": 3.606223145558757, + "tokens_seen": 1134196736 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003314744232698094, + "loss": 2.8744, + "theoretical_loss": 3.606203774852222, + "tokens_seen": 1134262272 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033146439317953863, + "loss": 2.7333, + "theoretical_loss": 3.606184405578224, + "tokens_seen": 1134327808 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003314543630892678, + "loss": 2.8247, + "theoretical_loss": 3.6061650377365746, + "tokens_seen": 1134393344 + }, + { + "epoch": 14.0, + "learning_rate": 0.000331444332998997, + "loss": 2.7387, + "theoretical_loss": 3.606145671327085, + "tokens_seen": 1134458880 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003314343029087262, + "loss": 2.8188, + "theoretical_loss": 3.606126306349566, + "tokens_seen": 1134524416 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003314242728184554, + "loss": 2.8059, + "theoretical_loss": 3.606106942803829, + "tokens_seen": 1134589952 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033141424272818454, + "loss": 2.7765, + "theoretical_loss": 3.6060875806896857, + "tokens_seen": 1134655488 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003314042126379138, + "loss": 2.7127, + "theoretical_loss": 3.6060682200069483, + "tokens_seen": 1134721024 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313941825476429, + "loss": 2.8028, + "theoretical_loss": 3.6060488607554273, + "tokens_seen": 1134786560 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033138415245737214, + "loss": 2.8621, + "theoretical_loss": 3.606029502934934, + "tokens_seen": 1134852096 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313741223671013, + "loss": 2.6766, + "theoretical_loss": 3.6060101465452816, + "tokens_seen": 1134917632 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313640922768305, + "loss": 2.7847, + "theoretical_loss": 3.6059907915862803, + "tokens_seen": 1134983168 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313540621865597, + "loss": 2.7595, + "theoretical_loss": 3.605971438057742, + "tokens_seen": 1135048704 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033134403209628886, + "loss": 2.6902, + "theoretical_loss": 3.605952085959479, + "tokens_seen": 1135114240 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033133400200601804, + "loss": 2.8908, + "theoretical_loss": 3.605932735291302, + "tokens_seen": 1135179776 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313239719157473, + "loss": 2.8394, + "theoretical_loss": 3.6059133860530244, + "tokens_seen": 1135245312 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2699562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.791576862335205, + "objective/train/theoretical_loss": 3.6058940382444558, + "objective/train/tokens_used": 1155770848, + "theoretical_loss": 3.6058940382444558, + "tokens_seen": 1135310848 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003313139418254764, + "loss": 2.7638, + "theoretical_loss": 3.6058940382444558, + "tokens_seen": 1135310848 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033130391173520564, + "loss": 2.7942, + "theoretical_loss": 3.60587469186541, + "tokens_seen": 1135376384 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033129388164493477, + "loss": 2.728, + "theoretical_loss": 3.605855346915698, + "tokens_seen": 1135441920 + }, + { + "epoch": 14.0, + "learning_rate": 0.000331283851554664, + "loss": 2.7161, + "theoretical_loss": 3.6058360033951313, + "tokens_seen": 1135507456 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003312738214643932, + "loss": 2.7978, + "theoretical_loss": 3.605816661303523, + "tokens_seen": 1135572992 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033126379137412236, + "loss": 2.7179, + "theoretical_loss": 3.605797320640684, + "tokens_seen": 1135638528 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033125376128385155, + "loss": 2.8494, + "theoretical_loss": 3.605777981406427, + "tokens_seen": 1135704064 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003312437311935808, + "loss": 2.6919, + "theoretical_loss": 3.6057586436005638, + "tokens_seen": 1135769600 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003312337011033099, + "loss": 2.7641, + "theoretical_loss": 3.6057393072229065, + "tokens_seen": 1135835136 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033122367101303914, + "loss": 2.7148, + "theoretical_loss": 3.605719972273267, + "tokens_seen": 1135900672 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033121364092276827, + "loss": 2.6568, + "theoretical_loss": 3.6057006387514576, + "tokens_seen": 1135966208 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003312036108324975, + "loss": 2.7895, + "theoretical_loss": 3.605681306657291, + "tokens_seen": 1136031744 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003311935807422267, + "loss": 2.8166, + "theoretical_loss": 3.6056619759905786, + "tokens_seen": 1136097280 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033118355065195587, + "loss": 2.7856, + "theoretical_loss": 3.605642646751133, + "tokens_seen": 1136162816 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033117352056168505, + "loss": 2.8426, + "theoretical_loss": 3.6056233189387665, + "tokens_seen": 1136228352 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033116349047141423, + "loss": 2.7658, + "theoretical_loss": 3.6056039925532923, + "tokens_seen": 1136293888 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003311534603811434, + "loss": 2.8076, + "theoretical_loss": 3.6055846675945213, + "tokens_seen": 1136359424 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033114343029087265, + "loss": 2.7295, + "theoretical_loss": 3.605565344062266, + "tokens_seen": 1136424960 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033113340020060177, + "loss": 2.7109, + "theoretical_loss": 3.6055460219563407, + "tokens_seen": 1136490496 + }, + { + "epoch": 14.0, + "learning_rate": 0.000331123370110331, + "loss": 2.7216, + "theoretical_loss": 3.6055267012765557, + "tokens_seen": 1136556032 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033111334002006013, + "loss": 2.8235, + "theoretical_loss": 3.605507382022725, + "tokens_seen": 1136621568 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033110330992978937, + "loss": 2.9245, + "theoretical_loss": 3.60548806419466, + "tokens_seen": 1136687104 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033109327983951855, + "loss": 2.9315, + "theoretical_loss": 3.605468747792174, + "tokens_seen": 1136752640 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033108324974924773, + "loss": 2.7771, + "theoretical_loss": 3.60544943281508, + "tokens_seen": 1136818176 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033107321965897697, + "loss": 2.7882, + "theoretical_loss": 3.6054301192631892, + "tokens_seen": 1136883712 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2702641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.889044761657715, + "objective/train/theoretical_loss": 3.6054108071363156, + "objective/train/tokens_used": 1157409248, + "theoretical_loss": 3.6054108071363156, + "tokens_seen": 1136949248 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033106318956870615, + "loss": 2.8081, + "theoretical_loss": 3.6054108071363156, + "tokens_seen": 1136949248 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033105315947843533, + "loss": 2.8499, + "theoretical_loss": 3.6053914964342715, + "tokens_seen": 1137014784 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003310431293881645, + "loss": 2.8801, + "theoretical_loss": 3.6053721871568696, + "tokens_seen": 1137080320 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003310330992978937, + "loss": 2.8087, + "theoretical_loss": 3.605352879303923, + "tokens_seen": 1137145856 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003310230692076229, + "loss": 2.768, + "theoretical_loss": 3.605333572875244, + "tokens_seen": 1137211392 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003310130391173521, + "loss": 2.7323, + "theoretical_loss": 3.6053142678706465, + "tokens_seen": 1137276928 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033100300902708124, + "loss": 2.8069, + "theoretical_loss": 3.605294964289942, + "tokens_seen": 1137342464 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033099297893681047, + "loss": 2.7615, + "theoretical_loss": 3.6052756621329443, + "tokens_seen": 1137408000 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003309829488465396, + "loss": 2.7549, + "theoretical_loss": 3.605256361399466, + "tokens_seen": 1137473536 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033097291875626883, + "loss": 2.75, + "theoretical_loss": 3.605237062089321, + "tokens_seen": 1137539072 + }, + { + "epoch": 14.0, + "learning_rate": 0.000330962888665998, + "loss": 2.8218, + "theoretical_loss": 3.6052177642023207, + "tokens_seen": 1137604608 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003309528585757272, + "loss": 2.8192, + "theoretical_loss": 3.60519846773828, + "tokens_seen": 1137670144 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003309428284854564, + "loss": 2.8186, + "theoretical_loss": 3.6051791726970106, + "tokens_seen": 1137735680 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003309327983951856, + "loss": 2.7616, + "theoretical_loss": 3.605159879078326, + "tokens_seen": 1137801216 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033092276830491474, + "loss": 2.7942, + "theoretical_loss": 3.6051405868820403, + "tokens_seen": 1137866752 + }, + { + "epoch": 14.0, + "learning_rate": 0.000330912738214644, + "loss": 2.7884, + "theoretical_loss": 3.6051212961079653, + "tokens_seen": 1137932288 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003309027081243731, + "loss": 2.6584, + "theoretical_loss": 3.6051020067559154, + "tokens_seen": 1137997824 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033089267803410234, + "loss": 2.7444, + "theoretical_loss": 3.6050827188257033, + "tokens_seen": 1138063360 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003308826479438315, + "loss": 2.7225, + "theoretical_loss": 3.605063432317143, + "tokens_seen": 1138128896 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003308726178535607, + "loss": 2.7788, + "theoretical_loss": 3.6050441472300463, + "tokens_seen": 1138194432 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003308625877632899, + "loss": 2.8115, + "theoretical_loss": 3.6050248635642284, + "tokens_seen": 1138259968 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033085255767301906, + "loss": 2.7997, + "theoretical_loss": 3.605005581319502, + "tokens_seen": 1138325504 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033084252758274824, + "loss": 2.8172, + "theoretical_loss": 3.60498630049568, + "tokens_seen": 1138391040 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003308324974924775, + "loss": 2.719, + "theoretical_loss": 3.604967021092577, + "tokens_seen": 1138456576 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003308224674022066, + "loss": 2.8416, + "theoretical_loss": 3.6049477431100057, + "tokens_seen": 1138522112 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2705759, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.688344955444336, + "objective/train/theoretical_loss": 3.60492846654778, + "objective/train/tokens_used": 1159047648, + "theoretical_loss": 3.60492846654778, + "tokens_seen": 1138587648 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033081243731193584, + "loss": 2.7859, + "theoretical_loss": 3.60492846654778, + "tokens_seen": 1138587648 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033080240722166497, + "loss": 2.7943, + "theoretical_loss": 3.6049091914057136, + "tokens_seen": 1138653184 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003307923771313942, + "loss": 2.715, + "theoretical_loss": 3.6048899176836198, + "tokens_seen": 1138718720 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003307823470411234, + "loss": 2.7558, + "theoretical_loss": 3.6048706453813124, + "tokens_seen": 1138784256 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033077231695085256, + "loss": 2.8148, + "theoretical_loss": 3.6048513744986055, + "tokens_seen": 1138849792 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033076228686058175, + "loss": 2.7844, + "theoretical_loss": 3.604832105035312, + "tokens_seen": 1138915328 + }, + { + "epoch": 14.0, + "learning_rate": 0.000330752256770311, + "loss": 2.7947, + "theoretical_loss": 3.604812836991247, + "tokens_seen": 1138980864 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003307422266800401, + "loss": 2.8368, + "theoretical_loss": 3.6047935703662226, + "tokens_seen": 1139046400 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033073219658976934, + "loss": 2.9107, + "theoretical_loss": 3.6047743051600545, + "tokens_seen": 1139111936 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033072216649949847, + "loss": 2.6656, + "theoretical_loss": 3.604755041372555, + "tokens_seen": 1139177472 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003307121364092277, + "loss": 2.6863, + "theoretical_loss": 3.6047357790035393, + "tokens_seen": 1139243008 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003307021063189569, + "loss": 2.7432, + "theoretical_loss": 3.6047165180528205, + "tokens_seen": 1139308544 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033069207622868607, + "loss": 2.7281, + "theoretical_loss": 3.604697258520213, + "tokens_seen": 1139374080 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033068204613841525, + "loss": 2.7496, + "theoretical_loss": 3.6046780004055305, + "tokens_seen": 1139439616 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033067201604814443, + "loss": 2.7865, + "theoretical_loss": 3.6046587437085873, + "tokens_seen": 1139505152 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003306619859578736, + "loss": 2.7881, + "theoretical_loss": 3.6046394884291972, + "tokens_seen": 1139570688 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033065195586760285, + "loss": 2.782, + "theoretical_loss": 3.604620234567175, + "tokens_seen": 1139636224 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033064192577733197, + "loss": 2.8093, + "theoretical_loss": 3.6046009821223346, + "tokens_seen": 1139701760 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003306318956870612, + "loss": 2.7906, + "theoretical_loss": 3.60458173109449, + "tokens_seen": 1139767296 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033062186559679034, + "loss": 2.7922, + "theoretical_loss": 3.6045624814834554, + "tokens_seen": 1139832832 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033061183550651957, + "loss": 2.7753, + "theoretical_loss": 3.604543233289045, + "tokens_seen": 1139898368 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033060180541624875, + "loss": 2.917, + "theoretical_loss": 3.604523986511073, + "tokens_seen": 1139963904 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033059177532597793, + "loss": 2.7941, + "theoretical_loss": 3.604504741149355, + "tokens_seen": 1140029440 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003305817452357071, + "loss": 2.7308, + "theoretical_loss": 3.6044854972037044, + "tokens_seen": 1140094976 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033057171514543635, + "loss": 2.738, + "theoretical_loss": 3.6044662546739348, + "tokens_seen": 1140160512 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2710547, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.021167278289795, + "objective/train/theoretical_loss": 3.604447013559862, + "objective/train/tokens_used": 1160686048, + "theoretical_loss": 3.604447013559862, + "tokens_seen": 1140226048 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003305616850551655, + "loss": 2.8862, + "theoretical_loss": 3.604447013559862, + "tokens_seen": 1140226048 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003305516549648947, + "loss": 2.8128, + "theoretical_loss": 3.6044277738612998, + "tokens_seen": 1140291584 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033054162487462384, + "loss": 2.7703, + "theoretical_loss": 3.604408535578063, + "tokens_seen": 1140357120 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003305315947843531, + "loss": 2.797, + "theoretical_loss": 3.6043892987099664, + "tokens_seen": 1140422656 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033052156469408225, + "loss": 2.7626, + "theoretical_loss": 3.6043700632568236, + "tokens_seen": 1140488192 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033051153460381144, + "loss": 2.8412, + "theoretical_loss": 3.60435082921845, + "tokens_seen": 1140553728 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003305015045135406, + "loss": 2.8155, + "theoretical_loss": 3.6043315965946605, + "tokens_seen": 1140619264 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304914744232698, + "loss": 2.8445, + "theoretical_loss": 3.6043123653852693, + "tokens_seen": 1140684800 + }, + { + "epoch": 14.0, + "learning_rate": 0.000330481444332999, + "loss": 2.7523, + "theoretical_loss": 3.6042931355900913, + "tokens_seen": 1140750336 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304714142427282, + "loss": 2.7744, + "theoretical_loss": 3.604273907208941, + "tokens_seen": 1140815872 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033046138415245734, + "loss": 2.8131, + "theoretical_loss": 3.6042546802416333, + "tokens_seen": 1140881408 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304513540621866, + "loss": 2.7808, + "theoretical_loss": 3.6042354546879833, + "tokens_seen": 1140946944 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304413239719157, + "loss": 2.858, + "theoretical_loss": 3.604216230547806, + "tokens_seen": 1141012480 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033043129388164494, + "loss": 2.8078, + "theoretical_loss": 3.604197007820916, + "tokens_seen": 1141078016 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304212637913741, + "loss": 2.7709, + "theoretical_loss": 3.604177786507128, + "tokens_seen": 1141143552 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304112337011033, + "loss": 2.7752, + "theoretical_loss": 3.604158566606257, + "tokens_seen": 1141209088 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003304012036108325, + "loss": 2.6847, + "theoretical_loss": 3.604139348118119, + "tokens_seen": 1141274624 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303911735205617, + "loss": 2.7815, + "theoretical_loss": 3.604120131042528, + "tokens_seen": 1141340160 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033038114343029084, + "loss": 2.8722, + "theoretical_loss": 3.6041009153792993, + "tokens_seen": 1141405696 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303711133400201, + "loss": 2.6993, + "theoretical_loss": 3.6040817011282478, + "tokens_seen": 1141471232 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303610832497492, + "loss": 2.8746, + "theoretical_loss": 3.6040624882891894, + "tokens_seen": 1141536768 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033035105315947844, + "loss": 2.7934, + "theoretical_loss": 3.6040432768619386, + "tokens_seen": 1141602304 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303410230692076, + "loss": 2.7144, + "theoretical_loss": 3.604024066846311, + "tokens_seen": 1141667840 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303309929789368, + "loss": 2.822, + "theoretical_loss": 3.6040048582421216, + "tokens_seen": 1141733376 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033032096288866604, + "loss": 2.9183, + "theoretical_loss": 3.603985651049186, + "tokens_seen": 1141798912 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2714215, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.849510431289673, + "objective/train/theoretical_loss": 3.603966445267319, + "objective/train/tokens_used": 1162324448, + "theoretical_loss": 3.603966445267319, + "tokens_seen": 1141864448 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033031093279839517, + "loss": 2.821, + "theoretical_loss": 3.603966445267319, + "tokens_seen": 1141864448 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003303009027081244, + "loss": 2.7412, + "theoretical_loss": 3.6039472408963364, + "tokens_seen": 1141929984 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003302908726178536, + "loss": 2.8906, + "theoretical_loss": 3.603928037936053, + "tokens_seen": 1141995520 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033028084252758276, + "loss": 2.9208, + "theoretical_loss": 3.6039088363862852, + "tokens_seen": 1142061056 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033027081243731195, + "loss": 2.7762, + "theoretical_loss": 3.6038896362468478, + "tokens_seen": 1142126592 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003302607823470412, + "loss": 2.6997, + "theoretical_loss": 3.603870437517557, + "tokens_seen": 1142192128 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003302507522567703, + "loss": 2.8345, + "theoretical_loss": 3.603851240198227, + "tokens_seen": 1142257664 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033024072216649954, + "loss": 2.8077, + "theoretical_loss": 3.603832044288674, + "tokens_seen": 1142323200 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033023069207622867, + "loss": 2.8712, + "theoretical_loss": 3.6038128497887145, + "tokens_seen": 1142388736 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003302206619859579, + "loss": 2.8116, + "theoretical_loss": 3.603793656698163, + "tokens_seen": 1142454272 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003302106318956871, + "loss": 2.7927, + "theoretical_loss": 3.6037744650168353, + "tokens_seen": 1142519808 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033020060180541627, + "loss": 2.7642, + "theoretical_loss": 3.6037552747445476, + "tokens_seen": 1142585344 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033019057171514545, + "loss": 2.8124, + "theoretical_loss": 3.603736085881115, + "tokens_seen": 1142650880 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033018054162487463, + "loss": 2.8523, + "theoretical_loss": 3.603716898426354, + "tokens_seen": 1142716416 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003301705115346038, + "loss": 2.8066, + "theoretical_loss": 3.60369771238008, + "tokens_seen": 1142781952 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033016048144433305, + "loss": 2.7291, + "theoretical_loss": 3.603678527742109, + "tokens_seen": 1142847488 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003301504513540622, + "loss": 2.8042, + "theoretical_loss": 3.603659344512257, + "tokens_seen": 1142913024 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003301404212637914, + "loss": 2.8518, + "theoretical_loss": 3.6036401626903394, + "tokens_seen": 1142978560 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033013039117352054, + "loss": 2.7594, + "theoretical_loss": 3.6036209822761727, + "tokens_seen": 1143044096 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033012036108324977, + "loss": 2.739, + "theoretical_loss": 3.603601803269572, + "tokens_seen": 1143109632 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033011033099297895, + "loss": 2.8304, + "theoretical_loss": 3.603582625670354, + "tokens_seen": 1143175168 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033010030090270813, + "loss": 2.9243, + "theoretical_loss": 3.6035634494783357, + "tokens_seen": 1143240704 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003300902708124373, + "loss": 2.9039, + "theoretical_loss": 3.6035442746933315, + "tokens_seen": 1143306240 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033008024072216655, + "loss": 2.7804, + "theoretical_loss": 3.603525101315158, + "tokens_seen": 1143371776 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003300702106318957, + "loss": 2.8122, + "theoretical_loss": 3.6035059293436316, + "tokens_seen": 1143437312 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2717400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.664616584777832, + "objective/train/theoretical_loss": 3.603486758778569, + "objective/train/tokens_used": 1163962848, + "theoretical_loss": 3.603486758778569, + "tokens_seen": 1143502848 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003300601805416249, + "loss": 2.6604, + "theoretical_loss": 3.603486758778569, + "tokens_seen": 1143502848 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033005015045135404, + "loss": 2.8706, + "theoretical_loss": 3.603467589619785, + "tokens_seen": 1143568384 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003300401203610833, + "loss": 2.7951, + "theoretical_loss": 3.603448421867097, + "tokens_seen": 1143633920 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033003009027081245, + "loss": 2.6564, + "theoretical_loss": 3.6034292555203216, + "tokens_seen": 1143699456 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033002006018054164, + "loss": 2.8542, + "theoretical_loss": 3.603410090579274, + "tokens_seen": 1143764992 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003300100300902708, + "loss": 2.8307, + "theoretical_loss": 3.6033909270437716, + "tokens_seen": 1143830528 + }, + { + "epoch": 14.0, + "learning_rate": 0.00033, + "loss": 2.703, + "theoretical_loss": 3.6033717649136294, + "tokens_seen": 1143896064 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299899699097292, + "loss": 2.7944, + "theoretical_loss": 3.6033526041886654, + "tokens_seen": 1143961600 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299799398194584, + "loss": 2.8923, + "theoretical_loss": 3.603333444868696, + "tokens_seen": 1144027136 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032996990972918754, + "loss": 2.8349, + "theoretical_loss": 3.6033142869535357, + "tokens_seen": 1144092672 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299598796389168, + "loss": 2.8527, + "theoretical_loss": 3.603295130443003, + "tokens_seen": 1144158208 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299498495486459, + "loss": 2.732, + "theoretical_loss": 3.6032759753369143, + "tokens_seen": 1144223744 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032993981945837514, + "loss": 2.8565, + "theoretical_loss": 3.6032568216350853, + "tokens_seen": 1144289280 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299297893681043, + "loss": 2.7263, + "theoretical_loss": 3.603237669337333, + "tokens_seen": 1144354816 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299197592778335, + "loss": 2.815, + "theoretical_loss": 3.603218518443475, + "tokens_seen": 1144420352 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003299097291875627, + "loss": 2.7948, + "theoretical_loss": 3.6031993689533266, + "tokens_seen": 1144485888 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003298996990972919, + "loss": 2.7417, + "theoretical_loss": 3.6031802208667054, + "tokens_seen": 1144551424 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032988966900702104, + "loss": 2.8392, + "theoretical_loss": 3.6031610741834275, + "tokens_seen": 1144616960 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003298796389167503, + "loss": 2.7175, + "theoretical_loss": 3.6031419289033106, + "tokens_seen": 1144682496 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003298696088264794, + "loss": 2.7721, + "theoretical_loss": 3.603122785026171, + "tokens_seen": 1144748032 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032985957873620864, + "loss": 2.8613, + "theoretical_loss": 3.603103642551826, + "tokens_seen": 1144813568 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003298495486459378, + "loss": 2.8688, + "theoretical_loss": 3.603084501480091, + "tokens_seen": 1144879104 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329839518555667, + "loss": 2.8051, + "theoretical_loss": 3.6030653618107853, + "tokens_seen": 1144944640 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003298294884653962, + "loss": 2.7562, + "theoretical_loss": 3.6030462235437244, + "tokens_seen": 1145010176 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032981945837512537, + "loss": 2.7632, + "theoretical_loss": 3.6030270866787255, + "tokens_seen": 1145075712 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2722091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8158671855926514, + "objective/train/theoretical_loss": 3.6030079512156057, + "objective/train/tokens_used": 1165601248, + "theoretical_loss": 3.6030079512156057, + "tokens_seen": 1145141248 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032980942828485455, + "loss": 2.8143, + "theoretical_loss": 3.6030079512156057, + "tokens_seen": 1145141248 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297993981945838, + "loss": 2.7521, + "theoretical_loss": 3.6029888171541824, + "tokens_seen": 1145206784 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297893681043129, + "loss": 2.786, + "theoretical_loss": 3.6029696844942727, + "tokens_seen": 1145272320 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032977933801404215, + "loss": 2.8675, + "theoretical_loss": 3.6029505532356927, + "tokens_seen": 1145337856 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297693079237713, + "loss": 2.5816, + "theoretical_loss": 3.602931423378261, + "tokens_seen": 1145403392 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297592778335005, + "loss": 2.7268, + "theoretical_loss": 3.6029122949217944, + "tokens_seen": 1145468928 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297492477432297, + "loss": 2.9166, + "theoretical_loss": 3.6028931678661094, + "tokens_seen": 1145534464 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032973921765295887, + "loss": 2.8283, + "theoretical_loss": 3.6028740422110244, + "tokens_seen": 1145600000 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032972918756268805, + "loss": 2.6801, + "theoretical_loss": 3.602854917956356, + "tokens_seen": 1145665536 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297191574724173, + "loss": 2.8232, + "theoretical_loss": 3.602835795101922, + "tokens_seen": 1145731072 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003297091273821464, + "loss": 2.6834, + "theoretical_loss": 3.602816673647539, + "tokens_seen": 1145796608 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032969909729187565, + "loss": 2.8112, + "theoretical_loss": 3.6027975535930254, + "tokens_seen": 1145862144 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003296890672016048, + "loss": 2.7004, + "theoretical_loss": 3.602778434938198, + "tokens_seen": 1145927680 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329679037111334, + "loss": 2.8773, + "theoretical_loss": 3.602759317682875, + "tokens_seen": 1145993216 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003296690070210632, + "loss": 2.7568, + "theoretical_loss": 3.6027402018268733, + "tokens_seen": 1146058752 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003296589769307924, + "loss": 2.8257, + "theoretical_loss": 3.6027210873700106, + "tokens_seen": 1146124288 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032964894684052155, + "loss": 2.7216, + "theoretical_loss": 3.6027019743121045, + "tokens_seen": 1146189824 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032963891675025074, + "loss": 2.8341, + "theoretical_loss": 3.6026828626529723, + "tokens_seen": 1146255360 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003296288866599799, + "loss": 2.6801, + "theoretical_loss": 3.6026637523924325, + "tokens_seen": 1146320896 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032961885656970915, + "loss": 2.7355, + "theoretical_loss": 3.6026446435303026, + "tokens_seen": 1146386432 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003296088264794383, + "loss": 2.7931, + "theoretical_loss": 3.602625536066399, + "tokens_seen": 1146451968 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295987963891675, + "loss": 2.8059, + "theoretical_loss": 3.6026064300005416, + "tokens_seen": 1146517504 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295887662988967, + "loss": 2.8967, + "theoretical_loss": 3.6025873253325464, + "tokens_seen": 1146583040 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295787362086259, + "loss": 2.7731, + "theoretical_loss": 3.6025682220622324, + "tokens_seen": 1146648576 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295687061183551, + "loss": 2.8877, + "theoretical_loss": 3.6025491201894173, + "tokens_seen": 1146714112 + }, + { + "debugging/Self-BLEU-5": 0.6622903474980145, + "debugging/distinct-1-grams": 0.7511255856654362, + "debugging/distinct-2-grams": 0.9468447045084527, + "debugging/entropy-1-grams": 6.506173973052927, + "debugging/entropy-2-grams": 7.779109056482787, + "debugging/length": 524.2162162162163, + "debugging/num_segments": 37, + "epoch": 14.0, + "objective/train/docs_used": 2725047, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811790704727173, + "objective/train/theoretical_loss": 3.6025300197139183, + "objective/train/tokens_used": 1167239648, + "theoretical_loss": 3.6025300197139183, + "tokens_seen": 1146779648 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032955867602808424, + "loss": 2.8273, + "theoretical_loss": 3.6025300197139183, + "tokens_seen": 1146779648 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295486459378135, + "loss": 2.8489, + "theoretical_loss": 3.602510920635554, + "tokens_seen": 1146845184 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032953861584754265, + "loss": 2.9297, + "theoretical_loss": 3.6024918229541427, + "tokens_seen": 1146910720 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032952858575727184, + "loss": 2.8205, + "theoretical_loss": 3.602472726669501, + "tokens_seen": 1146976256 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329518555667001, + "loss": 2.8176, + "theoretical_loss": 3.602453631781448, + "tokens_seen": 1147041792 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003295085255767302, + "loss": 2.9106, + "theoretical_loss": 3.6024345382898018, + "tokens_seen": 1147107328 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294984954864594, + "loss": 2.8153, + "theoretical_loss": 3.6024154461943807, + "tokens_seen": 1147172864 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294884653961886, + "loss": 2.7212, + "theoretical_loss": 3.602396355495002, + "tokens_seen": 1147238400 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032947843530591774, + "loss": 2.7924, + "theoretical_loss": 3.6023772661914846, + "tokens_seen": 1147303936 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329468405215647, + "loss": 2.8576, + "theoretical_loss": 3.6023581782836462, + "tokens_seen": 1147369472 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294583751253761, + "loss": 2.7957, + "theoretical_loss": 3.602339091771306, + "tokens_seen": 1147435008 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032944834503510534, + "loss": 2.8281, + "theoretical_loss": 3.6023200066542804, + "tokens_seen": 1147500544 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294383149448345, + "loss": 2.8328, + "theoretical_loss": 3.6023009229323897, + "tokens_seen": 1147566080 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294282848545637, + "loss": 2.8983, + "theoretical_loss": 3.602281840605451, + "tokens_seen": 1147631616 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294182547642929, + "loss": 2.7487, + "theoretical_loss": 3.6022627596732835, + "tokens_seen": 1147697152 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003294082246740221, + "loss": 2.8367, + "theoretical_loss": 3.602243680135705, + "tokens_seen": 1147762688 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032939819458375124, + "loss": 2.7247, + "theoretical_loss": 3.6022246019925346, + "tokens_seen": 1147828224 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003293881644934805, + "loss": 2.8183, + "theoretical_loss": 3.6022055252435896, + "tokens_seen": 1147893760 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003293781344032096, + "loss": 2.7924, + "theoretical_loss": 3.60218644988869, + "tokens_seen": 1147959296 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032936810431293884, + "loss": 2.9414, + "theoretical_loss": 3.6021673759276536, + "tokens_seen": 1148024832 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329358074222668, + "loss": 2.7141, + "theoretical_loss": 3.602148303360299, + "tokens_seen": 1148090368 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003293480441323972, + "loss": 2.7553, + "theoretical_loss": 3.602129232186445, + "tokens_seen": 1148155904 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003293380140421264, + "loss": 2.8105, + "theoretical_loss": 3.6021101624059098, + "tokens_seen": 1148221440 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032932798395185557, + "loss": 2.7398, + "theoretical_loss": 3.6020910940185122, + "tokens_seen": 1148286976 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032931795386158475, + "loss": 2.8892, + "theoretical_loss": 3.6020720270240716, + "tokens_seen": 1148352512 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2728750, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.748448371887207, + "objective/train/theoretical_loss": 3.602052961422406, + "objective/train/tokens_used": 1168878048, + "theoretical_loss": 3.602052961422406, + "tokens_seen": 1148418048 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329307923771314, + "loss": 2.7547, + "theoretical_loss": 3.602052961422406, + "tokens_seen": 1148418048 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292978936810431, + "loss": 2.707, + "theoretical_loss": 3.602033897213335, + "tokens_seen": 1148483584 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032928786359077235, + "loss": 2.8276, + "theoretical_loss": 3.6020148343966762, + "tokens_seen": 1148549120 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292778335005015, + "loss": 2.7404, + "theoretical_loss": 3.6019957729722494, + "tokens_seen": 1148614656 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292678034102307, + "loss": 2.8222, + "theoretical_loss": 3.601976712939874, + "tokens_seen": 1148680192 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292577733199599, + "loss": 2.7613, + "theoretical_loss": 3.601957654299367, + "tokens_seen": 1148745728 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032924774322968907, + "loss": 2.8629, + "theoretical_loss": 3.601938597050549, + "tokens_seen": 1148811264 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032923771313941825, + "loss": 2.8466, + "theoretical_loss": 3.6019195411932388, + "tokens_seen": 1148876800 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292276830491475, + "loss": 2.7949, + "theoretical_loss": 3.601900486727255, + "tokens_seen": 1148942336 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003292176529588766, + "loss": 2.7875, + "theoretical_loss": 3.6018814336524168, + "tokens_seen": 1149007872 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032920762286860585, + "loss": 2.7923, + "theoretical_loss": 3.6018623819685436, + "tokens_seen": 1149073408 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329197592778335, + "loss": 2.7497, + "theoretical_loss": 3.601843331675454, + "tokens_seen": 1149138944 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291875626880642, + "loss": 2.7984, + "theoretical_loss": 3.6018242827729674, + "tokens_seen": 1149204480 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291775325977934, + "loss": 2.7985, + "theoretical_loss": 3.601805235260903, + "tokens_seen": 1149270016 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291675025075226, + "loss": 2.7889, + "theoretical_loss": 3.60178618913908, + "tokens_seen": 1149335552 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032915747241725175, + "loss": 2.817, + "theoretical_loss": 3.601767144407318, + "tokens_seen": 1149401088 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032914744232698094, + "loss": 2.7737, + "theoretical_loss": 3.6017481010654357, + "tokens_seen": 1149466624 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291374122367101, + "loss": 2.8627, + "theoretical_loss": 3.6017290591132527, + "tokens_seen": 1149532160 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032912738214643935, + "loss": 2.8547, + "theoretical_loss": 3.6017100185505884, + "tokens_seen": 1149597696 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291173520561685, + "loss": 2.834, + "theoretical_loss": 3.6016909793772625, + "tokens_seen": 1149663232 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003291073219658977, + "loss": 2.8183, + "theoretical_loss": 3.601671941593094, + "tokens_seen": 1149728768 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003290972918756269, + "loss": 2.795, + "theoretical_loss": 3.6016529051979025, + "tokens_seen": 1149794304 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003290872617853561, + "loss": 2.9431, + "theoretical_loss": 3.601633870191508, + "tokens_seen": 1149859840 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032907723169508526, + "loss": 2.8255, + "theoretical_loss": 3.6016148365737286, + "tokens_seen": 1149925376 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032906720160481444, + "loss": 2.6655, + "theoretical_loss": 3.6015958043443854, + "tokens_seen": 1149990912 + }, + { + "epoch": 14.0, + "objective/train/docs_used": 2733610, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8504388332366943, + "objective/train/theoretical_loss": 3.6015767735032975, + "objective/train/tokens_used": 1170516448, + "theoretical_loss": 3.6015767735032975, + "tokens_seen": 1150056448 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003290571715145436, + "loss": 2.8155, + "theoretical_loss": 3.6015767735032975, + "tokens_seen": 1150056448 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032904714142427286, + "loss": 2.6673, + "theoretical_loss": 3.601557744050284, + "tokens_seen": 1150121984 + }, + { + "epoch": 14.0, + "learning_rate": 0.000329037111334002, + "loss": 2.9081, + "theoretical_loss": 3.6015387159851655, + "tokens_seen": 1150187520 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003290270812437312, + "loss": 2.8831, + "theoretical_loss": 3.601519689307761, + "tokens_seen": 1150253056 + }, + { + "epoch": 14.0, + "learning_rate": 0.00032901705115346034, + "loss": 2.8976, + "theoretical_loss": 3.6015006640178906, + "tokens_seen": 1150318592 + }, + { + "epoch": 14.0, + "learning_rate": 0.0003290070210631896, + "loss": 2.7688, + "theoretical_loss": 3.6014816401153738, + "tokens_seen": 1150384128 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032899699097291876, + "loss": 2.8148, + "theoretical_loss": 3.6014626176000304, + "tokens_seen": 1150449664 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032898696088264794, + "loss": 2.7976, + "theoretical_loss": 3.6014435964716807, + "tokens_seen": 1150515200 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003289769307923771, + "loss": 2.8361, + "theoretical_loss": 3.6014245767301443, + "tokens_seen": 1150580736 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003289669007021063, + "loss": 2.7676, + "theoretical_loss": 3.601405558375241, + "tokens_seen": 1150646272 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003289568706118355, + "loss": 2.887, + "theoretical_loss": 3.6013865414067903, + "tokens_seen": 1150711808 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003289468405215647, + "loss": 2.771, + "theoretical_loss": 3.601367525824613, + "tokens_seen": 1150777344 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032893681043129385, + "loss": 2.7456, + "theoretical_loss": 3.601348511628529, + "tokens_seen": 1150842880 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003289267803410231, + "loss": 2.8971, + "theoretical_loss": 3.601329498818358, + "tokens_seen": 1150908416 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032891675025075226, + "loss": 2.7728, + "theoretical_loss": 3.60131048739392, + "tokens_seen": 1150973952 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032890672016048145, + "loss": 2.8723, + "theoretical_loss": 3.6012914773550357, + "tokens_seen": 1151039488 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288966900702106, + "loss": 2.7392, + "theoretical_loss": 3.6012724687015245, + "tokens_seen": 1151105024 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288866599799398, + "loss": 2.8143, + "theoretical_loss": 3.601253461433207, + "tokens_seen": 1151170560 + }, + { + "epoch": 14.01, + "learning_rate": 0.000328876629889669, + "loss": 2.8217, + "theoretical_loss": 3.6012344555499034, + "tokens_seen": 1151236096 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288665997993982, + "loss": 2.8363, + "theoretical_loss": 3.601215451051434, + "tokens_seen": 1151301632 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032885656970912735, + "loss": 2.7947, + "theoretical_loss": 3.6011964479376184, + "tokens_seen": 1151367168 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288465396188566, + "loss": 2.9071, + "theoretical_loss": 3.601177446208278, + "tokens_seen": 1151432704 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288365095285857, + "loss": 2.8206, + "theoretical_loss": 3.6011584458632324, + "tokens_seen": 1151498240 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032882647943831495, + "loss": 2.7524, + "theoretical_loss": 3.6011394469023017, + "tokens_seen": 1151563776 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288164493480442, + "loss": 2.7779, + "theoretical_loss": 3.6011204493253075, + "tokens_seen": 1151629312 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2736529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4398891925811768, + "objective/train/theoretical_loss": 3.6011014531320695, + "objective/train/tokens_used": 1172154848, + "theoretical_loss": 3.6011014531320695, + "tokens_seen": 1151694848 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003288064192577733, + "loss": 2.6201, + "theoretical_loss": 3.6011014531320695, + "tokens_seen": 1151694848 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032879638916750255, + "loss": 2.8548, + "theoretical_loss": 3.6010824583224075, + "tokens_seen": 1151760384 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287863590772317, + "loss": 2.7658, + "theoretical_loss": 3.6010634648961437, + "tokens_seen": 1151825920 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287763289869609, + "loss": 2.8789, + "theoretical_loss": 3.601044472853097, + "tokens_seen": 1151891456 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287662988966901, + "loss": 2.9017, + "theoretical_loss": 3.6010254821930885, + "tokens_seen": 1151956992 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032875626880641927, + "loss": 2.829, + "theoretical_loss": 3.601006492915939, + "tokens_seen": 1152022528 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032874623871614845, + "loss": 2.8411, + "theoretical_loss": 3.6009875050214695, + "tokens_seen": 1152088064 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287362086258777, + "loss": 2.7853, + "theoretical_loss": 3.6009685185095, + "tokens_seen": 1152153600 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287261785356068, + "loss": 2.7966, + "theoretical_loss": 3.6009495333798514, + "tokens_seen": 1152219136 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032871614844533605, + "loss": 2.6562, + "theoretical_loss": 3.6009305496323445, + "tokens_seen": 1152284672 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003287061183550652, + "loss": 2.8257, + "theoretical_loss": 3.6009115672668006, + "tokens_seen": 1152350208 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286960882647944, + "loss": 2.7456, + "theoretical_loss": 3.6008925862830394, + "tokens_seen": 1152415744 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286860581745236, + "loss": 2.7825, + "theoretical_loss": 3.6008736066808824, + "tokens_seen": 1152481280 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286760280842528, + "loss": 2.9094, + "theoretical_loss": 3.6008546284601506, + "tokens_seen": 1152546816 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032866599799398195, + "loss": 2.7743, + "theoretical_loss": 3.600835651620665, + "tokens_seen": 1152612352 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032865596790371114, + "loss": 2.8211, + "theoretical_loss": 3.600816676162246, + "tokens_seen": 1152677888 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286459378134403, + "loss": 2.8122, + "theoretical_loss": 3.600797702084715, + "tokens_seen": 1152743424 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032863590772316955, + "loss": 2.7954, + "theoretical_loss": 3.6007787293878923, + "tokens_seen": 1152808960 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286258776328987, + "loss": 2.7084, + "theoretical_loss": 3.6007597580716, + "tokens_seen": 1152874496 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286158475426279, + "loss": 2.7349, + "theoretical_loss": 3.600740788135659, + "tokens_seen": 1152940032 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003286058174523571, + "loss": 2.8336, + "theoretical_loss": 3.6007218195798893, + "tokens_seen": 1153005568 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003285957873620863, + "loss": 2.7176, + "theoretical_loss": 3.6007028524041136, + "tokens_seen": 1153071104 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032858575727181546, + "loss": 2.8102, + "theoretical_loss": 3.6006838866081523, + "tokens_seen": 1153136640 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032857572718154464, + "loss": 2.8813, + "theoretical_loss": 3.600664922191826, + "tokens_seen": 1153202176 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003285656970912738, + "loss": 2.8434, + "theoretical_loss": 3.6006459591549573, + "tokens_seen": 1153267712 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2741360, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9436845779418945, + "objective/train/theoretical_loss": 3.6006269974973666, + "objective/train/tokens_used": 1173793248, + "theoretical_loss": 3.6006269974973666, + "tokens_seen": 1153333248 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032855566700100306, + "loss": 2.8938, + "theoretical_loss": 3.6006269974973666, + "tokens_seen": 1153333248 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003285456369107322, + "loss": 2.807, + "theoretical_loss": 3.6006080372188753, + "tokens_seen": 1153398784 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003285356068204614, + "loss": 2.8987, + "theoretical_loss": 3.600589078319304, + "tokens_seen": 1153464320 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032852557673019054, + "loss": 2.8204, + "theoretical_loss": 3.6005701207984764, + "tokens_seen": 1153529856 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003285155466399198, + "loss": 2.8495, + "theoretical_loss": 3.6005511646562116, + "tokens_seen": 1153595392 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032850551654964896, + "loss": 2.8046, + "theoretical_loss": 3.600532209892332, + "tokens_seen": 1153660928 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032849548645937814, + "loss": 2.8354, + "theoretical_loss": 3.600513256506659, + "tokens_seen": 1153726464 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284854563691073, + "loss": 2.7966, + "theoretical_loss": 3.6004943044990134, + "tokens_seen": 1153792000 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284754262788365, + "loss": 2.8766, + "theoretical_loss": 3.600475353869218, + "tokens_seen": 1153857536 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284653961885657, + "loss": 2.7346, + "theoretical_loss": 3.600456404617094, + "tokens_seen": 1153923072 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284553660982949, + "loss": 2.8444, + "theoretical_loss": 3.600437456742462, + "tokens_seen": 1153988608 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032844533600802405, + "loss": 2.7496, + "theoretical_loss": 3.600418510245145, + "tokens_seen": 1154054144 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284353059177533, + "loss": 2.8671, + "theoretical_loss": 3.600399565124964, + "tokens_seen": 1154119680 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032842527582748246, + "loss": 2.8199, + "theoretical_loss": 3.600380621381741, + "tokens_seen": 1154185216 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032841524573721165, + "loss": 2.803, + "theoretical_loss": 3.6003616790152977, + "tokens_seen": 1154250752 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003284052156469408, + "loss": 2.7989, + "theoretical_loss": 3.6003427380254553, + "tokens_seen": 1154316288 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032839518555667, + "loss": 2.7575, + "theoretical_loss": 3.600323798412036, + "tokens_seen": 1154381824 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283851554663992, + "loss": 2.7899, + "theoretical_loss": 3.600304860174862, + "tokens_seen": 1154447360 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283751253761284, + "loss": 2.8283, + "theoretical_loss": 3.6002859233137547, + "tokens_seen": 1154512896 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032836509528585755, + "loss": 2.834, + "theoretical_loss": 3.600266987828536, + "tokens_seen": 1154578432 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283550651955868, + "loss": 2.8291, + "theoretical_loss": 3.6002480537190285, + "tokens_seen": 1154643968 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283450351053159, + "loss": 2.9507, + "theoretical_loss": 3.6002291209850537, + "tokens_seen": 1154709504 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032833500501504515, + "loss": 2.7911, + "theoretical_loss": 3.600210189626433, + "tokens_seen": 1154775040 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032832497492477433, + "loss": 2.8632, + "theoretical_loss": 3.6001912596429895, + "tokens_seen": 1154840576 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283149448345035, + "loss": 2.8136, + "theoretical_loss": 3.6001723310345444, + "tokens_seen": 1154906112 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2744349, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8593995571136475, + "objective/train/theoretical_loss": 3.600153403800921, + "objective/train/tokens_used": 1175431648, + "theoretical_loss": 3.600153403800921, + "tokens_seen": 1154971648 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003283049147442327, + "loss": 2.8526, + "theoretical_loss": 3.600153403800921, + "tokens_seen": 1154971648 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003282948846539619, + "loss": 2.8677, + "theoretical_loss": 3.60013447794194, + "tokens_seen": 1155037184 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032828485456369105, + "loss": 2.7692, + "theoretical_loss": 3.6001155534574245, + "tokens_seen": 1155102720 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003282748244734203, + "loss": 2.8046, + "theoretical_loss": 3.6000966303471964, + "tokens_seen": 1155168256 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003282647943831494, + "loss": 2.789, + "theoretical_loss": 3.600077708611078, + "tokens_seen": 1155233792 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032825476429287865, + "loss": 2.7823, + "theoretical_loss": 3.6000587882488917, + "tokens_seen": 1155299328 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032824473420260783, + "loss": 2.8164, + "theoretical_loss": 3.6000398692604594, + "tokens_seen": 1155364864 + }, + { + "epoch": 14.01, + "learning_rate": 0.000328234704112337, + "loss": 2.8507, + "theoretical_loss": 3.600020951645604, + "tokens_seen": 1155430400 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003282246740220662, + "loss": 2.7619, + "theoretical_loss": 3.600002035404147, + "tokens_seen": 1155495936 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003282146439317954, + "loss": 2.8525, + "theoretical_loss": 3.5999831205359123, + "tokens_seen": 1155561472 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032820461384152456, + "loss": 2.7875, + "theoretical_loss": 3.5999642070407214, + "tokens_seen": 1155627008 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281945837512538, + "loss": 2.8001, + "theoretical_loss": 3.5999452949183963, + "tokens_seen": 1155692544 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281845536609829, + "loss": 2.6964, + "theoretical_loss": 3.59992638416876, + "tokens_seen": 1155758080 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032817452357071215, + "loss": 2.8063, + "theoretical_loss": 3.5999074747916353, + "tokens_seen": 1155823616 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281644934804413, + "loss": 2.765, + "theoretical_loss": 3.5998885667868445, + "tokens_seen": 1155889152 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281544633901705, + "loss": 2.8095, + "theoretical_loss": 3.5998696601542104, + "tokens_seen": 1155954688 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281444332998997, + "loss": 2.8176, + "theoretical_loss": 3.5998507548935548, + "tokens_seen": 1156020224 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281344032096289, + "loss": 2.8285, + "theoretical_loss": 3.5998318510047014, + "tokens_seen": 1156085760 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032812437311935806, + "loss": 2.8133, + "theoretical_loss": 3.5998129484874726, + "tokens_seen": 1156151296 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281143430290873, + "loss": 2.8197, + "theoretical_loss": 3.5997940473416916, + "tokens_seen": 1156216832 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003281043129388164, + "loss": 2.8436, + "theoretical_loss": 3.59977514756718, + "tokens_seen": 1156282368 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032809428284854566, + "loss": 2.8174, + "theoretical_loss": 3.5997562491637614, + "tokens_seen": 1156347904 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003280842527582748, + "loss": 2.7983, + "theoretical_loss": 3.5997373521312586, + "tokens_seen": 1156413440 + }, + { + "epoch": 14.01, + "learning_rate": 0.000328074222668004, + "loss": 2.8914, + "theoretical_loss": 3.5997184564694944, + "tokens_seen": 1156478976 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032806419257773326, + "loss": 2.7931, + "theoretical_loss": 3.5996995621782912, + "tokens_seen": 1156544512 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2748195, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.917259454727173, + "objective/train/theoretical_loss": 3.5996806692574728, + "objective/train/tokens_used": 1177070048, + "theoretical_loss": 3.5996806692574728, + "tokens_seen": 1156610048 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003280541624874624, + "loss": 2.8925, + "theoretical_loss": 3.5996806692574728, + "tokens_seen": 1156610048 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003280441323971916, + "loss": 2.8019, + "theoretical_loss": 3.599661777706862, + "tokens_seen": 1156675584 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032803410230692074, + "loss": 2.8705, + "theoretical_loss": 3.599642887526281, + "tokens_seen": 1156741120 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032802407221665, + "loss": 2.7268, + "theoretical_loss": 3.5996239987155536, + "tokens_seen": 1156806656 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032801404212637916, + "loss": 2.9637, + "theoretical_loss": 3.599605111274503, + "tokens_seen": 1156872192 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032800401203610834, + "loss": 2.7323, + "theoretical_loss": 3.599586225202951, + "tokens_seen": 1156937728 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279939819458375, + "loss": 2.7951, + "theoretical_loss": 3.5995673405007222, + "tokens_seen": 1157003264 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279839518555667, + "loss": 2.8007, + "theoretical_loss": 3.5995484571676397, + "tokens_seen": 1157068800 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279739217652959, + "loss": 2.7765, + "theoretical_loss": 3.599529575203526, + "tokens_seen": 1157134336 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279638916750251, + "loss": 2.8367, + "theoretical_loss": 3.5995106946082043, + "tokens_seen": 1157199872 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032795386158475425, + "loss": 2.8337, + "theoretical_loss": 3.5994918153814983, + "tokens_seen": 1157265408 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279438314944835, + "loss": 2.8245, + "theoretical_loss": 3.599472937523231, + "tokens_seen": 1157330944 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032793380140421266, + "loss": 2.8712, + "theoretical_loss": 3.599454061033226, + "tokens_seen": 1157396480 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032792377131394185, + "loss": 2.8157, + "theoretical_loss": 3.599435185911307, + "tokens_seen": 1157462016 + }, + { + "epoch": 14.01, + "learning_rate": 0.000327913741223671, + "loss": 2.7819, + "theoretical_loss": 3.599416312157296, + "tokens_seen": 1157527552 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003279037111334002, + "loss": 2.8456, + "theoretical_loss": 3.599397439771018, + "tokens_seen": 1157593088 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003278936810431294, + "loss": 2.7288, + "theoretical_loss": 3.5993785687522957, + "tokens_seen": 1157658624 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003278836509528586, + "loss": 2.8354, + "theoretical_loss": 3.599359699100953, + "tokens_seen": 1157724160 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032787362086258775, + "loss": 2.8587, + "theoretical_loss": 3.5993408308168124, + "tokens_seen": 1157789696 + }, + { + "epoch": 14.01, + "learning_rate": 0.000327863590772317, + "loss": 2.8105, + "theoretical_loss": 3.5993219638996985, + "tokens_seen": 1157855232 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003278535606820461, + "loss": 2.8737, + "theoretical_loss": 3.5993030983494343, + "tokens_seen": 1157920768 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032784353059177535, + "loss": 2.745, + "theoretical_loss": 3.599284234165844, + "tokens_seen": 1157986304 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032783350050150453, + "loss": 2.8984, + "theoretical_loss": 3.599265371348751, + "tokens_seen": 1158051840 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003278234704112337, + "loss": 2.8144, + "theoretical_loss": 3.599246509897979, + "tokens_seen": 1158117376 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003278134403209629, + "loss": 2.7144, + "theoretical_loss": 3.5992276498133515, + "tokens_seen": 1158182912 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2752948, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8936147689819336, + "objective/train/theoretical_loss": 3.5992087910946924, + "objective/train/tokens_used": 1178708448, + "theoretical_loss": 3.5992087910946924, + "tokens_seen": 1158248448 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032780341023069213, + "loss": 2.8562, + "theoretical_loss": 3.5992087910946924, + "tokens_seen": 1158248448 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032779338014042125, + "loss": 2.8041, + "theoretical_loss": 3.599189933741825, + "tokens_seen": 1158313984 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003277833500501505, + "loss": 2.8668, + "theoretical_loss": 3.5991710777545745, + "tokens_seen": 1158379520 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003277733199598796, + "loss": 2.8574, + "theoretical_loss": 3.599152223132764, + "tokens_seen": 1158445056 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032776328986960885, + "loss": 2.8032, + "theoretical_loss": 3.5991333698762165, + "tokens_seen": 1158510592 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032775325977933803, + "loss": 2.9085, + "theoretical_loss": 3.599114517984757, + "tokens_seen": 1158576128 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003277432296890672, + "loss": 2.818, + "theoretical_loss": 3.599095667458209, + "tokens_seen": 1158641664 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003277331995987964, + "loss": 2.7075, + "theoretical_loss": 3.5990768182963966, + "tokens_seen": 1158707200 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003277231695085256, + "loss": 2.8892, + "theoretical_loss": 3.5990579704991443, + "tokens_seen": 1158772736 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032771313941825476, + "loss": 2.7906, + "theoretical_loss": 3.599039124066275, + "tokens_seen": 1158838272 + }, + { + "epoch": 14.01, + "learning_rate": 0.000327703109327984, + "loss": 2.9319, + "theoretical_loss": 3.599020278997614, + "tokens_seen": 1158903808 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276930792377131, + "loss": 2.7634, + "theoretical_loss": 3.599001435292984, + "tokens_seen": 1158969344 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032768304914744235, + "loss": 2.8502, + "theoretical_loss": 3.5989825929522112, + "tokens_seen": 1159034880 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276730190571715, + "loss": 2.8375, + "theoretical_loss": 3.598963751975118, + "tokens_seen": 1159100416 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276629889669007, + "loss": 2.7746, + "theoretical_loss": 3.598944912361529, + "tokens_seen": 1159165952 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276529588766299, + "loss": 2.8203, + "theoretical_loss": 3.5989260741112687, + "tokens_seen": 1159231488 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276429287863591, + "loss": 2.8212, + "theoretical_loss": 3.5989072372241617, + "tokens_seen": 1159297024 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032763289869608826, + "loss": 2.866, + "theoretical_loss": 3.598888401700032, + "tokens_seen": 1159362560 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276228686058175, + "loss": 2.7504, + "theoretical_loss": 3.5988695675387037, + "tokens_seen": 1159428096 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003276128385155466, + "loss": 2.7398, + "theoretical_loss": 3.598850734740001, + "tokens_seen": 1159493632 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032760280842527586, + "loss": 2.693, + "theoretical_loss": 3.598831903303749, + "tokens_seen": 1159559168 + }, + { + "epoch": 14.01, + "learning_rate": 0.000327592778335005, + "loss": 2.8128, + "theoretical_loss": 3.598813073229772, + "tokens_seen": 1159624704 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275827482447342, + "loss": 2.8133, + "theoretical_loss": 3.5987942445178938, + "tokens_seen": 1159690240 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275727181544634, + "loss": 2.7953, + "theoretical_loss": 3.598775417167939, + "tokens_seen": 1159755776 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275626880641926, + "loss": 2.8123, + "theoretical_loss": 3.5987565911797335, + "tokens_seen": 1159821312 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2756028, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.599485158920288, + "objective/train/theoretical_loss": 3.598737766553101, + "objective/train/tokens_used": 1180346848, + "theoretical_loss": 3.598737766553101, + "tokens_seen": 1159886848 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032755265797392176, + "loss": 2.6861, + "theoretical_loss": 3.598737766553101, + "tokens_seen": 1159886848 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032754262788365094, + "loss": 2.7667, + "theoretical_loss": 3.5987189432878646, + "tokens_seen": 1159952384 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275325977933801, + "loss": 2.9004, + "theoretical_loss": 3.5987001213838514, + "tokens_seen": 1160017920 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032752256770310936, + "loss": 2.837, + "theoretical_loss": 3.5986813008408847, + "tokens_seen": 1160083456 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275125376128385, + "loss": 2.9084, + "theoretical_loss": 3.5986624816587898, + "tokens_seen": 1160148992 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003275025075225677, + "loss": 2.8407, + "theoretical_loss": 3.5986436638373904, + "tokens_seen": 1160214528 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032749247743229685, + "loss": 2.8673, + "theoretical_loss": 3.598624847376513, + "tokens_seen": 1160280064 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003274824473420261, + "loss": 2.8342, + "theoretical_loss": 3.5986060322759803, + "tokens_seen": 1160345600 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032747241725175527, + "loss": 2.8815, + "theoretical_loss": 3.598587218535619, + "tokens_seen": 1160411136 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032746238716148445, + "loss": 2.7341, + "theoretical_loss": 3.5985684061552528, + "tokens_seen": 1160476672 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032745235707121363, + "loss": 2.8863, + "theoretical_loss": 3.598549595134707, + "tokens_seen": 1160542208 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032744232698094286, + "loss": 2.8767, + "theoretical_loss": 3.5985307854738067, + "tokens_seen": 1160607744 + }, + { + "epoch": 14.01, + "learning_rate": 0.000327432296890672, + "loss": 2.8243, + "theoretical_loss": 3.598511977172376, + "tokens_seen": 1160673280 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003274222668004012, + "loss": 2.8375, + "theoretical_loss": 3.5984931702302414, + "tokens_seen": 1160738816 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032741223671013035, + "loss": 2.8359, + "theoretical_loss": 3.5984743646472266, + "tokens_seen": 1160804352 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003274022066198596, + "loss": 2.7931, + "theoretical_loss": 3.598455560423157, + "tokens_seen": 1160869888 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032739217652958877, + "loss": 2.8272, + "theoretical_loss": 3.598436757557858, + "tokens_seen": 1160935424 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032738214643931795, + "loss": 2.8048, + "theoretical_loss": 3.598417956051155, + "tokens_seen": 1161000960 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032737211634904713, + "loss": 2.8819, + "theoretical_loss": 3.598399155902872, + "tokens_seen": 1161066496 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003273620862587763, + "loss": 2.863, + "theoretical_loss": 3.598380357112835, + "tokens_seen": 1161132032 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003273520561685055, + "loss": 2.8433, + "theoretical_loss": 3.5983615596808693, + "tokens_seen": 1161197568 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032734202607823473, + "loss": 2.8759, + "theoretical_loss": 3.5983427636067997, + "tokens_seen": 1161263104 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032733199598796386, + "loss": 2.8729, + "theoretical_loss": 3.598323968890452, + "tokens_seen": 1161328640 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003273219658976931, + "loss": 2.8048, + "theoretical_loss": 3.5983051755316513, + "tokens_seen": 1161394176 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032731193580742233, + "loss": 2.7299, + "theoretical_loss": 3.5982863835302226, + "tokens_seen": 1161459712 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2760830, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.833972930908203, + "objective/train/theoretical_loss": 3.5982675928859917, + "objective/train/tokens_used": 1181985248, + "theoretical_loss": 3.5982675928859917, + "tokens_seen": 1161525248 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032730190571715145, + "loss": 2.8362, + "theoretical_loss": 3.5982675928859917, + "tokens_seen": 1161525248 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272918756268807, + "loss": 2.8394, + "theoretical_loss": 3.598248803598784, + "tokens_seen": 1161590784 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272818455366098, + "loss": 2.9216, + "theoretical_loss": 3.5982300156684244, + "tokens_seen": 1161656320 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032727181544633905, + "loss": 2.8586, + "theoretical_loss": 3.598211229094739, + "tokens_seen": 1161721856 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032726178535606823, + "loss": 2.8136, + "theoretical_loss": 3.5981924438775534, + "tokens_seen": 1161787392 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272517552657974, + "loss": 2.8497, + "theoretical_loss": 3.598173660016693, + "tokens_seen": 1161852928 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272417251755266, + "loss": 2.7509, + "theoretical_loss": 3.5981548775119823, + "tokens_seen": 1161918464 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272316950852558, + "loss": 2.907, + "theoretical_loss": 3.5981360963632483, + "tokens_seen": 1161984000 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032722166499498496, + "loss": 2.826, + "theoretical_loss": 3.5981173165703164, + "tokens_seen": 1162049536 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272116349047142, + "loss": 2.8249, + "theoretical_loss": 3.598098538133012, + "tokens_seen": 1162115072 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003272016048144433, + "loss": 2.8126, + "theoretical_loss": 3.5980797610511606, + "tokens_seen": 1162180608 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032719157472417256, + "loss": 2.8592, + "theoretical_loss": 3.5980609853245884, + "tokens_seen": 1162246144 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271815446339017, + "loss": 2.8555, + "theoretical_loss": 3.5980422109531207, + "tokens_seen": 1162311680 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271715145436309, + "loss": 2.8544, + "theoretical_loss": 3.598023437936584, + "tokens_seen": 1162377216 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271614844533601, + "loss": 2.8894, + "theoretical_loss": 3.598004666274803, + "tokens_seen": 1162442752 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271514543630893, + "loss": 2.8435, + "theoretical_loss": 3.5979858959676045, + "tokens_seen": 1162508288 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032714142427281846, + "loss": 2.779, + "theoretical_loss": 3.5979671270148144, + "tokens_seen": 1162573824 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271313941825477, + "loss": 2.7724, + "theoretical_loss": 3.5979483594162582, + "tokens_seen": 1162639360 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271213640922768, + "loss": 2.9, + "theoretical_loss": 3.597929593171762, + "tokens_seen": 1162704896 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032711133400200606, + "loss": 2.8609, + "theoretical_loss": 3.597910828281152, + "tokens_seen": 1162770432 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003271013039117352, + "loss": 2.7305, + "theoretical_loss": 3.597892064744254, + "tokens_seen": 1162835968 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270912738214644, + "loss": 2.7806, + "theoretical_loss": 3.597873302560894, + "tokens_seen": 1162901504 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270812437311936, + "loss": 2.8695, + "theoretical_loss": 3.5978545417308982, + "tokens_seen": 1162967040 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270712136409228, + "loss": 2.907, + "theoretical_loss": 3.5978357822540925, + "tokens_seen": 1163032576 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032706118355065196, + "loss": 2.7384, + "theoretical_loss": 3.5978170241303036, + "tokens_seen": 1163098112 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2763803, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.087282180786133, + "objective/train/theoretical_loss": 3.5977982673593574, + "objective/train/tokens_used": 1183623648, + "theoretical_loss": 3.5977982673593574, + "tokens_seen": 1163163648 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032705115346038114, + "loss": 2.8391, + "theoretical_loss": 3.5977982673593574, + "tokens_seen": 1163163648 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270411233701103, + "loss": 2.8082, + "theoretical_loss": 3.59777951194108, + "tokens_seen": 1163229184 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032703109327983956, + "loss": 2.8165, + "theoretical_loss": 3.5977607578752973, + "tokens_seen": 1163294720 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270210631895687, + "loss": 2.8018, + "theoretical_loss": 3.5977420051618365, + "tokens_seen": 1163360256 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003270110330992979, + "loss": 2.6192, + "theoretical_loss": 3.597723253800523, + "tokens_seen": 1163425792 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032700100300902705, + "loss": 2.8717, + "theoretical_loss": 3.5977045037911837, + "tokens_seen": 1163491328 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003269909729187563, + "loss": 2.6935, + "theoretical_loss": 3.5976857551336447, + "tokens_seen": 1163556864 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032698094282848547, + "loss": 2.7384, + "theoretical_loss": 3.5976670078277326, + "tokens_seen": 1163622400 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032697091273821465, + "loss": 2.8303, + "theoretical_loss": 3.5976482618732746, + "tokens_seen": 1163687936 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032696088264794383, + "loss": 2.8908, + "theoretical_loss": 3.5976295172700956, + "tokens_seen": 1163753472 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032695085255767306, + "loss": 2.8474, + "theoretical_loss": 3.5976107740180225, + "tokens_seen": 1163819008 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003269408224674022, + "loss": 2.7809, + "theoretical_loss": 3.597592032116883, + "tokens_seen": 1163884544 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003269307923771314, + "loss": 2.7609, + "theoretical_loss": 3.5975732915665026, + "tokens_seen": 1163950080 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032692076228686055, + "loss": 2.8719, + "theoretical_loss": 3.597554552366708, + "tokens_seen": 1164015616 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003269107321965898, + "loss": 2.8733, + "theoretical_loss": 3.597535814517326, + "tokens_seen": 1164081152 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032690070210631897, + "loss": 2.8768, + "theoretical_loss": 3.5975170780181838, + "tokens_seen": 1164146688 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032689067201604815, + "loss": 2.8013, + "theoretical_loss": 3.597498342869107, + "tokens_seen": 1164212224 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032688064192577733, + "loss": 2.7728, + "theoretical_loss": 3.5974796090699233, + "tokens_seen": 1164277760 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003268706118355065, + "loss": 2.7979, + "theoretical_loss": 3.5974608766204588, + "tokens_seen": 1164343296 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003268605817452357, + "loss": 2.8549, + "theoretical_loss": 3.5974421455205405, + "tokens_seen": 1164408832 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032685055165496493, + "loss": 2.8008, + "theoretical_loss": 3.5974234157699954, + "tokens_seen": 1164474368 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032684052156469406, + "loss": 2.7629, + "theoretical_loss": 3.597404687368651, + "tokens_seen": 1164539904 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003268304914744233, + "loss": 2.8038, + "theoretical_loss": 3.5973859603163323, + "tokens_seen": 1164605440 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003268204613841524, + "loss": 2.8993, + "theoretical_loss": 3.597367234612868, + "tokens_seen": 1164670976 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032681043129388165, + "loss": 2.7889, + "theoretical_loss": 3.597348510258084, + "tokens_seen": 1164736512 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2767472, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8048131465911865, + "objective/train/theoretical_loss": 3.597329787251808, + "objective/train/tokens_used": 1185262048, + "theoretical_loss": 3.597329787251808, + "tokens_seen": 1164802048 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032680040120361084, + "loss": 2.7352, + "theoretical_loss": 3.597329787251808, + "tokens_seen": 1164802048 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032679037111334, + "loss": 2.8307, + "theoretical_loss": 3.5973110655938667, + "tokens_seen": 1164867584 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003267803410230692, + "loss": 2.8374, + "theoretical_loss": 3.597292345284087, + "tokens_seen": 1164933120 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032677031093279843, + "loss": 2.7748, + "theoretical_loss": 3.5972736263222966, + "tokens_seen": 1164998656 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032676028084252756, + "loss": 2.8794, + "theoretical_loss": 3.597254908708322, + "tokens_seen": 1165064192 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003267502507522568, + "loss": 2.8168, + "theoretical_loss": 3.5972361924419904, + "tokens_seen": 1165129728 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003267402206619859, + "loss": 2.8588, + "theoretical_loss": 3.597217477523129, + "tokens_seen": 1165195264 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032673019057171516, + "loss": 2.8657, + "theoretical_loss": 3.5971987639515652, + "tokens_seen": 1165260800 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032672016048144434, + "loss": 2.82, + "theoretical_loss": 3.5971800517271264, + "tokens_seen": 1165326336 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003267101303911735, + "loss": 2.8443, + "theoretical_loss": 3.597161340849639, + "tokens_seen": 1165391872 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003267001003009027, + "loss": 2.8242, + "theoretical_loss": 3.5971426313189316, + "tokens_seen": 1165457408 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003266900702106319, + "loss": 2.7774, + "theoretical_loss": 3.5971239231348306, + "tokens_seen": 1165522944 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032668004012036106, + "loss": 2.887, + "theoretical_loss": 3.5971052162971637, + "tokens_seen": 1165588480 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003266700100300903, + "loss": 2.8662, + "theoretical_loss": 3.5970865108057577, + "tokens_seen": 1165654016 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003266599799398194, + "loss": 2.7509, + "theoretical_loss": 3.5970678066604416, + "tokens_seen": 1165719552 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032664994984954866, + "loss": 2.8387, + "theoretical_loss": 3.597049103861041, + "tokens_seen": 1165785088 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032663991975927784, + "loss": 2.9186, + "theoretical_loss": 3.5970304024073845, + "tokens_seen": 1165850624 + }, + { + "epoch": 14.01, + "learning_rate": 0.000326629889669007, + "loss": 2.8782, + "theoretical_loss": 3.597011702299299, + "tokens_seen": 1165916160 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003266198595787362, + "loss": 2.6724, + "theoretical_loss": 3.5969930035366127, + "tokens_seen": 1165981696 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003266098294884654, + "loss": 2.9051, + "theoretical_loss": 3.596974306119153, + "tokens_seen": 1166047232 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032659979939819457, + "loss": 2.8353, + "theoretical_loss": 3.5969556100467477, + "tokens_seen": 1166112768 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003265897693079238, + "loss": 2.8321, + "theoretical_loss": 3.5969369153192234, + "tokens_seen": 1166178304 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032657973921765293, + "loss": 2.7341, + "theoretical_loss": 3.5969182219364093, + "tokens_seen": 1166243840 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032656970912738216, + "loss": 2.8117, + "theoretical_loss": 3.596899529898132, + "tokens_seen": 1166309376 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032655967903711135, + "loss": 2.8564, + "theoretical_loss": 3.5968808392042195, + "tokens_seen": 1166374912 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2772620, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8558552265167236, + "objective/train/theoretical_loss": 3.5968621498545, + "objective/train/tokens_used": 1186900448, + "theoretical_loss": 3.5968621498545, + "tokens_seen": 1166440448 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003265496489468405, + "loss": 2.7834, + "theoretical_loss": 3.5968621498545, + "tokens_seen": 1166440448 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032653961885656976, + "loss": 2.8386, + "theoretical_loss": 3.5968434618488008, + "tokens_seen": 1166505984 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003265295887662989, + "loss": 2.7803, + "theoretical_loss": 3.5968247751869504, + "tokens_seen": 1166571520 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003265195586760281, + "loss": 2.9021, + "theoretical_loss": 3.5968060898687764, + "tokens_seen": 1166637056 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032650952858575725, + "loss": 2.8316, + "theoretical_loss": 3.596787405894106, + "tokens_seen": 1166702592 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003264994984954865, + "loss": 2.7915, + "theoretical_loss": 3.596768723262768, + "tokens_seen": 1166768128 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032648946840521567, + "loss": 2.7302, + "theoretical_loss": 3.5967500419745897, + "tokens_seen": 1166833664 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032647943831494485, + "loss": 2.8798, + "theoretical_loss": 3.5967313620293995, + "tokens_seen": 1166899200 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032646940822467403, + "loss": 2.956, + "theoretical_loss": 3.5967126834270258, + "tokens_seen": 1166964736 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032645937813440326, + "loss": 2.827, + "theoretical_loss": 3.5966940061672963, + "tokens_seen": 1167030272 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003264493480441324, + "loss": 2.8383, + "theoretical_loss": 3.5966753302500387, + "tokens_seen": 1167095808 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003264393179538616, + "loss": 2.8629, + "theoretical_loss": 3.596656655675082, + "tokens_seen": 1167161344 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032642928786359075, + "loss": 2.7849, + "theoretical_loss": 3.5966379824422536, + "tokens_seen": 1167226880 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032641925777332, + "loss": 2.7076, + "theoretical_loss": 3.596619310551382, + "tokens_seen": 1167292416 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032640922768304917, + "loss": 2.7923, + "theoretical_loss": 3.596600640002295, + "tokens_seen": 1167357952 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032639919759277835, + "loss": 2.8685, + "theoretical_loss": 3.596581970794822, + "tokens_seen": 1167423488 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032638916750250753, + "loss": 2.8426, + "theoretical_loss": 3.5965633029287902, + "tokens_seen": 1167489024 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003263791374122367, + "loss": 2.8945, + "theoretical_loss": 3.5965446364040283, + "tokens_seen": 1167554560 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003263691073219659, + "loss": 2.8927, + "theoretical_loss": 3.5965259712203648, + "tokens_seen": 1167620096 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032635907723169513, + "loss": 2.8397, + "theoretical_loss": 3.5965073073776272, + "tokens_seen": 1167685632 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032634904714142426, + "loss": 2.9508, + "theoretical_loss": 3.5964886448756452, + "tokens_seen": 1167751168 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003263390170511535, + "loss": 2.8002, + "theoretical_loss": 3.596469983714247, + "tokens_seen": 1167816704 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003263289869608826, + "loss": 2.8512, + "theoretical_loss": 3.59645132389326, + "tokens_seen": 1167882240 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032631895687061185, + "loss": 2.8776, + "theoretical_loss": 3.5964326654125136, + "tokens_seen": 1167947776 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032630892678034104, + "loss": 2.6804, + "theoretical_loss": 3.5964140082718368, + "tokens_seen": 1168013312 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2775305, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.53837251663208, + "objective/train/theoretical_loss": 3.596395352471057, + "objective/train/tokens_used": 1188538848, + "theoretical_loss": 3.596395352471057, + "tokens_seen": 1168078848 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003262988966900702, + "loss": 2.6919, + "theoretical_loss": 3.596395352471057, + "tokens_seen": 1168078848 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003262888665997994, + "loss": 2.8951, + "theoretical_loss": 3.596376698010004, + "tokens_seen": 1168144384 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032627883650952863, + "loss": 2.8308, + "theoretical_loss": 3.596358044888505, + "tokens_seen": 1168209920 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032626880641925776, + "loss": 2.7974, + "theoretical_loss": 3.5963393931063896, + "tokens_seen": 1168275456 + }, + { + "epoch": 14.01, + "learning_rate": 0.000326258776328987, + "loss": 2.8302, + "theoretical_loss": 3.5963207426634867, + "tokens_seen": 1168340992 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003262487462387161, + "loss": 2.7984, + "theoretical_loss": 3.5963020935596246, + "tokens_seen": 1168406528 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032623871614844536, + "loss": 2.6857, + "theoretical_loss": 3.596283445794632, + "tokens_seen": 1168472064 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032622868605817454, + "loss": 2.8544, + "theoretical_loss": 3.5962647993683383, + "tokens_seen": 1168537600 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003262186559679037, + "loss": 2.8488, + "theoretical_loss": 3.5962461542805713, + "tokens_seen": 1168603136 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003262086258776329, + "loss": 2.8338, + "theoretical_loss": 3.596227510531161, + "tokens_seen": 1168668672 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261985957873621, + "loss": 2.7784, + "theoretical_loss": 3.596208868119936, + "tokens_seen": 1168734208 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032618856569709126, + "loss": 2.8757, + "theoretical_loss": 3.5961902270467236, + "tokens_seen": 1168799744 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261785356068205, + "loss": 2.8002, + "theoretical_loss": 3.596171587311355, + "tokens_seen": 1168865280 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261685055165496, + "loss": 2.8069, + "theoretical_loss": 3.5961529489136583, + "tokens_seen": 1168930816 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032615847542627886, + "loss": 2.826, + "theoretical_loss": 3.5961343118534623, + "tokens_seen": 1168996352 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032614844533600804, + "loss": 2.8085, + "theoretical_loss": 3.5961156761305966, + "tokens_seen": 1169061888 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261384152457372, + "loss": 2.828, + "theoretical_loss": 3.5960970417448896, + "tokens_seen": 1169127424 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261283851554664, + "loss": 2.8237, + "theoretical_loss": 3.596078408696171, + "tokens_seen": 1169192960 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003261183550651956, + "loss": 2.7751, + "theoretical_loss": 3.5960597769842693, + "tokens_seen": 1169258496 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032610832497492477, + "loss": 2.8366, + "theoretical_loss": 3.5960411466090143, + "tokens_seen": 1169324032 + }, + { + "epoch": 14.01, + "learning_rate": 0.000326098294884654, + "loss": 2.7382, + "theoretical_loss": 3.596022517570235, + "tokens_seen": 1169389568 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032608826479438313, + "loss": 2.8016, + "theoretical_loss": 3.5960038898677604, + "tokens_seen": 1169455104 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032607823470411236, + "loss": 2.7703, + "theoretical_loss": 3.5959852635014204, + "tokens_seen": 1169520640 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003260682046138415, + "loss": 2.6621, + "theoretical_loss": 3.5959666384710434, + "tokens_seen": 1169586176 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003260581745235707, + "loss": 2.8382, + "theoretical_loss": 3.595948014776459, + "tokens_seen": 1169651712 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2780511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8173227310180664, + "objective/train/theoretical_loss": 3.5959293924174975, + "objective/train/tokens_used": 1190177248, + "theoretical_loss": 3.5959293924174975, + "tokens_seen": 1169717248 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003260481444332999, + "loss": 2.8954, + "theoretical_loss": 3.5959293924174975, + "tokens_seen": 1169717248 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003260381143430291, + "loss": 2.8695, + "theoretical_loss": 3.595910771393987, + "tokens_seen": 1169782784 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032602808425275827, + "loss": 2.7297, + "theoretical_loss": 3.5958921517057574, + "tokens_seen": 1169848320 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032601805416248745, + "loss": 2.8018, + "theoretical_loss": 3.5958735333526386, + "tokens_seen": 1169913856 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032600802407221663, + "loss": 2.8817, + "theoretical_loss": 3.595854916334459, + "tokens_seen": 1169979392 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032599799398194587, + "loss": 2.7144, + "theoretical_loss": 3.595836300651049, + "tokens_seen": 1170044928 + }, + { + "epoch": 14.01, + "learning_rate": 0.000325987963891675, + "loss": 2.8782, + "theoretical_loss": 3.5958176863022384, + "tokens_seen": 1170110464 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032597793380140423, + "loss": 2.831, + "theoretical_loss": 3.595799073287856, + "tokens_seen": 1170176000 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003259679037111334, + "loss": 2.7836, + "theoretical_loss": 3.5957804616077316, + "tokens_seen": 1170241536 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003259578736208626, + "loss": 2.8625, + "theoretical_loss": 3.5957618512616953, + "tokens_seen": 1170307072 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003259478435305918, + "loss": 2.897, + "theoretical_loss": 3.595743242249576, + "tokens_seen": 1170372608 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032593781344032095, + "loss": 2.7103, + "theoretical_loss": 3.595724634571204, + "tokens_seen": 1170438144 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032592778335005014, + "loss": 2.7488, + "theoretical_loss": 3.5957060282264095, + "tokens_seen": 1170503680 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032591775325977937, + "loss": 2.9354, + "theoretical_loss": 3.595687423215021, + "tokens_seen": 1170569216 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003259077231695085, + "loss": 2.8869, + "theoretical_loss": 3.5956688195368693, + "tokens_seen": 1170634752 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032589769307923773, + "loss": 2.8762, + "theoretical_loss": 3.5956502171917837, + "tokens_seen": 1170700288 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032588766298896686, + "loss": 2.9036, + "theoretical_loss": 3.5956316161795945, + "tokens_seen": 1170765824 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003258776328986961, + "loss": 2.8259, + "theoretical_loss": 3.595613016500131, + "tokens_seen": 1170831360 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003258676028084253, + "loss": 2.743, + "theoretical_loss": 3.595594418153224, + "tokens_seen": 1170896896 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032585757271815446, + "loss": 2.8592, + "theoretical_loss": 3.595575821138702, + "tokens_seen": 1170962432 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032584754262788364, + "loss": 2.8623, + "theoretical_loss": 3.5955572254563966, + "tokens_seen": 1171027968 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003258375125376128, + "loss": 2.7946, + "theoretical_loss": 3.5955386311061366, + "tokens_seen": 1171093504 + }, + { + "epoch": 14.01, + "learning_rate": 0.000325827482447342, + "loss": 2.7677, + "theoretical_loss": 3.595520038087753, + "tokens_seen": 1171159040 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032581745235707124, + "loss": 2.7976, + "theoretical_loss": 3.5955014464010757, + "tokens_seen": 1171224576 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003258074222668004, + "loss": 2.8047, + "theoretical_loss": 3.595482856045934, + "tokens_seen": 1171290112 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2783389, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8318915367126465, + "objective/train/theoretical_loss": 3.595464267022159, + "objective/train/tokens_used": 1191815648, + "theoretical_loss": 3.595464267022159, + "tokens_seen": 1171355648 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257973921765296, + "loss": 2.8632, + "theoretical_loss": 3.595464267022159, + "tokens_seen": 1171355648 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032578736208625883, + "loss": 2.8914, + "theoretical_loss": 3.5954456793295804, + "tokens_seen": 1171421184 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032577733199598796, + "loss": 2.824, + "theoretical_loss": 3.595427092968028, + "tokens_seen": 1171486720 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257673019057172, + "loss": 2.8538, + "theoretical_loss": 3.595408507937333, + "tokens_seen": 1171552256 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257572718154463, + "loss": 2.862, + "theoretical_loss": 3.595389924237325, + "tokens_seen": 1171617792 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032574724172517556, + "loss": 2.8108, + "theoretical_loss": 3.5953713418678346, + "tokens_seen": 1171683328 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032573721163490474, + "loss": 2.8206, + "theoretical_loss": 3.5953527608286917, + "tokens_seen": 1171748864 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257271815446339, + "loss": 2.8631, + "theoretical_loss": 3.5953341811197275, + "tokens_seen": 1171814400 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257171514543631, + "loss": 2.7989, + "theoretical_loss": 3.5953156027407713, + "tokens_seen": 1171879936 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003257071213640923, + "loss": 2.8464, + "theoretical_loss": 3.5952970256916545, + "tokens_seen": 1171945472 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032569709127382146, + "loss": 2.772, + "theoretical_loss": 3.5952784499722066, + "tokens_seen": 1172011008 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256870611835507, + "loss": 2.8887, + "theoretical_loss": 3.5952598755822587, + "tokens_seen": 1172076544 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256770310932798, + "loss": 2.7433, + "theoretical_loss": 3.5952413025216416, + "tokens_seen": 1172142080 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032566700100300906, + "loss": 2.7239, + "theoretical_loss": 3.5952227307901854, + "tokens_seen": 1172207616 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032565697091273824, + "loss": 2.8232, + "theoretical_loss": 3.5952041603877207, + "tokens_seen": 1172273152 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256469408224674, + "loss": 2.9229, + "theoretical_loss": 3.595185591314078, + "tokens_seen": 1172338688 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256369107321966, + "loss": 2.7252, + "theoretical_loss": 3.595167023569088, + "tokens_seen": 1172404224 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256268806419258, + "loss": 2.7994, + "theoretical_loss": 3.5951484571525816, + "tokens_seen": 1172469760 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032561685055165497, + "loss": 2.9025, + "theoretical_loss": 3.5951298920643895, + "tokens_seen": 1172535296 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003256068204613842, + "loss": 2.836, + "theoretical_loss": 3.595111328304342, + "tokens_seen": 1172600832 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032559679037111333, + "loss": 2.8348, + "theoretical_loss": 3.5950927658722707, + "tokens_seen": 1172666368 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032558676028084256, + "loss": 2.8458, + "theoretical_loss": 3.5950742047680055, + "tokens_seen": 1172731904 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003255767301905717, + "loss": 2.8324, + "theoretical_loss": 3.5950556449913775, + "tokens_seen": 1172797440 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003255667001003009, + "loss": 2.889, + "theoretical_loss": 3.5950370865422174, + "tokens_seen": 1172862976 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003255566700100301, + "loss": 2.8157, + "theoretical_loss": 3.595018529420357, + "tokens_seen": 1172928512 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2786861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6826415061950684, + "objective/train/theoretical_loss": 3.594999973625626, + "objective/train/tokens_used": 1193454048, + "theoretical_loss": 3.594999973625626, + "tokens_seen": 1172994048 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003255466399197593, + "loss": 2.7865, + "theoretical_loss": 3.594999973625626, + "tokens_seen": 1172994048 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032553660982948847, + "loss": 2.8248, + "theoretical_loss": 3.5949814191578557, + "tokens_seen": 1173059584 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032552657973921765, + "loss": 2.8226, + "theoretical_loss": 3.5949628660168784, + "tokens_seen": 1173125120 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032551654964894683, + "loss": 2.8583, + "theoretical_loss": 3.594944314202523, + "tokens_seen": 1173190656 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032550651955867607, + "loss": 2.7447, + "theoretical_loss": 3.5949257637146217, + "tokens_seen": 1173256192 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003254964894684052, + "loss": 2.8574, + "theoretical_loss": 3.594907214553005, + "tokens_seen": 1173321728 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032548645937813443, + "loss": 2.9148, + "theoretical_loss": 3.5948886667175053, + "tokens_seen": 1173387264 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003254764292878636, + "loss": 2.8228, + "theoretical_loss": 3.5948701202079523, + "tokens_seen": 1173452800 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003254663991975928, + "loss": 2.8333, + "theoretical_loss": 3.594851575024178, + "tokens_seen": 1173518336 + }, + { + "epoch": 14.01, + "learning_rate": 0.000325456369107322, + "loss": 2.8343, + "theoretical_loss": 3.594833031166013, + "tokens_seen": 1173583872 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032544633901705115, + "loss": 2.8657, + "theoretical_loss": 3.594814488633289, + "tokens_seen": 1173649408 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032543630892678034, + "loss": 2.7023, + "theoretical_loss": 3.594795947425837, + "tokens_seen": 1173714944 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032542627883650957, + "loss": 2.9054, + "theoretical_loss": 3.5947774075434884, + "tokens_seen": 1173780480 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003254162487462387, + "loss": 2.8704, + "theoretical_loss": 3.594758868986074, + "tokens_seen": 1173846016 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032540621865596793, + "loss": 2.8163, + "theoretical_loss": 3.5947403317534263, + "tokens_seen": 1173911552 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032539618856569706, + "loss": 2.8391, + "theoretical_loss": 3.5947217958453757, + "tokens_seen": 1173977088 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003253861584754263, + "loss": 2.9262, + "theoretical_loss": 3.594703261261754, + "tokens_seen": 1174042624 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003253761283851555, + "loss": 2.9082, + "theoretical_loss": 3.5946847280023926, + "tokens_seen": 1174108160 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032536609829488466, + "loss": 2.9065, + "theoretical_loss": 3.594666196067123, + "tokens_seen": 1174173696 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032535606820461384, + "loss": 2.8108, + "theoretical_loss": 3.594647665455777, + "tokens_seen": 1174239232 + }, + { + "epoch": 14.01, + "learning_rate": 0.000325346038114343, + "loss": 2.8559, + "theoretical_loss": 3.5946291361681855, + "tokens_seen": 1174304768 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003253360080240722, + "loss": 2.8446, + "theoretical_loss": 3.5946106082041798, + "tokens_seen": 1174370304 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032532597793380144, + "loss": 2.8859, + "theoretical_loss": 3.594592081563593, + "tokens_seen": 1174435840 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032531594784353056, + "loss": 2.8435, + "theoretical_loss": 3.5945735562462557, + "tokens_seen": 1174501376 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003253059177532598, + "loss": 2.827, + "theoretical_loss": 3.594555032251999, + "tokens_seen": 1174566912 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2791794, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.842240810394287, + "objective/train/theoretical_loss": 3.5945365095806556, + "objective/train/tokens_used": 1195092448, + "theoretical_loss": 3.5945365095806556, + "tokens_seen": 1174632448 + }, + { + "epoch": 14.01, + "learning_rate": 0.000325295887662989, + "loss": 2.8521, + "theoretical_loss": 3.5945365095806556, + "tokens_seen": 1174632448 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032528585757271816, + "loss": 2.8642, + "theoretical_loss": 3.594517988232057, + "tokens_seen": 1174697984 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032527582748244734, + "loss": 2.8967, + "theoretical_loss": 3.5944994682060347, + "tokens_seen": 1174763520 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003252657973921765, + "loss": 2.8672, + "theoretical_loss": 3.5944809495024206, + "tokens_seen": 1174829056 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003252557673019057, + "loss": 2.8855, + "theoretical_loss": 3.5944624321210465, + "tokens_seen": 1174894592 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032524573721163494, + "loss": 2.8767, + "theoretical_loss": 3.5944439160617447, + "tokens_seen": 1174960128 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032523570712136407, + "loss": 2.8942, + "theoretical_loss": 3.5944254013243464, + "tokens_seen": 1175025664 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003252256770310933, + "loss": 2.7522, + "theoretical_loss": 3.5944068879086832, + "tokens_seen": 1175091200 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032521564694082243, + "loss": 2.9017, + "theoretical_loss": 3.594388375814588, + "tokens_seen": 1175156736 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032520561685055166, + "loss": 2.8965, + "theoretical_loss": 3.5943698650418927, + "tokens_seen": 1175222272 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032519558676028084, + "loss": 2.8143, + "theoretical_loss": 3.5943513555904287, + "tokens_seen": 1175287808 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032518555667001, + "loss": 2.9288, + "theoretical_loss": 3.594332847460028, + "tokens_seen": 1175353344 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003251755265797392, + "loss": 2.9366, + "theoretical_loss": 3.5943143406505227, + "tokens_seen": 1175418880 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032516549648946844, + "loss": 2.8007, + "theoretical_loss": 3.5942958351617458, + "tokens_seen": 1175484416 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032515546639919757, + "loss": 2.8173, + "theoretical_loss": 3.5942773309935285, + "tokens_seen": 1175549952 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003251454363089268, + "loss": 2.9155, + "theoretical_loss": 3.594258828145703, + "tokens_seen": 1175615488 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032513540621865593, + "loss": 2.8389, + "theoretical_loss": 3.594240326618102, + "tokens_seen": 1175681024 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032512537612838517, + "loss": 2.7877, + "theoretical_loss": 3.594221826410557, + "tokens_seen": 1175746560 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032511534603811435, + "loss": 2.8506, + "theoretical_loss": 3.5942033275229015, + "tokens_seen": 1175812096 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032510531594784353, + "loss": 2.8348, + "theoretical_loss": 3.594184829954966, + "tokens_seen": 1175877632 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003250952858575727, + "loss": 2.7769, + "theoretical_loss": 3.594166333706584, + "tokens_seen": 1175943168 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003250852557673019, + "loss": 2.8587, + "theoretical_loss": 3.5941478387775874, + "tokens_seen": 1176008704 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032507522567703107, + "loss": 2.8433, + "theoretical_loss": 3.5941293451678096, + "tokens_seen": 1176074240 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003250651955867603, + "loss": 2.7598, + "theoretical_loss": 3.594110852877081, + "tokens_seen": 1176139776 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003250551654964895, + "loss": 2.8621, + "theoretical_loss": 3.5940923619052354, + "tokens_seen": 1176205312 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2794837, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8877530097961426, + "objective/train/theoretical_loss": 3.5940738722521055, + "objective/train/tokens_used": 1196730848, + "theoretical_loss": 3.5940738722521055, + "tokens_seen": 1176270848 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032504513540621867, + "loss": 2.8975, + "theoretical_loss": 3.5940738722521055, + "tokens_seen": 1176270848 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032503510531594785, + "loss": 2.8881, + "theoretical_loss": 3.5940553839175227, + "tokens_seen": 1176336384 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032502507522567703, + "loss": 2.8426, + "theoretical_loss": 3.5940368969013208, + "tokens_seen": 1176401920 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032501504513540627, + "loss": 2.8952, + "theoretical_loss": 3.594018411203331, + "tokens_seen": 1176467456 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003250050150451354, + "loss": 2.7927, + "theoretical_loss": 3.593999926823387, + "tokens_seen": 1176532992 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032499498495486463, + "loss": 2.7759, + "theoretical_loss": 3.5939814437613213, + "tokens_seen": 1176598528 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003249849548645938, + "loss": 2.927, + "theoretical_loss": 3.593962962016965, + "tokens_seen": 1176664064 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324974924774323, + "loss": 2.9143, + "theoretical_loss": 3.593944481590153, + "tokens_seen": 1176729600 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003249648946840522, + "loss": 2.7345, + "theoretical_loss": 3.5939260024807167, + "tokens_seen": 1176795136 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032495486459378135, + "loss": 2.7657, + "theoretical_loss": 3.593907524688489, + "tokens_seen": 1176860672 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032494483450351054, + "loss": 2.8821, + "theoretical_loss": 3.5938890482133026, + "tokens_seen": 1176926208 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032493480441323977, + "loss": 2.786, + "theoretical_loss": 3.593870573054991, + "tokens_seen": 1176991744 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003249247743229689, + "loss": 2.6709, + "theoretical_loss": 3.5938520992133864, + "tokens_seen": 1177057280 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032491474423269813, + "loss": 2.6863, + "theoretical_loss": 3.5938336266883217, + "tokens_seen": 1177122816 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032490471414242726, + "loss": 2.9461, + "theoretical_loss": 3.5938151554796294, + "tokens_seen": 1177188352 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003248946840521565, + "loss": 2.7837, + "theoretical_loss": 3.5937966855871433, + "tokens_seen": 1177253888 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003248846539618857, + "loss": 2.8122, + "theoretical_loss": 3.593778217010696, + "tokens_seen": 1177319424 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032487462387161486, + "loss": 2.7635, + "theoretical_loss": 3.59375974975012, + "tokens_seen": 1177384960 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032486459378134404, + "loss": 2.7756, + "theoretical_loss": 3.5937412838052487, + "tokens_seen": 1177450496 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003248545636910732, + "loss": 2.7676, + "theoretical_loss": 3.593722819175915, + "tokens_seen": 1177516032 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003248445336008024, + "loss": 2.7594, + "theoretical_loss": 3.5937043558619526, + "tokens_seen": 1177581568 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032483450351053164, + "loss": 2.8558, + "theoretical_loss": 3.593685893863194, + "tokens_seen": 1177647104 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032482447342026076, + "loss": 2.8045, + "theoretical_loss": 3.5936674331794722, + "tokens_seen": 1177712640 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032481444332999, + "loss": 2.8159, + "theoretical_loss": 3.5936489738106205, + "tokens_seen": 1177778176 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003248044132397192, + "loss": 2.6888, + "theoretical_loss": 3.5936305157564727, + "tokens_seen": 1177843712 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2797840, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.778233766555786, + "objective/train/theoretical_loss": 3.593612059016861, + "objective/train/tokens_used": 1198369248, + "theoretical_loss": 3.593612059016861, + "tokens_seen": 1177909248 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032479438314944836, + "loss": 2.7838, + "theoretical_loss": 3.593612059016861, + "tokens_seen": 1177909248 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032478435305917754, + "loss": 2.7752, + "theoretical_loss": 3.5935936035916196, + "tokens_seen": 1177974784 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003247743229689067, + "loss": 2.8149, + "theoretical_loss": 3.5935751494805808, + "tokens_seen": 1178040320 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003247642928786359, + "loss": 2.8749, + "theoretical_loss": 3.593556696683579, + "tokens_seen": 1178105856 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032475426278836514, + "loss": 2.8297, + "theoretical_loss": 3.5935382452004463, + "tokens_seen": 1178171392 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032474423269809427, + "loss": 2.8361, + "theoretical_loss": 3.5935197950310176, + "tokens_seen": 1178236928 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003247342026078235, + "loss": 2.8052, + "theoretical_loss": 3.593501346175125, + "tokens_seen": 1178302464 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032472417251755263, + "loss": 2.7769, + "theoretical_loss": 3.593482898632602, + "tokens_seen": 1178368000 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032471414242728186, + "loss": 2.8153, + "theoretical_loss": 3.5934644524032833, + "tokens_seen": 1178433536 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032470411233701104, + "loss": 2.904, + "theoretical_loss": 3.5934460074870014, + "tokens_seen": 1178499072 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003246940822467402, + "loss": 2.8469, + "theoretical_loss": 3.5934275638835897, + "tokens_seen": 1178564608 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003246840521564694, + "loss": 2.867, + "theoretical_loss": 3.593409121592882, + "tokens_seen": 1178630144 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032467402206619864, + "loss": 2.8131, + "theoretical_loss": 3.593390680614712, + "tokens_seen": 1178695680 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032466399197592777, + "loss": 2.785, + "theoretical_loss": 3.5933722409489137, + "tokens_seen": 1178761216 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324653961885657, + "loss": 2.7637, + "theoretical_loss": 3.5933538025953196, + "tokens_seen": 1178826752 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032464393179538613, + "loss": 2.7858, + "theoretical_loss": 3.5933353655537648, + "tokens_seen": 1178892288 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032463390170511537, + "loss": 2.7434, + "theoretical_loss": 3.5933169298240815, + "tokens_seen": 1178957824 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032462387161484455, + "loss": 2.8671, + "theoretical_loss": 3.5932984954061045, + "tokens_seen": 1179023360 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032461384152457373, + "loss": 2.8029, + "theoretical_loss": 3.5932800622996677, + "tokens_seen": 1179088896 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003246038114343029, + "loss": 2.8582, + "theoretical_loss": 3.593261630504604, + "tokens_seen": 1179154432 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003245937813440321, + "loss": 2.795, + "theoretical_loss": 3.5932432000207477, + "tokens_seen": 1179219968 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032458375125376127, + "loss": 2.8197, + "theoretical_loss": 3.5932247708479332, + "tokens_seen": 1179285504 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003245737211634905, + "loss": 2.9215, + "theoretical_loss": 3.5932063429859937, + "tokens_seen": 1179351040 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032456369107321963, + "loss": 2.848, + "theoretical_loss": 3.593187916434763, + "tokens_seen": 1179416576 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032455366098294887, + "loss": 2.7723, + "theoretical_loss": 3.5931694911940752, + "tokens_seen": 1179482112 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2801732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0486302375793457, + "objective/train/theoretical_loss": 3.593151067263765, + "objective/train/tokens_used": 1200007648, + "theoretical_loss": 3.593151067263765, + "tokens_seen": 1179547648 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324543630892678, + "loss": 2.7804, + "theoretical_loss": 3.593151067263765, + "tokens_seen": 1179547648 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032453360080240723, + "loss": 2.6905, + "theoretical_loss": 3.5931326446436653, + "tokens_seen": 1179613184 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003245235707121364, + "loss": 2.8134, + "theoretical_loss": 3.593114223333611, + "tokens_seen": 1179678720 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003245135406218656, + "loss": 2.8555, + "theoretical_loss": 3.593095803333436, + "tokens_seen": 1179744256 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003245035105315948, + "loss": 2.8846, + "theoretical_loss": 3.5930773846429735, + "tokens_seen": 1179809792 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324493480441324, + "loss": 2.7858, + "theoretical_loss": 3.593058967262059, + "tokens_seen": 1179875328 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032448345035105314, + "loss": 2.8273, + "theoretical_loss": 3.5930405511905263, + "tokens_seen": 1179940864 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003244734202607824, + "loss": 2.7483, + "theoretical_loss": 3.5930221364282087, + "tokens_seen": 1180006400 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003244633901705115, + "loss": 2.7841, + "theoretical_loss": 3.5930037229749416, + "tokens_seen": 1180071936 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032445336008024074, + "loss": 2.826, + "theoretical_loss": 3.5929853108305583, + "tokens_seen": 1180137472 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003244433299899699, + "loss": 2.9507, + "theoretical_loss": 3.5929668999948934, + "tokens_seen": 1180203008 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003244332998996991, + "loss": 2.7758, + "theoretical_loss": 3.5929484904677818, + "tokens_seen": 1180268544 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003244232698094283, + "loss": 2.8131, + "theoretical_loss": 3.5929300822490564, + "tokens_seen": 1180334080 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032441323971915746, + "loss": 2.8682, + "theoretical_loss": 3.592911675338553, + "tokens_seen": 1180399616 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032440320962888664, + "loss": 2.8649, + "theoretical_loss": 3.592893269736106, + "tokens_seen": 1180465152 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003243931795386159, + "loss": 2.824, + "theoretical_loss": 3.592874865441549, + "tokens_seen": 1180530688 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324383149448345, + "loss": 2.7406, + "theoretical_loss": 3.5928564624547166, + "tokens_seen": 1180596224 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032437311935807424, + "loss": 2.8086, + "theoretical_loss": 3.5928380607754438, + "tokens_seen": 1180661760 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032436308926780337, + "loss": 2.9041, + "theoretical_loss": 3.592819660403565, + "tokens_seen": 1180727296 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003243530591775326, + "loss": 2.8375, + "theoretical_loss": 3.5928012613389146, + "tokens_seen": 1180792832 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003243430290872618, + "loss": 2.8197, + "theoretical_loss": 3.592782863581327, + "tokens_seen": 1180858368 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032433299899699096, + "loss": 2.8445, + "theoretical_loss": 3.5927644671306367, + "tokens_seen": 1180923904 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003243229689067202, + "loss": 2.8716, + "theoretical_loss": 3.592746071986679, + "tokens_seen": 1180989440 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003243129388164494, + "loss": 2.7704, + "theoretical_loss": 3.592727678149288, + "tokens_seen": 1181054976 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032430290872617856, + "loss": 2.9344, + "theoretical_loss": 3.5927092856182985, + "tokens_seen": 1181120512 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2806426, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.599151849746704, + "objective/train/theoretical_loss": 3.592690894393545, + "objective/train/tokens_used": 1201646048, + "theoretical_loss": 3.592690894393545, + "tokens_seen": 1181186048 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032429287863590774, + "loss": 2.701, + "theoretical_loss": 3.592690894393545, + "tokens_seen": 1181186048 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003242828485456369, + "loss": 2.7772, + "theoretical_loss": 3.5926725044748635, + "tokens_seen": 1181251584 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003242728184553661, + "loss": 2.8701, + "theoretical_loss": 3.5926541158620875, + "tokens_seen": 1181317120 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032426278836509534, + "loss": 2.6748, + "theoretical_loss": 3.5926357285550523, + "tokens_seen": 1181382656 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032425275827482447, + "loss": 2.7798, + "theoretical_loss": 3.5926173425535923, + "tokens_seen": 1181448192 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003242427281845537, + "loss": 2.7743, + "theoretical_loss": 3.5925989578575432, + "tokens_seen": 1181513728 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032423269809428283, + "loss": 2.8645, + "theoretical_loss": 3.59258057446674, + "tokens_seen": 1181579264 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032422266800401206, + "loss": 2.814, + "theoretical_loss": 3.592562192381016, + "tokens_seen": 1181644800 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032421263791374125, + "loss": 2.7826, + "theoretical_loss": 3.5925438116002075, + "tokens_seen": 1181710336 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003242026078234704, + "loss": 2.8288, + "theoretical_loss": 3.59252543212415, + "tokens_seen": 1181775872 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003241925777331996, + "loss": 2.9087, + "theoretical_loss": 3.5925070539526773, + "tokens_seen": 1181841408 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032418254764292884, + "loss": 2.7983, + "theoretical_loss": 3.5924886770856252, + "tokens_seen": 1181906944 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032417251755265797, + "loss": 2.7123, + "theoretical_loss": 3.5924703015228285, + "tokens_seen": 1181972480 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003241624874623872, + "loss": 2.8003, + "theoretical_loss": 3.592451927264123, + "tokens_seen": 1182038016 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032415245737211633, + "loss": 2.8791, + "theoretical_loss": 3.592433554309342, + "tokens_seen": 1182103552 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032414242728184557, + "loss": 2.9028, + "theoretical_loss": 3.592415182658323, + "tokens_seen": 1182169088 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032413239719157475, + "loss": 2.7847, + "theoretical_loss": 3.5923968123109002, + "tokens_seen": 1182234624 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032412236710130393, + "loss": 2.8396, + "theoretical_loss": 3.5923784432669086, + "tokens_seen": 1182300160 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003241123370110331, + "loss": 2.7084, + "theoretical_loss": 3.5923600755261833, + "tokens_seen": 1182365696 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003241023069207623, + "loss": 2.7842, + "theoretical_loss": 3.5923417090885605, + "tokens_seen": 1182431232 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032409227683049147, + "loss": 2.8796, + "theoretical_loss": 3.5923233439538746, + "tokens_seen": 1182496768 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003240822467402207, + "loss": 2.8214, + "theoretical_loss": 3.5923049801219618, + "tokens_seen": 1182562304 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032407221664994984, + "loss": 2.8519, + "theoretical_loss": 3.5922866175926575, + "tokens_seen": 1182627840 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032406218655967907, + "loss": 2.8367, + "theoretical_loss": 3.592268256365796, + "tokens_seen": 1182693376 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003240521564694082, + "loss": 2.873, + "theoretical_loss": 3.5922498964412135, + "tokens_seen": 1182758912 + }, + { + "epoch": 14.01, + "objective/train/docs_used": 2809462, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0026941299438477, + "objective/train/theoretical_loss": 3.592231537818746, + "objective/train/tokens_used": 1203284448, + "theoretical_loss": 3.592231537818746, + "tokens_seen": 1182824448 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032404212637913743, + "loss": 2.9499, + "theoretical_loss": 3.592231537818746, + "tokens_seen": 1182824448 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003240320962888666, + "loss": 2.8896, + "theoretical_loss": 3.592213180498228, + "tokens_seen": 1182889984 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003240220661985958, + "loss": 2.7852, + "theoretical_loss": 3.5921948244794955, + "tokens_seen": 1182955520 + }, + { + "epoch": 14.01, + "learning_rate": 0.000324012036108325, + "loss": 2.8912, + "theoretical_loss": 3.592176469762385, + "tokens_seen": 1183021056 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003240020060180542, + "loss": 2.8553, + "theoretical_loss": 3.59215811634673, + "tokens_seen": 1183086592 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032399197592778334, + "loss": 2.7966, + "theoretical_loss": 3.5921397642323676, + "tokens_seen": 1183152128 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003239819458375126, + "loss": 2.8625, + "theoretical_loss": 3.5921214134191333, + "tokens_seen": 1183217664 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003239719157472417, + "loss": 2.8498, + "theoretical_loss": 3.592103063906863, + "tokens_seen": 1183283200 + }, + { + "epoch": 14.01, + "learning_rate": 0.00032396188565697094, + "loss": 2.8853, + "theoretical_loss": 3.5920847156953917, + "tokens_seen": 1183348736 + }, + { + "epoch": 14.01, + "learning_rate": 0.0003239518555667001, + "loss": 2.8279, + "theoretical_loss": 3.5920663687845558, + "tokens_seen": 1183414272 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003239418254764293, + "loss": 2.8091, + "theoretical_loss": 3.592048023174191, + "tokens_seen": 1183479808 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003239317953861585, + "loss": 2.8417, + "theoretical_loss": 3.592029678864133, + "tokens_seen": 1183545344 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032392176529588766, + "loss": 2.9008, + "theoretical_loss": 3.5920113358542176, + "tokens_seen": 1183610880 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032391173520561684, + "loss": 2.8929, + "theoretical_loss": 3.59199299414428, + "tokens_seen": 1183676416 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003239017051153461, + "loss": 2.7911, + "theoretical_loss": 3.5919746537341575, + "tokens_seen": 1183741952 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003238916750250752, + "loss": 2.8867, + "theoretical_loss": 3.591956314623685, + "tokens_seen": 1183807488 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032388164493480444, + "loss": 2.8503, + "theoretical_loss": 3.591937976812699, + "tokens_seen": 1183873024 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032387161484453357, + "loss": 2.8693, + "theoretical_loss": 3.591919640301035, + "tokens_seen": 1183938560 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003238615847542628, + "loss": 2.7122, + "theoretical_loss": 3.5919013050885296, + "tokens_seen": 1184004096 + }, + { + "epoch": 14.02, + "learning_rate": 0.000323851554663992, + "loss": 2.9031, + "theoretical_loss": 3.5918829711750186, + "tokens_seen": 1184069632 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032384152457372116, + "loss": 2.7236, + "theoretical_loss": 3.5918646385603377, + "tokens_seen": 1184135168 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032383149448345034, + "loss": 2.8095, + "theoretical_loss": 3.5918463072443236, + "tokens_seen": 1184200704 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003238214643931796, + "loss": 2.8577, + "theoretical_loss": 3.591827977226812, + "tokens_seen": 1184266240 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003238114343029087, + "loss": 2.7857, + "theoretical_loss": 3.591809648507639, + "tokens_seen": 1184331776 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032380140421263794, + "loss": 2.8018, + "theoretical_loss": 3.5917913210866415, + "tokens_seen": 1184397312 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2814170, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.679577589035034, + "objective/train/theoretical_loss": 3.591772994963655, + "objective/train/tokens_used": 1204922848, + "theoretical_loss": 3.591772994963655, + "tokens_seen": 1184462848 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032379137412236707, + "loss": 2.8711, + "theoretical_loss": 3.591772994963655, + "tokens_seen": 1184462848 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003237813440320963, + "loss": 2.8165, + "theoretical_loss": 3.591754670138516, + "tokens_seen": 1184528384 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003237713139418255, + "loss": 2.8633, + "theoretical_loss": 3.591736346611061, + "tokens_seen": 1184593920 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032376128385155467, + "loss": 2.8471, + "theoretical_loss": 3.5917180243811258, + "tokens_seen": 1184659456 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032375125376128385, + "loss": 2.9163, + "theoretical_loss": 3.5916997034485476, + "tokens_seen": 1184724992 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032374122367101303, + "loss": 2.8764, + "theoretical_loss": 3.591681383813162, + "tokens_seen": 1184790528 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003237311935807422, + "loss": 2.876, + "theoretical_loss": 3.5916630654748056, + "tokens_seen": 1184856064 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032372116349047145, + "loss": 2.8834, + "theoretical_loss": 3.591644748433315, + "tokens_seen": 1184921600 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032371113340020057, + "loss": 2.9108, + "theoretical_loss": 3.5916264326885265, + "tokens_seen": 1184987136 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003237011033099298, + "loss": 2.8623, + "theoretical_loss": 3.591608118240277, + "tokens_seen": 1185052672 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032369107321965893, + "loss": 2.7655, + "theoretical_loss": 3.591589805088402, + "tokens_seen": 1185118208 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032368104312938817, + "loss": 2.837, + "theoretical_loss": 3.591571493232739, + "tokens_seen": 1185183744 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032367101303911735, + "loss": 2.8045, + "theoretical_loss": 3.591553182673125, + "tokens_seen": 1185249280 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032366098294884653, + "loss": 2.8505, + "theoretical_loss": 3.5915348734093957, + "tokens_seen": 1185314816 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003236509528585757, + "loss": 2.7937, + "theoretical_loss": 3.5915165654413874, + "tokens_seen": 1185380352 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032364092276830495, + "loss": 2.8739, + "theoretical_loss": 3.591498258768938, + "tokens_seen": 1185445888 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003236308926780341, + "loss": 2.7967, + "theoretical_loss": 3.5914799533918833, + "tokens_seen": 1185511424 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003236208625877633, + "loss": 2.7457, + "theoretical_loss": 3.5914616493100606, + "tokens_seen": 1185576960 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032361083249749244, + "loss": 2.6685, + "theoretical_loss": 3.591443346523306, + "tokens_seen": 1185642496 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003236008024072217, + "loss": 2.8109, + "theoretical_loss": 3.591425045031457, + "tokens_seen": 1185708032 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032359077231695085, + "loss": 2.8187, + "theoretical_loss": 3.59140674483435, + "tokens_seen": 1185773568 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032358074222668004, + "loss": 2.8155, + "theoretical_loss": 3.5913884459318215, + "tokens_seen": 1185839104 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032357071213640927, + "loss": 2.8453, + "theoretical_loss": 3.5913701483237093, + "tokens_seen": 1185904640 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003235606820461384, + "loss": 2.8642, + "theoretical_loss": 3.5913518520098497, + "tokens_seen": 1185970176 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032355065195586763, + "loss": 2.8647, + "theoretical_loss": 3.5913335569900804, + "tokens_seen": 1186035712 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2817204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.965890407562256, + "objective/train/theoretical_loss": 3.591315263264237, + "objective/train/tokens_used": 1206561248, + "theoretical_loss": 3.591315263264237, + "tokens_seen": 1186101248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003235406218655968, + "loss": 2.9142, + "theoretical_loss": 3.591315263264237, + "tokens_seen": 1186101248 + }, + { + "epoch": 14.02, + "learning_rate": 0.000323530591775326, + "loss": 2.8414, + "theoretical_loss": 3.5912969708321576, + "tokens_seen": 1186166784 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003235205616850552, + "loss": 2.8125, + "theoretical_loss": 3.5912786796936786, + "tokens_seen": 1186232320 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003235105315947844, + "loss": 2.7599, + "theoretical_loss": 3.5912603898486375, + "tokens_seen": 1186297856 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032350050150451354, + "loss": 2.7897, + "theoretical_loss": 3.5912421012968716, + "tokens_seen": 1186363392 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234904714142428, + "loss": 2.8056, + "theoretical_loss": 3.5912238140382167, + "tokens_seen": 1186428928 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234804413239719, + "loss": 2.882, + "theoretical_loss": 3.5912055280725115, + "tokens_seen": 1186494464 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032347041123370114, + "loss": 2.8992, + "theoretical_loss": 3.5911872433995926, + "tokens_seen": 1186560000 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234603811434303, + "loss": 2.8553, + "theoretical_loss": 3.5911689600192975, + "tokens_seen": 1186625536 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234503510531595, + "loss": 2.7959, + "theoretical_loss": 3.591150677931463, + "tokens_seen": 1186691072 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234403209628887, + "loss": 2.8156, + "theoretical_loss": 3.591132397135926, + "tokens_seen": 1186756608 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032343029087261786, + "loss": 2.8377, + "theoretical_loss": 3.5911141176325243, + "tokens_seen": 1186822144 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032342026078234704, + "loss": 2.9284, + "theoretical_loss": 3.5910958394210954, + "tokens_seen": 1186887680 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234102306920763, + "loss": 2.7802, + "theoretical_loss": 3.591077562501477, + "tokens_seen": 1186953216 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003234002006018054, + "loss": 2.7944, + "theoretical_loss": 3.591059286873505, + "tokens_seen": 1187018752 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032339017051153464, + "loss": 2.832, + "theoretical_loss": 3.5910410125370182, + "tokens_seen": 1187084288 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032338014042126377, + "loss": 2.9266, + "theoretical_loss": 3.5910227394918537, + "tokens_seen": 1187149824 + }, + { + "epoch": 14.02, + "learning_rate": 0.000323370110330993, + "loss": 2.8222, + "theoretical_loss": 3.5910044677378483, + "tokens_seen": 1187215360 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003233600802407222, + "loss": 2.868, + "theoretical_loss": 3.590986197274841, + "tokens_seen": 1187280896 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032335005015045136, + "loss": 2.9277, + "theoretical_loss": 3.5909679281026676, + "tokens_seen": 1187346432 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032334002006018054, + "loss": 2.8325, + "theoretical_loss": 3.590949660221167, + "tokens_seen": 1187411968 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003233299899699098, + "loss": 2.7744, + "theoretical_loss": 3.590931393630176, + "tokens_seen": 1187477504 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003233299899699098, + "loss": 2.8532, + "theoretical_loss": 3.5909131283295324, + "tokens_seen": 1187543040 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003233199598796389, + "loss": 2.9011, + "theoretical_loss": 3.590894864319074, + "tokens_seen": 1187608576 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032330992978936814, + "loss": 2.8111, + "theoretical_loss": 3.590876601598638, + "tokens_seen": 1187674112 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2821179, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8013556003570557, + "objective/train/theoretical_loss": 3.5908583401680634, + "objective/train/tokens_used": 1208199648, + "theoretical_loss": 3.5908583401680634, + "tokens_seen": 1187739648 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032329989969909727, + "loss": 2.814, + "theoretical_loss": 3.5908583401680634, + "tokens_seen": 1187739648 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003232898696088265, + "loss": 2.7595, + "theoretical_loss": 3.5908400800271867, + "tokens_seen": 1187805184 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003232798395185557, + "loss": 2.926, + "theoretical_loss": 3.5908218211758456, + "tokens_seen": 1187870720 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032326980942828487, + "loss": 2.8393, + "theoretical_loss": 3.590803563613879, + "tokens_seen": 1187936256 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032325977933801405, + "loss": 2.8835, + "theoretical_loss": 3.5907853073411236, + "tokens_seen": 1188001792 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032324974924774323, + "loss": 2.7928, + "theoretical_loss": 3.5907670523574176, + "tokens_seen": 1188067328 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003232397191574724, + "loss": 2.8369, + "theoretical_loss": 3.5907487986625988, + "tokens_seen": 1188132864 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032322968906720165, + "loss": 2.8189, + "theoretical_loss": 3.5907305462565056, + "tokens_seen": 1188198400 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032321965897693077, + "loss": 2.8084, + "theoretical_loss": 3.5907122951389754, + "tokens_seen": 1188263936 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032320962888666, + "loss": 2.8654, + "theoretical_loss": 3.5906940453098466, + "tokens_seen": 1188329472 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032319959879638913, + "loss": 2.9013, + "theoretical_loss": 3.5906757967689567, + "tokens_seen": 1188395008 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032318956870611837, + "loss": 2.8281, + "theoretical_loss": 3.590657549516145, + "tokens_seen": 1188460544 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032317953861584755, + "loss": 2.9075, + "theoretical_loss": 3.5906393035512476, + "tokens_seen": 1188526080 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032316950852557673, + "loss": 2.7996, + "theoretical_loss": 3.5906210588741034, + "tokens_seen": 1188591616 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003231594784353059, + "loss": 2.8479, + "theoretical_loss": 3.590602815484551, + "tokens_seen": 1188657152 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032314944834503515, + "loss": 2.8676, + "theoretical_loss": 3.5905845733824284, + "tokens_seen": 1188722688 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003231394182547643, + "loss": 2.861, + "theoretical_loss": 3.5905663325675734, + "tokens_seen": 1188788224 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003231293881644935, + "loss": 2.8699, + "theoretical_loss": 3.590548093039825, + "tokens_seen": 1188853760 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032311935807422264, + "loss": 2.7708, + "theoretical_loss": 3.59052985479902, + "tokens_seen": 1188919296 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003231093279839519, + "loss": 2.8875, + "theoretical_loss": 3.5905116178449976, + "tokens_seen": 1188984832 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032309929789368105, + "loss": 2.8513, + "theoretical_loss": 3.5904933821775966, + "tokens_seen": 1189050368 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032308926780341024, + "loss": 2.8275, + "theoretical_loss": 3.5904751477966546, + "tokens_seen": 1189115904 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003230792377131394, + "loss": 2.9283, + "theoretical_loss": 3.5904569147020093, + "tokens_seen": 1189181440 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003230692076228686, + "loss": 2.8444, + "theoretical_loss": 3.5904386828935007, + "tokens_seen": 1189246976 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003230591775325978, + "loss": 2.7215, + "theoretical_loss": 3.590420452370966, + "tokens_seen": 1189312512 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2825974, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.653865098953247, + "objective/train/theoretical_loss": 3.590402223134244, + "objective/train/tokens_used": 1209838048, + "theoretical_loss": 3.590402223134244, + "tokens_seen": 1189378048 + }, + { + "epoch": 14.02, + "learning_rate": 0.000323049147442327, + "loss": 2.8304, + "theoretical_loss": 3.590402223134244, + "tokens_seen": 1189378048 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032303911735205614, + "loss": 2.7989, + "theoretical_loss": 3.5903839951831733, + "tokens_seen": 1189443584 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003230290872617854, + "loss": 2.7885, + "theoretical_loss": 3.590365768517592, + "tokens_seen": 1189509120 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003230190571715145, + "loss": 2.814, + "theoretical_loss": 3.590347543137339, + "tokens_seen": 1189574656 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032300902708124374, + "loss": 2.8444, + "theoretical_loss": 3.590329319042253, + "tokens_seen": 1189640192 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229989969909729, + "loss": 2.7733, + "theoretical_loss": 3.5903110962321723, + "tokens_seen": 1189705728 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229889669007021, + "loss": 2.8189, + "theoretical_loss": 3.5902928747069356, + "tokens_seen": 1189771264 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229789368104313, + "loss": 2.8151, + "theoretical_loss": 3.5902746544663815, + "tokens_seen": 1189836800 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229689067201605, + "loss": 2.7552, + "theoretical_loss": 3.590256435510349, + "tokens_seen": 1189902336 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032295887662988964, + "loss": 2.8197, + "theoretical_loss": 3.5902382178386754, + "tokens_seen": 1189967872 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229488465396189, + "loss": 2.8737, + "theoretical_loss": 3.5902200014512013, + "tokens_seen": 1190033408 + }, + { + "epoch": 14.02, + "learning_rate": 0.000322938816449348, + "loss": 2.7744, + "theoretical_loss": 3.590201786347764, + "tokens_seen": 1190098944 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032292878635907724, + "loss": 2.92, + "theoretical_loss": 3.590183572528204, + "tokens_seen": 1190164480 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229187562688064, + "loss": 2.7973, + "theoretical_loss": 3.5901653599923584, + "tokens_seen": 1190230016 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003229087261785356, + "loss": 2.6891, + "theoretical_loss": 3.5901471487400674, + "tokens_seen": 1190295552 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003228986960882648, + "loss": 2.8808, + "theoretical_loss": 3.5901289387711683, + "tokens_seen": 1190361088 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032288866599799397, + "loss": 2.8089, + "theoretical_loss": 3.5901107300855015, + "tokens_seen": 1190426624 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032287863590772315, + "loss": 2.7186, + "theoretical_loss": 3.590092522682905, + "tokens_seen": 1190492160 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003228686058174524, + "loss": 2.8713, + "theoretical_loss": 3.590074316563218, + "tokens_seen": 1190557696 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003228585757271815, + "loss": 2.9312, + "theoretical_loss": 3.59005611172628, + "tokens_seen": 1190623232 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032284854563691074, + "loss": 2.8473, + "theoretical_loss": 3.5900379081719294, + "tokens_seen": 1190688768 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003228385155466399, + "loss": 2.9835, + "theoretical_loss": 3.5900197059000054, + "tokens_seen": 1190754304 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003228284854563691, + "loss": 2.7207, + "theoretical_loss": 3.5900015049103473, + "tokens_seen": 1190819840 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032281845536609834, + "loss": 2.8353, + "theoretical_loss": 3.589983305202794, + "tokens_seen": 1190885376 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032280842527582747, + "loss": 2.812, + "theoretical_loss": 3.5899651067771847, + "tokens_seen": 1190950912 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2828773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8685131072998047, + "objective/train/theoretical_loss": 3.5899469096333583, + "objective/train/tokens_used": 1211476448, + "theoretical_loss": 3.5899469096333583, + "tokens_seen": 1191016448 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003227983951855567, + "loss": 2.7842, + "theoretical_loss": 3.5899469096333583, + "tokens_seen": 1191016448 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003227883650952859, + "loss": 2.7977, + "theoretical_loss": 3.5899287137711546, + "tokens_seen": 1191081984 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032277833500501507, + "loss": 2.8262, + "theoretical_loss": 3.589910519190412, + "tokens_seen": 1191147520 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032276830491474425, + "loss": 2.8393, + "theoretical_loss": 3.589892325890971, + "tokens_seen": 1191213056 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032275827482447343, + "loss": 2.7352, + "theoretical_loss": 3.58987413387267, + "tokens_seen": 1191278592 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003227482447342026, + "loss": 2.8606, + "theoretical_loss": 3.589855943135348, + "tokens_seen": 1191344128 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032273821464393185, + "loss": 2.8287, + "theoretical_loss": 3.589837753678845, + "tokens_seen": 1191409664 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032272818455366097, + "loss": 2.8241, + "theoretical_loss": 3.5898195655030003, + "tokens_seen": 1191475200 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003227181544633902, + "loss": 2.782, + "theoretical_loss": 3.5898013786076524, + "tokens_seen": 1191540736 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032270812437311933, + "loss": 2.8625, + "theoretical_loss": 3.589783192992642, + "tokens_seen": 1191606272 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032269809428284857, + "loss": 2.8235, + "theoretical_loss": 3.5897650086578086, + "tokens_seen": 1191671808 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032268806419257775, + "loss": 2.7877, + "theoretical_loss": 3.5897468256029903, + "tokens_seen": 1191737344 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032267803410230693, + "loss": 2.8741, + "theoretical_loss": 3.5897286438280274, + "tokens_seen": 1191802880 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003226680040120361, + "loss": 2.8327, + "theoretical_loss": 3.58971046333276, + "tokens_seen": 1191868416 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032265797392176535, + "loss": 2.7357, + "theoretical_loss": 3.589692284117027, + "tokens_seen": 1191933952 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003226479438314945, + "loss": 2.8642, + "theoretical_loss": 3.589674106180668, + "tokens_seen": 1191999488 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003226379137412237, + "loss": 2.7859, + "theoretical_loss": 3.5896559295235226, + "tokens_seen": 1192065024 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032262788365095284, + "loss": 2.9663, + "theoretical_loss": 3.589637754145431, + "tokens_seen": 1192130560 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003226178535606821, + "loss": 2.8246, + "theoretical_loss": 3.5896195800462323, + "tokens_seen": 1192196096 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032260782347041125, + "loss": 2.7879, + "theoretical_loss": 3.5896014072257665, + "tokens_seen": 1192261632 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032259779338014044, + "loss": 2.7984, + "theoretical_loss": 3.5895832356838735, + "tokens_seen": 1192327168 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225877632898696, + "loss": 2.7702, + "theoretical_loss": 3.5895650654203926, + "tokens_seen": 1192392704 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225777331995988, + "loss": 2.8252, + "theoretical_loss": 3.589546896435164, + "tokens_seen": 1192458240 + }, + { + "epoch": 14.02, + "learning_rate": 0.000322567703109328, + "loss": 2.9238, + "theoretical_loss": 3.5895287287280273, + "tokens_seen": 1192523776 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225576730190572, + "loss": 2.9045, + "theoretical_loss": 3.5895105622988224, + "tokens_seen": 1192589312 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2833395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8013877868652344, + "objective/train/theoretical_loss": 3.5894923971473895, + "objective/train/tokens_used": 1213114848, + "theoretical_loss": 3.5894923971473895, + "tokens_seen": 1192654848 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032254764292878634, + "loss": 2.7521, + "theoretical_loss": 3.5894923971473895, + "tokens_seen": 1192654848 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225376128385156, + "loss": 2.7433, + "theoretical_loss": 3.5894742332735685, + "tokens_seen": 1192720384 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225275827482447, + "loss": 2.8609, + "theoretical_loss": 3.5894560706771985, + "tokens_seen": 1192785920 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032251755265797394, + "loss": 2.8617, + "theoretical_loss": 3.5894379093581206, + "tokens_seen": 1192851456 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003225075225677031, + "loss": 2.8265, + "theoretical_loss": 3.589419749316174, + "tokens_seen": 1192916992 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224974924774323, + "loss": 2.9037, + "theoretical_loss": 3.589401590551199, + "tokens_seen": 1192982528 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224874623871615, + "loss": 2.9683, + "theoretical_loss": 3.589383433063036, + "tokens_seen": 1193048064 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224774322968907, + "loss": 2.7031, + "theoretical_loss": 3.5893652768515247, + "tokens_seen": 1193113600 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032246740220661984, + "loss": 2.8704, + "theoretical_loss": 3.5893471219165054, + "tokens_seen": 1193179136 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224573721163491, + "loss": 2.8717, + "theoretical_loss": 3.589328968257818, + "tokens_seen": 1193244672 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224473420260782, + "loss": 2.8625, + "theoretical_loss": 3.5893108158753027, + "tokens_seen": 1193310208 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032243731193580744, + "loss": 2.8162, + "theoretical_loss": 3.5892926647688004, + "tokens_seen": 1193375744 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224272818455366, + "loss": 2.8384, + "theoretical_loss": 3.5892745149381504, + "tokens_seen": 1193441280 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003224172517552658, + "loss": 2.8923, + "theoretical_loss": 3.5892563663831933, + "tokens_seen": 1193506816 + }, + { + "epoch": 14.02, + "learning_rate": 0.000322407221664995, + "loss": 2.8899, + "theoretical_loss": 3.5892382191037697, + "tokens_seen": 1193572352 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032239719157472417, + "loss": 2.7623, + "theoretical_loss": 3.5892200730997192, + "tokens_seen": 1193637888 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032238716148445335, + "loss": 2.8411, + "theoretical_loss": 3.5892019283708834, + "tokens_seen": 1193703424 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003223771313941826, + "loss": 2.8386, + "theoretical_loss": 3.5891837849171013, + "tokens_seen": 1193768960 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003223671013039117, + "loss": 2.9057, + "theoretical_loss": 3.589165642738214, + "tokens_seen": 1193834496 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032235707121364095, + "loss": 2.8811, + "theoretical_loss": 3.589147501834062, + "tokens_seen": 1193900032 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003223470411233701, + "loss": 2.8881, + "theoretical_loss": 3.5891293622044858, + "tokens_seen": 1193965568 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003223370110330993, + "loss": 2.8937, + "theoretical_loss": 3.5891112238493257, + "tokens_seen": 1194031104 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003223269809428285, + "loss": 2.8645, + "theoretical_loss": 3.589093086768422, + "tokens_seen": 1194096640 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032231695085255767, + "loss": 2.8365, + "theoretical_loss": 3.5890749509616153, + "tokens_seen": 1194162176 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032230692076228685, + "loss": 2.8282, + "theoretical_loss": 3.5890568164287466, + "tokens_seen": 1194227712 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.704463005065918, + "objective/train/theoretical_loss": 3.5890386831696564, + "objective/train/tokens_used": 1214753248, + "theoretical_loss": 3.5890386831696564, + "tokens_seen": 1194293248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003222968906720161, + "loss": 2.7978, + "theoretical_loss": 3.5890386831696564, + "tokens_seen": 1194293248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003222868605817452, + "loss": 2.8452, + "theoretical_loss": 3.589020551184185, + "tokens_seen": 1194358784 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032227683049147445, + "loss": 2.868, + "theoretical_loss": 3.5890024204721733, + "tokens_seen": 1194424320 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003222668004012036, + "loss": 2.859, + "theoretical_loss": 3.588984291033462, + "tokens_seen": 1194489856 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003222567703109328, + "loss": 2.9105, + "theoretical_loss": 3.588966162867892, + "tokens_seen": 1194555392 + }, + { + "epoch": 14.02, + "learning_rate": 0.000322246740220662, + "loss": 2.8516, + "theoretical_loss": 3.588948035975304, + "tokens_seen": 1194620928 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032223671013039117, + "loss": 2.7591, + "theoretical_loss": 3.5889299103555388, + "tokens_seen": 1194686464 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032222668004012035, + "loss": 2.8305, + "theoretical_loss": 3.5889117860084365, + "tokens_seen": 1194752000 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032221664994984953, + "loss": 2.8116, + "theoretical_loss": 3.5888936629338386, + "tokens_seen": 1194817536 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003222066198595787, + "loss": 2.8058, + "theoretical_loss": 3.588875541131586, + "tokens_seen": 1194883072 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032219658976930795, + "loss": 2.8294, + "theoretical_loss": 3.58885742060152, + "tokens_seen": 1194948608 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221865596790371, + "loss": 2.8971, + "theoretical_loss": 3.58883930134348, + "tokens_seen": 1195014144 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221765295887663, + "loss": 2.9008, + "theoretical_loss": 3.5888211833573083, + "tokens_seen": 1195079680 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221664994984955, + "loss": 2.8156, + "theoretical_loss": 3.588803066642846, + "tokens_seen": 1195145216 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221564694082247, + "loss": 2.8321, + "theoretical_loss": 3.588784951199933, + "tokens_seen": 1195210752 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032214643931795386, + "loss": 2.7785, + "theoretical_loss": 3.5887668370284116, + "tokens_seen": 1195276288 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032213640922768304, + "loss": 2.8661, + "theoretical_loss": 3.5887487241281217, + "tokens_seen": 1195341824 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221263791374122, + "loss": 2.9117, + "theoretical_loss": 3.588730612498905, + "tokens_seen": 1195407360 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032211634904714145, + "loss": 2.8436, + "theoretical_loss": 3.5887125021406026, + "tokens_seen": 1195472896 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003221063189568706, + "loss": 2.795, + "theoretical_loss": 3.588694393053056, + "tokens_seen": 1195538432 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220962888665998, + "loss": 2.8443, + "theoretical_loss": 3.5886762852361054, + "tokens_seen": 1195603968 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032208625877632894, + "loss": 2.7703, + "theoretical_loss": 3.588658178689593, + "tokens_seen": 1195669504 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220762286860582, + "loss": 2.8248, + "theoretical_loss": 3.5886400734133597, + "tokens_seen": 1195735040 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220661985957874, + "loss": 2.7481, + "theoretical_loss": 3.5886219694072468, + "tokens_seen": 1195800576 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032205616850551654, + "loss": 2.7986, + "theoretical_loss": 3.5886038666710953, + "tokens_seen": 1195866112 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9018940925598145, + "objective/train/theoretical_loss": 3.588585765204747, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.588585765204747, + "tokens_seen": 1195931648 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220461384152458, + "loss": 2.9042, + "theoretical_loss": 3.588585765204747, + "tokens_seen": 1195931648 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220361083249749, + "loss": 2.8445, + "theoretical_loss": 3.5885676650080427, + "tokens_seen": 1195997184 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032202607823470414, + "loss": 2.8317, + "theoretical_loss": 3.588549566080824, + "tokens_seen": 1196062720 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220160481444333, + "loss": 2.9313, + "theoretical_loss": 3.5885314684229326, + "tokens_seen": 1196128256 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003220060180541625, + "loss": 2.8806, + "theoretical_loss": 3.5885133720342095, + "tokens_seen": 1196193792 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219959879638917, + "loss": 2.7765, + "theoretical_loss": 3.5884952769144967, + "tokens_seen": 1196259328 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219859578736209, + "loss": 2.8185, + "theoretical_loss": 3.5884771830636355, + "tokens_seen": 1196324864 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032197592778335004, + "loss": 2.7117, + "theoretical_loss": 3.588459090481467, + "tokens_seen": 1196390400 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219658976930793, + "loss": 2.7757, + "theoretical_loss": 3.588440999167833, + "tokens_seen": 1196455936 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219558676028084, + "loss": 2.7648, + "theoretical_loss": 3.588422909122576, + "tokens_seen": 1196521472 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032194583751253764, + "loss": 2.8233, + "theoretical_loss": 3.588404820345536, + "tokens_seen": 1196587008 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219358074222668, + "loss": 2.8743, + "theoretical_loss": 3.5883867328365557, + "tokens_seen": 1196652544 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321925777331996, + "loss": 2.8089, + "theoretical_loss": 3.588368646595476, + "tokens_seen": 1196718080 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003219157472417252, + "loss": 2.9058, + "theoretical_loss": 3.588350561622139, + "tokens_seen": 1196783616 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032190571715145437, + "loss": 2.8785, + "theoretical_loss": 3.5883324779163877, + "tokens_seen": 1196849152 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032189568706118355, + "loss": 2.7645, + "theoretical_loss": 3.5883143954780614, + "tokens_seen": 1196914688 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218856569709128, + "loss": 2.9517, + "theoretical_loss": 3.5882963143070037, + "tokens_seen": 1196980224 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218756268806419, + "loss": 2.8219, + "theoretical_loss": 3.588278234403056, + "tokens_seen": 1197045760 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032186559679037115, + "loss": 2.7996, + "theoretical_loss": 3.5882601557660596, + "tokens_seen": 1197111296 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218555667001003, + "loss": 2.941, + "theoretical_loss": 3.5882420783958566, + "tokens_seen": 1197176832 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218455366098295, + "loss": 2.9045, + "theoretical_loss": 3.5882240022922893, + "tokens_seen": 1197242368 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218355065195587, + "loss": 2.9493, + "theoretical_loss": 3.588205927455199, + "tokens_seen": 1197307904 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032182547642928787, + "loss": 2.8898, + "theoretical_loss": 3.5881878538844285, + "tokens_seen": 1197373440 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032181544633901705, + "loss": 2.7976, + "theoretical_loss": 3.5881697815798193, + "tokens_seen": 1197438976 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003218054162487463, + "loss": 2.8096, + "theoretical_loss": 3.5881517105412133, + "tokens_seen": 1197504512 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9191172122955322, + "objective/train/theoretical_loss": 3.5881336407684525, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.5881336407684525, + "tokens_seen": 1197570048 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003217953861584754, + "loss": 2.9392, + "theoretical_loss": 3.5881336407684525, + "tokens_seen": 1197570048 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032178535606820465, + "loss": 2.9166, + "theoretical_loss": 3.588115572261379, + "tokens_seen": 1197635584 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003217753259779338, + "loss": 2.8602, + "theoretical_loss": 3.588097505019835, + "tokens_seen": 1197701120 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321765295887663, + "loss": 2.7534, + "theoretical_loss": 3.5880794390436628, + "tokens_seen": 1197766656 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003217552657973922, + "loss": 2.7839, + "theoretical_loss": 3.588061374332704, + "tokens_seen": 1197832192 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032174523570712137, + "loss": 2.8493, + "theoretical_loss": 3.588043310886802, + "tokens_seen": 1197897728 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032173520561685055, + "loss": 2.8284, + "theoretical_loss": 3.5880252487057973, + "tokens_seen": 1197963264 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032172517552657974, + "loss": 2.8164, + "theoretical_loss": 3.5880071877895334, + "tokens_seen": 1198028800 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003217151454363089, + "loss": 2.7821, + "theoretical_loss": 3.5879891281378518, + "tokens_seen": 1198094336 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032170511534603815, + "loss": 2.707, + "theoretical_loss": 3.5879710697505955, + "tokens_seen": 1198159872 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216950852557673, + "loss": 2.7299, + "theoretical_loss": 3.5879530126276062, + "tokens_seen": 1198225408 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216850551654965, + "loss": 2.7839, + "theoretical_loss": 3.5879349567687266, + "tokens_seen": 1198290944 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216750250752257, + "loss": 2.7671, + "theoretical_loss": 3.5879169021737987, + "tokens_seen": 1198356480 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216649949849549, + "loss": 2.8965, + "theoretical_loss": 3.5878988488426655, + "tokens_seen": 1198422016 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032165496489468406, + "loss": 2.9114, + "theoretical_loss": 3.5878807967751696, + "tokens_seen": 1198487552 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032164493480441324, + "loss": 2.8071, + "theoretical_loss": 3.5878627459711523, + "tokens_seen": 1198553088 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216349047141424, + "loss": 2.9071, + "theoretical_loss": 3.587844696430457, + "tokens_seen": 1198618624 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032162487462387165, + "loss": 2.8955, + "theoretical_loss": 3.5878266481529257, + "tokens_seen": 1198684160 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003216148445336008, + "loss": 2.9325, + "theoretical_loss": 3.587808601138401, + "tokens_seen": 1198749696 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032160481444333, + "loss": 2.8722, + "theoretical_loss": 3.587790555386727, + "tokens_seen": 1198815232 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032159478435305914, + "loss": 2.8289, + "theoretical_loss": 3.587772510897744, + "tokens_seen": 1198880768 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215847542627884, + "loss": 2.932, + "theoretical_loss": 3.587754467671296, + "tokens_seen": 1198946304 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032157472417251756, + "loss": 2.7537, + "theoretical_loss": 3.5877364257072246, + "tokens_seen": 1199011840 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032156469408224674, + "loss": 2.7929, + "theoretical_loss": 3.5877183850053735, + "tokens_seen": 1199077376 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215546639919759, + "loss": 2.8197, + "theoretical_loss": 3.5877003455655854, + "tokens_seen": 1199142912 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.878288507461548, + "objective/train/theoretical_loss": 3.5876823073877024, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.5876823073877024, + "tokens_seen": 1199208448 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215446339017051, + "loss": 2.8088, + "theoretical_loss": 3.5876823073877024, + "tokens_seen": 1199208448 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215346038114343, + "loss": 2.8354, + "theoretical_loss": 3.587664270471568, + "tokens_seen": 1199273984 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215245737211635, + "loss": 2.7871, + "theoretical_loss": 3.587646234817024, + "tokens_seen": 1199339520 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032151454363089265, + "loss": 2.8545, + "theoretical_loss": 3.5876282004239144, + "tokens_seen": 1199405056 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003215045135406219, + "loss": 2.8946, + "theoretical_loss": 3.5876101672920813, + "tokens_seen": 1199470592 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032149448345035106, + "loss": 2.9226, + "theoretical_loss": 3.5875921354213682, + "tokens_seen": 1199536128 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032148445336008024, + "loss": 2.8492, + "theoretical_loss": 3.587574104811617, + "tokens_seen": 1199601664 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003214744232698094, + "loss": 2.8201, + "theoretical_loss": 3.587556075462671, + "tokens_seen": 1199667200 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003214643931795386, + "loss": 2.7375, + "theoretical_loss": 3.587538047374374, + "tokens_seen": 1199732736 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003214543630892678, + "loss": 2.8259, + "theoretical_loss": 3.5875200205465685, + "tokens_seen": 1199798272 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321444332998997, + "loss": 2.849, + "theoretical_loss": 3.587501994979097, + "tokens_seen": 1199863808 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032143430290872615, + "loss": 2.9424, + "theoretical_loss": 3.587483970671803, + "tokens_seen": 1199929344 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003214242728184554, + "loss": 2.9107, + "theoretical_loss": 3.58746594762453, + "tokens_seen": 1199994880 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003214142427281845, + "loss": 2.8235, + "theoretical_loss": 3.58744792583712, + "tokens_seen": 1200060416 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032140421263791375, + "loss": 2.7813, + "theoretical_loss": 3.587429905309417, + "tokens_seen": 1200125952 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032139418254764293, + "loss": 2.8888, + "theoretical_loss": 3.5874118860412643, + "tokens_seen": 1200191488 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213841524573721, + "loss": 2.8058, + "theoretical_loss": 3.5873938680325046, + "tokens_seen": 1200257024 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213741223671013, + "loss": 2.8521, + "theoretical_loss": 3.5873758512829808, + "tokens_seen": 1200322560 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213640922768305, + "loss": 2.8616, + "theoretical_loss": 3.5873578357925373, + "tokens_seen": 1200388096 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032135406218655965, + "loss": 2.8573, + "theoretical_loss": 3.5873398215610166, + "tokens_seen": 1200453632 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213440320962889, + "loss": 2.7686, + "theoretical_loss": 3.587321808588262, + "tokens_seen": 1200519168 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321334002006018, + "loss": 2.87, + "theoretical_loss": 3.587303796874117, + "tokens_seen": 1200584704 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032132397191574725, + "loss": 2.9294, + "theoretical_loss": 3.5872857864184247, + "tokens_seen": 1200650240 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213139418254765, + "loss": 2.8262, + "theoretical_loss": 3.5872677772210286, + "tokens_seen": 1200715776 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003213039117352056, + "loss": 2.7782, + "theoretical_loss": 3.5872497692817724, + "tokens_seen": 1200781312 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.007293224334717, + "objective/train/theoretical_loss": 3.5872317626004993, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.5872317626004993, + "tokens_seen": 1200846848 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032129388164493485, + "loss": 2.8958, + "theoretical_loss": 3.5872317626004993, + "tokens_seen": 1200846848 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321283851554664, + "loss": 2.8524, + "theoretical_loss": 3.587213757177053, + "tokens_seen": 1200912384 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003212738214643932, + "loss": 2.9125, + "theoretical_loss": 3.587195753011277, + "tokens_seen": 1200977920 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003212637913741224, + "loss": 2.9305, + "theoretical_loss": 3.587177750103014, + "tokens_seen": 1201043456 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003212537612838516, + "loss": 2.849, + "theoretical_loss": 3.587159748452109, + "tokens_seen": 1201108992 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032124373119358075, + "loss": 2.8228, + "theoretical_loss": 3.5871417480584045, + "tokens_seen": 1201174528 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032123370110330994, + "loss": 2.8789, + "theoretical_loss": 3.5871237489217442, + "tokens_seen": 1201240064 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003212236710130391, + "loss": 2.9106, + "theoretical_loss": 3.587105751041972, + "tokens_seen": 1201305600 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032121364092276835, + "loss": 2.8793, + "theoretical_loss": 3.587087754418932, + "tokens_seen": 1201371136 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003212036108324975, + "loss": 2.9062, + "theoretical_loss": 3.5870697590524667, + "tokens_seen": 1201436672 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003211935807422267, + "loss": 2.8215, + "theoretical_loss": 3.587051764942421, + "tokens_seen": 1201502208 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003211835506519559, + "loss": 2.8579, + "theoretical_loss": 3.5870337720886383, + "tokens_seen": 1201567744 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003211735205616851, + "loss": 2.8226, + "theoretical_loss": 3.5870157804909626, + "tokens_seen": 1201633280 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032116349047141426, + "loss": 2.8257, + "theoretical_loss": 3.586997790149237, + "tokens_seen": 1201698816 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032115346038114344, + "loss": 2.8359, + "theoretical_loss": 3.586979801063306, + "tokens_seen": 1201764352 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003211434302908726, + "loss": 2.8034, + "theoretical_loss": 3.586961813233012, + "tokens_seen": 1201829888 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032113340020060185, + "loss": 2.8494, + "theoretical_loss": 3.5869438266582017, + "tokens_seen": 1201895424 + }, + { + "epoch": 14.02, + "learning_rate": 0.000321123370110331, + "loss": 2.9493, + "theoretical_loss": 3.586925841338717, + "tokens_seen": 1201960960 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003211133400200602, + "loss": 2.9176, + "theoretical_loss": 3.5869078572744018, + "tokens_seen": 1202026496 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032110330992978934, + "loss": 2.9087, + "theoretical_loss": 3.5868898744651005, + "tokens_seen": 1202092032 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210932798395186, + "loss": 2.9124, + "theoretical_loss": 3.5868718929106578, + "tokens_seen": 1202157568 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032108324974924776, + "loss": 2.8283, + "theoretical_loss": 3.5868539126109162, + "tokens_seen": 1202223104 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032107321965897694, + "loss": 2.8395, + "theoretical_loss": 3.5868359335657214, + "tokens_seen": 1202288640 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210631895687061, + "loss": 2.856, + "theoretical_loss": 3.5868179557749165, + "tokens_seen": 1202354176 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210531594784353, + "loss": 2.8462, + "theoretical_loss": 3.5867999792383456, + "tokens_seen": 1202419712 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.552184581756592, + "objective/train/theoretical_loss": 3.586782003955854, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.586782003955854, + "tokens_seen": 1202485248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210431293881645, + "loss": 2.6476, + "theoretical_loss": 3.586782003955854, + "tokens_seen": 1202485248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210330992978937, + "loss": 2.7858, + "theoretical_loss": 3.5867640299272843, + "tokens_seen": 1202550784 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032102306920762285, + "loss": 2.9495, + "theoretical_loss": 3.586746057152481, + "tokens_seen": 1202616320 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003210130391173521, + "loss": 2.8853, + "theoretical_loss": 3.586728085631289, + "tokens_seen": 1202681856 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032100300902708126, + "loss": 2.9107, + "theoretical_loss": 3.5867101153635526, + "tokens_seen": 1202747392 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032099297893681044, + "loss": 2.8609, + "theoretical_loss": 3.5866921463491153, + "tokens_seen": 1202812928 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003209829488465396, + "loss": 2.7369, + "theoretical_loss": 3.5866741785878222, + "tokens_seen": 1202878464 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003209729187562688, + "loss": 2.8531, + "theoretical_loss": 3.586656212079517, + "tokens_seen": 1202944000 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320962888665998, + "loss": 2.7718, + "theoretical_loss": 3.5866382468240445, + "tokens_seen": 1203009536 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003209528585757272, + "loss": 2.8632, + "theoretical_loss": 3.586620282821249, + "tokens_seen": 1203075072 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032094282848545635, + "loss": 2.8243, + "theoretical_loss": 3.5866023200709747, + "tokens_seen": 1203140608 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003209327983951856, + "loss": 2.8007, + "theoretical_loss": 3.5865843585730666, + "tokens_seen": 1203206144 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003209227683049147, + "loss": 2.8798, + "theoretical_loss": 3.5865663983273683, + "tokens_seen": 1203271680 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032091273821464395, + "loss": 2.9046, + "theoretical_loss": 3.586548439333725, + "tokens_seen": 1203337216 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032090270812437313, + "loss": 2.897, + "theoretical_loss": 3.586530481591981, + "tokens_seen": 1203402752 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208926780341023, + "loss": 2.7888, + "theoretical_loss": 3.5865125251019814, + "tokens_seen": 1203468288 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208826479438315, + "loss": 2.8492, + "theoretical_loss": 3.5864945698635697, + "tokens_seen": 1203533824 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208726178535607, + "loss": 2.8406, + "theoretical_loss": 3.5864766158765913, + "tokens_seen": 1203599360 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032086258776328985, + "loss": 2.9142, + "theoretical_loss": 3.5864586631408906, + "tokens_seen": 1203664896 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208525576730191, + "loss": 2.8295, + "theoretical_loss": 3.586440711656312, + "tokens_seen": 1203730432 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208425275827482, + "loss": 2.8538, + "theoretical_loss": 3.586422761422701, + "tokens_seen": 1203795968 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032083249749247745, + "loss": 2.848, + "theoretical_loss": 3.586404812439902, + "tokens_seen": 1203861504 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032082246740220663, + "loss": 2.8429, + "theoretical_loss": 3.5863868647077592, + "tokens_seen": 1203927040 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003208124373119358, + "loss": 2.8787, + "theoretical_loss": 3.586368918226118, + "tokens_seen": 1203992576 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320802407221665, + "loss": 2.8176, + "theoretical_loss": 3.586350972994823, + "tokens_seen": 1204058112 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9742016792297363, + "objective/train/theoretical_loss": 3.586333029013719, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.586333029013719, + "tokens_seen": 1204123648 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207923771313942, + "loss": 2.7784, + "theoretical_loss": 3.586333029013719, + "tokens_seen": 1204123648 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032078234704112336, + "loss": 2.7668, + "theoretical_loss": 3.586315086282651, + "tokens_seen": 1204189184 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207723169508526, + "loss": 2.8585, + "theoretical_loss": 3.5862971448014638, + "tokens_seen": 1204254720 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207622868605817, + "loss": 2.814, + "theoretical_loss": 3.586279204570002, + "tokens_seen": 1204320256 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032075225677031095, + "loss": 2.8372, + "theoretical_loss": 3.5862612655881114, + "tokens_seen": 1204385792 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207422266800401, + "loss": 2.8196, + "theoretical_loss": 3.5862433278556365, + "tokens_seen": 1204451328 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207321965897693, + "loss": 2.8584, + "theoretical_loss": 3.5862253913724214, + "tokens_seen": 1204516864 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207221664994985, + "loss": 2.8897, + "theoretical_loss": 3.586207456138313, + "tokens_seen": 1204582400 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003207121364092277, + "loss": 2.8235, + "theoretical_loss": 3.5861895221531546, + "tokens_seen": 1204647936 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032070210631895686, + "loss": 2.9145, + "theoretical_loss": 3.5861715894167925, + "tokens_seen": 1204713472 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206920762286861, + "loss": 2.8911, + "theoretical_loss": 3.5861536579290716, + "tokens_seen": 1204779008 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206820461384152, + "loss": 2.8699, + "theoretical_loss": 3.586135727689836, + "tokens_seen": 1204844544 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032067201604814446, + "loss": 2.9936, + "theoretical_loss": 3.5861177986989325, + "tokens_seen": 1204910080 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206619859578736, + "loss": 2.7951, + "theoretical_loss": 3.586099870956205, + "tokens_seen": 1204975616 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206519558676028, + "loss": 2.8626, + "theoretical_loss": 3.5860819444614993, + "tokens_seen": 1205041152 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320641925777332, + "loss": 2.9203, + "theoretical_loss": 3.5860640192146604, + "tokens_seen": 1205106688 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206318956870612, + "loss": 2.8485, + "theoretical_loss": 3.5860460952155337, + "tokens_seen": 1205172224 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032062186559679036, + "loss": 2.8406, + "theoretical_loss": 3.5860281724639647, + "tokens_seen": 1205237760 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032061183550651954, + "loss": 2.8863, + "theoretical_loss": 3.5860102509597986, + "tokens_seen": 1205303296 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003206018054162487, + "loss": 2.9348, + "theoretical_loss": 3.585992330702881, + "tokens_seen": 1205368832 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032059177532597796, + "loss": 2.8769, + "theoretical_loss": 3.5859744116930568, + "tokens_seen": 1205434368 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205817452357071, + "loss": 2.8253, + "theoretical_loss": 3.585956493930172, + "tokens_seen": 1205499904 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205717151454363, + "loss": 2.9253, + "theoretical_loss": 3.585938577414071, + "tokens_seen": 1205565440 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205616850551655, + "loss": 2.8869, + "theoretical_loss": 3.5859206621446003, + "tokens_seen": 1205630976 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205516549648947, + "loss": 2.841, + "theoretical_loss": 3.585902748121605, + "tokens_seen": 1205696512 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.864086627960205, + "objective/train/theoretical_loss": 3.585884835344931, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.585884835344931, + "tokens_seen": 1205762048 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205416248746239, + "loss": 2.789, + "theoretical_loss": 3.585884835344931, + "tokens_seen": 1205762048 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032053159478435305, + "loss": 2.8825, + "theoretical_loss": 3.585866923814423, + "tokens_seen": 1205827584 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003205215646940823, + "loss": 2.8497, + "theoretical_loss": 3.585849013529928, + "tokens_seen": 1205893120 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032051153460381146, + "loss": 2.9417, + "theoretical_loss": 3.58583110449129, + "tokens_seen": 1205958656 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032050150451354064, + "loss": 2.7978, + "theoretical_loss": 3.5858131966983553, + "tokens_seen": 1206024192 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204914744232698, + "loss": 2.7631, + "theoretical_loss": 3.5857952901509704, + "tokens_seen": 1206089728 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320481444332999, + "loss": 2.7711, + "theoretical_loss": 3.5857773848489796, + "tokens_seen": 1206155264 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204714142427282, + "loss": 2.8988, + "theoretical_loss": 3.5857594807922295, + "tokens_seen": 1206220800 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204613841524574, + "loss": 2.8928, + "theoretical_loss": 3.585741577980566, + "tokens_seen": 1206286336 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032045135406218655, + "loss": 2.8394, + "theoretical_loss": 3.585723676413834, + "tokens_seen": 1206351872 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204413239719158, + "loss": 2.8588, + "theoretical_loss": 3.5857057760918805, + "tokens_seen": 1206417408 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204312938816449, + "loss": 2.8934, + "theoretical_loss": 3.58568787701455, + "tokens_seen": 1206482944 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032042126379137415, + "loss": 2.7572, + "theoretical_loss": 3.5856699791816893, + "tokens_seen": 1206548480 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032041123370110333, + "loss": 2.8714, + "theoretical_loss": 3.585652082593144, + "tokens_seen": 1206614016 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003204012036108325, + "loss": 2.8496, + "theoretical_loss": 3.58563418724876, + "tokens_seen": 1206679552 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203911735205617, + "loss": 2.7157, + "theoretical_loss": 3.585616293148383, + "tokens_seen": 1206745088 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203811434302909, + "loss": 2.8356, + "theoretical_loss": 3.5855984002918593, + "tokens_seen": 1206810624 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032037111334002005, + "loss": 2.799, + "theoretical_loss": 3.585580508679035, + "tokens_seen": 1206876160 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203610832497493, + "loss": 2.934, + "theoretical_loss": 3.5855626183097558, + "tokens_seen": 1206941696 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203510531594784, + "loss": 2.9379, + "theoretical_loss": 3.585544729183868, + "tokens_seen": 1207007232 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032034102306920765, + "loss": 2.7979, + "theoretical_loss": 3.5855268413012173, + "tokens_seen": 1207072768 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032033099297893683, + "loss": 2.8148, + "theoretical_loss": 3.5855089546616505, + "tokens_seen": 1207138304 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320320962888666, + "loss": 2.8862, + "theoretical_loss": 3.5854910692650135, + "tokens_seen": 1207203840 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203109327983952, + "loss": 2.7368, + "theoretical_loss": 3.5854731851111517, + "tokens_seen": 1207269376 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003203009027081244, + "loss": 2.9399, + "theoretical_loss": 3.585455302199912, + "tokens_seen": 1207334912 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.799795389175415, + "objective/train/theoretical_loss": 3.5854374205311403, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.5854374205311403, + "tokens_seen": 1207400448 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032029087261785356, + "loss": 2.8793, + "theoretical_loss": 3.5854374205311403, + "tokens_seen": 1207400448 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202808425275828, + "loss": 2.7672, + "theoretical_loss": 3.5854195401046836, + "tokens_seen": 1207465984 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202708124373119, + "loss": 2.7271, + "theoretical_loss": 3.585401660920387, + "tokens_seen": 1207531520 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032026078234704115, + "loss": 2.8735, + "theoretical_loss": 3.5853837829780977, + "tokens_seen": 1207597056 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202507522567703, + "loss": 2.9063, + "theoretical_loss": 3.5853659062776617, + "tokens_seen": 1207662592 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202407221664995, + "loss": 2.8057, + "theoretical_loss": 3.585348030818925, + "tokens_seen": 1207728128 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202306920762287, + "loss": 2.8546, + "theoretical_loss": 3.585330156601735, + "tokens_seen": 1207793664 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202206619859579, + "loss": 2.9399, + "theoretical_loss": 3.585312283625937, + "tokens_seen": 1207859200 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032021063189568706, + "loss": 2.8369, + "theoretical_loss": 3.585294411891378, + "tokens_seen": 1207924736 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003202006018054163, + "loss": 2.9236, + "theoretical_loss": 3.5852765413979046, + "tokens_seen": 1207990272 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003201905717151454, + "loss": 2.7879, + "theoretical_loss": 3.585258672145363, + "tokens_seen": 1208055808 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032018054162487466, + "loss": 2.8403, + "theoretical_loss": 3.5852408041335995, + "tokens_seen": 1208121344 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003201705115346038, + "loss": 2.8483, + "theoretical_loss": 3.585222937362461, + "tokens_seen": 1208186880 + }, + { + "epoch": 14.02, + "learning_rate": 0.000320160481444333, + "loss": 2.8856, + "theoretical_loss": 3.585205071831794, + "tokens_seen": 1208252416 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003201504513540622, + "loss": 2.8694, + "theoretical_loss": 3.585187207541445, + "tokens_seen": 1208317952 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003201404212637914, + "loss": 2.9416, + "theoretical_loss": 3.585169344491261, + "tokens_seen": 1208383488 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032013039117352056, + "loss": 2.9226, + "theoretical_loss": 3.5851514826810877, + "tokens_seen": 1208449024 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032012036108324974, + "loss": 2.8653, + "theoretical_loss": 3.585133622110773, + "tokens_seen": 1208514560 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003201103309929789, + "loss": 2.8128, + "theoretical_loss": 3.5851157627801626, + "tokens_seen": 1208580096 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032010030090270816, + "loss": 2.7926, + "theoretical_loss": 3.5850979046891043, + "tokens_seen": 1208645632 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003200902708124373, + "loss": 2.8595, + "theoretical_loss": 3.585080047837444, + "tokens_seen": 1208711168 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003200802407221665, + "loss": 2.8642, + "theoretical_loss": 3.5850621922250285, + "tokens_seen": 1208776704 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032007021063189565, + "loss": 2.8512, + "theoretical_loss": 3.5850443378517047, + "tokens_seen": 1208842240 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003200601805416249, + "loss": 2.947, + "theoretical_loss": 3.58502648471732, + "tokens_seen": 1208907776 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032005015045135407, + "loss": 2.8075, + "theoretical_loss": 3.585008632821721, + "tokens_seen": 1208973312 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.841606616973877, + "objective/train/theoretical_loss": 3.584990782164754, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.584990782164754, + "tokens_seen": 1209038848 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032004012036108325, + "loss": 2.7541, + "theoretical_loss": 3.584990782164754, + "tokens_seen": 1209038848 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032003009027081243, + "loss": 2.8449, + "theoretical_loss": 3.5849729327462665, + "tokens_seen": 1209104384 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032002006018054166, + "loss": 2.7472, + "theoretical_loss": 3.5849550845661056, + "tokens_seen": 1209169920 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003200100300902708, + "loss": 2.93, + "theoretical_loss": 3.5849372376241178, + "tokens_seen": 1209235456 + }, + { + "epoch": 14.02, + "learning_rate": 0.00032, + "loss": 2.8145, + "theoretical_loss": 3.5849193919201507, + "tokens_seen": 1209300992 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031998996990972915, + "loss": 2.8758, + "theoretical_loss": 3.5849015474540504, + "tokens_seen": 1209366528 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003199799398194584, + "loss": 2.8204, + "theoretical_loss": 3.584883704225665, + "tokens_seen": 1209432064 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031996990972918757, + "loss": 2.857, + "theoretical_loss": 3.5848658622348406, + "tokens_seen": 1209497600 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031995987963891675, + "loss": 2.832, + "theoretical_loss": 3.5848480214814256, + "tokens_seen": 1209563136 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031994984954864593, + "loss": 2.8631, + "theoretical_loss": 3.5848301819652657, + "tokens_seen": 1209628672 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003199398194583751, + "loss": 2.8123, + "theoretical_loss": 3.5848123436862087, + "tokens_seen": 1209694208 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003199297893681043, + "loss": 2.9197, + "theoretical_loss": 3.5847945066441023, + "tokens_seen": 1209759744 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031991975927783353, + "loss": 2.8516, + "theoretical_loss": 3.584776670838793, + "tokens_seen": 1209825280 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031990972918756266, + "loss": 2.8448, + "theoretical_loss": 3.5847588362701286, + "tokens_seen": 1209890816 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003198996990972919, + "loss": 2.8516, + "theoretical_loss": 3.584741002937956, + "tokens_seen": 1209956352 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319889669007021, + "loss": 2.8684, + "theoretical_loss": 3.584723170842122, + "tokens_seen": 1210021888 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031987963891675025, + "loss": 2.8898, + "theoretical_loss": 3.5847053399824755, + "tokens_seen": 1210087424 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031986960882647943, + "loss": 2.9276, + "theoretical_loss": 3.5846875103588625, + "tokens_seen": 1210152960 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003198595787362086, + "loss": 2.9284, + "theoretical_loss": 3.58466968197113, + "tokens_seen": 1210218496 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003198495486459378, + "loss": 2.8888, + "theoretical_loss": 3.5846518548191275, + "tokens_seen": 1210284032 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031983951855566703, + "loss": 2.8509, + "theoretical_loss": 3.5846340289027, + "tokens_seen": 1210349568 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031982948846539616, + "loss": 2.9071, + "theoretical_loss": 3.584616204221697, + "tokens_seen": 1210415104 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003198194583751254, + "loss": 2.7324, + "theoretical_loss": 3.584598380775965, + "tokens_seen": 1210480640 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003198094282848546, + "loss": 2.9202, + "theoretical_loss": 3.584580558565351, + "tokens_seen": 1210546176 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031979939819458376, + "loss": 2.87, + "theoretical_loss": 3.584562737589704, + "tokens_seen": 1210611712 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.683354139328003, + "objective/train/theoretical_loss": 3.58454491784887, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.58454491784887, + "tokens_seen": 1210677248 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319789368104313, + "loss": 2.8071, + "theoretical_loss": 3.58454491784887, + "tokens_seen": 1210677248 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197793380140421, + "loss": 2.8522, + "theoretical_loss": 3.584527099342698, + "tokens_seen": 1210742784 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031976930792377135, + "loss": 2.7757, + "theoretical_loss": 3.5845092820710347, + "tokens_seen": 1210808320 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197592778335005, + "loss": 2.8984, + "theoretical_loss": 3.584491466033728, + "tokens_seen": 1210873856 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197492477432297, + "loss": 2.8113, + "theoretical_loss": 3.584473651230626, + "tokens_seen": 1210939392 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197392176529589, + "loss": 2.7777, + "theoretical_loss": 3.5844558376615754, + "tokens_seen": 1211004928 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197291875626881, + "loss": 2.8984, + "theoretical_loss": 3.584438025326425, + "tokens_seen": 1211070464 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031971915747241726, + "loss": 2.7904, + "theoretical_loss": 3.5844202142250223, + "tokens_seen": 1211136000 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003197091273821465, + "loss": 2.818, + "theoretical_loss": 3.5844024043572142, + "tokens_seen": 1211201536 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003196990972918756, + "loss": 2.8274, + "theoretical_loss": 3.58438459572285, + "tokens_seen": 1211267072 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031968906720160486, + "loss": 2.8889, + "theoretical_loss": 3.584366788321777, + "tokens_seen": 1211332608 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319679037111334, + "loss": 2.8764, + "theoretical_loss": 3.584348982153842, + "tokens_seen": 1211398144 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003196690070210632, + "loss": 2.8964, + "theoretical_loss": 3.5843311772188944, + "tokens_seen": 1211463680 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003196589769307924, + "loss": 2.7691, + "theoretical_loss": 3.584313373516782, + "tokens_seen": 1211529216 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003196489468405216, + "loss": 2.7326, + "theoretical_loss": 3.584295571047351, + "tokens_seen": 1211594752 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031963891675025076, + "loss": 2.8421, + "theoretical_loss": 3.584277769810452, + "tokens_seen": 1211660288 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031962888665997994, + "loss": 2.8501, + "theoretical_loss": 3.584259969805931, + "tokens_seen": 1211725824 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003196188565697091, + "loss": 2.8034, + "theoretical_loss": 3.5842421710336367, + "tokens_seen": 1211791360 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031960882647943836, + "loss": 2.8476, + "theoretical_loss": 3.584224373493417, + "tokens_seen": 1211856896 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003195987963891675, + "loss": 2.7886, + "theoretical_loss": 3.58420657718512, + "tokens_seen": 1211922432 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003195887662988967, + "loss": 2.9561, + "theoretical_loss": 3.584188782108595, + "tokens_seen": 1211987968 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031957873620862585, + "loss": 2.847, + "theoretical_loss": 3.584170988263688, + "tokens_seen": 1212053504 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003195687061183551, + "loss": 2.83, + "theoretical_loss": 3.5841531956502486, + "tokens_seen": 1212119040 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031955867602808427, + "loss": 2.8331, + "theoretical_loss": 3.5841354042681255, + "tokens_seen": 1212184576 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031954864593781345, + "loss": 2.965, + "theoretical_loss": 3.584117614117165, + "tokens_seen": 1212250112 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0070066452026367, + "objective/train/theoretical_loss": 3.5840998251972174, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.5840998251972174, + "tokens_seen": 1212315648 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031953861584754263, + "loss": 2.8924, + "theoretical_loss": 3.5840998251972174, + "tokens_seen": 1212315648 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031952858575727186, + "loss": 2.7514, + "theoretical_loss": 3.5840820375081295, + "tokens_seen": 1212381184 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319518555667001, + "loss": 2.8489, + "theoretical_loss": 3.58406425104975, + "tokens_seen": 1212446720 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003195085255767302, + "loss": 2.8616, + "theoretical_loss": 3.5840464658219275, + "tokens_seen": 1212512256 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031949849548645935, + "loss": 2.9292, + "theoretical_loss": 3.5840286818245106, + "tokens_seen": 1212577792 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003194884653961886, + "loss": 2.8795, + "theoretical_loss": 3.584010899057347, + "tokens_seen": 1212643328 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031947843530591777, + "loss": 2.8781, + "theoretical_loss": 3.5839931175202855, + "tokens_seen": 1212708864 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031946840521564695, + "loss": 2.6902, + "theoretical_loss": 3.5839753372131744, + "tokens_seen": 1212774400 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031945837512537613, + "loss": 2.8404, + "theoretical_loss": 3.5839575581358627, + "tokens_seen": 1212839936 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003194483450351053, + "loss": 2.8614, + "theoretical_loss": 3.583939780288198, + "tokens_seen": 1212905472 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003194383149448345, + "loss": 2.7761, + "theoretical_loss": 3.583922003670029, + "tokens_seen": 1212971008 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031942828485456373, + "loss": 2.7913, + "theoretical_loss": 3.583904228281205, + "tokens_seen": 1213036544 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031941825476429286, + "loss": 2.8222, + "theoretical_loss": 3.583886454121574, + "tokens_seen": 1213102080 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003194082246740221, + "loss": 2.9239, + "theoretical_loss": 3.583868681190985, + "tokens_seen": 1213167616 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003193981945837512, + "loss": 2.8853, + "theoretical_loss": 3.5838509094892856, + "tokens_seen": 1213233152 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031938816449348045, + "loss": 2.8576, + "theoretical_loss": 3.583833139016326, + "tokens_seen": 1213298688 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031937813440320964, + "loss": 2.7508, + "theoretical_loss": 3.583815369771953, + "tokens_seen": 1213364224 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003193681043129388, + "loss": 2.8589, + "theoretical_loss": 3.583797601756017, + "tokens_seen": 1213429760 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319358074222668, + "loss": 2.7912, + "theoretical_loss": 3.5837798349683663, + "tokens_seen": 1213495296 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031934804413239723, + "loss": 2.8864, + "theoretical_loss": 3.5837620694088494, + "tokens_seen": 1213560832 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031933801404212636, + "loss": 2.8982, + "theoretical_loss": 3.583744305077315, + "tokens_seen": 1213626368 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003193279839518556, + "loss": 2.9117, + "theoretical_loss": 3.5837265419736117, + "tokens_seen": 1213691904 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003193179538615847, + "loss": 2.8522, + "theoretical_loss": 3.583708780097589, + "tokens_seen": 1213757440 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031930792377131396, + "loss": 2.8617, + "theoretical_loss": 3.5836910194490956, + "tokens_seen": 1213822976 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031929789368104314, + "loss": 2.7699, + "theoretical_loss": 3.58367326002798, + "tokens_seen": 1213888512 + }, + { + "epoch": 14.02, + "objective/train/docs_used": 2836320, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.956008195877075, + "objective/train/theoretical_loss": 3.583655501834092, + "objective/train/tokens_used": 1214909920, + "theoretical_loss": 3.583655501834092, + "tokens_seen": 1213954048 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192878635907723, + "loss": 2.9515, + "theoretical_loss": 3.583655501834092, + "tokens_seen": 1213954048 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192778335005015, + "loss": 2.9011, + "theoretical_loss": 3.5836377448672794, + "tokens_seen": 1214019584 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192678034102307, + "loss": 2.8229, + "theoretical_loss": 3.5836199891273917, + "tokens_seen": 1214085120 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031925777331995986, + "loss": 2.8718, + "theoretical_loss": 3.583602234614278, + "tokens_seen": 1214150656 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192477432296891, + "loss": 2.7937, + "theoretical_loss": 3.5835844813277875, + "tokens_seen": 1214216192 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192377131394182, + "loss": 2.7859, + "theoretical_loss": 3.5835667292677686, + "tokens_seen": 1214281728 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031922768304914746, + "loss": 2.8244, + "theoretical_loss": 3.583548978434071, + "tokens_seen": 1214347264 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192176529588766, + "loss": 2.9229, + "theoretical_loss": 3.5835312288265437, + "tokens_seen": 1214412800 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003192076228686058, + "loss": 2.9022, + "theoretical_loss": 3.5835134804450357, + "tokens_seen": 1214478336 + }, + { + "epoch": 14.02, + "learning_rate": 0.000319197592778335, + "loss": 2.9113, + "theoretical_loss": 3.583495733289396, + "tokens_seen": 1214543872 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003191875626880642, + "loss": 2.8345, + "theoretical_loss": 3.583477987359474, + "tokens_seen": 1214609408 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031917753259779337, + "loss": 2.9197, + "theoretical_loss": 3.583460242655119, + "tokens_seen": 1214674944 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003191675025075226, + "loss": 2.7778, + "theoretical_loss": 3.58344249917618, + "tokens_seen": 1214740480 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031915747241725173, + "loss": 2.912, + "theoretical_loss": 3.583424756922507, + "tokens_seen": 1214806016 + }, + { + "epoch": 14.02, + "learning_rate": 0.00031914744232698096, + "loss": 2.8039, + "theoretical_loss": 3.583407015893948, + "tokens_seen": 1214871552 + }, + { + "epoch": 14.02, + "learning_rate": 0.0003191374122367101, + "loss": 2.9355, + "theoretical_loss": 3.583391216321713, + "tokens_seen": 1214929920 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003191273821464393, + "loss": 2.8786, + "theoretical_loss": 3.5833734776089754, + "tokens_seen": 1214995456 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003191173520561685, + "loss": 2.7935, + "theoretical_loss": 3.5833557401209166, + "tokens_seen": 1215060992 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003191073219658977, + "loss": 2.8311, + "theoretical_loss": 3.5833380038573868, + "tokens_seen": 1215126528 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031909729187562687, + "loss": 2.7468, + "theoretical_loss": 3.583320268818235, + "tokens_seen": 1215192064 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031908726178535605, + "loss": 2.7957, + "theoretical_loss": 3.5833025350033103, + "tokens_seen": 1215257600 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031907723169508523, + "loss": 2.8238, + "theoretical_loss": 3.5832848024124626, + "tokens_seen": 1215323136 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031906720160481447, + "loss": 2.7806, + "theoretical_loss": 3.5832670710455417, + "tokens_seen": 1215388672 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031905717151454365, + "loss": 2.7972, + "theoretical_loss": 3.583249340902397, + "tokens_seen": 1215454208 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031904714142427283, + "loss": 2.8412, + "theoretical_loss": 3.5832316119828773, + "tokens_seen": 1215519744 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2886764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.650057554244995, + "objective/train/theoretical_loss": 3.5832138842868337, + "objective/train/tokens_used": 1236045280, + "theoretical_loss": 3.5832138842868337, + "tokens_seen": 1215585280 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031903711133400206, + "loss": 2.6765, + "theoretical_loss": 3.5832138842868337, + "tokens_seen": 1215585280 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003190270812437312, + "loss": 2.8035, + "theoretical_loss": 3.583196157814114, + "tokens_seen": 1215650816 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003190170511534604, + "loss": 2.7384, + "theoretical_loss": 3.5831784325645692, + "tokens_seen": 1215716352 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031900702106318955, + "loss": 2.7931, + "theoretical_loss": 3.5831607085380486, + "tokens_seen": 1215781888 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003189969909729188, + "loss": 2.7905, + "theoretical_loss": 3.5831429857344013, + "tokens_seen": 1215847424 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031898696088264797, + "loss": 2.8597, + "theoretical_loss": 3.583125264153478, + "tokens_seen": 1215912960 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031897693079237715, + "loss": 2.8391, + "theoretical_loss": 3.583107543795128, + "tokens_seen": 1215978496 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031896690070210633, + "loss": 2.8278, + "theoretical_loss": 3.5830898246592007, + "tokens_seen": 1216044032 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003189568706118355, + "loss": 2.676, + "theoretical_loss": 3.583072106745546, + "tokens_seen": 1216109568 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003189468405215647, + "loss": 2.7484, + "theoretical_loss": 3.5830543900540146, + "tokens_seen": 1216175104 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031893681043129393, + "loss": 2.8411, + "theoretical_loss": 3.5830366745844557, + "tokens_seen": 1216240640 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031892678034102306, + "loss": 2.8754, + "theoretical_loss": 3.5830189603367186, + "tokens_seen": 1216306176 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003189167502507523, + "loss": 2.7956, + "theoretical_loss": 3.5830012473106545, + "tokens_seen": 1216371712 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003189067201604814, + "loss": 2.7575, + "theoretical_loss": 3.5829835355061124, + "tokens_seen": 1216437248 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031889669007021065, + "loss": 2.7915, + "theoretical_loss": 3.582965824922942, + "tokens_seen": 1216502784 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031888665997993984, + "loss": 2.7745, + "theoretical_loss": 3.5829481155609946, + "tokens_seen": 1216568320 + }, + { + "epoch": 15.0, + "learning_rate": 0.000318876629889669, + "loss": 2.8167, + "theoretical_loss": 3.5829304074201187, + "tokens_seen": 1216633856 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003188665997993982, + "loss": 2.7474, + "theoretical_loss": 3.5829127005001653, + "tokens_seen": 1216699392 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031885656970912743, + "loss": 2.8037, + "theoretical_loss": 3.5828949948009843, + "tokens_seen": 1216764928 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031884653961885656, + "loss": 2.5999, + "theoretical_loss": 3.5828772903224255, + "tokens_seen": 1216830464 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003188365095285858, + "loss": 2.725, + "theoretical_loss": 3.5828595870643394, + "tokens_seen": 1216896000 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003188264794383149, + "loss": 2.8085, + "theoretical_loss": 3.5828418850265757, + "tokens_seen": 1216961536 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031881644934804416, + "loss": 2.8752, + "theoretical_loss": 3.5828241842089845, + "tokens_seen": 1217027072 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031880641925777334, + "loss": 2.774, + "theoretical_loss": 3.582806484611417, + "tokens_seen": 1217092608 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187963891675025, + "loss": 2.7094, + "theoretical_loss": 3.5827887862337224, + "tokens_seen": 1217158144 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2889910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8892416954040527, + "objective/train/theoretical_loss": 3.5827710890757514, + "objective/train/tokens_used": 1237683680, + "theoretical_loss": 3.5827710890757514, + "tokens_seen": 1217223680 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187863590772317, + "loss": 2.7158, + "theoretical_loss": 3.5827710890757514, + "tokens_seen": 1217223680 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187763289869609, + "loss": 2.6699, + "theoretical_loss": 3.582753393137354, + "tokens_seen": 1217289216 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031876629889669006, + "loss": 2.7847, + "theoretical_loss": 3.5827356984183805, + "tokens_seen": 1217354752 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187562688064193, + "loss": 2.7157, + "theoretical_loss": 3.582718004918682, + "tokens_seen": 1217420288 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187462387161484, + "loss": 2.7144, + "theoretical_loss": 3.582700312638108, + "tokens_seen": 1217485824 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031873620862587766, + "loss": 2.783, + "theoretical_loss": 3.5826826215765086, + "tokens_seen": 1217551360 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187261785356068, + "loss": 2.7598, + "theoretical_loss": 3.5826649317337353, + "tokens_seen": 1217616896 + }, + { + "epoch": 15.0, + "learning_rate": 0.000318716148445336, + "loss": 2.6426, + "theoretical_loss": 3.582647243109638, + "tokens_seen": 1217682432 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003187061183550652, + "loss": 2.8599, + "theoretical_loss": 3.5826295557040666, + "tokens_seen": 1217747968 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186960882647944, + "loss": 2.8165, + "theoretical_loss": 3.5826118695168727, + "tokens_seen": 1217813504 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031868605817452357, + "loss": 2.7252, + "theoretical_loss": 3.582594184547906, + "tokens_seen": 1217879040 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186760280842528, + "loss": 2.7732, + "theoretical_loss": 3.5825765007970167, + "tokens_seen": 1217944576 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031866599799398193, + "loss": 2.7157, + "theoretical_loss": 3.582558818264057, + "tokens_seen": 1218010112 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031865596790371116, + "loss": 2.9005, + "theoretical_loss": 3.5825411369488758, + "tokens_seen": 1218075648 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186459378134403, + "loss": 2.7209, + "theoretical_loss": 3.5825234568513245, + "tokens_seen": 1218141184 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186359077231695, + "loss": 2.8365, + "theoretical_loss": 3.582505777971254, + "tokens_seen": 1218206720 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186258776328987, + "loss": 2.8134, + "theoretical_loss": 3.5824881003085136, + "tokens_seen": 1218272256 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003186158475426279, + "loss": 2.6293, + "theoretical_loss": 3.5824704238629552, + "tokens_seen": 1218337792 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031860581745235707, + "loss": 2.7581, + "theoretical_loss": 3.58245274863443, + "tokens_seen": 1218403328 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031859578736208625, + "loss": 2.837, + "theoretical_loss": 3.5824350746227873, + "tokens_seen": 1218468864 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031858575727181543, + "loss": 2.7148, + "theoretical_loss": 3.5824174018278794, + "tokens_seen": 1218534400 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031857572718154467, + "loss": 2.7352, + "theoretical_loss": 3.5823997302495556, + "tokens_seen": 1218599936 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003185656970912738, + "loss": 2.7724, + "theoretical_loss": 3.582382059887668, + "tokens_seen": 1218665472 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031855566700100303, + "loss": 2.7065, + "theoretical_loss": 3.5823643907420664, + "tokens_seen": 1218731008 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003185456369107322, + "loss": 2.8011, + "theoretical_loss": 3.582346722812603, + "tokens_seen": 1218796544 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2893583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6894311904907227, + "objective/train/theoretical_loss": 3.5823290560991268, + "objective/train/tokens_used": 1239322080, + "theoretical_loss": 3.5823290560991268, + "tokens_seen": 1218862080 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003185356068204614, + "loss": 2.6579, + "theoretical_loss": 3.5823290560991268, + "tokens_seen": 1218862080 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031852557673019057, + "loss": 2.6635, + "theoretical_loss": 3.5823113906014905, + "tokens_seen": 1218927616 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031851554663991975, + "loss": 2.6743, + "theoretical_loss": 3.5822937263195445, + "tokens_seen": 1218993152 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031850551654964893, + "loss": 2.6998, + "theoretical_loss": 3.5822760632531394, + "tokens_seen": 1219058688 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031849548645937817, + "loss": 2.8115, + "theoretical_loss": 3.5822584014021266, + "tokens_seen": 1219124224 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003184854563691073, + "loss": 2.7137, + "theoretical_loss": 3.5822407407663572, + "tokens_seen": 1219189760 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031847542627883653, + "loss": 2.7423, + "theoretical_loss": 3.582223081345682, + "tokens_seen": 1219255296 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031846539618856566, + "loss": 2.8459, + "theoretical_loss": 3.5822054231399516, + "tokens_seen": 1219320832 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003184553660982949, + "loss": 2.8282, + "theoretical_loss": 3.5821877661490187, + "tokens_seen": 1219386368 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003184453360080241, + "loss": 2.7693, + "theoretical_loss": 3.582170110372733, + "tokens_seen": 1219451904 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031843530591775326, + "loss": 2.7028, + "theoretical_loss": 3.582152455810946, + "tokens_seen": 1219517440 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031842527582748244, + "loss": 2.7251, + "theoretical_loss": 3.5821348024635093, + "tokens_seen": 1219582976 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003184152457372116, + "loss": 2.7478, + "theoretical_loss": 3.582117150330274, + "tokens_seen": 1219648512 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003184052156469408, + "loss": 2.774, + "theoretical_loss": 3.5820994994110906, + "tokens_seen": 1219714048 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031839518555667004, + "loss": 2.6716, + "theoretical_loss": 3.582081849705812, + "tokens_seen": 1219779584 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031838515546639916, + "loss": 2.8115, + "theoretical_loss": 3.582064201214288, + "tokens_seen": 1219845120 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003183751253761284, + "loss": 2.8088, + "theoretical_loss": 3.5820465539363697, + "tokens_seen": 1219910656 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003183650952858576, + "loss": 2.7298, + "theoretical_loss": 3.58202890787191, + "tokens_seen": 1219976192 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031835506519558676, + "loss": 2.788, + "theoretical_loss": 3.5820112630207595, + "tokens_seen": 1220041728 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031834503510531594, + "loss": 2.7267, + "theoretical_loss": 3.581993619382769, + "tokens_seen": 1220107264 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003183350050150451, + "loss": 2.741, + "theoretical_loss": 3.581975976957791, + "tokens_seen": 1220172800 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003183249749247743, + "loss": 2.7451, + "theoretical_loss": 3.581958335745676, + "tokens_seen": 1220238336 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031831494483450354, + "loss": 2.7503, + "theoretical_loss": 3.581940695746276, + "tokens_seen": 1220303872 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003183049147442327, + "loss": 2.7383, + "theoretical_loss": 3.581923056959443, + "tokens_seen": 1220369408 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182948846539619, + "loss": 2.7919, + "theoretical_loss": 3.581905419385028, + "tokens_seen": 1220434944 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2898730, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8024709224700928, + "objective/train/theoretical_loss": 3.5818877830228817, + "objective/train/tokens_used": 1240960480, + "theoretical_loss": 3.5818877830228817, + "tokens_seen": 1220500480 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182848545636911, + "loss": 2.8053, + "theoretical_loss": 3.5818877830228817, + "tokens_seen": 1220500480 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031827482447342026, + "loss": 2.9326, + "theoretical_loss": 3.5818701478728574, + "tokens_seen": 1220566016 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182647943831495, + "loss": 2.7357, + "theoretical_loss": 3.5818525139348054, + "tokens_seen": 1220631552 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182547642928786, + "loss": 2.7093, + "theoretical_loss": 3.581834881208578, + "tokens_seen": 1220697088 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031824473420260786, + "loss": 2.7388, + "theoretical_loss": 3.5818172496940264, + "tokens_seen": 1220762624 + }, + { + "epoch": 15.0, + "learning_rate": 0.000318234704112337, + "loss": 2.6982, + "theoretical_loss": 3.5817996193910027, + "tokens_seen": 1220828160 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182246740220662, + "loss": 2.8648, + "theoretical_loss": 3.581781990299359, + "tokens_seen": 1220893696 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182146439317954, + "loss": 2.743, + "theoretical_loss": 3.581764362418946, + "tokens_seen": 1220959232 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003182046138415246, + "loss": 2.6904, + "theoretical_loss": 3.5817467357496158, + "tokens_seen": 1221024768 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031819458375125377, + "loss": 2.6952, + "theoretical_loss": 3.581729110291221, + "tokens_seen": 1221090304 + }, + { + "epoch": 15.0, + "learning_rate": 0.000318184553660983, + "loss": 2.7629, + "theoretical_loss": 3.581711486043613, + "tokens_seen": 1221155840 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031817452357071213, + "loss": 2.6655, + "theoretical_loss": 3.5816938630066426, + "tokens_seen": 1221221376 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031816449348044136, + "loss": 2.6632, + "theoretical_loss": 3.5816762411801633, + "tokens_seen": 1221286912 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003181544633901705, + "loss": 2.7183, + "theoretical_loss": 3.581658620564026, + "tokens_seen": 1221352448 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003181444332998997, + "loss": 2.7935, + "theoretical_loss": 3.581641001158083, + "tokens_seen": 1221417984 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003181344032096289, + "loss": 2.7511, + "theoretical_loss": 3.581623382962186, + "tokens_seen": 1221483520 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003181243731193581, + "loss": 2.769, + "theoretical_loss": 3.5816057659761875, + "tokens_seen": 1221549056 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031811434302908727, + "loss": 2.7444, + "theoretical_loss": 3.5815881501999387, + "tokens_seen": 1221614592 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031810431293881645, + "loss": 2.6659, + "theoretical_loss": 3.5815705356332925, + "tokens_seen": 1221680128 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031809428284854563, + "loss": 2.7068, + "theoretical_loss": 3.5815529222761002, + "tokens_seen": 1221745664 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031808425275827487, + "loss": 2.8445, + "theoretical_loss": 3.5815353101282144, + "tokens_seen": 1221811200 + }, + { + "epoch": 15.0, + "learning_rate": 0.000318074222668004, + "loss": 2.8044, + "theoretical_loss": 3.581517699189487, + "tokens_seen": 1221876736 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031806419257773323, + "loss": 2.7631, + "theoretical_loss": 3.5815000894597704, + "tokens_seen": 1221942272 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003180541624874624, + "loss": 2.7246, + "theoretical_loss": 3.5814824809389165, + "tokens_seen": 1222007808 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003180441323971916, + "loss": 2.8118, + "theoretical_loss": 3.581464873626777, + "tokens_seen": 1222073344 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2901427, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7269246578216553, + "objective/train/theoretical_loss": 3.581447267523205, + "objective/train/tokens_used": 1242598880, + "theoretical_loss": 3.581447267523205, + "tokens_seen": 1222138880 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031803410230692077, + "loss": 2.7226, + "theoretical_loss": 3.581447267523205, + "tokens_seen": 1222138880 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031802407221664995, + "loss": 2.7596, + "theoretical_loss": 3.581429662628053, + "tokens_seen": 1222204416 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031801404212637913, + "loss": 2.7261, + "theoretical_loss": 3.581412058941172, + "tokens_seen": 1222269952 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031800401203610837, + "loss": 2.813, + "theoretical_loss": 3.581394456462415, + "tokens_seen": 1222335488 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003179939819458375, + "loss": 2.8664, + "theoretical_loss": 3.5813768551916345, + "tokens_seen": 1222401024 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031798395185556673, + "loss": 2.7051, + "theoretical_loss": 3.581359255128682, + "tokens_seen": 1222466560 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031797392176529586, + "loss": 2.8121, + "theoretical_loss": 3.5813416562734113, + "tokens_seen": 1222532096 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003179638916750251, + "loss": 2.7572, + "theoretical_loss": 3.5813240586256736, + "tokens_seen": 1222597632 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003179538615847543, + "loss": 2.7529, + "theoretical_loss": 3.5813064621853217, + "tokens_seen": 1222663168 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031794383149448346, + "loss": 2.8138, + "theoretical_loss": 3.5812888669522085, + "tokens_seen": 1222728704 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031793380140421264, + "loss": 2.8099, + "theoretical_loss": 3.5812712729261857, + "tokens_seen": 1222794240 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003179237713139418, + "loss": 2.8705, + "theoretical_loss": 3.581253680107106, + "tokens_seen": 1222859776 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317913741223671, + "loss": 2.8112, + "theoretical_loss": 3.5812360884948222, + "tokens_seen": 1222925312 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031790371113340024, + "loss": 2.7839, + "theoretical_loss": 3.5812184980891866, + "tokens_seen": 1222990848 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031789368104312936, + "loss": 2.767, + "theoretical_loss": 3.581200908890052, + "tokens_seen": 1223056384 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003178836509528586, + "loss": 2.8239, + "theoretical_loss": 3.581183320897271, + "tokens_seen": 1223121920 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003178736208625878, + "loss": 2.8019, + "theoretical_loss": 3.5811657341106966, + "tokens_seen": 1223187456 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031786359077231696, + "loss": 2.7963, + "theoretical_loss": 3.5811481485301804, + "tokens_seen": 1223252992 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031785356068204614, + "loss": 2.849, + "theoretical_loss": 3.581130564155576, + "tokens_seen": 1223318528 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003178435305917753, + "loss": 2.7333, + "theoretical_loss": 3.5811129809867355, + "tokens_seen": 1223384064 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003178335005015045, + "loss": 2.844, + "theoretical_loss": 3.5810953990235124, + "tokens_seen": 1223449600 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031782347041123374, + "loss": 2.7333, + "theoretical_loss": 3.5810778182657583, + "tokens_seen": 1223515136 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031781344032096287, + "loss": 2.8008, + "theoretical_loss": 3.5810602387133272, + "tokens_seen": 1223580672 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003178034102306921, + "loss": 2.8232, + "theoretical_loss": 3.581042660366071, + "tokens_seen": 1223646208 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031779338014042123, + "loss": 2.6601, + "theoretical_loss": 3.5810250832238433, + "tokens_seen": 1223711744 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2906488, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.746250867843628, + "objective/train/theoretical_loss": 3.5810075072864964, + "objective/train/tokens_used": 1244237280, + "theoretical_loss": 3.5810075072864964, + "tokens_seen": 1223777280 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031778335005015046, + "loss": 2.745, + "theoretical_loss": 3.5810075072864964, + "tokens_seen": 1223777280 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031777331995987964, + "loss": 2.7695, + "theoretical_loss": 3.5809899325538836, + "tokens_seen": 1223842816 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003177632898696088, + "loss": 2.7932, + "theoretical_loss": 3.580972359025857, + "tokens_seen": 1223908352 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317753259779338, + "loss": 2.8664, + "theoretical_loss": 3.5809547867022706, + "tokens_seen": 1223973888 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003177432296890672, + "loss": 2.6802, + "theoretical_loss": 3.580937215582977, + "tokens_seen": 1224039424 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031773319959879637, + "loss": 2.734, + "theoretical_loss": 3.580919645667829, + "tokens_seen": 1224104960 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003177231695085256, + "loss": 2.8031, + "theoretical_loss": 3.58090207695668, + "tokens_seen": 1224170496 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031771313941825473, + "loss": 2.6495, + "theoretical_loss": 3.5808845094493824, + "tokens_seen": 1224236032 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031770310932798397, + "loss": 2.7, + "theoretical_loss": 3.58086694314579, + "tokens_seen": 1224301568 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031769307923771315, + "loss": 2.6653, + "theoretical_loss": 3.5808493780457553, + "tokens_seen": 1224367104 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031768304914744233, + "loss": 2.7562, + "theoretical_loss": 3.580831814149131, + "tokens_seen": 1224432640 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003176730190571715, + "loss": 2.8121, + "theoretical_loss": 3.580814251455772, + "tokens_seen": 1224498176 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003176629889669007, + "loss": 2.7275, + "theoretical_loss": 3.5807966899655295, + "tokens_seen": 1224563712 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031765295887662987, + "loss": 2.714, + "theoretical_loss": 3.580779129678258, + "tokens_seen": 1224629248 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003176429287863591, + "loss": 2.7434, + "theoretical_loss": 3.5807615705938103, + "tokens_seen": 1224694784 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031763289869608823, + "loss": 2.7227, + "theoretical_loss": 3.5807440127120396, + "tokens_seen": 1224760320 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031762286860581747, + "loss": 2.6718, + "theoretical_loss": 3.5807264560327994, + "tokens_seen": 1224825856 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003176128385155466, + "loss": 2.8614, + "theoretical_loss": 3.580708900555943, + "tokens_seen": 1224891392 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031760280842527583, + "loss": 2.7667, + "theoretical_loss": 3.5806913462813235, + "tokens_seen": 1224956928 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317592778335005, + "loss": 2.7408, + "theoretical_loss": 3.5806737932087938, + "tokens_seen": 1225022464 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003175827482447342, + "loss": 2.7387, + "theoretical_loss": 3.5806562413382084, + "tokens_seen": 1225088000 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003175727181544634, + "loss": 2.7164, + "theoretical_loss": 3.58063869066942, + "tokens_seen": 1225153536 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003175626880641926, + "loss": 2.686, + "theoretical_loss": 3.5806211412022817, + "tokens_seen": 1225219072 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003175526579739218, + "loss": 2.7446, + "theoretical_loss": 3.5806035929366473, + "tokens_seen": 1225284608 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031754262788365097, + "loss": 2.8398, + "theoretical_loss": 3.580586045872371, + "tokens_seen": 1225350144 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2909325, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.765523910522461, + "objective/train/theoretical_loss": 3.5805685000093055, + "objective/train/tokens_used": 1245875680, + "theoretical_loss": 3.5805685000093055, + "tokens_seen": 1225415680 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031753259779338015, + "loss": 2.8285, + "theoretical_loss": 3.5805685000093055, + "tokens_seen": 1225415680 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031752256770310933, + "loss": 2.7395, + "theoretical_loss": 3.580550955347304, + "tokens_seen": 1225481216 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031751253761283857, + "loss": 2.7096, + "theoretical_loss": 3.580533411886221, + "tokens_seen": 1225546752 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003175025075225677, + "loss": 2.7767, + "theoretical_loss": 3.58051586962591, + "tokens_seen": 1225612288 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031749247743229693, + "loss": 2.6913, + "theoretical_loss": 3.580498328566224, + "tokens_seen": 1225677824 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031748244734202606, + "loss": 2.7573, + "theoretical_loss": 3.5804807887070167, + "tokens_seen": 1225743360 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003174724172517553, + "loss": 2.8394, + "theoretical_loss": 3.580463250048142, + "tokens_seen": 1225808896 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003174623871614845, + "loss": 2.7018, + "theoretical_loss": 3.580445712589454, + "tokens_seen": 1225874432 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031745235707121366, + "loss": 2.7769, + "theoretical_loss": 3.5804281763308055, + "tokens_seen": 1225939968 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031744232698094284, + "loss": 2.8397, + "theoretical_loss": 3.580410641272051, + "tokens_seen": 1226005504 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317432296890672, + "loss": 2.765, + "theoretical_loss": 3.580393107413044, + "tokens_seen": 1226071040 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003174222668004012, + "loss": 2.8798, + "theoretical_loss": 3.5803755747536385, + "tokens_seen": 1226136576 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031741223671013044, + "loss": 2.8629, + "theoretical_loss": 3.5803580432936877, + "tokens_seen": 1226202112 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031740220661985956, + "loss": 2.8465, + "theoretical_loss": 3.580340513033046, + "tokens_seen": 1226267648 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003173921765295888, + "loss": 2.7505, + "theoretical_loss": 3.580322983971567, + "tokens_seen": 1226333184 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317382146439318, + "loss": 2.7972, + "theoretical_loss": 3.5803054561091043, + "tokens_seen": 1226398720 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031737211634904716, + "loss": 2.6317, + "theoretical_loss": 3.5802879294455128, + "tokens_seen": 1226464256 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031736208625877634, + "loss": 2.7943, + "theoretical_loss": 3.580270403980646, + "tokens_seen": 1226529792 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003173520561685055, + "loss": 2.808, + "theoretical_loss": 3.580252879714357, + "tokens_seen": 1226595328 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003173420260782347, + "loss": 2.7407, + "theoretical_loss": 3.5802353566465013, + "tokens_seen": 1226660864 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031733199598796394, + "loss": 2.7523, + "theoretical_loss": 3.580217834776932, + "tokens_seen": 1226726400 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031732196589769307, + "loss": 2.851, + "theoretical_loss": 3.580200314105503, + "tokens_seen": 1226791936 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003173119358074223, + "loss": 2.7872, + "theoretical_loss": 3.580182794632069, + "tokens_seen": 1226857472 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031730190571715143, + "loss": 2.7218, + "theoretical_loss": 3.5801652763564835, + "tokens_seen": 1226923008 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031729187562688066, + "loss": 2.8545, + "theoretical_loss": 3.5801477592786006, + "tokens_seen": 1226988544 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2912979, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.672762870788574, + "objective/train/theoretical_loss": 3.5801302433982753, + "objective/train/tokens_used": 1247514080, + "theoretical_loss": 3.5801302433982753, + "tokens_seen": 1227054080 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031728184553660984, + "loss": 2.7511, + "theoretical_loss": 3.5801302433982753, + "tokens_seen": 1227054080 + }, + { + "epoch": 15.0, + "learning_rate": 0.000317271815446339, + "loss": 2.7268, + "theoretical_loss": 3.5801127287153607, + "tokens_seen": 1227119616 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003172617853560682, + "loss": 2.7576, + "theoretical_loss": 3.580095215229712, + "tokens_seen": 1227185152 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003172517552657974, + "loss": 2.7814, + "theoretical_loss": 3.5800777029411828, + "tokens_seen": 1227250688 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031724172517552657, + "loss": 2.819, + "theoretical_loss": 3.5800601918496273, + "tokens_seen": 1227316224 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003172316950852558, + "loss": 2.7195, + "theoretical_loss": 3.5800426819549003, + "tokens_seen": 1227381760 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031722166499498493, + "loss": 2.8128, + "theoretical_loss": 3.580025173256855, + "tokens_seen": 1227447296 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031721163490471417, + "loss": 2.6868, + "theoretical_loss": 3.580007665755347, + "tokens_seen": 1227512832 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031720160481444335, + "loss": 2.8072, + "theoretical_loss": 3.5799901594502304, + "tokens_seen": 1227578368 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031719157472417253, + "loss": 2.7392, + "theoretical_loss": 3.5799726543413595, + "tokens_seen": 1227643904 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003171815446339017, + "loss": 2.7995, + "theoretical_loss": 3.579955150428588, + "tokens_seen": 1227709440 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003171715145436309, + "loss": 2.8245, + "theoretical_loss": 3.579937647711771, + "tokens_seen": 1227774976 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031716148445336007, + "loss": 2.7263, + "theoretical_loss": 3.579920146190763, + "tokens_seen": 1227840512 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003171514543630893, + "loss": 2.628, + "theoretical_loss": 3.579902645865418, + "tokens_seen": 1227906048 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031714142427281843, + "loss": 2.7229, + "theoretical_loss": 3.5798851467355908, + "tokens_seen": 1227971584 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031713139418254767, + "loss": 2.7358, + "theoretical_loss": 3.5798676488011365, + "tokens_seen": 1228037120 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003171213640922768, + "loss": 2.9347, + "theoretical_loss": 3.579850152061909, + "tokens_seen": 1228102656 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031711133400200603, + "loss": 2.8527, + "theoretical_loss": 3.5798326565177625, + "tokens_seen": 1228168192 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003171013039117352, + "loss": 2.8086, + "theoretical_loss": 3.5798151621685523, + "tokens_seen": 1228233728 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170912738214644, + "loss": 2.7769, + "theoretical_loss": 3.5797976690141327, + "tokens_seen": 1228299264 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170812437311936, + "loss": 2.9025, + "theoretical_loss": 3.5797801770543587, + "tokens_seen": 1228364800 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170712136409228, + "loss": 2.8148, + "theoretical_loss": 3.579762686289085, + "tokens_seen": 1228430336 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031706118355065194, + "loss": 2.8041, + "theoretical_loss": 3.5797451967181653, + "tokens_seen": 1228495872 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170511534603812, + "loss": 2.7648, + "theoretical_loss": 3.579727708341456, + "tokens_seen": 1228561408 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170411233701103, + "loss": 2.7776, + "theoretical_loss": 3.57971022115881, + "tokens_seen": 1228626944 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2917838, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5257983207702637, + "objective/train/theoretical_loss": 3.5796927351700836, + "objective/train/tokens_used": 1249152480, + "theoretical_loss": 3.5796927351700836, + "tokens_seen": 1228692480 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031703109327983954, + "loss": 2.7584, + "theoretical_loss": 3.5796927351700836, + "tokens_seen": 1228692480 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170210631895687, + "loss": 2.7184, + "theoretical_loss": 3.5796752503751312, + "tokens_seen": 1228758016 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170110330992979, + "loss": 2.7598, + "theoretical_loss": 3.579657766773807, + "tokens_seen": 1228823552 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003170010030090271, + "loss": 2.817, + "theoretical_loss": 3.579640284365967, + "tokens_seen": 1228889088 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031699097291875626, + "loss": 2.8255, + "theoretical_loss": 3.5796228031514654, + "tokens_seen": 1228954624 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031698094282848544, + "loss": 2.7343, + "theoretical_loss": 3.5796053231301572, + "tokens_seen": 1229020160 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003169709127382147, + "loss": 2.7642, + "theoretical_loss": 3.579587844301897, + "tokens_seen": 1229085696 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003169608826479438, + "loss": 2.7888, + "theoretical_loss": 3.57957036666654, + "tokens_seen": 1229151232 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031695085255767304, + "loss": 2.8722, + "theoretical_loss": 3.5795528902239413, + "tokens_seen": 1229216768 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031694082246740217, + "loss": 2.8095, + "theoretical_loss": 3.5795354149739564, + "tokens_seen": 1229282304 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003169307923771314, + "loss": 2.6675, + "theoretical_loss": 3.579517940916439, + "tokens_seen": 1229347840 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003169207622868606, + "loss": 2.6973, + "theoretical_loss": 3.5795004680512457, + "tokens_seen": 1229413376 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031691073219658976, + "loss": 2.714, + "theoretical_loss": 3.57948299637823, + "tokens_seen": 1229478912 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031690070210631894, + "loss": 2.7987, + "theoretical_loss": 3.579465525897249, + "tokens_seen": 1229544448 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003168906720160482, + "loss": 2.7771, + "theoretical_loss": 3.5794480566081566, + "tokens_seen": 1229609984 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003168806419257773, + "loss": 2.8287, + "theoretical_loss": 3.5794305885108075, + "tokens_seen": 1229675520 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031687061183550654, + "loss": 2.7871, + "theoretical_loss": 3.5794131216050573, + "tokens_seen": 1229741056 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031686058174523567, + "loss": 2.6763, + "theoretical_loss": 3.579395655890762, + "tokens_seen": 1229806592 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003168505516549649, + "loss": 2.758, + "theoretical_loss": 3.579378191367776, + "tokens_seen": 1229872128 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003168405215646941, + "loss": 2.8154, + "theoretical_loss": 3.5793607280359554, + "tokens_seen": 1229937664 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031683049147442327, + "loss": 2.8176, + "theoretical_loss": 3.579343265895154, + "tokens_seen": 1230003200 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031682046138415245, + "loss": 2.808, + "theoretical_loss": 3.5793258049452286, + "tokens_seen": 1230068736 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031681043129388163, + "loss": 2.6673, + "theoretical_loss": 3.5793083451860337, + "tokens_seen": 1230134272 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031680040120361086, + "loss": 2.8534, + "theoretical_loss": 3.579290886617425, + "tokens_seen": 1230199808 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031679037111334004, + "loss": 2.7086, + "theoretical_loss": 3.579273429239258, + "tokens_seen": 1230265344 + }, + { + "epoch": 15.0, + "objective/train/docs_used": 2920742, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7392444610595703, + "objective/train/theoretical_loss": 3.5792559730513878, + "objective/train/tokens_used": 1250790880, + "theoretical_loss": 3.5792559730513878, + "tokens_seen": 1230330880 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003167803410230692, + "loss": 2.6168, + "theoretical_loss": 3.5792559730513878, + "tokens_seen": 1230330880 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003167703109327984, + "loss": 2.8448, + "theoretical_loss": 3.5792385180536703, + "tokens_seen": 1230396416 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003167602808425276, + "loss": 2.9597, + "theoretical_loss": 3.57922106424596, + "tokens_seen": 1230461952 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031675025075225677, + "loss": 2.896, + "theoretical_loss": 3.5792036116281136, + "tokens_seen": 1230527488 + }, + { + "epoch": 15.0, + "learning_rate": 0.000316740220661986, + "loss": 2.7453, + "theoretical_loss": 3.579186160199986, + "tokens_seen": 1230593024 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031673019057171513, + "loss": 2.8139, + "theoretical_loss": 3.579168709961433, + "tokens_seen": 1230658560 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031672016048144437, + "loss": 2.8162, + "theoretical_loss": 3.57915126091231, + "tokens_seen": 1230724096 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031671013039117355, + "loss": 2.7666, + "theoretical_loss": 3.579133813052472, + "tokens_seen": 1230789632 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031670010030090273, + "loss": 2.747, + "theoretical_loss": 3.579116366381776, + "tokens_seen": 1230855168 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003166900702106319, + "loss": 2.7625, + "theoretical_loss": 3.5790989209000763, + "tokens_seen": 1230920704 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003166800401203611, + "loss": 2.7646, + "theoretical_loss": 3.5790814766072296, + "tokens_seen": 1230986240 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031667001003009027, + "loss": 2.8212, + "theoretical_loss": 3.5790640335030908, + "tokens_seen": 1231051776 + }, + { + "epoch": 15.0, + "learning_rate": 0.0003166599799398195, + "loss": 2.8031, + "theoretical_loss": 3.5790465915875163, + "tokens_seen": 1231117312 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031664994984954863, + "loss": 2.7049, + "theoretical_loss": 3.579029150860361, + "tokens_seen": 1231182848 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031663991975927787, + "loss": 2.7017, + "theoretical_loss": 3.5790117113214817, + "tokens_seen": 1231248384 + }, + { + "epoch": 15.0, + "learning_rate": 0.000316629889669007, + "loss": 2.8556, + "theoretical_loss": 3.5789942729707334, + "tokens_seen": 1231313920 + }, + { + "epoch": 15.0, + "learning_rate": 0.00031661985957873623, + "loss": 2.7635, + "theoretical_loss": 3.578976835807972, + "tokens_seen": 1231379456 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003166098294884654, + "loss": 2.8211, + "theoretical_loss": 3.578959399833054, + "tokens_seen": 1231444992 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165997993981946, + "loss": 2.726, + "theoretical_loss": 3.5789419650458347, + "tokens_seen": 1231510528 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165897693079238, + "loss": 2.8513, + "theoretical_loss": 3.5789245314461704, + "tokens_seen": 1231576064 + }, + { + "epoch": 15.01, + "learning_rate": 0.000316579739217653, + "loss": 2.7756, + "theoretical_loss": 3.5789070990339162, + "tokens_seen": 1231641600 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031656970912738214, + "loss": 2.7868, + "theoretical_loss": 3.5788896678089284, + "tokens_seen": 1231707136 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165596790371114, + "loss": 2.8361, + "theoretical_loss": 3.578872237771064, + "tokens_seen": 1231772672 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165496489468405, + "loss": 2.7569, + "theoretical_loss": 3.5788548089201777, + "tokens_seen": 1231838208 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031653961885656974, + "loss": 2.7373, + "theoretical_loss": 3.5788373812561263, + "tokens_seen": 1231903744 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2925617, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8712079524993896, + "objective/train/theoretical_loss": 3.5788199547787647, + "objective/train/tokens_used": 1252429280, + "theoretical_loss": 3.5788199547787647, + "tokens_seen": 1231969280 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165295887662989, + "loss": 2.7828, + "theoretical_loss": 3.5788199547787647, + "tokens_seen": 1231969280 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165195586760281, + "loss": 2.6708, + "theoretical_loss": 3.578802529487951, + "tokens_seen": 1232034816 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003165095285857573, + "loss": 2.8263, + "theoretical_loss": 3.578785105383539, + "tokens_seen": 1232100352 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031649949849548646, + "loss": 2.6609, + "theoretical_loss": 3.5787676824653865, + "tokens_seen": 1232165888 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031648946840521564, + "loss": 2.8563, + "theoretical_loss": 3.5787502607333495, + "tokens_seen": 1232231424 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003164794383149449, + "loss": 2.811, + "theoretical_loss": 3.578732840187283, + "tokens_seen": 1232296960 + }, + { + "epoch": 15.01, + "learning_rate": 0.000316469408224674, + "loss": 2.8232, + "theoretical_loss": 3.578715420827044, + "tokens_seen": 1232362496 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031645937813440324, + "loss": 2.7555, + "theoretical_loss": 3.578698002652489, + "tokens_seen": 1232428032 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031644934804413237, + "loss": 2.7473, + "theoretical_loss": 3.5786805856634745, + "tokens_seen": 1232493568 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003164393179538616, + "loss": 2.7704, + "theoretical_loss": 3.5786631698598557, + "tokens_seen": 1232559104 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003164292878635908, + "loss": 2.7892, + "theoretical_loss": 3.578645755241489, + "tokens_seen": 1232624640 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031641925777331996, + "loss": 2.8243, + "theoretical_loss": 3.5786283418082316, + "tokens_seen": 1232690176 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031640922768304914, + "loss": 2.7162, + "theoretical_loss": 3.5786109295599395, + "tokens_seen": 1232755712 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003163991975927784, + "loss": 2.7728, + "theoretical_loss": 3.5785935184964686, + "tokens_seen": 1232821248 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003163891675025075, + "loss": 2.5998, + "theoretical_loss": 3.578576108617676, + "tokens_seen": 1232886784 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031637913741223674, + "loss": 2.7755, + "theoretical_loss": 3.578558699923418, + "tokens_seen": 1232952320 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031636910732196587, + "loss": 2.6016, + "theoretical_loss": 3.5785412924135502, + "tokens_seen": 1233017856 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003163590772316951, + "loss": 2.6438, + "theoretical_loss": 3.57852388608793, + "tokens_seen": 1233083392 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003163490471414243, + "loss": 2.7157, + "theoretical_loss": 3.5785064809464138, + "tokens_seen": 1233148928 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031633901705115347, + "loss": 2.8376, + "theoretical_loss": 3.5784890769888578, + "tokens_seen": 1233214464 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031632898696088265, + "loss": 2.7622, + "theoretical_loss": 3.578471674215119, + "tokens_seen": 1233280000 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031631895687061183, + "loss": 2.7314, + "theoretical_loss": 3.578454272625053, + "tokens_seen": 1233345536 + }, + { + "epoch": 15.01, + "learning_rate": 0.000316308926780341, + "loss": 2.8445, + "theoretical_loss": 3.578436872218518, + "tokens_seen": 1233411072 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031629889669007024, + "loss": 2.7493, + "theoretical_loss": 3.5784194729953684, + "tokens_seen": 1233476608 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031628886659979937, + "loss": 2.8499, + "theoretical_loss": 3.578402074955463, + "tokens_seen": 1233542144 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2928557, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734081745147705, + "objective/train/theoretical_loss": 3.578384678098658, + "objective/train/tokens_used": 1254067680, + "theoretical_loss": 3.578384678098658, + "tokens_seen": 1233607680 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003162788365095286, + "loss": 2.7834, + "theoretical_loss": 3.578384678098658, + "tokens_seen": 1233607680 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031626880641925773, + "loss": 2.7806, + "theoretical_loss": 3.578367282424809, + "tokens_seen": 1233673216 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031625877632898697, + "loss": 2.746, + "theoretical_loss": 3.578349887933774, + "tokens_seen": 1233738752 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031624874623871615, + "loss": 2.768, + "theoretical_loss": 3.5783324946254087, + "tokens_seen": 1233804288 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031623871614844533, + "loss": 2.7469, + "theoretical_loss": 3.5783151024995705, + "tokens_seen": 1233869824 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003162286860581745, + "loss": 2.7231, + "theoretical_loss": 3.5782977115561163, + "tokens_seen": 1233935360 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031621865596790375, + "loss": 2.8366, + "theoretical_loss": 3.5782803217949026, + "tokens_seen": 1234000896 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003162086258776329, + "loss": 2.6934, + "theoretical_loss": 3.578262933215786, + "tokens_seen": 1234066432 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003161985957873621, + "loss": 2.8034, + "theoretical_loss": 3.578245545818624, + "tokens_seen": 1234131968 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031618856569709124, + "loss": 2.7482, + "theoretical_loss": 3.578228159603274, + "tokens_seen": 1234197504 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031617853560682047, + "loss": 2.7815, + "theoretical_loss": 3.578210774569591, + "tokens_seen": 1234263040 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031616850551654965, + "loss": 2.7832, + "theoretical_loss": 3.5781933907174333, + "tokens_seen": 1234328576 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031615847542627883, + "loss": 2.6589, + "theoretical_loss": 3.5781760080466585, + "tokens_seen": 1234394112 + }, + { + "epoch": 15.01, + "learning_rate": 0.000316148445336008, + "loss": 2.7929, + "theoretical_loss": 3.578158626557122, + "tokens_seen": 1234459648 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003161384152457372, + "loss": 2.758, + "theoretical_loss": 3.578141246248682, + "tokens_seen": 1234525184 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003161283851554664, + "loss": 2.7756, + "theoretical_loss": 3.578123867121195, + "tokens_seen": 1234590720 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003161183550651956, + "loss": 2.8442, + "theoretical_loss": 3.5781064891745182, + "tokens_seen": 1234656256 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031610832497492474, + "loss": 2.6755, + "theoretical_loss": 3.5780891124085086, + "tokens_seen": 1234721792 + }, + { + "epoch": 15.01, + "learning_rate": 0.000316098294884654, + "loss": 2.871, + "theoretical_loss": 3.5780717368230235, + "tokens_seen": 1234787328 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160882647943831, + "loss": 2.6613, + "theoretical_loss": 3.5780543624179204, + "tokens_seen": 1234852864 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031607823470411234, + "loss": 2.7855, + "theoretical_loss": 3.5780369891930555, + "tokens_seen": 1234918400 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160682046138415, + "loss": 2.7855, + "theoretical_loss": 3.578019617148287, + "tokens_seen": 1234983936 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160581745235707, + "loss": 2.807, + "theoretical_loss": 3.5780022462834715, + "tokens_seen": 1235049472 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031604814443329994, + "loss": 2.7282, + "theoretical_loss": 3.5779848765984665, + "tokens_seen": 1235115008 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160381143430291, + "loss": 2.8523, + "theoretical_loss": 3.5779675080931295, + "tokens_seen": 1235180544 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2932368, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8745710849761963, + "objective/train/theoretical_loss": 3.5779501407673173, + "objective/train/tokens_used": 1255706080, + "theoretical_loss": 3.5779501407673173, + "tokens_seen": 1235246080 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160280842527583, + "loss": 2.8644, + "theoretical_loss": 3.5779501407673173, + "tokens_seen": 1235246080 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003160180541624875, + "loss": 2.8495, + "theoretical_loss": 3.577932774620887, + "tokens_seen": 1235311616 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031600802407221666, + "loss": 2.7892, + "theoretical_loss": 3.577915409653697, + "tokens_seen": 1235377152 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031599799398194584, + "loss": 2.88, + "theoretical_loss": 3.577898045865604, + "tokens_seen": 1235442688 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003159879638916751, + "loss": 2.7516, + "theoretical_loss": 3.577880683256465, + "tokens_seen": 1235508224 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003159779338014042, + "loss": 2.8167, + "theoretical_loss": 3.5778633218261384, + "tokens_seen": 1235573760 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031596790371113344, + "loss": 2.7474, + "theoretical_loss": 3.5778459615744813, + "tokens_seen": 1235639296 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031595787362086257, + "loss": 2.7864, + "theoretical_loss": 3.5778286025013504, + "tokens_seen": 1235704832 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003159478435305918, + "loss": 2.7829, + "theoretical_loss": 3.577811244606604, + "tokens_seen": 1235770368 + }, + { + "epoch": 15.01, + "learning_rate": 0.000315937813440321, + "loss": 2.8821, + "theoretical_loss": 3.5777938878900994, + "tokens_seen": 1235835904 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031592778335005016, + "loss": 2.7883, + "theoretical_loss": 3.5777765323516943, + "tokens_seen": 1235901440 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031591775325977934, + "loss": 2.926, + "theoretical_loss": 3.577759177991246, + "tokens_seen": 1235966976 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003159077231695086, + "loss": 2.7954, + "theoretical_loss": 3.577741824808612, + "tokens_seen": 1236032512 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003158976930792377, + "loss": 2.7926, + "theoretical_loss": 3.577724472803651, + "tokens_seen": 1236098048 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031588766298896694, + "loss": 2.7203, + "theoretical_loss": 3.577707121976219, + "tokens_seen": 1236163584 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031587763289869607, + "loss": 2.8431, + "theoretical_loss": 3.577689772326175, + "tokens_seen": 1236229120 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003158676028084253, + "loss": 2.7893, + "theoretical_loss": 3.5776724238533757, + "tokens_seen": 1236294656 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003158575727181545, + "loss": 2.5414, + "theoretical_loss": 3.5776550765576793, + "tokens_seen": 1236360192 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031584754262788367, + "loss": 2.8214, + "theoretical_loss": 3.5776377304389437, + "tokens_seen": 1236425728 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031583751253761285, + "loss": 2.7911, + "theoretical_loss": 3.5776203854970268, + "tokens_seen": 1236491264 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031582748244734203, + "loss": 2.7944, + "theoretical_loss": 3.5776030417317854, + "tokens_seen": 1236556800 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003158174523570712, + "loss": 2.8878, + "theoretical_loss": 3.5775856991430786, + "tokens_seen": 1236622336 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031580742226680044, + "loss": 2.8032, + "theoretical_loss": 3.5775683577307635, + "tokens_seen": 1236687872 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031579739217652957, + "loss": 2.714, + "theoretical_loss": 3.577551017494698, + "tokens_seen": 1236753408 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003157873620862588, + "loss": 2.8573, + "theoretical_loss": 3.5775336784347402, + "tokens_seen": 1236818944 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2937283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7865958213806152, + "objective/train/theoretical_loss": 3.5775163405507477, + "objective/train/tokens_used": 1257344480, + "theoretical_loss": 3.5775163405507477, + "tokens_seen": 1236884480 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031577733199598793, + "loss": 2.7465, + "theoretical_loss": 3.5775163405507477, + "tokens_seen": 1236884480 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031576730190571717, + "loss": 2.7671, + "theoretical_loss": 3.577499003842579, + "tokens_seen": 1236950016 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031575727181544635, + "loss": 2.8159, + "theoretical_loss": 3.577481668310092, + "tokens_seen": 1237015552 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031574724172517553, + "loss": 2.8008, + "theoretical_loss": 3.577464333953144, + "tokens_seen": 1237081088 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003157372116349047, + "loss": 2.744, + "theoretical_loss": 3.5774470007715933, + "tokens_seen": 1237146624 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031572718154463395, + "loss": 2.7883, + "theoretical_loss": 3.5774296687652987, + "tokens_seen": 1237212160 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003157171514543631, + "loss": 2.8674, + "theoretical_loss": 3.577412337934117, + "tokens_seen": 1237277696 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003157071213640923, + "loss": 2.7697, + "theoretical_loss": 3.5773950082779074, + "tokens_seen": 1237343232 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031569709127382144, + "loss": 2.8426, + "theoretical_loss": 3.577377679796527, + "tokens_seen": 1237408768 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031568706118355067, + "loss": 2.7496, + "theoretical_loss": 3.577360352489835, + "tokens_seen": 1237474304 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031567703109327985, + "loss": 2.7968, + "theoretical_loss": 3.577343026357689, + "tokens_seen": 1237539840 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031566700100300903, + "loss": 2.7512, + "theoretical_loss": 3.577325701399947, + "tokens_seen": 1237605376 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003156569709127382, + "loss": 2.8705, + "theoretical_loss": 3.577308377616468, + "tokens_seen": 1237670912 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003156469408224674, + "loss": 2.8548, + "theoretical_loss": 3.5772910550071093, + "tokens_seen": 1237736448 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003156369107321966, + "loss": 2.6074, + "theoretical_loss": 3.5772737335717295, + "tokens_seen": 1237801984 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003156268806419258, + "loss": 2.8274, + "theoretical_loss": 3.577256413310187, + "tokens_seen": 1237867520 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031561685055165494, + "loss": 2.9372, + "theoretical_loss": 3.57723909422234, + "tokens_seen": 1237933056 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003156068204613842, + "loss": 2.7325, + "theoretical_loss": 3.5772217763080474, + "tokens_seen": 1237998592 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155967903711133, + "loss": 2.8303, + "theoretical_loss": 3.5772044595671666, + "tokens_seen": 1238064128 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031558676028084254, + "loss": 2.8695, + "theoretical_loss": 3.5771871439995566, + "tokens_seen": 1238129664 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155767301905717, + "loss": 2.8889, + "theoretical_loss": 3.5771698296050753, + "tokens_seen": 1238195200 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155667001003009, + "loss": 2.832, + "theoretical_loss": 3.577152516383582, + "tokens_seen": 1238260736 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155566700100301, + "loss": 2.7407, + "theoretical_loss": 3.5771352043349345, + "tokens_seen": 1238326272 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155466399197593, + "loss": 2.7235, + "theoretical_loss": 3.5771178934589916, + "tokens_seen": 1238391808 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031553660982948844, + "loss": 2.7902, + "theoretical_loss": 3.577100583755611, + "tokens_seen": 1238457344 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2940260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6799509525299072, + "objective/train/theoretical_loss": 3.577083275224653, + "objective/train/tokens_used": 1258982880, + "theoretical_loss": 3.577083275224653, + "tokens_seen": 1238522880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155265797392177, + "loss": 2.794, + "theoretical_loss": 3.577083275224653, + "tokens_seen": 1238522880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003155165496489468, + "loss": 2.9355, + "theoretical_loss": 3.5770659678659738, + "tokens_seen": 1238588416 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031550651955867604, + "loss": 2.8189, + "theoretical_loss": 3.5770486616794335, + "tokens_seen": 1238653952 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154964894684052, + "loss": 2.7667, + "theoretical_loss": 3.577031356664891, + "tokens_seen": 1238719488 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154864593781344, + "loss": 2.6906, + "theoretical_loss": 3.577014052822204, + "tokens_seen": 1238785024 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154764292878636, + "loss": 2.6975, + "theoretical_loss": 3.576996750151231, + "tokens_seen": 1238850560 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031546639919759277, + "loss": 2.7754, + "theoretical_loss": 3.5769794486518323, + "tokens_seen": 1238916096 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031545636910732195, + "loss": 2.793, + "theoretical_loss": 3.576962148323865, + "tokens_seen": 1238981632 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154463390170512, + "loss": 2.791, + "theoretical_loss": 3.5769448491671882, + "tokens_seen": 1239047168 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154363089267803, + "loss": 2.944, + "theoretical_loss": 3.5769275511816607, + "tokens_seen": 1239112704 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031542627883650954, + "loss": 2.8204, + "theoretical_loss": 3.5769102543671414, + "tokens_seen": 1239178240 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154162487462387, + "loss": 2.7717, + "theoretical_loss": 3.576892958723489, + "tokens_seen": 1239243776 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003154062186559679, + "loss": 2.8897, + "theoretical_loss": 3.576875664250563, + "tokens_seen": 1239309312 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153961885656971, + "loss": 2.8877, + "theoretical_loss": 3.576858370948221, + "tokens_seen": 1239374848 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031538615847542627, + "loss": 2.7723, + "theoretical_loss": 3.5768410788163227, + "tokens_seen": 1239440384 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031537612838515545, + "loss": 2.8873, + "theoretical_loss": 3.576823787854727, + "tokens_seen": 1239505920 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153660982948847, + "loss": 2.7235, + "theoretical_loss": 3.5768064980632923, + "tokens_seen": 1239571456 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153560682046138, + "loss": 2.8255, + "theoretical_loss": 3.576789209441878, + "tokens_seen": 1239636992 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031534603811434305, + "loss": 2.7907, + "theoretical_loss": 3.5767719219903427, + "tokens_seen": 1239702528 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153360080240722, + "loss": 2.7266, + "theoretical_loss": 3.5767546357085465, + "tokens_seen": 1239768064 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153259779338014, + "loss": 2.7496, + "theoretical_loss": 3.5767373505963467, + "tokens_seen": 1239833600 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003153159478435306, + "loss": 2.7425, + "theoretical_loss": 3.5767200666536034, + "tokens_seen": 1239899136 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031530591775325977, + "loss": 2.7865, + "theoretical_loss": 3.576702783880176, + "tokens_seen": 1239964672 + }, + { + "epoch": 15.01, + "learning_rate": 0.000315295887662989, + "loss": 2.8634, + "theoretical_loss": 3.5766855022759225, + "tokens_seen": 1240030208 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031528585757271813, + "loss": 2.8832, + "theoretical_loss": 3.576668221840703, + "tokens_seen": 1240095744 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2945116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7853896617889404, + "objective/train/theoretical_loss": 3.576650942574376, + "objective/train/tokens_used": 1260621280, + "theoretical_loss": 3.576650942574376, + "tokens_seen": 1240161280 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031527582748244737, + "loss": 2.8053, + "theoretical_loss": 3.576650942574376, + "tokens_seen": 1240161280 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031526579739217655, + "loss": 2.8409, + "theoretical_loss": 3.5766336644768013, + "tokens_seen": 1240226816 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031525576730190573, + "loss": 2.7832, + "theoretical_loss": 3.5766163875478374, + "tokens_seen": 1240292352 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003152457372116349, + "loss": 2.7313, + "theoretical_loss": 3.576599111787344, + "tokens_seen": 1240357888 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031523570712136415, + "loss": 2.707, + "theoretical_loss": 3.5765818371951803, + "tokens_seen": 1240423424 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003152256770310933, + "loss": 2.8119, + "theoretical_loss": 3.576564563771205, + "tokens_seen": 1240488960 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003152156469408225, + "loss": 2.7538, + "theoretical_loss": 3.5765472915152783, + "tokens_seen": 1240554496 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031520561685055164, + "loss": 2.78, + "theoretical_loss": 3.576530020427259, + "tokens_seen": 1240620032 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031519558676028087, + "loss": 2.7786, + "theoretical_loss": 3.576512750507007, + "tokens_seen": 1240685568 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031518555667001005, + "loss": 2.7582, + "theoretical_loss": 3.57649548175438, + "tokens_seen": 1240751104 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031517552657973924, + "loss": 2.7363, + "theoretical_loss": 3.5764782141692395, + "tokens_seen": 1240816640 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003151654964894684, + "loss": 2.8064, + "theoretical_loss": 3.5764609477514435, + "tokens_seen": 1240882176 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003151554663991976, + "loss": 2.7999, + "theoretical_loss": 3.576443682500852, + "tokens_seen": 1240947712 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003151454363089268, + "loss": 2.6888, + "theoretical_loss": 3.5764264184173244, + "tokens_seen": 1241013248 + }, + { + "epoch": 15.01, + "learning_rate": 0.000315135406218656, + "loss": 2.8927, + "theoretical_loss": 3.5764091555007202, + "tokens_seen": 1241078784 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031512537612838514, + "loss": 2.8079, + "theoretical_loss": 3.576391893750899, + "tokens_seen": 1241144320 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003151153460381144, + "loss": 2.7412, + "theoretical_loss": 3.57637463316772, + "tokens_seen": 1241209856 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003151053159478435, + "loss": 2.8257, + "theoretical_loss": 3.5763573737510432, + "tokens_seen": 1241275392 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031509528585757274, + "loss": 2.889, + "theoretical_loss": 3.5763401155007273, + "tokens_seen": 1241340928 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150852557673019, + "loss": 2.7266, + "theoretical_loss": 3.576322858416633, + "tokens_seen": 1241406464 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150752256770311, + "loss": 2.8276, + "theoretical_loss": 3.5763056024986195, + "tokens_seen": 1241472000 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150651955867603, + "loss": 2.6825, + "theoretical_loss": 3.576288347746547, + "tokens_seen": 1241537536 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150551654964895, + "loss": 2.7022, + "theoretical_loss": 3.576271094160274, + "tokens_seen": 1241603072 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031504513540621864, + "loss": 2.8161, + "theoretical_loss": 3.5762538417396605, + "tokens_seen": 1241668608 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150351053159479, + "loss": 2.7287, + "theoretical_loss": 3.5762365904845677, + "tokens_seen": 1241734144 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2948002, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7739388942718506, + "objective/train/theoretical_loss": 3.576219340394853, + "objective/train/tokens_used": 1262259680, + "theoretical_loss": 3.576219340394853, + "tokens_seen": 1241799680 + }, + { + "epoch": 15.01, + "learning_rate": 0.000315025075225677, + "loss": 2.7701, + "theoretical_loss": 3.576219340394853, + "tokens_seen": 1241799680 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031501504513540624, + "loss": 2.7628, + "theoretical_loss": 3.5762020914703783, + "tokens_seen": 1241865216 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003150050150451354, + "loss": 2.7541, + "theoretical_loss": 3.576184843711002, + "tokens_seen": 1241930752 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149949849548646, + "loss": 2.8038, + "theoretical_loss": 3.5761675971165845, + "tokens_seen": 1241996288 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149849548645938, + "loss": 2.8623, + "theoretical_loss": 3.5761503516869855, + "tokens_seen": 1242061824 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031497492477432297, + "loss": 2.826, + "theoretical_loss": 3.576133107422065, + "tokens_seen": 1242127360 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031496489468405215, + "loss": 2.7969, + "theoretical_loss": 3.576115864321683, + "tokens_seen": 1242192896 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149548645937814, + "loss": 2.7678, + "theoretical_loss": 3.576098622385699, + "tokens_seen": 1242258432 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149448345035105, + "loss": 2.729, + "theoretical_loss": 3.5760813816139736, + "tokens_seen": 1242323968 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031493480441323974, + "loss": 2.7524, + "theoretical_loss": 3.576064142006366, + "tokens_seen": 1242389504 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149247743229689, + "loss": 2.7034, + "theoretical_loss": 3.5760469035627365, + "tokens_seen": 1242455040 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149147442326981, + "loss": 2.8234, + "theoretical_loss": 3.5760296662829454, + "tokens_seen": 1242520576 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003149047141424273, + "loss": 2.865, + "theoretical_loss": 3.5760124301668528, + "tokens_seen": 1242586112 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031489468405215647, + "loss": 2.8447, + "theoretical_loss": 3.5759951952143183, + "tokens_seen": 1242651648 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031488465396188565, + "loss": 2.8529, + "theoretical_loss": 3.5759779614252025, + "tokens_seen": 1242717184 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003148746238716149, + "loss": 2.8377, + "theoretical_loss": 3.575960728799365, + "tokens_seen": 1242782720 + }, + { + "epoch": 15.01, + "learning_rate": 0.000314864593781344, + "loss": 2.8298, + "theoretical_loss": 3.575943497336666, + "tokens_seen": 1242848256 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031485456369107325, + "loss": 2.7822, + "theoretical_loss": 3.575926267036966, + "tokens_seen": 1242913792 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003148445336008024, + "loss": 2.672, + "theoretical_loss": 3.575909037900125, + "tokens_seen": 1242979328 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003148345035105316, + "loss": 2.8491, + "theoretical_loss": 3.575891809926003, + "tokens_seen": 1243044864 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003148244734202608, + "loss": 2.8095, + "theoretical_loss": 3.575874583114461, + "tokens_seen": 1243110400 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031481444332998997, + "loss": 2.9198, + "theoretical_loss": 3.5758573574653587, + "tokens_seen": 1243175936 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031480441323971915, + "loss": 2.8375, + "theoretical_loss": 3.575840132978556, + "tokens_seen": 1243241472 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031479438314944833, + "loss": 2.794, + "theoretical_loss": 3.575822909653914, + "tokens_seen": 1243307008 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003147843530591775, + "loss": 2.7717, + "theoretical_loss": 3.5758056874912922, + "tokens_seen": 1243372544 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2951750, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6513702869415283, + "objective/train/theoretical_loss": 3.575788466490552, + "objective/train/tokens_used": 1263898080, + "theoretical_loss": 3.575788466490552, + "tokens_seen": 1243438080 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031477432296890675, + "loss": 2.7688, + "theoretical_loss": 3.575788466490552, + "tokens_seen": 1243438080 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003147642928786359, + "loss": 2.8778, + "theoretical_loss": 3.575771246651553, + "tokens_seen": 1243503616 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003147542627883651, + "loss": 2.7435, + "theoretical_loss": 3.5757540279741558, + "tokens_seen": 1243569152 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003147442326980943, + "loss": 2.841, + "theoretical_loss": 3.5757368104582206, + "tokens_seen": 1243634688 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003147342026078235, + "loss": 2.7731, + "theoretical_loss": 3.575719594103609, + "tokens_seen": 1243700224 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031472417251755266, + "loss": 2.8013, + "theoretical_loss": 3.5757023789101794, + "tokens_seen": 1243765760 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031471414242728184, + "loss": 2.8168, + "theoretical_loss": 3.575685164877794, + "tokens_seen": 1243831296 + }, + { + "epoch": 15.01, + "learning_rate": 0.000314704112337011, + "loss": 2.8526, + "theoretical_loss": 3.575667952006313, + "tokens_seen": 1243896832 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031469408224674025, + "loss": 2.8317, + "theoretical_loss": 3.575650740295597, + "tokens_seen": 1243962368 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003146840521564694, + "loss": 2.9369, + "theoretical_loss": 3.575633529745506, + "tokens_seen": 1244027904 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003146740220661986, + "loss": 2.8501, + "theoretical_loss": 3.575616320355901, + "tokens_seen": 1244093440 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031466399197592774, + "loss": 2.8577, + "theoretical_loss": 3.5755991121266426, + "tokens_seen": 1244158976 + }, + { + "epoch": 15.01, + "learning_rate": 0.000314653961885657, + "loss": 2.7596, + "theoretical_loss": 3.575581905057591, + "tokens_seen": 1244224512 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031464393179538616, + "loss": 2.8628, + "theoretical_loss": 3.5755646991486083, + "tokens_seen": 1244290048 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031463390170511534, + "loss": 2.7863, + "theoretical_loss": 3.5755474943995535, + "tokens_seen": 1244355584 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003146238716148445, + "loss": 2.8228, + "theoretical_loss": 3.575530290810288, + "tokens_seen": 1244421120 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003146138415245737, + "loss": 2.8146, + "theoretical_loss": 3.5755130883806725, + "tokens_seen": 1244486656 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003146038114343029, + "loss": 2.6823, + "theoretical_loss": 3.575495887110568, + "tokens_seen": 1244552192 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145937813440321, + "loss": 2.7936, + "theoretical_loss": 3.575478686999835, + "tokens_seen": 1244617728 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031458375125376125, + "loss": 2.8206, + "theoretical_loss": 3.575461488048335, + "tokens_seen": 1244683264 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145737211634905, + "loss": 2.7759, + "theoretical_loss": 3.575444290255928, + "tokens_seen": 1244748800 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145636910732197, + "loss": 2.9048, + "theoretical_loss": 3.5754270936224746, + "tokens_seen": 1244814336 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031455366098294884, + "loss": 2.8142, + "theoretical_loss": 3.5754098981478366, + "tokens_seen": 1244879872 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145436308926781, + "loss": 2.8708, + "theoretical_loss": 3.5753927038318745, + "tokens_seen": 1244945408 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145336008024072, + "loss": 2.6992, + "theoretical_loss": 3.5753755106744496, + "tokens_seen": 1245010944 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2956560, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7485125064849854, + "objective/train/theoretical_loss": 3.575358318675422, + "objective/train/tokens_used": 1265536480, + "theoretical_loss": 3.575358318675422, + "tokens_seen": 1245076480 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031452357071213644, + "loss": 2.8076, + "theoretical_loss": 3.575358318675422, + "tokens_seen": 1245076480 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145135406218656, + "loss": 2.8473, + "theoretical_loss": 3.575341127834654, + "tokens_seen": 1245142016 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003145035105315948, + "loss": 2.8042, + "theoretical_loss": 3.575323938152005, + "tokens_seen": 1245207552 + }, + { + "epoch": 15.01, + "learning_rate": 0.000314493480441324, + "loss": 2.8543, + "theoretical_loss": 3.575306749627337, + "tokens_seen": 1245273088 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031448345035105317, + "loss": 2.8182, + "theoretical_loss": 3.575289562260511, + "tokens_seen": 1245338624 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031447342026078235, + "loss": 2.7652, + "theoretical_loss": 3.5752723760513883, + "tokens_seen": 1245404160 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003144633901705116, + "loss": 2.7869, + "theoretical_loss": 3.5752551909998296, + "tokens_seen": 1245469696 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003144533600802407, + "loss": 2.7933, + "theoretical_loss": 3.575238007105696, + "tokens_seen": 1245535232 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031444332998996994, + "loss": 2.7467, + "theoretical_loss": 3.575220824368849, + "tokens_seen": 1245600768 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003144332998996991, + "loss": 2.7062, + "theoretical_loss": 3.575203642789149, + "tokens_seen": 1245666304 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003144232698094283, + "loss": 2.7358, + "theoretical_loss": 3.575186462366459, + "tokens_seen": 1245731840 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003144132397191575, + "loss": 2.8309, + "theoretical_loss": 3.575169283100638, + "tokens_seen": 1245797376 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031440320962888667, + "loss": 2.7039, + "theoretical_loss": 3.575152104991549, + "tokens_seen": 1245862912 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031439317953861585, + "loss": 2.8009, + "theoretical_loss": 3.575134928039051, + "tokens_seen": 1245928448 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003143831494483451, + "loss": 2.7453, + "theoretical_loss": 3.575117752243008, + "tokens_seen": 1245993984 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003143731193580742, + "loss": 2.8509, + "theoretical_loss": 3.5751005776032803, + "tokens_seen": 1246059520 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031436308926780345, + "loss": 2.796, + "theoretical_loss": 3.575083404119729, + "tokens_seen": 1246125056 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003143530591775326, + "loss": 2.74, + "theoretical_loss": 3.5750662317922153, + "tokens_seen": 1246190592 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003143430290872618, + "loss": 2.8391, + "theoretical_loss": 3.5750490606206005, + "tokens_seen": 1246256128 + }, + { + "epoch": 15.01, + "learning_rate": 0.000314332998996991, + "loss": 2.8141, + "theoretical_loss": 3.575031890604747, + "tokens_seen": 1246321664 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031432296890672017, + "loss": 2.6975, + "theoretical_loss": 3.575014721744515, + "tokens_seen": 1246387200 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031431293881644935, + "loss": 2.7411, + "theoretical_loss": 3.5749975540397667, + "tokens_seen": 1246452736 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031430290872617853, + "loss": 2.7955, + "theoretical_loss": 3.574980387490364, + "tokens_seen": 1246518272 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142928786359077, + "loss": 2.825, + "theoretical_loss": 3.574963222096167, + "tokens_seen": 1246583808 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031428284854563695, + "loss": 2.8231, + "theoretical_loss": 3.574946057857039, + "tokens_seen": 1246649344 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2959764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7330057621002197, + "objective/train/theoretical_loss": 3.5749288947728397, + "objective/train/tokens_used": 1267174880, + "theoretical_loss": 3.5749288947728397, + "tokens_seen": 1246714880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142728184553661, + "loss": 2.8189, + "theoretical_loss": 3.5749288947728397, + "tokens_seen": 1246714880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142627883650953, + "loss": 2.7498, + "theoretical_loss": 3.5749117328434323, + "tokens_seen": 1246780416 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142527582748245, + "loss": 2.8439, + "theoretical_loss": 3.574894572068678, + "tokens_seen": 1246845952 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142427281845537, + "loss": 2.872, + "theoretical_loss": 3.5748774124484375, + "tokens_seen": 1246911488 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031423269809428286, + "loss": 2.8099, + "theoretical_loss": 3.5748602539825733, + "tokens_seen": 1246977024 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031422266800401204, + "loss": 2.8731, + "theoretical_loss": 3.5748430966709472, + "tokens_seen": 1247042560 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003142126379137412, + "loss": 2.7751, + "theoretical_loss": 3.5748259405134206, + "tokens_seen": 1247108096 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031420260782347045, + "loss": 2.791, + "theoretical_loss": 3.5748087855098545, + "tokens_seen": 1247173632 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141925777331996, + "loss": 2.7556, + "theoretical_loss": 3.574791631660112, + "tokens_seen": 1247239168 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141825476429288, + "loss": 2.9158, + "theoretical_loss": 3.5747744789640543, + "tokens_seen": 1247304704 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031417251755265794, + "loss": 2.8666, + "theoretical_loss": 3.574757327421543, + "tokens_seen": 1247370240 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141624874623872, + "loss": 2.8566, + "theoretical_loss": 3.5747401770324405, + "tokens_seen": 1247435776 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031415245737211636, + "loss": 2.814, + "theoretical_loss": 3.5747230277966073, + "tokens_seen": 1247501312 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031414242728184554, + "loss": 2.7267, + "theoretical_loss": 3.574705879713907, + "tokens_seen": 1247566848 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141323971915747, + "loss": 2.8136, + "theoretical_loss": 3.5746887327842005, + "tokens_seen": 1247632384 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141223671013039, + "loss": 2.8213, + "theoretical_loss": 3.5746715870073498, + "tokens_seen": 1247697920 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141123370110331, + "loss": 2.7265, + "theoretical_loss": 3.574654442383217, + "tokens_seen": 1247763456 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003141023069207623, + "loss": 2.7443, + "theoretical_loss": 3.5746372989116644, + "tokens_seen": 1247828992 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031409227683049145, + "loss": 2.8369, + "theoretical_loss": 3.5746201565925526, + "tokens_seen": 1247894528 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140822467402207, + "loss": 2.9517, + "theoretical_loss": 3.5746030154257458, + "tokens_seen": 1247960064 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031407221664994986, + "loss": 2.7821, + "theoretical_loss": 3.574585875411104, + "tokens_seen": 1248025600 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031406218655967904, + "loss": 2.7887, + "theoretical_loss": 3.57456873654849, + "tokens_seen": 1248091136 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140521564694082, + "loss": 2.7992, + "theoretical_loss": 3.5745515988377665, + "tokens_seen": 1248156672 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140421263791374, + "loss": 2.8732, + "theoretical_loss": 3.574534462278795, + "tokens_seen": 1248222208 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140320962888666, + "loss": 2.8437, + "theoretical_loss": 3.5745173268714376, + "tokens_seen": 1248287744 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2962694, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.849201202392578, + "objective/train/theoretical_loss": 3.574500192615557, + "objective/train/tokens_used": 1268813280, + "theoretical_loss": 3.574500192615557, + "tokens_seen": 1248353280 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140220661985958, + "loss": 2.7946, + "theoretical_loss": 3.574500192615557, + "tokens_seen": 1248353280 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031401203610832495, + "loss": 2.8455, + "theoretical_loss": 3.5744830595110146, + "tokens_seen": 1248418816 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003140020060180542, + "loss": 2.9084, + "theoretical_loss": 3.5744659275576733, + "tokens_seen": 1248484352 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139919759277833, + "loss": 2.7326, + "theoretical_loss": 3.5744487967553944, + "tokens_seen": 1248549888 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031398194583751255, + "loss": 2.8466, + "theoretical_loss": 3.574431667104041, + "tokens_seen": 1248615424 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031397191574724173, + "loss": 2.8452, + "theoretical_loss": 3.574414538603475, + "tokens_seen": 1248680960 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139618856569709, + "loss": 2.8754, + "theoretical_loss": 3.574397411253559, + "tokens_seen": 1248746496 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139518555667001, + "loss": 2.7456, + "theoretical_loss": 3.574380285054155, + "tokens_seen": 1248812032 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139418254764293, + "loss": 2.737, + "theoretical_loss": 3.5743631600051256, + "tokens_seen": 1248877568 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031393179538615845, + "loss": 2.8843, + "theoretical_loss": 3.5743460361063333, + "tokens_seen": 1248943104 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139217652958877, + "loss": 2.8891, + "theoretical_loss": 3.5743289133576397, + "tokens_seen": 1249008640 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003139117352056168, + "loss": 2.8274, + "theoretical_loss": 3.574311791758908, + "tokens_seen": 1249074176 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031390170511534605, + "loss": 2.8396, + "theoretical_loss": 3.5742946713100006, + "tokens_seen": 1249139712 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031389167502507523, + "loss": 2.8056, + "theoretical_loss": 3.5742775520107797, + "tokens_seen": 1249205248 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138816449348044, + "loss": 2.7776, + "theoretical_loss": 3.5742604338611077, + "tokens_seen": 1249270784 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138716148445336, + "loss": 2.7601, + "theoretical_loss": 3.574243316860847, + "tokens_seen": 1249336320 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138615847542628, + "loss": 2.8773, + "theoretical_loss": 3.5742262010098607, + "tokens_seen": 1249401856 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031385155466399196, + "loss": 2.916, + "theoretical_loss": 3.5742090863080107, + "tokens_seen": 1249467392 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138415245737212, + "loss": 2.8374, + "theoretical_loss": 3.57419197275516, + "tokens_seen": 1249532928 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138314944834503, + "loss": 2.7965, + "theoretical_loss": 3.5741748603511714, + "tokens_seen": 1249598464 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031382146439317955, + "loss": 2.8798, + "theoretical_loss": 3.574157749095907, + "tokens_seen": 1249664000 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031381143430290873, + "loss": 2.8782, + "theoretical_loss": 3.57414063898923, + "tokens_seen": 1249729536 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003138014042126379, + "loss": 2.7909, + "theoretical_loss": 3.5741235300310024, + "tokens_seen": 1249795072 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031379137412236715, + "loss": 2.7418, + "theoretical_loss": 3.574106422221088, + "tokens_seen": 1249860608 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137813440320963, + "loss": 2.7937, + "theoretical_loss": 3.574089315559348, + "tokens_seen": 1249926144 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2966191, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.816481113433838, + "objective/train/theoretical_loss": 3.574072210045646, + "objective/train/tokens_used": 1270451680, + "theoretical_loss": 3.574072210045646, + "tokens_seen": 1249991680 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137713139418255, + "loss": 2.7361, + "theoretical_loss": 3.574072210045646, + "tokens_seen": 1249991680 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137612838515547, + "loss": 2.8292, + "theoretical_loss": 3.574055105679845, + "tokens_seen": 1250057216 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137512537612839, + "loss": 2.62, + "theoretical_loss": 3.5740380024618075, + "tokens_seen": 1250122752 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031374122367101306, + "loss": 2.6362, + "theoretical_loss": 3.574020900391396, + "tokens_seen": 1250188288 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031373119358074224, + "loss": 2.822, + "theoretical_loss": 3.574003799468474, + "tokens_seen": 1250253824 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137211634904714, + "loss": 2.8563, + "theoretical_loss": 3.573986699692904, + "tokens_seen": 1250319360 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031371113340020065, + "loss": 2.851, + "theoretical_loss": 3.573969601064549, + "tokens_seen": 1250384896 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003137011033099298, + "loss": 2.8536, + "theoretical_loss": 3.5739525035832718, + "tokens_seen": 1250450432 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313691073219659, + "loss": 2.7398, + "theoretical_loss": 3.573935407248935, + "tokens_seen": 1250515968 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031368104312938814, + "loss": 2.8687, + "theoretical_loss": 3.5739183120614024, + "tokens_seen": 1250581504 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003136710130391174, + "loss": 2.7887, + "theoretical_loss": 3.573901218020536, + "tokens_seen": 1250647040 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031366098294884656, + "loss": 2.6333, + "theoretical_loss": 3.5738841251261997, + "tokens_seen": 1250712576 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031365095285857574, + "loss": 2.8361, + "theoretical_loss": 3.5738670333782565, + "tokens_seen": 1250778112 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003136409227683049, + "loss": 2.7818, + "theoretical_loss": 3.5738499427765684, + "tokens_seen": 1250843648 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003136308926780341, + "loss": 2.7883, + "theoretical_loss": 3.5738328533209995, + "tokens_seen": 1250909184 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003136208625877633, + "loss": 2.8509, + "theoretical_loss": 3.5738157650114126, + "tokens_seen": 1250974720 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003136108324974925, + "loss": 2.8117, + "theoretical_loss": 3.5737986778476705, + "tokens_seen": 1251040256 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031360080240722165, + "loss": 2.8368, + "theoretical_loss": 3.573781591829637, + "tokens_seen": 1251105792 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135907723169509, + "loss": 2.9296, + "theoretical_loss": 3.5737645069571746, + "tokens_seen": 1251171328 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031358074222668006, + "loss": 2.8334, + "theoretical_loss": 3.573747423230147, + "tokens_seen": 1251236864 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031357071213640924, + "loss": 2.7435, + "theoretical_loss": 3.5737303406484173, + "tokens_seen": 1251302400 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135606820461384, + "loss": 2.8188, + "theoretical_loss": 3.5737132592118486, + "tokens_seen": 1251367936 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135506519558676, + "loss": 2.7808, + "theoretical_loss": 3.5736961789203043, + "tokens_seen": 1251433472 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135406218655968, + "loss": 2.7278, + "theoretical_loss": 3.5736790997736474, + "tokens_seen": 1251499008 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313530591775326, + "loss": 2.8702, + "theoretical_loss": 3.5736620217717414, + "tokens_seen": 1251564544 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2971446, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.915802478790283, + "objective/train/theoretical_loss": 3.5736449449144496, + "objective/train/tokens_used": 1272090080, + "theoretical_loss": 3.5736449449144496, + "tokens_seen": 1251630080 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031352056168505515, + "loss": 2.7939, + "theoretical_loss": 3.5736449449144496, + "tokens_seen": 1251630080 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135105315947844, + "loss": 2.7996, + "theoretical_loss": 3.573627869201636, + "tokens_seen": 1251695616 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003135005015045135, + "loss": 2.8185, + "theoretical_loss": 3.573610794633163, + "tokens_seen": 1251761152 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031349047141424275, + "loss": 2.8155, + "theoretical_loss": 3.573593721208894, + "tokens_seen": 1251826688 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031348044132397193, + "loss": 2.8243, + "theoretical_loss": 3.5735766489286935, + "tokens_seen": 1251892224 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003134704112337011, + "loss": 2.8424, + "theoretical_loss": 3.573559577792424, + "tokens_seen": 1251957760 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003134603811434303, + "loss": 2.8609, + "theoretical_loss": 3.5735425077999494, + "tokens_seen": 1252023296 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003134503510531595, + "loss": 2.849, + "theoretical_loss": 3.573525438951133, + "tokens_seen": 1252088832 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031344032096288865, + "loss": 2.7869, + "theoretical_loss": 3.573508371245838, + "tokens_seen": 1252154368 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003134302908726179, + "loss": 2.8367, + "theoretical_loss": 3.5734913046839285, + "tokens_seen": 1252219904 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313420260782347, + "loss": 2.881, + "theoretical_loss": 3.573474239265268, + "tokens_seen": 1252285440 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031341023069207625, + "loss": 2.7291, + "theoretical_loss": 3.5734571749897204, + "tokens_seen": 1252350976 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031340020060180543, + "loss": 2.7483, + "theoretical_loss": 3.5734401118571486, + "tokens_seen": 1252416512 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133901705115346, + "loss": 2.753, + "theoretical_loss": 3.5734230498674164, + "tokens_seen": 1252482048 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133801404212638, + "loss": 2.9136, + "theoretical_loss": 3.5734059890203875, + "tokens_seen": 1252547584 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313370110330993, + "loss": 2.9022, + "theoretical_loss": 3.573388929315926, + "tokens_seen": 1252613120 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031336008024072216, + "loss": 2.7284, + "theoretical_loss": 3.5733718707538955, + "tokens_seen": 1252678656 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133500501504514, + "loss": 2.8223, + "theoretical_loss": 3.5733548133341593, + "tokens_seen": 1252744192 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133400200601805, + "loss": 2.6999, + "theoretical_loss": 3.5733377570565814, + "tokens_seen": 1252809728 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031332998996990975, + "loss": 2.671, + "theoretical_loss": 3.573320701921025, + "tokens_seen": 1252875264 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133199598796389, + "loss": 2.7424, + "theoretical_loss": 3.5733036479273554, + "tokens_seen": 1252940800 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003133099297893681, + "loss": 2.8057, + "theoretical_loss": 3.5732865950754347, + "tokens_seen": 1253006336 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132998996990973, + "loss": 2.7315, + "theoretical_loss": 3.5732695433651287, + "tokens_seen": 1253071872 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132898696088265, + "loss": 2.7968, + "theoretical_loss": 3.573252492796299, + "tokens_seen": 1253137408 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031327983951855566, + "loss": 2.8262, + "theoretical_loss": 3.573235443368811, + "tokens_seen": 1253202944 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2974309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7127323150634766, + "objective/train/theoretical_loss": 3.5732183950825283, + "objective/train/tokens_used": 1273728480, + "theoretical_loss": 3.5732183950825283, + "tokens_seen": 1253268480 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132698094282849, + "loss": 2.7841, + "theoretical_loss": 3.5732183950825283, + "tokens_seen": 1253268480 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313259779338014, + "loss": 2.8351, + "theoretical_loss": 3.5732013479373146, + "tokens_seen": 1253334016 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031324974924774326, + "loss": 2.8684, + "theoretical_loss": 3.5731843019330345, + "tokens_seen": 1253399552 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132397191574724, + "loss": 2.87, + "theoretical_loss": 3.573167257069551, + "tokens_seen": 1253465088 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132296890672016, + "loss": 2.7713, + "theoretical_loss": 3.573150213346729, + "tokens_seen": 1253530624 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003132196589769308, + "loss": 2.8625, + "theoretical_loss": 3.5731331707644323, + "tokens_seen": 1253596160 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031320962888666, + "loss": 2.8473, + "theoretical_loss": 3.5731161293225244, + "tokens_seen": 1253661696 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031319959879638916, + "loss": 2.8461, + "theoretical_loss": 3.5730990890208703, + "tokens_seen": 1253727232 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031318956870611834, + "loss": 2.8897, + "theoretical_loss": 3.5730820498593334, + "tokens_seen": 1253792768 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003131795386158475, + "loss": 2.8054, + "theoretical_loss": 3.5730650118377785, + "tokens_seen": 1253858304 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031316950852557676, + "loss": 2.7755, + "theoretical_loss": 3.5730479749560686, + "tokens_seen": 1253923840 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003131594784353059, + "loss": 2.7589, + "theoretical_loss": 3.5730309392140693, + "tokens_seen": 1253989376 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003131494483450351, + "loss": 2.7378, + "theoretical_loss": 3.5730139046116434, + "tokens_seen": 1254054912 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031313941825476425, + "loss": 2.7212, + "theoretical_loss": 3.572996871148656, + "tokens_seen": 1254120448 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003131293881644935, + "loss": 2.6918, + "theoretical_loss": 3.5729798388249714, + "tokens_seen": 1254185984 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031311935807422267, + "loss": 2.8249, + "theoretical_loss": 3.5729628076404536, + "tokens_seen": 1254251520 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031310932798395185, + "loss": 2.799, + "theoretical_loss": 3.5729457775949673, + "tokens_seen": 1254317056 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031309929789368103, + "loss": 2.724, + "theoretical_loss": 3.5729287486883763, + "tokens_seen": 1254382592 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031308926780341026, + "loss": 2.8058, + "theoretical_loss": 3.5729117209205445, + "tokens_seen": 1254448128 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130792377131394, + "loss": 2.81, + "theoretical_loss": 3.572894694291337, + "tokens_seen": 1254513664 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130692076228686, + "loss": 2.8226, + "theoretical_loss": 3.5728776688006185, + "tokens_seen": 1254579200 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130591775325978, + "loss": 2.8517, + "theoretical_loss": 3.572860644448253, + "tokens_seen": 1254644736 + }, + { + "epoch": 15.01, + "learning_rate": 0.000313049147442327, + "loss": 2.7818, + "theoretical_loss": 3.5728436212341044, + "tokens_seen": 1254710272 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130391173520562, + "loss": 2.81, + "theoretical_loss": 3.572826599158038, + "tokens_seen": 1254775808 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031302908726178535, + "loss": 2.7396, + "theoretical_loss": 3.5728095782199176, + "tokens_seen": 1254841344 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2979084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8463714122772217, + "objective/train/theoretical_loss": 3.572792558419608, + "objective/train/tokens_used": 1275366880, + "theoretical_loss": 3.572792558419608, + "tokens_seen": 1254906880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130190571715146, + "loss": 2.8706, + "theoretical_loss": 3.572792558419608, + "tokens_seen": 1254906880 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003130090270812437, + "loss": 2.795, + "theoretical_loss": 3.5727755397569743, + "tokens_seen": 1254972416 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031299899699097295, + "loss": 2.8086, + "theoretical_loss": 3.57275852223188, + "tokens_seen": 1255037952 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031298896690070213, + "loss": 2.7317, + "theoretical_loss": 3.5727415058441903, + "tokens_seen": 1255103488 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003129789368104313, + "loss": 2.83, + "theoretical_loss": 3.572724490593769, + "tokens_seen": 1255169024 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003129689067201605, + "loss": 2.8007, + "theoretical_loss": 3.5727074764804825, + "tokens_seen": 1255234560 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003129588766298897, + "loss": 2.8443, + "theoretical_loss": 3.572690463504194, + "tokens_seen": 1255300096 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031294884653961885, + "loss": 2.7866, + "theoretical_loss": 3.572673451664768, + "tokens_seen": 1255365632 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003129388164493481, + "loss": 2.729, + "theoretical_loss": 3.57265644096207, + "tokens_seen": 1255431168 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003129287863590772, + "loss": 2.7719, + "theoretical_loss": 3.5726394313959644, + "tokens_seen": 1255496704 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031291875626880645, + "loss": 2.7998, + "theoretical_loss": 3.572622422966316, + "tokens_seen": 1255562240 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031290872617853563, + "loss": 2.93, + "theoretical_loss": 3.572605415672989, + "tokens_seen": 1255627776 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128986960882648, + "loss": 2.8598, + "theoretical_loss": 3.5725884095158493, + "tokens_seen": 1255693312 + }, + { + "epoch": 15.01, + "learning_rate": 0.000312888665997994, + "loss": 2.8107, + "theoretical_loss": 3.5725714044947603, + "tokens_seen": 1255758848 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128786359077232, + "loss": 2.7912, + "theoretical_loss": 3.5725544006095884, + "tokens_seen": 1255824384 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031286860581745236, + "loss": 2.8545, + "theoretical_loss": 3.572537397860197, + "tokens_seen": 1255889920 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128585757271816, + "loss": 2.8673, + "theoretical_loss": 3.5725203962464516, + "tokens_seen": 1255955456 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128485456369107, + "loss": 2.7233, + "theoretical_loss": 3.5725033957682175, + "tokens_seen": 1256020992 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031283851554663995, + "loss": 2.8192, + "theoretical_loss": 3.572486396425359, + "tokens_seen": 1256086528 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128284854563691, + "loss": 2.7673, + "theoretical_loss": 3.5724693982177413, + "tokens_seen": 1256152064 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128184553660983, + "loss": 2.8335, + "theoretical_loss": 3.572452401145229, + "tokens_seen": 1256217600 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003128084252758275, + "loss": 2.8171, + "theoretical_loss": 3.5724354052076883, + "tokens_seen": 1256283136 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127983951855567, + "loss": 2.8473, + "theoretical_loss": 3.5724184104049828, + "tokens_seen": 1256348672 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031278836509528586, + "loss": 2.7683, + "theoretical_loss": 3.5724014167369775, + "tokens_seen": 1256414208 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127783350050151, + "loss": 2.7716, + "theoretical_loss": 3.5723844242035385, + "tokens_seen": 1256479744 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2982091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7662105560302734, + "objective/train/theoretical_loss": 3.5723674328045307, + "objective/train/tokens_used": 1277005280, + "theoretical_loss": 3.5723674328045307, + "tokens_seen": 1256545280 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127683049147442, + "loss": 2.6642, + "theoretical_loss": 3.5723674328045307, + "tokens_seen": 1256545280 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031275827482447346, + "loss": 2.8606, + "theoretical_loss": 3.5723504425398183, + "tokens_seen": 1256610816 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127482447342026, + "loss": 2.8086, + "theoretical_loss": 3.5723334534092674, + "tokens_seen": 1256676352 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127382146439318, + "loss": 2.7846, + "theoretical_loss": 3.572316465412743, + "tokens_seen": 1256741888 + }, + { + "epoch": 15.01, + "learning_rate": 0.000312728184553661, + "loss": 2.756, + "theoretical_loss": 3.57229947855011, + "tokens_seen": 1256807424 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003127181544633902, + "loss": 2.8712, + "theoretical_loss": 3.572282492821233, + "tokens_seen": 1256872960 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031270812437311936, + "loss": 2.7875, + "theoretical_loss": 3.5722655082259784, + "tokens_seen": 1256938496 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031269809428284854, + "loss": 2.7837, + "theoretical_loss": 3.5722485247642104, + "tokens_seen": 1257004032 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003126880641925777, + "loss": 2.8191, + "theoretical_loss": 3.5722315424357953, + "tokens_seen": 1257069568 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031267803410230696, + "loss": 2.783, + "theoretical_loss": 3.5722145612405978, + "tokens_seen": 1257135104 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003126680040120361, + "loss": 2.8603, + "theoretical_loss": 3.572197581178483, + "tokens_seen": 1257200640 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003126579739217653, + "loss": 2.8155, + "theoretical_loss": 3.5721806022493166, + "tokens_seen": 1257266176 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031264794383149445, + "loss": 2.7044, + "theoretical_loss": 3.5721636244529638, + "tokens_seen": 1257331712 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003126379137412237, + "loss": 2.8748, + "theoretical_loss": 3.5721466477892903, + "tokens_seen": 1257397248 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031262788365095287, + "loss": 2.8507, + "theoretical_loss": 3.572129672258161, + "tokens_seen": 1257462784 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031261785356068205, + "loss": 2.8318, + "theoretical_loss": 3.5721126978594415, + "tokens_seen": 1257528320 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031260782347041123, + "loss": 2.7634, + "theoretical_loss": 3.5720957245929976, + "tokens_seen": 1257593856 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031259779338014046, + "loss": 2.7874, + "theoretical_loss": 3.5720787524586948, + "tokens_seen": 1257659392 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003125877632898696, + "loss": 2.8222, + "theoretical_loss": 3.5720617814563975, + "tokens_seen": 1257724928 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003125777331995988, + "loss": 2.7132, + "theoretical_loss": 3.572044811585972, + "tokens_seen": 1257790464 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031256770310932795, + "loss": 2.8297, + "theoretical_loss": 3.5720278428472847, + "tokens_seen": 1257856000 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003125576730190572, + "loss": 2.8562, + "theoretical_loss": 3.5720108752401996, + "tokens_seen": 1257921536 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031254764292878637, + "loss": 2.766, + "theoretical_loss": 3.571993908764583, + "tokens_seen": 1257987072 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031253761283851555, + "loss": 2.8768, + "theoretical_loss": 3.571976943420301, + "tokens_seen": 1258052608 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031252758274824473, + "loss": 2.8165, + "theoretical_loss": 3.571959979207218, + "tokens_seen": 1258118144 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2985822, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8803975582122803, + "objective/train/theoretical_loss": 3.571943016125201, + "objective/train/tokens_used": 1278643680, + "theoretical_loss": 3.571943016125201, + "tokens_seen": 1258183680 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003125175526579739, + "loss": 2.8338, + "theoretical_loss": 3.571943016125201, + "tokens_seen": 1258183680 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003125075225677031, + "loss": 2.6855, + "theoretical_loss": 3.571926054174115, + "tokens_seen": 1258249216 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031249749247743233, + "loss": 2.7998, + "theoretical_loss": 3.571909093353826, + "tokens_seen": 1258314752 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031248746238716146, + "loss": 2.827, + "theoretical_loss": 3.571892133664199, + "tokens_seen": 1258380288 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003124774322968907, + "loss": 2.7701, + "theoretical_loss": 3.5718751751051006, + "tokens_seen": 1258445824 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003124674022066198, + "loss": 2.7788, + "theoretical_loss": 3.5718582176763958, + "tokens_seen": 1258511360 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031245737211634905, + "loss": 2.7877, + "theoretical_loss": 3.5718412613779513, + "tokens_seen": 1258576896 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031244734202607823, + "loss": 2.903, + "theoretical_loss": 3.571824306209632, + "tokens_seen": 1258642432 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003124373119358074, + "loss": 2.8469, + "theoretical_loss": 3.5718073521713043, + "tokens_seen": 1258707968 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003124272818455366, + "loss": 2.7856, + "theoretical_loss": 3.5717903992628344, + "tokens_seen": 1258773504 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031241725175526583, + "loss": 2.7923, + "theoretical_loss": 3.571773447484087, + "tokens_seen": 1258839040 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031240722166499496, + "loss": 2.821, + "theoretical_loss": 3.5717564968349294, + "tokens_seen": 1258904576 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123971915747242, + "loss": 2.8824, + "theoretical_loss": 3.571739547315226, + "tokens_seen": 1258970112 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123871614844533, + "loss": 2.857, + "theoretical_loss": 3.5717225989248442, + "tokens_seen": 1259035648 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031237713139418256, + "loss": 2.863, + "theoretical_loss": 3.571705651663649, + "tokens_seen": 1259101184 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031236710130391174, + "loss": 2.6658, + "theoretical_loss": 3.5716887055315074, + "tokens_seen": 1259166720 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123570712136409, + "loss": 2.8746, + "theoretical_loss": 3.5716717605282846, + "tokens_seen": 1259232256 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123470411233701, + "loss": 2.879, + "theoretical_loss": 3.5716548166538464, + "tokens_seen": 1259297792 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123370110330993, + "loss": 2.7941, + "theoretical_loss": 3.57163787390806, + "tokens_seen": 1259363328 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031232698094282846, + "loss": 2.8752, + "theoretical_loss": 3.5716209322907906, + "tokens_seen": 1259428864 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123169508525577, + "loss": 2.7574, + "theoretical_loss": 3.5716039918019042, + "tokens_seen": 1259494400 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003123069207622869, + "loss": 2.8171, + "theoretical_loss": 3.5715870524412674, + "tokens_seen": 1259559936 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031229689067201606, + "loss": 2.8168, + "theoretical_loss": 3.571570114208747, + "tokens_seen": 1259625472 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003122868605817453, + "loss": 2.8086, + "theoretical_loss": 3.571553177104207, + "tokens_seen": 1259691008 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003122768304914744, + "loss": 2.7416, + "theoretical_loss": 3.5715362411275158, + "tokens_seen": 1259756544 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2988876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6132752895355225, + "objective/train/theoretical_loss": 3.571519306278539, + "objective/train/tokens_used": 1280282080, + "theoretical_loss": 3.571519306278539, + "tokens_seen": 1259822080 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031226680040120366, + "loss": 2.6696, + "theoretical_loss": 3.571519306278539, + "tokens_seen": 1259822080 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003122567703109328, + "loss": 2.778, + "theoretical_loss": 3.571502372557142, + "tokens_seen": 1259887616 + }, + { + "epoch": 15.01, + "learning_rate": 0.000312246740220662, + "loss": 2.8669, + "theoretical_loss": 3.571485439963193, + "tokens_seen": 1259953152 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003122367101303912, + "loss": 2.7516, + "theoretical_loss": 3.5714685084965563, + "tokens_seen": 1260018688 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003122266800401204, + "loss": 2.8622, + "theoretical_loss": 3.571451578157099, + "tokens_seen": 1260084224 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031221664994984956, + "loss": 2.7867, + "theoretical_loss": 3.5714346489446873, + "tokens_seen": 1260149760 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031220661985957874, + "loss": 2.882, + "theoretical_loss": 3.5714177208591877, + "tokens_seen": 1260215296 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003121965897693079, + "loss": 2.8271, + "theoretical_loss": 3.571400793900467, + "tokens_seen": 1260280832 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031218655967903716, + "loss": 2.8344, + "theoretical_loss": 3.5713838680683905, + "tokens_seen": 1260346368 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003121765295887663, + "loss": 2.8507, + "theoretical_loss": 3.5713669433628255, + "tokens_seen": 1260411904 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003121664994984955, + "loss": 2.8388, + "theoretical_loss": 3.5713500197836385, + "tokens_seen": 1260477440 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031215646940822465, + "loss": 2.8346, + "theoretical_loss": 3.5713330973306956, + "tokens_seen": 1260542976 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003121464393179539, + "loss": 2.9227, + "theoretical_loss": 3.571316176003864, + "tokens_seen": 1260608512 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031213640922768307, + "loss": 2.8191, + "theoretical_loss": 3.5712992558030088, + "tokens_seen": 1260674048 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031212637913741225, + "loss": 2.5767, + "theoretical_loss": 3.571282336727998, + "tokens_seen": 1260739584 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031211634904714143, + "loss": 2.8251, + "theoretical_loss": 3.5712654187786974, + "tokens_seen": 1260805120 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031210631895687066, + "loss": 2.7279, + "theoretical_loss": 3.5712485019549742, + "tokens_seen": 1260870656 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003120962888665998, + "loss": 2.7673, + "theoretical_loss": 3.5712315862566943, + "tokens_seen": 1260936192 + }, + { + "epoch": 15.01, + "learning_rate": 0.000312086258776329, + "loss": 2.7015, + "theoretical_loss": 3.571214671683725, + "tokens_seen": 1261001728 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031207622868605815, + "loss": 2.785, + "theoretical_loss": 3.571197758235932, + "tokens_seen": 1261067264 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003120661985957874, + "loss": 2.7397, + "theoretical_loss": 3.571180845913183, + "tokens_seen": 1261132800 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031205616850551657, + "loss": 2.8618, + "theoretical_loss": 3.571163934715344, + "tokens_seen": 1261198336 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031204613841524575, + "loss": 2.8301, + "theoretical_loss": 3.571147024642282, + "tokens_seen": 1261263872 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031203610832497493, + "loss": 2.823, + "theoretical_loss": 3.5711301156938644, + "tokens_seen": 1261329408 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003120260782347041, + "loss": 2.8744, + "theoretical_loss": 3.571113207869957, + "tokens_seen": 1261394944 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2993644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5732975006103516, + "objective/train/theoretical_loss": 3.5710963011704266, + "objective/train/tokens_used": 1281920480, + "theoretical_loss": 3.5710963011704266, + "tokens_seen": 1261460480 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003120160481444333, + "loss": 2.6926, + "theoretical_loss": 3.5710963011704266, + "tokens_seen": 1261460480 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031200601805416253, + "loss": 2.8532, + "theoretical_loss": 3.5710793955951408, + "tokens_seen": 1261526016 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031199598796389166, + "loss": 2.8174, + "theoretical_loss": 3.571062491143966, + "tokens_seen": 1261591552 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003119859578736209, + "loss": 2.8453, + "theoretical_loss": 3.5710455878167684, + "tokens_seen": 1261657088 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031197592778335, + "loss": 2.7872, + "theoretical_loss": 3.571028685613417, + "tokens_seen": 1261722624 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031196589769307925, + "loss": 2.8131, + "theoretical_loss": 3.571011784533776, + "tokens_seen": 1261788160 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031195586760280843, + "loss": 2.7958, + "theoretical_loss": 3.570994884577714, + "tokens_seen": 1261853696 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003119458375125376, + "loss": 2.7453, + "theoretical_loss": 3.5709779857450976, + "tokens_seen": 1261919232 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003119358074222668, + "loss": 2.8963, + "theoretical_loss": 3.570961088035794, + "tokens_seen": 1261984768 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031192577733199603, + "loss": 2.7855, + "theoretical_loss": 3.5709441914496693, + "tokens_seen": 1262050304 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031191574724172516, + "loss": 2.8469, + "theoretical_loss": 3.570927295986592, + "tokens_seen": 1262115840 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003119057171514544, + "loss": 2.7694, + "theoretical_loss": 3.5709104016464277, + "tokens_seen": 1262181376 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003118956870611835, + "loss": 2.8003, + "theoretical_loss": 3.5708935084290445, + "tokens_seen": 1262246912 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031188565697091276, + "loss": 2.7923, + "theoretical_loss": 3.5708766163343086, + "tokens_seen": 1262312448 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031187562688064194, + "loss": 2.8674, + "theoretical_loss": 3.570859725362088, + "tokens_seen": 1262377984 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003118655967903711, + "loss": 2.8747, + "theoretical_loss": 3.5708428355122495, + "tokens_seen": 1262443520 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003118555667001003, + "loss": 2.7327, + "theoretical_loss": 3.5708259467846597, + "tokens_seen": 1262509056 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003118455366098295, + "loss": 2.8378, + "theoretical_loss": 3.5708090591791866, + "tokens_seen": 1262574592 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031183550651955866, + "loss": 2.839, + "theoretical_loss": 3.570792172695697, + "tokens_seen": 1262640128 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003118254764292879, + "loss": 2.7632, + "theoretical_loss": 3.570775287334058, + "tokens_seen": 1262705664 + }, + { + "epoch": 15.01, + "learning_rate": 0.000311815446339017, + "loss": 2.8139, + "theoretical_loss": 3.5707584030941377, + "tokens_seen": 1262771200 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031180541624874626, + "loss": 2.815, + "theoretical_loss": 3.5707415199758024, + "tokens_seen": 1262836736 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117953861584754, + "loss": 2.717, + "theoretical_loss": 3.5707246379789197, + "tokens_seen": 1262902272 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117853560682046, + "loss": 2.8413, + "theoretical_loss": 3.570707757103357, + "tokens_seen": 1262967808 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117753259779338, + "loss": 2.7522, + "theoretical_loss": 3.5706908773489814, + "tokens_seen": 1263033344 + }, + { + "epoch": 15.01, + "objective/train/docs_used": 2996590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.951831340789795, + "objective/train/theoretical_loss": 3.5706739987156606, + "objective/train/tokens_used": 1283558880, + "theoretical_loss": 3.5706739987156606, + "tokens_seen": 1263098880 + }, + { + "epoch": 15.01, + "learning_rate": 0.000311765295887663, + "loss": 2.8858, + "theoretical_loss": 3.5706739987156606, + "tokens_seen": 1263098880 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031175526579739217, + "loss": 2.6841, + "theoretical_loss": 3.5706571212032623, + "tokens_seen": 1263164416 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117452357071214, + "loss": 2.9003, + "theoretical_loss": 3.5706402448116528, + "tokens_seen": 1263229952 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031173520561685053, + "loss": 2.8699, + "theoretical_loss": 3.5706233695407006, + "tokens_seen": 1263295488 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031172517552657976, + "loss": 2.795, + "theoretical_loss": 3.570606495390272, + "tokens_seen": 1263361024 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117151454363089, + "loss": 2.9396, + "theoretical_loss": 3.570589622360236, + "tokens_seen": 1263426560 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003117051153460381, + "loss": 2.8163, + "theoretical_loss": 3.5705727504504585, + "tokens_seen": 1263492096 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003116950852557673, + "loss": 2.857, + "theoretical_loss": 3.570555879660809, + "tokens_seen": 1263557632 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003116850551654965, + "loss": 2.7437, + "theoretical_loss": 3.570539009991153, + "tokens_seen": 1263623168 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031167502507522567, + "loss": 2.9009, + "theoretical_loss": 3.5705221414413586, + "tokens_seen": 1263688704 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031166499498495485, + "loss": 2.7827, + "theoretical_loss": 3.5705052740112944, + "tokens_seen": 1263754240 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031165496489468403, + "loss": 2.9077, + "theoretical_loss": 3.5704884077008274, + "tokens_seen": 1263819776 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031164493480441327, + "loss": 2.8159, + "theoretical_loss": 3.570471542509825, + "tokens_seen": 1263885312 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003116349047141424, + "loss": 2.8057, + "theoretical_loss": 3.5704546784381552, + "tokens_seen": 1263950848 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031162487462387163, + "loss": 2.8742, + "theoretical_loss": 3.5704378154856853, + "tokens_seen": 1264016384 + }, + { + "epoch": 15.01, + "learning_rate": 0.0003116148445336008, + "loss": 2.7067, + "theoretical_loss": 3.570420953652283, + "tokens_seen": 1264081920 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031160481444333, + "loss": 2.7778, + "theoretical_loss": 3.5704040929378165, + "tokens_seen": 1264147456 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031159478435305917, + "loss": 2.8042, + "theoretical_loss": 3.570387233342153, + "tokens_seen": 1264212992 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031158475426278835, + "loss": 2.8282, + "theoretical_loss": 3.5703703748651607, + "tokens_seen": 1264278528 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031157472417251753, + "loss": 2.8658, + "theoretical_loss": 3.5703535175067076, + "tokens_seen": 1264344064 + }, + { + "epoch": 15.01, + "learning_rate": 0.00031156469408224677, + "loss": 2.8394, + "theoretical_loss": 3.5703366612666607, + "tokens_seen": 1264409600 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031155466399197595, + "loss": 2.7995, + "theoretical_loss": 3.5703198061448886, + "tokens_seen": 1264475136 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031154463390170513, + "loss": 2.7468, + "theoretical_loss": 3.5703029521412586, + "tokens_seen": 1264540672 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003115346038114343, + "loss": 2.6709, + "theoretical_loss": 3.5702860992556387, + "tokens_seen": 1264606208 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003115245737211635, + "loss": 2.8452, + "theoretical_loss": 3.5702692474878974, + "tokens_seen": 1264671744 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3000502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.893022060394287, + "objective/train/theoretical_loss": 3.570252396837902, + "objective/train/tokens_used": 1285197280, + "theoretical_loss": 3.570252396837902, + "tokens_seen": 1264737280 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031151454363089273, + "loss": 2.79, + "theoretical_loss": 3.570252396837902, + "tokens_seen": 1264737280 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031150451354062186, + "loss": 2.8776, + "theoretical_loss": 3.5702355473055203, + "tokens_seen": 1264802816 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003114944834503511, + "loss": 2.7401, + "theoretical_loss": 3.5702186988906206, + "tokens_seen": 1264868352 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003114844533600802, + "loss": 2.8347, + "theoretical_loss": 3.5702018515930716, + "tokens_seen": 1264933888 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031147442326980945, + "loss": 2.8161, + "theoretical_loss": 3.57018500541274, + "tokens_seen": 1264999424 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031146439317953863, + "loss": 2.6906, + "theoretical_loss": 3.5701681603494944, + "tokens_seen": 1265064960 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003114543630892678, + "loss": 2.7818, + "theoretical_loss": 3.5701513164032033, + "tokens_seen": 1265130496 + }, + { + "epoch": 15.02, + "learning_rate": 0.000311444332998997, + "loss": 2.8473, + "theoretical_loss": 3.5701344735737335, + "tokens_seen": 1265196032 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031143430290872623, + "loss": 2.7496, + "theoretical_loss": 3.5701176318609544, + "tokens_seen": 1265261568 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031142427281845536, + "loss": 2.8754, + "theoretical_loss": 3.570100791264734, + "tokens_seen": 1265327104 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003114142427281846, + "loss": 2.7797, + "theoretical_loss": 3.57008395178494, + "tokens_seen": 1265392640 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003114042126379137, + "loss": 2.751, + "theoretical_loss": 3.5700671134214406, + "tokens_seen": 1265458176 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031139418254764296, + "loss": 2.6826, + "theoretical_loss": 3.570050276174104, + "tokens_seen": 1265523712 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031138415245737214, + "loss": 2.851, + "theoretical_loss": 3.570033440042799, + "tokens_seen": 1265589248 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113741223671013, + "loss": 2.8047, + "theoretical_loss": 3.570016605027393, + "tokens_seen": 1265654784 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113640922768305, + "loss": 2.8468, + "theoretical_loss": 3.5699997711277547, + "tokens_seen": 1265720320 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113540621865597, + "loss": 2.8433, + "theoretical_loss": 3.5699829383437525, + "tokens_seen": 1265785856 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031134403209628886, + "loss": 2.8105, + "theoretical_loss": 3.5699661066752544, + "tokens_seen": 1265851392 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113340020060181, + "loss": 2.8636, + "theoretical_loss": 3.569949276122129, + "tokens_seen": 1265916928 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113239719157472, + "loss": 2.8504, + "theoretical_loss": 3.569932446684244, + "tokens_seen": 1265982464 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031131394182547646, + "loss": 2.8258, + "theoretical_loss": 3.5699156183614686, + "tokens_seen": 1266048000 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003113039117352056, + "loss": 2.7929, + "theoretical_loss": 3.5698987911536713, + "tokens_seen": 1266113536 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112938816449348, + "loss": 2.9018, + "theoretical_loss": 3.56988196506072, + "tokens_seen": 1266179072 + }, + { + "epoch": 15.02, + "learning_rate": 0.000311283851554664, + "loss": 2.8517, + "theoretical_loss": 3.5698651400824826, + "tokens_seen": 1266244608 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112738214643932, + "loss": 2.8026, + "theoretical_loss": 3.569848316218829, + "tokens_seen": 1266310144 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3005011, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.953618288040161, + "objective/train/theoretical_loss": 3.5698314934696262, + "objective/train/tokens_used": 1286835680, + "theoretical_loss": 3.5698314934696262, + "tokens_seen": 1266375680 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031126379137412237, + "loss": 2.8292, + "theoretical_loss": 3.5698314934696262, + "tokens_seen": 1266375680 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112537612838516, + "loss": 2.7038, + "theoretical_loss": 3.5698146718347434, + "tokens_seen": 1266441216 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031124373119358073, + "loss": 2.8196, + "theoretical_loss": 3.56979785131405, + "tokens_seen": 1266506752 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031123370110330996, + "loss": 2.8095, + "theoretical_loss": 3.5697810319074126, + "tokens_seen": 1266572288 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112236710130391, + "loss": 2.9252, + "theoretical_loss": 3.5697642136147016, + "tokens_seen": 1266637824 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112136409227683, + "loss": 2.8401, + "theoretical_loss": 3.5697473964357846, + "tokens_seen": 1266703360 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003112036108324975, + "loss": 2.7876, + "theoretical_loss": 3.5697305803705306, + "tokens_seen": 1266768896 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003111935807422267, + "loss": 2.7233, + "theoretical_loss": 3.569713765418808, + "tokens_seen": 1266834432 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031118355065195587, + "loss": 2.8197, + "theoretical_loss": 3.5696969515804855, + "tokens_seen": 1266899968 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031117352056168505, + "loss": 2.7684, + "theoretical_loss": 3.569680138855432, + "tokens_seen": 1266965504 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031116349047141423, + "loss": 2.8189, + "theoretical_loss": 3.569663327243516, + "tokens_seen": 1267031040 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031115346038114347, + "loss": 2.8205, + "theoretical_loss": 3.5696465167446063, + "tokens_seen": 1267096576 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003111434302908726, + "loss": 2.8051, + "theoretical_loss": 3.5696297073585717, + "tokens_seen": 1267162112 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031113340020060183, + "loss": 2.8388, + "theoretical_loss": 3.569612899085281, + "tokens_seen": 1267227648 + }, + { + "epoch": 15.02, + "learning_rate": 0.000311123370110331, + "loss": 2.8305, + "theoretical_loss": 3.569596091924603, + "tokens_seen": 1267293184 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003111133400200602, + "loss": 2.8397, + "theoretical_loss": 3.569579285876406, + "tokens_seen": 1267358720 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031110330992978937, + "loss": 2.7949, + "theoretical_loss": 3.5695624809405597, + "tokens_seen": 1267424256 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031109327983951855, + "loss": 2.8006, + "theoretical_loss": 3.569545677116933, + "tokens_seen": 1267489792 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031108324974924773, + "loss": 2.8298, + "theoretical_loss": 3.5695288744053935, + "tokens_seen": 1267555328 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031107321965897697, + "loss": 2.8469, + "theoretical_loss": 3.5695120728058116, + "tokens_seen": 1267620864 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003110631895687061, + "loss": 2.7207, + "theoretical_loss": 3.569495272318055, + "tokens_seen": 1267686400 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031105315947843533, + "loss": 2.7428, + "theoretical_loss": 3.5694784729419933, + "tokens_seen": 1267751936 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031104312938816446, + "loss": 2.8594, + "theoretical_loss": 3.5694616746774956, + "tokens_seen": 1267817472 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003110330992978937, + "loss": 2.8146, + "theoretical_loss": 3.569444877524431, + "tokens_seen": 1267883008 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003110230692076229, + "loss": 2.7459, + "theoretical_loss": 3.569428081482668, + "tokens_seen": 1267948544 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3008279, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.817399501800537, + "objective/train/theoretical_loss": 3.569411286552076, + "objective/train/tokens_used": 1288474080, + "theoretical_loss": 3.569411286552076, + "tokens_seen": 1268014080 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031101303911735206, + "loss": 2.7471, + "theoretical_loss": 3.569411286552076, + "tokens_seen": 1268014080 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031100300902708124, + "loss": 2.8132, + "theoretical_loss": 3.569394492732524, + "tokens_seen": 1268079616 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109929789368104, + "loss": 2.8653, + "theoretical_loss": 3.569377700023881, + "tokens_seen": 1268145152 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109829488465396, + "loss": 2.8009, + "theoretical_loss": 3.569360908426016, + "tokens_seen": 1268210688 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031097291875626883, + "loss": 2.8059, + "theoretical_loss": 3.569344117938798, + "tokens_seen": 1268276224 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031096288866599796, + "loss": 2.843, + "theoretical_loss": 3.5693273285620966, + "tokens_seen": 1268341760 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109528585757272, + "loss": 2.7542, + "theoretical_loss": 3.5693105402957808, + "tokens_seen": 1268407296 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109428284854564, + "loss": 2.8372, + "theoretical_loss": 3.56929375313972, + "tokens_seen": 1268472832 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031093279839518556, + "loss": 2.7773, + "theoretical_loss": 3.569276967093783, + "tokens_seen": 1268538368 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031092276830491474, + "loss": 2.7341, + "theoretical_loss": 3.56926018215784, + "tokens_seen": 1268603904 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109127382146439, + "loss": 2.844, + "theoretical_loss": 3.5692433983317584, + "tokens_seen": 1268669440 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003109027081243731, + "loss": 2.9065, + "theoretical_loss": 3.569226615615409, + "tokens_seen": 1268734976 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031089267803410234, + "loss": 2.7033, + "theoretical_loss": 3.5692098340086607, + "tokens_seen": 1268800512 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031088264794383147, + "loss": 2.8005, + "theoretical_loss": 3.569193053511383, + "tokens_seen": 1268866048 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003108726178535607, + "loss": 2.7841, + "theoretical_loss": 3.569176274123445, + "tokens_seen": 1268931584 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031086258776328983, + "loss": 2.8106, + "theoretical_loss": 3.5691594958447164, + "tokens_seen": 1268997120 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031085255767301906, + "loss": 2.7452, + "theoretical_loss": 3.5691427186750664, + "tokens_seen": 1269062656 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031084252758274824, + "loss": 2.8107, + "theoretical_loss": 3.569125942614364, + "tokens_seen": 1269128192 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003108324974924774, + "loss": 2.8893, + "theoretical_loss": 3.569109167662479, + "tokens_seen": 1269193728 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003108224674022066, + "loss": 2.6977, + "theoretical_loss": 3.5690923938192816, + "tokens_seen": 1269259264 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003108124373119358, + "loss": 2.8341, + "theoretical_loss": 3.5690756210846395, + "tokens_seen": 1269324800 + }, + { + "epoch": 15.02, + "learning_rate": 0.000310802407221665, + "loss": 2.8293, + "theoretical_loss": 3.569058849458424, + "tokens_seen": 1269390336 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107923771313942, + "loss": 2.9119, + "theoretical_loss": 3.569042078940504, + "tokens_seen": 1269455872 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107823470411234, + "loss": 2.7709, + "theoretical_loss": 3.5690253095307485, + "tokens_seen": 1269521408 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031077231695085257, + "loss": 2.7963, + "theoretical_loss": 3.569008541229028, + "tokens_seen": 1269586944 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3013065, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0154805183410645, + "objective/train/theoretical_loss": 3.5689917740352115, + "objective/train/tokens_used": 1290112480, + "theoretical_loss": 3.5689917740352115, + "tokens_seen": 1269652480 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107622868605818, + "loss": 2.9172, + "theoretical_loss": 3.5689917740352115, + "tokens_seen": 1269652480 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031075225677031093, + "loss": 2.839, + "theoretical_loss": 3.568975007949169, + "tokens_seen": 1269718016 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031074222668004016, + "loss": 2.7696, + "theoretical_loss": 3.568958242970769, + "tokens_seen": 1269783552 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107321965897693, + "loss": 2.7999, + "theoretical_loss": 3.568941479099883, + "tokens_seen": 1269849088 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107221664994985, + "loss": 2.8063, + "theoretical_loss": 3.5689247163363795, + "tokens_seen": 1269914624 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107121364092277, + "loss": 2.8144, + "theoretical_loss": 3.568907954680128, + "tokens_seen": 1269980160 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003107021063189569, + "loss": 2.8115, + "theoretical_loss": 3.568891194130999, + "tokens_seen": 1270045696 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031069207622868607, + "loss": 2.863, + "theoretical_loss": 3.5688744346888623, + "tokens_seen": 1270111232 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031068204613841525, + "loss": 2.7929, + "theoretical_loss": 3.568857676353587, + "tokens_seen": 1270176768 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031067201604814443, + "loss": 2.7698, + "theoretical_loss": 3.5688409191250434, + "tokens_seen": 1270242304 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031066198595787367, + "loss": 2.7594, + "theoretical_loss": 3.568824163003101, + "tokens_seen": 1270307840 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003106519558676028, + "loss": 2.7912, + "theoretical_loss": 3.568807407987629, + "tokens_seen": 1270373376 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031064192577733203, + "loss": 2.8131, + "theoretical_loss": 3.568790654078499, + "tokens_seen": 1270438912 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003106318956870612, + "loss": 2.7351, + "theoretical_loss": 3.5687739012755797, + "tokens_seen": 1270504448 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003106218655967904, + "loss": 2.7914, + "theoretical_loss": 3.568757149578741, + "tokens_seen": 1270569984 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031061183550651957, + "loss": 2.7528, + "theoretical_loss": 3.568740398987853, + "tokens_seen": 1270635520 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031060180541624875, + "loss": 2.9092, + "theoretical_loss": 3.5687236495027856, + "tokens_seen": 1270701056 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031059177532597793, + "loss": 2.8035, + "theoretical_loss": 3.5687069011234094, + "tokens_seen": 1270766592 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031058174523570717, + "loss": 2.8514, + "theoretical_loss": 3.568690153849593, + "tokens_seen": 1270832128 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003105717151454363, + "loss": 2.8568, + "theoretical_loss": 3.568673407681208, + "tokens_seen": 1270897664 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031056168505516553, + "loss": 2.8648, + "theoretical_loss": 3.5686566626181233, + "tokens_seen": 1270963200 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031055165496489466, + "loss": 2.8249, + "theoretical_loss": 3.5686399186602094, + "tokens_seen": 1271028736 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003105416248746239, + "loss": 2.7963, + "theoretical_loss": 3.5686231758073363, + "tokens_seen": 1271094272 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003105315947843531, + "loss": 2.7911, + "theoretical_loss": 3.568606434059374, + "tokens_seen": 1271159808 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031052156469408226, + "loss": 2.8388, + "theoretical_loss": 3.568589693416193, + "tokens_seen": 1271225344 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3016022, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8300089836120605, + "objective/train/theoretical_loss": 3.568572953877663, + "objective/train/tokens_used": 1291750880, + "theoretical_loss": 3.568572953877663, + "tokens_seen": 1271290880 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031051153460381144, + "loss": 2.868, + "theoretical_loss": 3.568572953877663, + "tokens_seen": 1271290880 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003105015045135406, + "loss": 2.8609, + "theoretical_loss": 3.5685562154436545, + "tokens_seen": 1271356416 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003104914744232698, + "loss": 2.7767, + "theoretical_loss": 3.5685394781140376, + "tokens_seen": 1271421952 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031048144433299904, + "loss": 2.8426, + "theoretical_loss": 3.5685227418886827, + "tokens_seen": 1271487488 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031047141424272816, + "loss": 2.8804, + "theoretical_loss": 3.568506006767459, + "tokens_seen": 1271553024 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003104613841524574, + "loss": 2.8214, + "theoretical_loss": 3.5684892727502384, + "tokens_seen": 1271618560 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003104513540621866, + "loss": 2.8045, + "theoretical_loss": 3.5684725398368897, + "tokens_seen": 1271684096 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031044132397191576, + "loss": 2.8834, + "theoretical_loss": 3.5684558080272843, + "tokens_seen": 1271749632 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031043129388164494, + "loss": 2.8299, + "theoretical_loss": 3.568439077321292, + "tokens_seen": 1271815168 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003104212637913741, + "loss": 2.9039, + "theoretical_loss": 3.568422347718783, + "tokens_seen": 1271880704 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003104112337011033, + "loss": 2.7483, + "theoretical_loss": 3.5684056192196283, + "tokens_seen": 1271946240 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031040120361083254, + "loss": 2.7642, + "theoretical_loss": 3.5683888918236972, + "tokens_seen": 1272011776 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031039117352056167, + "loss": 2.7849, + "theoretical_loss": 3.568372165530861, + "tokens_seen": 1272077312 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003103811434302909, + "loss": 2.8106, + "theoretical_loss": 3.5683554403409903, + "tokens_seen": 1272142848 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031037111334002003, + "loss": 2.8719, + "theoretical_loss": 3.5683387162539546, + "tokens_seen": 1272208384 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031036108324974926, + "loss": 2.7281, + "theoretical_loss": 3.5683219932696257, + "tokens_seen": 1272273920 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031035105315947844, + "loss": 2.729, + "theoretical_loss": 3.5683052713878727, + "tokens_seen": 1272339456 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003103410230692076, + "loss": 2.8252, + "theoretical_loss": 3.5682885506085666, + "tokens_seen": 1272404992 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003103309929789368, + "loss": 2.7992, + "theoretical_loss": 3.5682718309315784, + "tokens_seen": 1272470528 + }, + { + "epoch": 15.02, + "learning_rate": 0.000310320962888666, + "loss": 2.8312, + "theoretical_loss": 3.5682551123567787, + "tokens_seen": 1272536064 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031031093279839517, + "loss": 2.7373, + "theoretical_loss": 3.568238394884037, + "tokens_seen": 1272601600 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003103009027081244, + "loss": 2.7903, + "theoretical_loss": 3.568221678513225, + "tokens_seen": 1272667136 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031029087261785353, + "loss": 2.7907, + "theoretical_loss": 3.568204963244213, + "tokens_seen": 1272732672 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031028084252758277, + "loss": 2.7342, + "theoretical_loss": 3.5681882490768713, + "tokens_seen": 1272798208 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031027081243731195, + "loss": 2.7966, + "theoretical_loss": 3.5681715360110715, + "tokens_seen": 1272863744 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3019717, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7119643688201904, + "objective/train/theoretical_loss": 3.568154824046683, + "objective/train/tokens_used": 1293389280, + "theoretical_loss": 3.568154824046683, + "tokens_seen": 1272929280 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031026078234704113, + "loss": 2.7318, + "theoretical_loss": 3.568154824046683, + "tokens_seen": 1272929280 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003102507522567703, + "loss": 2.8849, + "theoretical_loss": 3.5681381131835774, + "tokens_seen": 1272994816 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003102407221664995, + "loss": 2.8783, + "theoretical_loss": 3.5681214034216255, + "tokens_seen": 1273060352 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031023069207622867, + "loss": 2.8208, + "theoretical_loss": 3.568104694760698, + "tokens_seen": 1273125888 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003102206619859579, + "loss": 2.8046, + "theoretical_loss": 3.5680879872006646, + "tokens_seen": 1273191424 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031021063189568703, + "loss": 2.8661, + "theoretical_loss": 3.568071280741398, + "tokens_seen": 1273256960 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031020060180541627, + "loss": 2.7928, + "theoretical_loss": 3.5680545753827673, + "tokens_seen": 1273322496 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003101905717151454, + "loss": 2.823, + "theoretical_loss": 3.568037871124644, + "tokens_seen": 1273388032 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031018054162487463, + "loss": 2.759, + "theoretical_loss": 3.568021167966899, + "tokens_seen": 1273453568 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003101705115346038, + "loss": 2.8089, + "theoretical_loss": 3.5680044659094037, + "tokens_seen": 1273519104 + }, + { + "epoch": 15.02, + "learning_rate": 0.000310160481444333, + "loss": 2.7571, + "theoretical_loss": 3.5679877649520284, + "tokens_seen": 1273584640 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003101504513540622, + "loss": 2.8079, + "theoretical_loss": 3.567971065094644, + "tokens_seen": 1273650176 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003101404212637914, + "loss": 2.8892, + "theoretical_loss": 3.567954366337122, + "tokens_seen": 1273715712 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031013039117352054, + "loss": 2.8407, + "theoretical_loss": 3.567937668679332, + "tokens_seen": 1273781248 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031012036108324977, + "loss": 2.9301, + "theoretical_loss": 3.567920972121147, + "tokens_seen": 1273846784 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003101103309929789, + "loss": 2.7248, + "theoretical_loss": 3.5679042766624365, + "tokens_seen": 1273912320 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031010030090270813, + "loss": 2.8327, + "theoretical_loss": 3.5678875823030722, + "tokens_seen": 1273977856 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100902708124373, + "loss": 2.8871, + "theoretical_loss": 3.5678708890429247, + "tokens_seen": 1274043392 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100802407221665, + "loss": 2.7208, + "theoretical_loss": 3.567854196881866, + "tokens_seen": 1274108928 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100702106318957, + "loss": 2.8616, + "theoretical_loss": 3.5678375058197664, + "tokens_seen": 1274174464 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031006018054162486, + "loss": 2.6735, + "theoretical_loss": 3.5678208158564972, + "tokens_seen": 1274240000 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100501504513541, + "loss": 2.8349, + "theoretical_loss": 3.5678041269919296, + "tokens_seen": 1274305536 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100401203610833, + "loss": 2.8311, + "theoretical_loss": 3.5677874392259348, + "tokens_seen": 1274371072 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031003009027081246, + "loss": 2.8625, + "theoretical_loss": 3.5677707525583835, + "tokens_seen": 1274436608 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031002006018054164, + "loss": 2.7637, + "theoretical_loss": 3.5677540669891474, + "tokens_seen": 1274502144 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3024453, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.751657724380493, + "objective/train/theoretical_loss": 3.5677373825180982, + "objective/train/tokens_used": 1295027680, + "theoretical_loss": 3.5677373825180982, + "tokens_seen": 1274567680 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003100100300902708, + "loss": 2.8901, + "theoretical_loss": 3.5677373825180982, + "tokens_seen": 1274567680 + }, + { + "epoch": 15.02, + "learning_rate": 0.00031, + "loss": 2.8316, + "theoretical_loss": 3.5677206991451067, + "tokens_seen": 1274633216 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030998996990972924, + "loss": 2.8322, + "theoretical_loss": 3.567704016870044, + "tokens_seen": 1274698752 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030997993981945836, + "loss": 2.8487, + "theoretical_loss": 3.5676873356927814, + "tokens_seen": 1274764288 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003099699097291876, + "loss": 2.7832, + "theoretical_loss": 3.56767065561319, + "tokens_seen": 1274829824 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003099598796389168, + "loss": 2.8235, + "theoretical_loss": 3.567653976631142, + "tokens_seen": 1274895360 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030994984954864596, + "loss": 2.8785, + "theoretical_loss": 3.567637298746509, + "tokens_seen": 1274960896 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030993981945837514, + "loss": 2.8565, + "theoretical_loss": 3.567620621959161, + "tokens_seen": 1275026432 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003099297893681043, + "loss": 2.7262, + "theoretical_loss": 3.56760394626897, + "tokens_seen": 1275091968 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003099197592778335, + "loss": 2.8365, + "theoretical_loss": 3.5675872716758077, + "tokens_seen": 1275157504 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030990972918756274, + "loss": 2.7444, + "theoretical_loss": 3.567570598179545, + "tokens_seen": 1275223040 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030989969909729187, + "loss": 2.7976, + "theoretical_loss": 3.567553925780054, + "tokens_seen": 1275288576 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003098896690070211, + "loss": 2.8348, + "theoretical_loss": 3.567537254477206, + "tokens_seen": 1275354112 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030987963891675023, + "loss": 2.7984, + "theoretical_loss": 3.5675205842708726, + "tokens_seen": 1275419648 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030986960882647946, + "loss": 2.8406, + "theoretical_loss": 3.567503915160925, + "tokens_seen": 1275485184 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030985957873620864, + "loss": 2.708, + "theoretical_loss": 3.5674872471472354, + "tokens_seen": 1275550720 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003098495486459378, + "loss": 2.8005, + "theoretical_loss": 3.5674705802296747, + "tokens_seen": 1275616256 + }, + { + "epoch": 15.02, + "learning_rate": 0.000309839518555667, + "loss": 2.7995, + "theoretical_loss": 3.5674539144081145, + "tokens_seen": 1275681792 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003098294884653962, + "loss": 2.8769, + "theoretical_loss": 3.567437249682427, + "tokens_seen": 1275747328 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030981945837512537, + "loss": 2.8261, + "theoretical_loss": 3.5674205860524837, + "tokens_seen": 1275812864 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003098094282848546, + "loss": 2.8316, + "theoretical_loss": 3.567403923518156, + "tokens_seen": 1275878400 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030979939819458373, + "loss": 2.7764, + "theoretical_loss": 3.5673872620793157, + "tokens_seen": 1275943936 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030978936810431297, + "loss": 2.9311, + "theoretical_loss": 3.5673706017358344, + "tokens_seen": 1276009472 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030977933801404215, + "loss": 2.7085, + "theoretical_loss": 3.5673539424875846, + "tokens_seen": 1276075008 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030976930792377133, + "loss": 2.8691, + "theoretical_loss": 3.5673372843344366, + "tokens_seen": 1276140544 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8284850120544434, + "objective/train/theoretical_loss": 3.5673206272762634, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.5673206272762634, + "tokens_seen": 1276206080 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003097592778335005, + "loss": 2.8128, + "theoretical_loss": 3.5673206272762634, + "tokens_seen": 1276206080 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003097492477432297, + "loss": 2.6953, + "theoretical_loss": 3.567303971312936, + "tokens_seen": 1276271616 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030973921765295887, + "loss": 2.8161, + "theoretical_loss": 3.567287316444327, + "tokens_seen": 1276337152 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003097291875626881, + "loss": 2.7639, + "theoretical_loss": 3.567270662670308, + "tokens_seen": 1276402688 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030971915747241723, + "loss": 2.8303, + "theoretical_loss": 3.5672540099907506, + "tokens_seen": 1276468224 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030970912738214647, + "loss": 2.7373, + "theoretical_loss": 3.567237358405527, + "tokens_seen": 1276533760 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003096990972918756, + "loss": 2.7726, + "theoretical_loss": 3.567220707914509, + "tokens_seen": 1276599296 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030968906720160483, + "loss": 2.8196, + "theoretical_loss": 3.567204058517568, + "tokens_seen": 1276664832 + }, + { + "epoch": 15.02, + "learning_rate": 0.000309679037111334, + "loss": 2.8413, + "theoretical_loss": 3.5671874102145766, + "tokens_seen": 1276730368 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003096690070210632, + "loss": 2.8284, + "theoretical_loss": 3.5671707630054073, + "tokens_seen": 1276795904 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003096589769307924, + "loss": 2.8162, + "theoretical_loss": 3.5671541168899306, + "tokens_seen": 1276861440 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003096489468405216, + "loss": 2.7628, + "theoretical_loss": 3.5671374718680195, + "tokens_seen": 1276926976 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030963891675025074, + "loss": 2.8306, + "theoretical_loss": 3.5671208279395454, + "tokens_seen": 1276992512 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030962888665997997, + "loss": 2.8397, + "theoretical_loss": 3.5671041851043817, + "tokens_seen": 1277058048 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003096188565697091, + "loss": 2.9111, + "theoretical_loss": 3.567087543362399, + "tokens_seen": 1277123584 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030960882647943833, + "loss": 2.867, + "theoretical_loss": 3.5670709027134704, + "tokens_seen": 1277189120 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095987963891675, + "loss": 2.7429, + "theoretical_loss": 3.567054263157467, + "tokens_seen": 1277254656 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095887662988967, + "loss": 2.8339, + "theoretical_loss": 3.567037624694262, + "tokens_seen": 1277320192 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095787362086259, + "loss": 2.7967, + "theoretical_loss": 3.567020987323727, + "tokens_seen": 1277385728 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030956870611835506, + "loss": 2.7347, + "theoretical_loss": 3.5670043510457345, + "tokens_seen": 1277451264 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030955867602808424, + "loss": 2.9158, + "theoretical_loss": 3.5669877158601566, + "tokens_seen": 1277516800 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095486459378135, + "loss": 2.7795, + "theoretical_loss": 3.5669710817668654, + "tokens_seen": 1277582336 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095386158475426, + "loss": 2.9458, + "theoretical_loss": 3.566954448765733, + "tokens_seen": 1277647872 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030952858575727184, + "loss": 2.7745, + "theoretical_loss": 3.5669378168566324, + "tokens_seen": 1277713408 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030951855566700096, + "loss": 2.8396, + "theoretical_loss": 3.5669211860394348, + "tokens_seen": 1277778944 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.859895944595337, + "objective/train/theoretical_loss": 3.566904556314013, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.566904556314013, + "tokens_seen": 1277844480 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003095085255767302, + "loss": 2.7489, + "theoretical_loss": 3.566904556314013, + "tokens_seen": 1277844480 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003094984954864594, + "loss": 2.8032, + "theoretical_loss": 3.5668879276802397, + "tokens_seen": 1277910016 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030948846539618856, + "loss": 2.7779, + "theoretical_loss": 3.5668713001379873, + "tokens_seen": 1277975552 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030947843530591774, + "loss": 2.8401, + "theoretical_loss": 3.5668546736871276, + "tokens_seen": 1278041088 + }, + { + "epoch": 15.02, + "learning_rate": 0.000309468405215647, + "loss": 2.7992, + "theoretical_loss": 3.5668380483275337, + "tokens_seen": 1278106624 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003094583751253761, + "loss": 2.886, + "theoretical_loss": 3.566821424059077, + "tokens_seen": 1278172160 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030944834503510534, + "loss": 2.8185, + "theoretical_loss": 3.566804800881631, + "tokens_seen": 1278237696 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030943831494483447, + "loss": 2.8419, + "theoretical_loss": 3.5667881787950675, + "tokens_seen": 1278303232 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003094282848545637, + "loss": 2.9097, + "theoretical_loss": 3.5667715577992594, + "tokens_seen": 1278368768 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003094182547642929, + "loss": 2.8341, + "theoretical_loss": 3.5667549378940793, + "tokens_seen": 1278434304 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030940822467402207, + "loss": 2.8064, + "theoretical_loss": 3.5667383190793993, + "tokens_seen": 1278499840 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030939819458375125, + "loss": 2.7659, + "theoretical_loss": 3.5667217013550916, + "tokens_seen": 1278565376 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030938816449348043, + "loss": 2.7167, + "theoretical_loss": 3.56670508472103, + "tokens_seen": 1278630912 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003093781344032096, + "loss": 2.8492, + "theoretical_loss": 3.5666884691770857, + "tokens_seen": 1278696448 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030936810431293884, + "loss": 2.8452, + "theoretical_loss": 3.5666718547231326, + "tokens_seen": 1278761984 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030935807422266797, + "loss": 2.8583, + "theoretical_loss": 3.5666552413590424, + "tokens_seen": 1278827520 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003093480441323972, + "loss": 2.899, + "theoretical_loss": 3.5666386290846885, + "tokens_seen": 1278893056 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030933801404212633, + "loss": 2.7963, + "theoretical_loss": 3.566622017899943, + "tokens_seen": 1278958592 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030932798395185557, + "loss": 2.7236, + "theoretical_loss": 3.566605407804679, + "tokens_seen": 1279024128 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030931795386158475, + "loss": 2.8577, + "theoretical_loss": 3.5665887987987688, + "tokens_seen": 1279089664 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030930792377131393, + "loss": 2.7905, + "theoretical_loss": 3.566572190882085, + "tokens_seen": 1279155200 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030929789368104317, + "loss": 2.8472, + "theoretical_loss": 3.566555584054501, + "tokens_seen": 1279220736 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030928786359077235, + "loss": 2.8334, + "theoretical_loss": 3.5665389783158896, + "tokens_seen": 1279286272 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030927783350050153, + "loss": 2.8385, + "theoretical_loss": 3.5665223736661233, + "tokens_seen": 1279351808 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003092678034102307, + "loss": 2.8398, + "theoretical_loss": 3.566505770105075, + "tokens_seen": 1279417344 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9913527965545654, + "objective/train/theoretical_loss": 3.5664891676326174, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.5664891676326174, + "tokens_seen": 1279482880 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003092577733199599, + "loss": 2.8031, + "theoretical_loss": 3.5664891676326174, + "tokens_seen": 1279482880 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030924774322968907, + "loss": 2.7401, + "theoretical_loss": 3.5664725662486236, + "tokens_seen": 1279548416 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003092377131394183, + "loss": 2.8377, + "theoretical_loss": 3.566455965952966, + "tokens_seen": 1279613952 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030922768304914743, + "loss": 2.8797, + "theoretical_loss": 3.5664393667455183, + "tokens_seen": 1279679488 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030921765295887667, + "loss": 2.853, + "theoretical_loss": 3.566422768626153, + "tokens_seen": 1279745024 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003092076228686058, + "loss": 2.6886, + "theoretical_loss": 3.5664061715947435, + "tokens_seen": 1279810560 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030919759277833503, + "loss": 2.9235, + "theoretical_loss": 3.566389575651162, + "tokens_seen": 1279876096 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091875626880642, + "loss": 2.811, + "theoretical_loss": 3.5663729807952818, + "tokens_seen": 1279941632 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091775325977934, + "loss": 2.7979, + "theoretical_loss": 3.5663563870269765, + "tokens_seen": 1280007168 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091675025075226, + "loss": 2.7544, + "theoretical_loss": 3.566339794346118, + "tokens_seen": 1280072704 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091574724172518, + "loss": 2.7251, + "theoretical_loss": 3.5663232027525806, + "tokens_seen": 1280138240 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030914744232698094, + "loss": 2.7874, + "theoretical_loss": 3.566306612246237, + "tokens_seen": 1280203776 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030913741223671017, + "loss": 2.742, + "theoretical_loss": 3.5662900228269594, + "tokens_seen": 1280269312 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091273821464393, + "loss": 2.8658, + "theoretical_loss": 3.5662734344946223, + "tokens_seen": 1280334848 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030911735205616853, + "loss": 2.8894, + "theoretical_loss": 3.5662568472490976, + "tokens_seen": 1280400384 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003091073219658977, + "loss": 2.9078, + "theoretical_loss": 3.5662402610902597, + "tokens_seen": 1280465920 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090972918756269, + "loss": 2.8446, + "theoretical_loss": 3.566223676017981, + "tokens_seen": 1280531456 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090872617853561, + "loss": 2.7488, + "theoretical_loss": 3.566207092032135, + "tokens_seen": 1280596992 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030907723169508526, + "loss": 2.8125, + "theoretical_loss": 3.5661905091325945, + "tokens_seen": 1280662528 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030906720160481444, + "loss": 2.8492, + "theoretical_loss": 3.566173927319234, + "tokens_seen": 1280728064 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090571715145437, + "loss": 2.7305, + "theoretical_loss": 3.5661573465919245, + "tokens_seen": 1280793600 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090471414242728, + "loss": 2.8746, + "theoretical_loss": 3.5661407669505416, + "tokens_seen": 1280859136 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030903711133400204, + "loss": 2.7833, + "theoretical_loss": 3.5661241883949577, + "tokens_seen": 1280924672 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030902708124373116, + "loss": 2.7834, + "theoretical_loss": 3.5661076109250462, + "tokens_seen": 1280990208 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090170511534604, + "loss": 2.8725, + "theoretical_loss": 3.5660910345406798, + "tokens_seen": 1281055744 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8767359256744385, + "objective/train/theoretical_loss": 3.566074459241733, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.566074459241733, + "tokens_seen": 1281121280 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003090070210631896, + "loss": 2.8675, + "theoretical_loss": 3.566074459241733, + "tokens_seen": 1281121280 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030899699097291876, + "loss": 2.7892, + "theoretical_loss": 3.566057885028079, + "tokens_seen": 1281186816 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030898696088264794, + "loss": 2.9062, + "theoretical_loss": 3.5660413118995904, + "tokens_seen": 1281252352 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003089769307923772, + "loss": 2.7531, + "theoretical_loss": 3.5660247398561413, + "tokens_seen": 1281317888 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003089669007021063, + "loss": 2.754, + "theoretical_loss": 3.566008168897605, + "tokens_seen": 1281383424 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030895687061183554, + "loss": 2.8488, + "theoretical_loss": 3.5659915990238553, + "tokens_seen": 1281448960 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030894684052156467, + "loss": 2.7999, + "theoretical_loss": 3.5659750302347653, + "tokens_seen": 1281514496 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003089368104312939, + "loss": 2.8429, + "theoretical_loss": 3.5659584625302085, + "tokens_seen": 1281580032 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003089267803410231, + "loss": 2.7549, + "theoretical_loss": 3.565941895910059, + "tokens_seen": 1281645568 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030891675025075227, + "loss": 2.8583, + "theoretical_loss": 3.5659253303741902, + "tokens_seen": 1281711104 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030890672016048145, + "loss": 2.7491, + "theoretical_loss": 3.565908765922475, + "tokens_seen": 1281776640 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030889669007021063, + "loss": 2.8405, + "theoretical_loss": 3.565892202554788, + "tokens_seen": 1281842176 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003088866599799398, + "loss": 2.7779, + "theoretical_loss": 3.5658756402710026, + "tokens_seen": 1281907712 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030887662988966904, + "loss": 2.829, + "theoretical_loss": 3.5658590790709916, + "tokens_seen": 1281973248 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030886659979939817, + "loss": 2.876, + "theoretical_loss": 3.5658425189546294, + "tokens_seen": 1282038784 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003088565697091274, + "loss": 2.7944, + "theoretical_loss": 3.5658259599217903, + "tokens_seen": 1282104320 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030884653961885653, + "loss": 2.8175, + "theoretical_loss": 3.5658094019723467, + "tokens_seen": 1282169856 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030883650952858577, + "loss": 2.8327, + "theoretical_loss": 3.5657928451061736, + "tokens_seen": 1282235392 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030882647943831495, + "loss": 2.8148, + "theoretical_loss": 3.5657762893231437, + "tokens_seen": 1282300928 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030881644934804413, + "loss": 2.7904, + "theoretical_loss": 3.565759734623131, + "tokens_seen": 1282366464 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003088064192577733, + "loss": 2.8555, + "theoretical_loss": 3.5657431810060105, + "tokens_seen": 1282432000 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030879638916750255, + "loss": 2.9245, + "theoretical_loss": 3.5657266284716544, + "tokens_seen": 1282497536 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003087863590772317, + "loss": 2.8403, + "theoretical_loss": 3.5657100770199373, + "tokens_seen": 1282563072 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003087763289869609, + "loss": 2.7607, + "theoretical_loss": 3.5656935266507337, + "tokens_seen": 1282628608 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030876629889669004, + "loss": 2.7835, + "theoretical_loss": 3.5656769773639163, + "tokens_seen": 1282694144 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7137351036071777, + "objective/train/theoretical_loss": 3.56566042915936, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.56566042915936, + "tokens_seen": 1282759680 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030875626880641927, + "loss": 2.8058, + "theoretical_loss": 3.56566042915936, + "tokens_seen": 1282759680 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030874623871614845, + "loss": 2.8724, + "theoretical_loss": 3.5656438820369374, + "tokens_seen": 1282825216 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030873620862587763, + "loss": 2.7476, + "theoretical_loss": 3.565627335996524, + "tokens_seen": 1282890752 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003087261785356068, + "loss": 2.8398, + "theoretical_loss": 3.565610791037993, + "tokens_seen": 1282956288 + }, + { + "epoch": 15.02, + "learning_rate": 0.000308716148445336, + "loss": 2.8831, + "theoretical_loss": 3.5655942471612185, + "tokens_seen": 1283021824 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003087061183550652, + "loss": 2.8444, + "theoretical_loss": 3.565577704366075, + "tokens_seen": 1283087360 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086960882647944, + "loss": 2.7945, + "theoretical_loss": 3.5655611626524353, + "tokens_seen": 1283152896 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030868605817452354, + "loss": 2.7535, + "theoretical_loss": 3.565544622020175, + "tokens_seen": 1283218432 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086760280842528, + "loss": 2.9486, + "theoretical_loss": 3.565528082469167, + "tokens_seen": 1283283968 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086659979939819, + "loss": 2.8087, + "theoretical_loss": 3.565511543999286, + "tokens_seen": 1283349504 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030865596790371114, + "loss": 2.8058, + "theoretical_loss": 3.565495006610406, + "tokens_seen": 1283415040 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086459378134403, + "loss": 2.7483, + "theoretical_loss": 3.565478470302401, + "tokens_seen": 1283480576 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086359077231695, + "loss": 2.7257, + "theoretical_loss": 3.5654619350751453, + "tokens_seen": 1283546112 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086258776328987, + "loss": 2.8034, + "theoretical_loss": 3.565445400928513, + "tokens_seen": 1283611648 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003086158475426279, + "loss": 2.8276, + "theoretical_loss": 3.5654288678623787, + "tokens_seen": 1283677184 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030860581745235704, + "loss": 2.7749, + "theoretical_loss": 3.5654123358766165, + "tokens_seen": 1283742720 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003085957873620863, + "loss": 2.9183, + "theoretical_loss": 3.5653958049711, + "tokens_seen": 1283808256 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003085857572718154, + "loss": 2.8155, + "theoretical_loss": 3.5653792751457036, + "tokens_seen": 1283873792 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030857572718154464, + "loss": 2.8738, + "theoretical_loss": 3.565362746400303, + "tokens_seen": 1283939328 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003085656970912738, + "loss": 2.8007, + "theoretical_loss": 3.5653462187347706, + "tokens_seen": 1284004864 + }, + { + "epoch": 15.02, + "learning_rate": 0.000308555667001003, + "loss": 2.7964, + "theoretical_loss": 3.565329692148982, + "tokens_seen": 1284070400 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030854563691073224, + "loss": 2.8622, + "theoretical_loss": 3.5653131666428113, + "tokens_seen": 1284135936 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030853560682046137, + "loss": 2.7449, + "theoretical_loss": 3.5652966422161327, + "tokens_seen": 1284201472 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003085255767301906, + "loss": 2.72, + "theoretical_loss": 3.56528011886882, + "tokens_seen": 1284267008 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003085155466399198, + "loss": 2.8042, + "theoretical_loss": 3.565263596600749, + "tokens_seen": 1284332544 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.037022590637207, + "objective/train/theoretical_loss": 3.565247075411793, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.565247075411793, + "tokens_seen": 1284398080 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030850551654964896, + "loss": 2.8137, + "theoretical_loss": 3.565247075411793, + "tokens_seen": 1284398080 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030849548645937814, + "loss": 2.8571, + "theoretical_loss": 3.5652305553018273, + "tokens_seen": 1284463616 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003084854563691074, + "loss": 2.865, + "theoretical_loss": 3.5652140362707256, + "tokens_seen": 1284529152 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003084754262788365, + "loss": 2.9066, + "theoretical_loss": 3.565197518318363, + "tokens_seen": 1284594688 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030846539618856574, + "loss": 2.7697, + "theoretical_loss": 3.5651810014446133, + "tokens_seen": 1284660224 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030845536609829487, + "loss": 2.7411, + "theoretical_loss": 3.565164485649352, + "tokens_seen": 1284725760 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003084453360080241, + "loss": 2.7563, + "theoretical_loss": 3.565147970932453, + "tokens_seen": 1284791296 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003084353059177533, + "loss": 2.8976, + "theoretical_loss": 3.5651314572937913, + "tokens_seen": 1284856832 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030842527582748247, + "loss": 2.8839, + "theoretical_loss": 3.5651149447332413, + "tokens_seen": 1284922368 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030841524573721165, + "loss": 2.8067, + "theoretical_loss": 3.565098433250678, + "tokens_seen": 1284987904 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030840521564694083, + "loss": 2.8205, + "theoretical_loss": 3.565081922845975, + "tokens_seen": 1285053440 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030839518555667, + "loss": 2.7083, + "theoretical_loss": 3.5650654135190076, + "tokens_seen": 1285118976 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030838515546639924, + "loss": 2.8323, + "theoretical_loss": 3.5650489052696512, + "tokens_seen": 1285184512 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030837512537612837, + "loss": 2.8167, + "theoretical_loss": 3.565032398097779, + "tokens_seen": 1285250048 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003083650952858576, + "loss": 2.6924, + "theoretical_loss": 3.5650158920032675, + "tokens_seen": 1285315584 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030835506519558673, + "loss": 2.8683, + "theoretical_loss": 3.56499938698599, + "tokens_seen": 1285381120 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030834503510531597, + "loss": 2.8763, + "theoretical_loss": 3.5649828830458223, + "tokens_seen": 1285446656 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030833500501504515, + "loss": 2.7784, + "theoretical_loss": 3.5649663801826383, + "tokens_seen": 1285512192 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030832497492477433, + "loss": 2.9111, + "theoretical_loss": 3.5649498783963143, + "tokens_seen": 1285577728 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003083149448345035, + "loss": 2.8651, + "theoretical_loss": 3.5649333776867227, + "tokens_seen": 1285643264 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030830491474423275, + "loss": 2.861, + "theoretical_loss": 3.5649168780537406, + "tokens_seen": 1285708800 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003082948846539619, + "loss": 2.8879, + "theoretical_loss": 3.564900379497242, + "tokens_seen": 1285774336 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003082848545636911, + "loss": 2.7779, + "theoretical_loss": 3.564883882017101, + "tokens_seen": 1285839872 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030827482447342024, + "loss": 2.8788, + "theoretical_loss": 3.5648673856131947, + "tokens_seen": 1285905408 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030826479438314947, + "loss": 2.9006, + "theoretical_loss": 3.564850890285396, + "tokens_seen": 1285970944 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6967246532440186, + "objective/train/theoretical_loss": 3.564834396033581, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.564834396033581, + "tokens_seen": 1286036480 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030825476429287865, + "loss": 2.7725, + "theoretical_loss": 3.564834396033581, + "tokens_seen": 1286036480 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030824473420260783, + "loss": 2.8668, + "theoretical_loss": 3.564817902857624, + "tokens_seen": 1286102016 + }, + { + "epoch": 15.02, + "learning_rate": 0.000308234704112337, + "loss": 2.8182, + "theoretical_loss": 3.5648014107574, + "tokens_seen": 1286167552 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003082246740220662, + "loss": 2.6884, + "theoretical_loss": 3.5647849197327846, + "tokens_seen": 1286233088 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003082146439317954, + "loss": 2.8092, + "theoretical_loss": 3.564768429783653, + "tokens_seen": 1286298624 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003082046138415246, + "loss": 2.9089, + "theoretical_loss": 3.564751940909879, + "tokens_seen": 1286364160 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030819458375125374, + "loss": 2.7962, + "theoretical_loss": 3.5647354531113393, + "tokens_seen": 1286429696 + }, + { + "epoch": 15.02, + "learning_rate": 0.000308184553660983, + "loss": 2.7759, + "theoretical_loss": 3.5647189663879075, + "tokens_seen": 1286495232 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081745235707121, + "loss": 2.8628, + "theoretical_loss": 3.5647024807394603, + "tokens_seen": 1286560768 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030816449348044134, + "loss": 2.8425, + "theoretical_loss": 3.5646859961658715, + "tokens_seen": 1286626304 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081544633901705, + "loss": 2.863, + "theoretical_loss": 3.564669512667017, + "tokens_seen": 1286691840 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081444332998997, + "loss": 2.8496, + "theoretical_loss": 3.5646530302427717, + "tokens_seen": 1286757376 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081344032096289, + "loss": 2.8902, + "theoretical_loss": 3.564636548893011, + "tokens_seen": 1286822912 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081243731193581, + "loss": 2.858, + "theoretical_loss": 3.5646200686176104, + "tokens_seen": 1286888448 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030811434302908724, + "loss": 2.8347, + "theoretical_loss": 3.5646035894164445, + "tokens_seen": 1286953984 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003081043129388165, + "loss": 2.8672, + "theoretical_loss": 3.5645871112893888, + "tokens_seen": 1287019520 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003080942828485456, + "loss": 2.7005, + "theoretical_loss": 3.564570634236319, + "tokens_seen": 1287085056 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030808425275827484, + "loss": 2.7883, + "theoretical_loss": 3.56455415825711, + "tokens_seen": 1287150592 + }, + { + "epoch": 15.02, + "learning_rate": 0.000308074222668004, + "loss": 2.8609, + "theoretical_loss": 3.564537683351637, + "tokens_seen": 1287216128 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003080641925777332, + "loss": 2.8446, + "theoretical_loss": 3.564521209519776, + "tokens_seen": 1287281664 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003080541624874624, + "loss": 2.8689, + "theoretical_loss": 3.5645047367614016, + "tokens_seen": 1287347200 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030804413239719157, + "loss": 2.814, + "theoretical_loss": 3.56448826507639, + "tokens_seen": 1287412736 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030803410230692075, + "loss": 2.8222, + "theoretical_loss": 3.564471794464616, + "tokens_seen": 1287478272 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030802407221665, + "loss": 2.8481, + "theoretical_loss": 3.5644553249259556, + "tokens_seen": 1287543808 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003080140421263791, + "loss": 2.7694, + "theoretical_loss": 3.564438856460284, + "tokens_seen": 1287609344 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8626184463500977, + "objective/train/theoretical_loss": 3.5644223890674764, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.5644223890674764, + "tokens_seen": 1287674880 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030800401203610834, + "loss": 2.7818, + "theoretical_loss": 3.5644223890674764, + "tokens_seen": 1287674880 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079939819458375, + "loss": 2.8222, + "theoretical_loss": 3.5644059227474085, + "tokens_seen": 1287740416 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079839518555667, + "loss": 2.8473, + "theoretical_loss": 3.564389457499956, + "tokens_seen": 1287805952 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079739217652959, + "loss": 2.8594, + "theoretical_loss": 3.5643729933249944, + "tokens_seen": 1287871488 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030796389167502507, + "loss": 2.8198, + "theoretical_loss": 3.564356530222399, + "tokens_seen": 1287937024 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030795386158475425, + "loss": 2.8167, + "theoretical_loss": 3.5643400681920454, + "tokens_seen": 1288002560 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079438314944835, + "loss": 2.8882, + "theoretical_loss": 3.56432360723381, + "tokens_seen": 1288068096 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079338014042126, + "loss": 2.7781, + "theoretical_loss": 3.5643071473475674, + "tokens_seen": 1288133632 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030792377131394185, + "loss": 2.7051, + "theoretical_loss": 3.564290688533194, + "tokens_seen": 1288199168 + }, + { + "epoch": 15.02, + "learning_rate": 0.000307913741223671, + "loss": 2.7214, + "theoretical_loss": 3.564274230790565, + "tokens_seen": 1288264704 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003079037111334002, + "loss": 2.8036, + "theoretical_loss": 3.5642577741195565, + "tokens_seen": 1288330240 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078936810431294, + "loss": 2.8504, + "theoretical_loss": 3.5642413185200437, + "tokens_seen": 1288395776 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030788365095285857, + "loss": 2.9075, + "theoretical_loss": 3.5642248639919027, + "tokens_seen": 1288461312 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030787362086258775, + "loss": 2.9322, + "theoretical_loss": 3.5642084105350094, + "tokens_seen": 1288526848 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030786359077231693, + "loss": 2.9393, + "theoretical_loss": 3.564191958149239, + "tokens_seen": 1288592384 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078535606820461, + "loss": 2.8269, + "theoretical_loss": 3.564175506834468, + "tokens_seen": 1288657920 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030784353059177535, + "loss": 2.799, + "theoretical_loss": 3.5641590565905714, + "tokens_seen": 1288723456 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078335005015045, + "loss": 2.8552, + "theoretical_loss": 3.564142607417426, + "tokens_seen": 1288788992 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078234704112337, + "loss": 2.7694, + "theoretical_loss": 3.564126159314907, + "tokens_seen": 1288854528 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078134403209629, + "loss": 2.8077, + "theoretical_loss": 3.5641097122828906, + "tokens_seen": 1288920064 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003078034102306921, + "loss": 2.9154, + "theoretical_loss": 3.564093266321252, + "tokens_seen": 1288985600 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003077933801404213, + "loss": 2.8084, + "theoretical_loss": 3.564076821429868, + "tokens_seen": 1289051136 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030778335005015044, + "loss": 2.779, + "theoretical_loss": 3.5640603776086146, + "tokens_seen": 1289116672 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030777331995987967, + "loss": 2.8462, + "theoretical_loss": 3.5640439348573674, + "tokens_seen": 1289182208 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030776328986960885, + "loss": 2.9121, + "theoretical_loss": 3.5640274931760016, + "tokens_seen": 1289247744 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7914390563964844, + "objective/train/theoretical_loss": 3.5640110525643944, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.5640110525643944, + "tokens_seen": 1289313280 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030775325977933803, + "loss": 2.8117, + "theoretical_loss": 3.5640110525643944, + "tokens_seen": 1289313280 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003077432296890672, + "loss": 2.7223, + "theoretical_loss": 3.563994613022422, + "tokens_seen": 1289378816 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003077331995987964, + "loss": 2.8877, + "theoretical_loss": 3.563978174549959, + "tokens_seen": 1289444352 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003077231695085256, + "loss": 2.8686, + "theoretical_loss": 3.563961737146883, + "tokens_seen": 1289509888 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003077131394182548, + "loss": 2.803, + "theoretical_loss": 3.563945300813069, + "tokens_seen": 1289575424 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030770310932798394, + "loss": 2.9338, + "theoretical_loss": 3.5639288655483936, + "tokens_seen": 1289640960 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076930792377132, + "loss": 2.9361, + "theoretical_loss": 3.5639124313527333, + "tokens_seen": 1289706496 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076830491474423, + "loss": 2.7646, + "theoretical_loss": 3.5638959982259633, + "tokens_seen": 1289772032 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030767301905717154, + "loss": 2.6943, + "theoretical_loss": 3.5638795661679605, + "tokens_seen": 1289837568 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076629889669007, + "loss": 2.8007, + "theoretical_loss": 3.5638631351786003, + "tokens_seen": 1289903104 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076529588766299, + "loss": 2.7885, + "theoretical_loss": 3.56384670525776, + "tokens_seen": 1289968640 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076429287863591, + "loss": 2.8629, + "theoretical_loss": 3.5638302764053154, + "tokens_seen": 1290034176 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076328986960883, + "loss": 2.739, + "theoretical_loss": 3.563813848621143, + "tokens_seen": 1290099712 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030762286860581744, + "loss": 2.9326, + "theoretical_loss": 3.563797421905118, + "tokens_seen": 1290165248 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076128385155467, + "loss": 2.8721, + "theoretical_loss": 3.5637809962571176, + "tokens_seen": 1290230784 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003076028084252758, + "loss": 2.7345, + "theoretical_loss": 3.563764571677018, + "tokens_seen": 1290296320 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030759277833500504, + "loss": 2.8789, + "theoretical_loss": 3.563748148164696, + "tokens_seen": 1290361856 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075827482447342, + "loss": 2.8911, + "theoretical_loss": 3.563731725720027, + "tokens_seen": 1290427392 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075727181544634, + "loss": 2.9235, + "theoretical_loss": 3.5637153043428875, + "tokens_seen": 1290492928 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075626880641926, + "loss": 2.8339, + "theoretical_loss": 3.5636988840331547, + "tokens_seen": 1290558464 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030755265797392177, + "loss": 2.8107, + "theoretical_loss": 3.5636824647907046, + "tokens_seen": 1290624000 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030754262788365095, + "loss": 2.821, + "theoretical_loss": 3.5636660466154133, + "tokens_seen": 1290689536 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075325977933802, + "loss": 2.839, + "theoretical_loss": 3.5636496295071574, + "tokens_seen": 1290755072 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075225677031093, + "loss": 2.7999, + "theoretical_loss": 3.563633213465814, + "tokens_seen": 1290820608 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030751253761283854, + "loss": 2.7723, + "theoretical_loss": 3.563616798491259, + "tokens_seen": 1290886144 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8161182403564453, + "objective/train/theoretical_loss": 3.563600384583369, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.563600384583369, + "tokens_seen": 1290951680 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003075025075225677, + "loss": 2.877, + "theoretical_loss": 3.563600384583369, + "tokens_seen": 1290951680 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074924774322969, + "loss": 2.8268, + "theoretical_loss": 3.5635839717420206, + "tokens_seen": 1291017216 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074824473420261, + "loss": 2.9423, + "theoretical_loss": 3.5635675599670904, + "tokens_seen": 1291082752 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030747241725175527, + "loss": 2.8084, + "theoretical_loss": 3.563551149258455, + "tokens_seen": 1291148288 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030746238716148445, + "loss": 2.8879, + "theoretical_loss": 3.5635347396159904, + "tokens_seen": 1291213824 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074523570712137, + "loss": 2.7977, + "theoretical_loss": 3.563518331039574, + "tokens_seen": 1291279360 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074423269809428, + "loss": 2.8426, + "theoretical_loss": 3.5635019235290826, + "tokens_seen": 1291344896 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030743229689067205, + "loss": 2.7559, + "theoretical_loss": 3.563485517084392, + "tokens_seen": 1291410432 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074222668004012, + "loss": 2.809, + "theoretical_loss": 3.5634691117053796, + "tokens_seen": 1291475968 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074122367101304, + "loss": 2.6895, + "theoretical_loss": 3.5634527073919218, + "tokens_seen": 1291541504 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003074022066198596, + "loss": 2.8958, + "theoretical_loss": 3.5634363041438957, + "tokens_seen": 1291607040 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030739217652958877, + "loss": 2.8599, + "theoretical_loss": 3.5634199019611774, + "tokens_seen": 1291672576 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030738214643931795, + "loss": 2.7785, + "theoretical_loss": 3.563403500843644, + "tokens_seen": 1291738112 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030737211634904713, + "loss": 2.741, + "theoretical_loss": 3.5633871007911724, + "tokens_seen": 1291803648 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003073620862587763, + "loss": 2.8757, + "theoretical_loss": 3.563370701803639, + "tokens_seen": 1291869184 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030735205616850555, + "loss": 2.8263, + "theoretical_loss": 3.5633543038809217, + "tokens_seen": 1291934720 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003073420260782347, + "loss": 2.7211, + "theoretical_loss": 3.5633379070228957, + "tokens_seen": 1292000256 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003073319959879639, + "loss": 2.7615, + "theoretical_loss": 3.5633215112294394, + "tokens_seen": 1292065792 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003073219658976931, + "loss": 2.8002, + "theoretical_loss": 3.5633051165004286, + "tokens_seen": 1292131328 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003073119358074223, + "loss": 2.9743, + "theoretical_loss": 3.5632887228357415, + "tokens_seen": 1292196864 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030730190571715146, + "loss": 2.8731, + "theoretical_loss": 3.563272330235253, + "tokens_seen": 1292262400 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030729187562688064, + "loss": 2.7981, + "theoretical_loss": 3.563255938698842, + "tokens_seen": 1292327936 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003072818455366098, + "loss": 2.8892, + "theoretical_loss": 3.5632395482263846, + "tokens_seen": 1292393472 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030727181544633905, + "loss": 2.9284, + "theoretical_loss": 3.563223158817758, + "tokens_seen": 1292459008 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003072617853560682, + "loss": 2.8025, + "theoretical_loss": 3.563206770472839, + "tokens_seen": 1292524544 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.648789167404175, + "objective/train/theoretical_loss": 3.563190383191505, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.563190383191505, + "tokens_seen": 1292590080 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003072517552657974, + "loss": 2.8001, + "theoretical_loss": 3.563190383191505, + "tokens_seen": 1292590080 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030724172517552654, + "loss": 2.8282, + "theoretical_loss": 3.563173996973632, + "tokens_seen": 1292655616 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003072316950852558, + "loss": 2.8768, + "theoretical_loss": 3.5631576118190984, + "tokens_seen": 1292721152 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030722166499498496, + "loss": 2.7613, + "theoretical_loss": 3.563141227727781, + "tokens_seen": 1292786688 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030721163490471414, + "loss": 2.741, + "theoretical_loss": 3.5631248446995567, + "tokens_seen": 1292852224 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003072016048144433, + "loss": 2.8577, + "theoretical_loss": 3.563108462734302, + "tokens_seen": 1292917760 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071915747241725, + "loss": 2.896, + "theoretical_loss": 3.563092081831895, + "tokens_seen": 1292983296 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071815446339017, + "loss": 2.8914, + "theoretical_loss": 3.5630757019922132, + "tokens_seen": 1293048832 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071715145436309, + "loss": 2.824, + "theoretical_loss": 3.5630593232151324, + "tokens_seen": 1293114368 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030716148445336005, + "loss": 2.8163, + "theoretical_loss": 3.5630429455005306, + "tokens_seen": 1293179904 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071514543630893, + "loss": 2.8046, + "theoretical_loss": 3.5630265688482856, + "tokens_seen": 1293245440 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030714142427281846, + "loss": 2.7948, + "theoretical_loss": 3.5630101932582736, + "tokens_seen": 1293310976 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030713139418254764, + "loss": 2.8694, + "theoretical_loss": 3.562993818730373, + "tokens_seen": 1293376512 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071213640922768, + "loss": 2.8209, + "theoretical_loss": 3.5629774452644596, + "tokens_seen": 1293442048 + }, + { + "epoch": 15.02, + "learning_rate": 0.000307111334002006, + "loss": 2.8099, + "theoretical_loss": 3.5629610728604124, + "tokens_seen": 1293507584 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003071013039117352, + "loss": 2.8292, + "theoretical_loss": 3.562944701518107, + "tokens_seen": 1293573120 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070912738214644, + "loss": 2.7767, + "theoretical_loss": 3.5629283312374223, + "tokens_seen": 1293638656 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030708124373119355, + "loss": 2.8186, + "theoretical_loss": 3.5629119620182346, + "tokens_seen": 1293704192 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070712136409228, + "loss": 2.8935, + "theoretical_loss": 3.5628955938604223, + "tokens_seen": 1293769728 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070611835506519, + "loss": 2.7583, + "theoretical_loss": 3.5628792267638625, + "tokens_seen": 1293835264 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030705115346038115, + "loss": 2.8911, + "theoretical_loss": 3.5628628607284316, + "tokens_seen": 1293900800 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070411233701104, + "loss": 2.8322, + "theoretical_loss": 3.5628464957540085, + "tokens_seen": 1293966336 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070310932798395, + "loss": 2.9491, + "theoretical_loss": 3.56283013184047, + "tokens_seen": 1294031872 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030702106318956874, + "loss": 2.775, + "theoretical_loss": 3.5628137689876933, + "tokens_seen": 1294097408 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070110330992979, + "loss": 2.8316, + "theoretical_loss": 3.5627974071955566, + "tokens_seen": 1294162944 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7153496742248535, + "objective/train/theoretical_loss": 3.562781046463937, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.562781046463937, + "tokens_seen": 1294228480 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003070010030090271, + "loss": 2.842, + "theoretical_loss": 3.562781046463937, + "tokens_seen": 1294228480 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003069909729187563, + "loss": 2.8476, + "theoretical_loss": 3.5627646867927125, + "tokens_seen": 1294294016 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030698094282848547, + "loss": 2.8056, + "theoretical_loss": 3.56274832818176, + "tokens_seen": 1294359552 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030697091273821465, + "loss": 2.8902, + "theoretical_loss": 3.562731970630958, + "tokens_seen": 1294425088 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003069608826479439, + "loss": 2.7653, + "theoretical_loss": 3.562715614140183, + "tokens_seen": 1294490624 + }, + { + "epoch": 15.02, + "learning_rate": 0.000306950852557673, + "loss": 2.8139, + "theoretical_loss": 3.562699258709314, + "tokens_seen": 1294556160 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030694082246740225, + "loss": 2.7719, + "theoretical_loss": 3.5626829043382275, + "tokens_seen": 1294621696 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003069307923771314, + "loss": 2.8456, + "theoretical_loss": 3.5626665510268016, + "tokens_seen": 1294687232 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003069207622868606, + "loss": 2.9169, + "theoretical_loss": 3.5626501987749144, + "tokens_seen": 1294752768 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003069107321965898, + "loss": 2.7355, + "theoretical_loss": 3.562633847582443, + "tokens_seen": 1294818304 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030690070210631897, + "loss": 2.8647, + "theoretical_loss": 3.5626174974492653, + "tokens_seen": 1294883840 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030689067201604815, + "loss": 2.7813, + "theoretical_loss": 3.562601148375259, + "tokens_seen": 1294949376 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030688064192577733, + "loss": 2.8796, + "theoretical_loss": 3.5625848003603027, + "tokens_seen": 1295014912 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003068706118355065, + "loss": 2.8322, + "theoretical_loss": 3.562568453404273, + "tokens_seen": 1295080448 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030686058174523575, + "loss": 2.7501, + "theoretical_loss": 3.5625521075070488, + "tokens_seen": 1295145984 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003068505516549649, + "loss": 2.7845, + "theoretical_loss": 3.5625357626685066, + "tokens_seen": 1295211520 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003068405215646941, + "loss": 2.7642, + "theoretical_loss": 3.562519418888526, + "tokens_seen": 1295277056 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003068304914744233, + "loss": 2.8144, + "theoretical_loss": 3.5625030761669834, + "tokens_seen": 1295342592 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003068204613841525, + "loss": 2.826, + "theoretical_loss": 3.562486734503758, + "tokens_seen": 1295408128 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030681043129388166, + "loss": 2.7604, + "theoretical_loss": 3.5624703938987263, + "tokens_seen": 1295473664 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030680040120361084, + "loss": 2.8168, + "theoretical_loss": 3.5624540543517673, + "tokens_seen": 1295539200 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030679037111334, + "loss": 2.848, + "theoretical_loss": 3.5624377158627585, + "tokens_seen": 1295604736 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030678034102306925, + "loss": 2.8588, + "theoretical_loss": 3.5624213784315786, + "tokens_seen": 1295670272 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003067703109327984, + "loss": 2.7896, + "theoretical_loss": 3.5624050420581046, + "tokens_seen": 1295735808 + }, + { + "epoch": 15.02, + "learning_rate": 0.0003067602808425276, + "loss": 2.9102, + "theoretical_loss": 3.5623887067422153, + "tokens_seen": 1295801344 + }, + { + "epoch": 15.02, + "objective/train/docs_used": 3025408, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.847632884979248, + "objective/train/theoretical_loss": 3.562372372483788, + "objective/train/tokens_used": 1295905248, + "theoretical_loss": 3.562372372483788, + "tokens_seen": 1295866880 + }, + { + "epoch": 15.02, + "learning_rate": 0.00030675025075225674, + "loss": 2.804, + "theoretical_loss": 3.562372372483788, + "tokens_seen": 1295866880 + }, + { + "epoch": 15.02, + "learning_rate": 0.000306740220661986, + "loss": 2.952, + "theoretical_loss": 3.5623578256750754, + "tokens_seen": 1295925248 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030673019057171516, + "loss": 2.586, + "theoretical_loss": 3.5623414934155804, + "tokens_seen": 1295990784 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030672016048144434, + "loss": 2.7865, + "theoretical_loss": 3.562325162213195, + "tokens_seen": 1296056320 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003067101303911735, + "loss": 2.7031, + "theoretical_loss": 3.562308832067798, + "tokens_seen": 1296121856 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003067001003009027, + "loss": 2.785, + "theoretical_loss": 3.5622925029792674, + "tokens_seen": 1296187392 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003066900702106319, + "loss": 2.7095, + "theoretical_loss": 3.562276174947481, + "tokens_seen": 1296252928 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003066800401203611, + "loss": 2.6566, + "theoretical_loss": 3.5622598479723173, + "tokens_seen": 1296318464 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030667001003009025, + "loss": 2.8565, + "theoretical_loss": 3.562243522053655, + "tokens_seen": 1296384000 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003066599799398195, + "loss": 2.6802, + "theoretical_loss": 3.5622271971913717, + "tokens_seen": 1296449536 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030664994984954866, + "loss": 2.7915, + "theoretical_loss": 3.5622108733853457, + "tokens_seen": 1296515072 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030663991975927784, + "loss": 2.7282, + "theoretical_loss": 3.5621945506354553, + "tokens_seen": 1296580608 + }, + { + "epoch": 16.0, + "learning_rate": 0.000306629889669007, + "loss": 2.6747, + "theoretical_loss": 3.562178228941579, + "tokens_seen": 1296646144 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003066198595787362, + "loss": 2.771, + "theoretical_loss": 3.5621619083035947, + "tokens_seen": 1296711680 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003066098294884654, + "loss": 2.6418, + "theoretical_loss": 3.562145588721381, + "tokens_seen": 1296777216 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003065997993981946, + "loss": 2.732, + "theoretical_loss": 3.5621292701948164, + "tokens_seen": 1296842752 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030658976930792375, + "loss": 2.6755, + "theoretical_loss": 3.562112952723779, + "tokens_seen": 1296908288 + }, + { + "epoch": 16.0, + "learning_rate": 0.000306579739217653, + "loss": 2.7629, + "theoretical_loss": 3.5620966363081474, + "tokens_seen": 1296973824 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003065697091273821, + "loss": 2.7833, + "theoretical_loss": 3.5620803209478, + "tokens_seen": 1297039360 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030655967903711135, + "loss": 2.7381, + "theoretical_loss": 3.562064006642615, + "tokens_seen": 1297104896 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030654964894684053, + "loss": 2.6649, + "theoretical_loss": 3.562047693392471, + "tokens_seen": 1297170432 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003065396188565697, + "loss": 2.7841, + "theoretical_loss": 3.5620313811972464, + "tokens_seen": 1297235968 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003065295887662989, + "loss": 2.7028, + "theoretical_loss": 3.56201507005682, + "tokens_seen": 1297301504 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003065195586760281, + "loss": 2.7113, + "theoretical_loss": 3.56199875997107, + "tokens_seen": 1297367040 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030650952858575725, + "loss": 2.7665, + "theoretical_loss": 3.561982450939875, + "tokens_seen": 1297432576 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3077738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5850837230682373, + "objective/train/theoretical_loss": 3.561966142963114, + "objective/train/tokens_used": 1317958112, + "theoretical_loss": 3.561966142963114, + "tokens_seen": 1297498112 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064994984954865, + "loss": 2.679, + "theoretical_loss": 3.561966142963114, + "tokens_seen": 1297498112 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064894684052156, + "loss": 2.7102, + "theoretical_loss": 3.5619498360406645, + "tokens_seen": 1297563648 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030647943831494485, + "loss": 2.7145, + "theoretical_loss": 3.561933530172406, + "tokens_seen": 1297629184 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030646940822467403, + "loss": 2.7333, + "theoretical_loss": 3.5619172253582168, + "tokens_seen": 1297694720 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064593781344032, + "loss": 2.8264, + "theoretical_loss": 3.561900921597976, + "tokens_seen": 1297760256 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064493480441324, + "loss": 2.7129, + "theoretical_loss": 3.561884618891561, + "tokens_seen": 1297825792 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064393179538616, + "loss": 2.7901, + "theoretical_loss": 3.5618683172388526, + "tokens_seen": 1297891328 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030642928786359076, + "loss": 2.7394, + "theoretical_loss": 3.561852016639727, + "tokens_seen": 1297956864 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030641925777332, + "loss": 2.6605, + "theoretical_loss": 3.5618357170940644, + "tokens_seen": 1298022400 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003064092276830491, + "loss": 2.7155, + "theoretical_loss": 3.561819418601744, + "tokens_seen": 1298087936 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030639919759277835, + "loss": 2.803, + "theoretical_loss": 3.561803121162643, + "tokens_seen": 1298153472 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063891675025075, + "loss": 2.7465, + "theoretical_loss": 3.5617868247766413, + "tokens_seen": 1298219008 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063791374122367, + "loss": 2.6554, + "theoretical_loss": 3.561770529443617, + "tokens_seen": 1298284544 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063691073219659, + "loss": 2.8117, + "theoretical_loss": 3.56175423516345, + "tokens_seen": 1298350080 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063590772316951, + "loss": 2.7293, + "theoretical_loss": 3.561737941936018, + "tokens_seen": 1298415616 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030634904714142426, + "loss": 2.7808, + "theoretical_loss": 3.5617216497612003, + "tokens_seen": 1298481152 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063390170511535, + "loss": 2.852, + "theoretical_loss": 3.561705358638876, + "tokens_seen": 1298546688 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003063289869608826, + "loss": 2.7237, + "theoretical_loss": 3.5616890685689233, + "tokens_seen": 1298612224 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030631895687061186, + "loss": 2.7754, + "theoretical_loss": 3.561672779551222, + "tokens_seen": 1298677760 + }, + { + "epoch": 16.0, + "learning_rate": 0.000306308926780341, + "loss": 2.7696, + "theoretical_loss": 3.5616564915856506, + "tokens_seen": 1298743296 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062988966900702, + "loss": 2.7248, + "theoretical_loss": 3.561640204672088, + "tokens_seen": 1298808832 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030628886659979945, + "loss": 2.6247, + "theoretical_loss": 3.5616239188104126, + "tokens_seen": 1298874368 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062788365095286, + "loss": 2.6912, + "theoretical_loss": 3.561607634000505, + "tokens_seen": 1298939904 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062688064192578, + "loss": 2.8449, + "theoretical_loss": 3.5615913502422427, + "tokens_seen": 1299005440 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030625877632898694, + "loss": 2.7937, + "theoretical_loss": 3.5615750675355056, + "tokens_seen": 1299070976 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3080817, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6446311473846436, + "objective/train/theoretical_loss": 3.5615587858801723, + "objective/train/tokens_used": 1319596512, + "theoretical_loss": 3.5615587858801723, + "tokens_seen": 1299136512 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062487462387162, + "loss": 2.7898, + "theoretical_loss": 3.5615587858801723, + "tokens_seen": 1299136512 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030623871614844536, + "loss": 2.7654, + "theoretical_loss": 3.5615425052761216, + "tokens_seen": 1299202048 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030622868605817454, + "loss": 2.8654, + "theoretical_loss": 3.5615262257232336, + "tokens_seen": 1299267584 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062186559679037, + "loss": 2.683, + "theoretical_loss": 3.5615099472213867, + "tokens_seen": 1299333120 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003062086258776329, + "loss": 2.6905, + "theoretical_loss": 3.56149366977046, + "tokens_seen": 1299398656 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061985957873621, + "loss": 2.7332, + "theoretical_loss": 3.5614773933703328, + "tokens_seen": 1299464192 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061885656970913, + "loss": 2.7622, + "theoretical_loss": 3.561461118020884, + "tokens_seen": 1299529728 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030617853560682045, + "loss": 2.8302, + "theoretical_loss": 3.561444843721994, + "tokens_seen": 1299595264 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061685055165497, + "loss": 2.721, + "theoretical_loss": 3.5614285704735407, + "tokens_seen": 1299660800 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030615847542627886, + "loss": 2.6939, + "theoretical_loss": 3.5614122982754033, + "tokens_seen": 1299726336 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030614844533600804, + "loss": 2.7342, + "theoretical_loss": 3.561396027127462, + "tokens_seen": 1299791872 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061384152457372, + "loss": 2.7942, + "theoretical_loss": 3.561379757029595, + "tokens_seen": 1299857408 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061283851554664, + "loss": 2.7028, + "theoretical_loss": 3.5613634879816827, + "tokens_seen": 1299922944 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061183550651956, + "loss": 2.7714, + "theoretical_loss": 3.5613472199836034, + "tokens_seen": 1299988480 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003061083249749248, + "loss": 2.7581, + "theoretical_loss": 3.5613309530352373, + "tokens_seen": 1300054016 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030609829488465395, + "loss": 2.6621, + "theoretical_loss": 3.5613146871364636, + "tokens_seen": 1300119552 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060882647943832, + "loss": 2.7591, + "theoretical_loss": 3.5612984222871606, + "tokens_seen": 1300185088 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060782347041123, + "loss": 2.8136, + "theoretical_loss": 3.5612821584872094, + "tokens_seen": 1300250624 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030606820461384155, + "loss": 2.7197, + "theoretical_loss": 3.561265895736488, + "tokens_seen": 1300316160 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030605817452357073, + "loss": 2.7283, + "theoretical_loss": 3.5612496340348767, + "tokens_seen": 1300381696 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060481444332999, + "loss": 2.7377, + "theoretical_loss": 3.5612333733822545, + "tokens_seen": 1300447232 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060381143430291, + "loss": 2.7404, + "theoretical_loss": 3.5612171137785014, + "tokens_seen": 1300512768 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060280842527583, + "loss": 2.7502, + "theoretical_loss": 3.561200855223496, + "tokens_seen": 1300578304 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030601805416248745, + "loss": 2.7684, + "theoretical_loss": 3.5611845977171184, + "tokens_seen": 1300643840 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003060080240722167, + "loss": 2.7733, + "theoretical_loss": 3.561168341259248, + "tokens_seen": 1300709376 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3085855, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8498191833496094, + "objective/train/theoretical_loss": 3.5611520858497645, + "objective/train/tokens_used": 1321234912, + "theoretical_loss": 3.5611520858497645, + "tokens_seen": 1300774912 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059979939819458, + "loss": 2.8121, + "theoretical_loss": 3.5611520858497645, + "tokens_seen": 1300774912 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030598796389167505, + "loss": 2.7324, + "theoretical_loss": 3.5611358314885475, + "tokens_seen": 1300840448 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030597793380140423, + "loss": 2.7359, + "theoretical_loss": 3.561119578175476, + "tokens_seen": 1300905984 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059679037111334, + "loss": 2.6895, + "theoretical_loss": 3.5611033259104303, + "tokens_seen": 1300971520 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059578736208626, + "loss": 2.5973, + "theoretical_loss": 3.5610870746932903, + "tokens_seen": 1301037056 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059478435305918, + "loss": 2.7225, + "theoretical_loss": 3.561070824523935, + "tokens_seen": 1301102592 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030593781344032096, + "loss": 2.6669, + "theoretical_loss": 3.561054575402244, + "tokens_seen": 1301168128 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059277833500502, + "loss": 2.6586, + "theoretical_loss": 3.5610383273280974, + "tokens_seen": 1301233664 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003059177532597793, + "loss": 2.6182, + "theoretical_loss": 3.561022080301375, + "tokens_seen": 1301299200 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030590772316950855, + "loss": 2.8041, + "theoretical_loss": 3.5610058343219557, + "tokens_seen": 1301364736 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058976930792377, + "loss": 2.6175, + "theoretical_loss": 3.5609895893897203, + "tokens_seen": 1301430272 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058876629889669, + "loss": 2.7379, + "theoretical_loss": 3.5609733455045482, + "tokens_seen": 1301495808 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058776328986961, + "loss": 2.7317, + "theoretical_loss": 3.5609571026663183, + "tokens_seen": 1301561344 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058676028084253, + "loss": 2.8001, + "theoretical_loss": 3.5609408608749122, + "tokens_seen": 1301626880 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030585757271815446, + "loss": 2.7002, + "theoretical_loss": 3.5609246201302085, + "tokens_seen": 1301692416 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058475426278837, + "loss": 2.741, + "theoretical_loss": 3.560908380432087, + "tokens_seen": 1301757952 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058375125376128, + "loss": 2.8005, + "theoretical_loss": 3.5608921417804282, + "tokens_seen": 1301823488 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030582748244734206, + "loss": 2.6867, + "theoretical_loss": 3.560875904175111, + "tokens_seen": 1301889024 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058174523570712, + "loss": 2.6666, + "theoretical_loss": 3.560859667616017, + "tokens_seen": 1301954560 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003058074222668004, + "loss": 2.8191, + "theoretical_loss": 3.5608434321030247, + "tokens_seen": 1302020096 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057973921765296, + "loss": 2.8622, + "theoretical_loss": 3.560827197636014, + "tokens_seen": 1302085632 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057873620862588, + "loss": 2.6545, + "theoretical_loss": 3.560810964214866, + "tokens_seen": 1302151168 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030577733199598796, + "loss": 2.7619, + "theoretical_loss": 3.5607947318394597, + "tokens_seen": 1302216704 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030576730190571714, + "loss": 2.7793, + "theoretical_loss": 3.5607785005096755, + "tokens_seen": 1302282240 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057572718154463, + "loss": 2.7765, + "theoretical_loss": 3.5607622702253936, + "tokens_seen": 1302347776 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3088723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.690802812576294, + "objective/train/theoretical_loss": 3.5607460409864933, + "objective/train/tokens_used": 1322873312, + "theoretical_loss": 3.5607460409864933, + "tokens_seen": 1302413312 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030574724172517556, + "loss": 2.6518, + "theoretical_loss": 3.5607460409864933, + "tokens_seen": 1302413312 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057372116349047, + "loss": 2.698, + "theoretical_loss": 3.5607298127928555, + "tokens_seen": 1302478848 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057271815446339, + "loss": 2.8155, + "theoretical_loss": 3.5607135856443604, + "tokens_seen": 1302544384 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030571715145436305, + "loss": 2.8079, + "theoretical_loss": 3.560697359540887, + "tokens_seen": 1302609920 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003057071213640923, + "loss": 2.7979, + "theoretical_loss": 3.560681134482316, + "tokens_seen": 1302675456 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030569709127382147, + "loss": 2.6666, + "theoretical_loss": 3.560664910468528, + "tokens_seen": 1302740992 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030568706118355065, + "loss": 2.6482, + "theoretical_loss": 3.560648687499403, + "tokens_seen": 1302806528 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030567703109327983, + "loss": 2.73, + "theoretical_loss": 3.5606324655748214, + "tokens_seen": 1302872064 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030566700100300906, + "loss": 2.6321, + "theoretical_loss": 3.5606162446946623, + "tokens_seen": 1302937600 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003056569709127382, + "loss": 2.6703, + "theoretical_loss": 3.5606000248588066, + "tokens_seen": 1303003136 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003056469408224674, + "loss": 2.8161, + "theoretical_loss": 3.560583806067135, + "tokens_seen": 1303068672 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030563691073219655, + "loss": 2.7905, + "theoretical_loss": 3.5605675883195276, + "tokens_seen": 1303134208 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003056268806419258, + "loss": 2.7344, + "theoretical_loss": 3.560551371615864, + "tokens_seen": 1303199744 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030561685055165497, + "loss": 2.6391, + "theoretical_loss": 3.560535155956025, + "tokens_seen": 1303265280 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030560682046138415, + "loss": 2.6905, + "theoretical_loss": 3.5605189413398914, + "tokens_seen": 1303330816 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030559679037111333, + "loss": 2.7066, + "theoretical_loss": 3.560502727767343, + "tokens_seen": 1303396352 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003055867602808425, + "loss": 2.6735, + "theoretical_loss": 3.560486515238259, + "tokens_seen": 1303461888 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003055767301905717, + "loss": 2.7852, + "theoretical_loss": 3.5604703037525223, + "tokens_seen": 1303527424 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030556670010030093, + "loss": 2.8826, + "theoretical_loss": 3.5604540933100117, + "tokens_seen": 1303592960 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030555667001003006, + "loss": 2.7556, + "theoretical_loss": 3.560437883910608, + "tokens_seen": 1303658496 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003055466399197593, + "loss": 2.7732, + "theoretical_loss": 3.560421675554191, + "tokens_seen": 1303724032 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003055366098294885, + "loss": 2.7033, + "theoretical_loss": 3.560405468240643, + "tokens_seen": 1303789568 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030552657973921765, + "loss": 2.7808, + "theoretical_loss": 3.560389261969842, + "tokens_seen": 1303855104 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003055165496489469, + "loss": 2.744, + "theoretical_loss": 3.5603730567416703, + "tokens_seen": 1303920640 + }, + { + "epoch": 16.0, + "learning_rate": 0.000305506519558676, + "loss": 2.8796, + "theoretical_loss": 3.5603568525560076, + "tokens_seen": 1303986176 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3092391, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8484227657318115, + "objective/train/theoretical_loss": 3.560340649412735, + "objective/train/tokens_used": 1324511712, + "theoretical_loss": 3.560340649412735, + "tokens_seen": 1304051712 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030549648946840525, + "loss": 2.7585, + "theoretical_loss": 3.560340649412735, + "tokens_seen": 1304051712 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030548645937813443, + "loss": 2.7166, + "theoretical_loss": 3.5603244473117326, + "tokens_seen": 1304117248 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003054764292878636, + "loss": 2.7642, + "theoretical_loss": 3.5603082462528812, + "tokens_seen": 1304182784 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003054663991975928, + "loss": 2.6775, + "theoretical_loss": 3.560292046236061, + "tokens_seen": 1304248320 + }, + { + "epoch": 16.0, + "learning_rate": 0.000305456369107322, + "loss": 2.7205, + "theoretical_loss": 3.5602758472611535, + "tokens_seen": 1304313856 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030544633901705116, + "loss": 2.7633, + "theoretical_loss": 3.5602596493280387, + "tokens_seen": 1304379392 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003054363089267804, + "loss": 2.8372, + "theoretical_loss": 3.560243452436597, + "tokens_seen": 1304444928 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003054262788365095, + "loss": 2.6963, + "theoretical_loss": 3.5602272565867095, + "tokens_seen": 1304510464 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030541624874623875, + "loss": 2.8123, + "theoretical_loss": 3.5602110617782574, + "tokens_seen": 1304576000 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003054062186559679, + "loss": 2.7158, + "theoretical_loss": 3.560194868011121, + "tokens_seen": 1304641536 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053961885656971, + "loss": 2.7438, + "theoretical_loss": 3.5601786752851803, + "tokens_seen": 1304707072 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053861584754263, + "loss": 2.6398, + "theoretical_loss": 3.5601624836003167, + "tokens_seen": 1304772608 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053761283851555, + "loss": 2.7168, + "theoretical_loss": 3.560146292956411, + "tokens_seen": 1304838144 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030536609829488466, + "loss": 2.7181, + "theoretical_loss": 3.5601301033533446, + "tokens_seen": 1304903680 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053560682046139, + "loss": 2.6969, + "theoretical_loss": 3.560113914790997, + "tokens_seen": 1304969216 + }, + { + "epoch": 16.0, + "learning_rate": 0.000305346038114343, + "loss": 2.7339, + "theoretical_loss": 3.56009772726925, + "tokens_seen": 1305034752 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030533600802407226, + "loss": 2.7668, + "theoretical_loss": 3.560081540787984, + "tokens_seen": 1305100288 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053259779338014, + "loss": 2.7437, + "theoretical_loss": 3.5600653553470805, + "tokens_seen": 1305165824 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053159478435306, + "loss": 2.7011, + "theoretical_loss": 3.5600491709464195, + "tokens_seen": 1305231360 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003053059177532598, + "loss": 2.8058, + "theoretical_loss": 3.5600329875858825, + "tokens_seen": 1305296896 + }, + { + "epoch": 16.0, + "learning_rate": 0.000305295887662989, + "loss": 2.7481, + "theoretical_loss": 3.5600168052653505, + "tokens_seen": 1305362432 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030528585757271816, + "loss": 2.6496, + "theoretical_loss": 3.560000623984704, + "tokens_seen": 1305427968 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030527582748244734, + "loss": 2.8004, + "theoretical_loss": 3.5599844437438244, + "tokens_seen": 1305493504 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003052657973921765, + "loss": 2.8298, + "theoretical_loss": 3.5599682645425927, + "tokens_seen": 1305559040 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030525576730190576, + "loss": 2.7868, + "theoretical_loss": 3.559952086380889, + "tokens_seen": 1305624576 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3097467, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8917949199676514, + "objective/train/theoretical_loss": 3.559935909258596, + "objective/train/tokens_used": 1326150112, + "theoretical_loss": 3.559935909258596, + "tokens_seen": 1305690112 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003052457372116349, + "loss": 2.7869, + "theoretical_loss": 3.559935909258596, + "tokens_seen": 1305690112 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003052357071213641, + "loss": 2.8679, + "theoretical_loss": 3.5599197331755934, + "tokens_seen": 1305755648 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030522567703109325, + "loss": 2.8049, + "theoretical_loss": 3.559903558131763, + "tokens_seen": 1305821184 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003052156469408225, + "loss": 2.7404, + "theoretical_loss": 3.559887384126985, + "tokens_seen": 1305886720 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030520561685055167, + "loss": 2.7047, + "theoretical_loss": 3.559871211161142, + "tokens_seen": 1305952256 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030519558676028085, + "loss": 2.7322, + "theoretical_loss": 3.5598550392341135, + "tokens_seen": 1306017792 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030518555667001003, + "loss": 2.8606, + "theoretical_loss": 3.559838868345782, + "tokens_seen": 1306083328 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030517552657973926, + "loss": 2.8017, + "theoretical_loss": 3.5598226984960277, + "tokens_seen": 1306148864 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003051654964894684, + "loss": 2.6949, + "theoretical_loss": 3.5598065296847325, + "tokens_seen": 1306214400 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003051554663991976, + "loss": 2.8151, + "theoretical_loss": 3.559790361911777, + "tokens_seen": 1306279936 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030514543630892675, + "loss": 2.7606, + "theoretical_loss": 3.559774195177043, + "tokens_seen": 1306345472 + }, + { + "epoch": 16.0, + "learning_rate": 0.000305135406218656, + "loss": 2.7893, + "theoretical_loss": 3.5597580294804114, + "tokens_seen": 1306411008 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030512537612838517, + "loss": 2.803, + "theoretical_loss": 3.5597418648217634, + "tokens_seen": 1306476544 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030511534603811435, + "loss": 2.7469, + "theoretical_loss": 3.55972570120098, + "tokens_seen": 1306542080 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030510531594784353, + "loss": 2.7255, + "theoretical_loss": 3.559709538617944, + "tokens_seen": 1306607616 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050952858575727, + "loss": 2.6099, + "theoretical_loss": 3.559693377072535, + "tokens_seen": 1306673152 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050852557673019, + "loss": 2.6961, + "theoretical_loss": 3.5596772165646353, + "tokens_seen": 1306738688 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030507522567703113, + "loss": 2.7245, + "theoretical_loss": 3.559661057094125, + "tokens_seen": 1306804224 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030506519558676026, + "loss": 2.7759, + "theoretical_loss": 3.5596448986608875, + "tokens_seen": 1306869760 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050551654964895, + "loss": 2.7616, + "theoretical_loss": 3.5596287412648033, + "tokens_seen": 1306935296 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050451354062186, + "loss": 2.7805, + "theoretical_loss": 3.559612584905753, + "tokens_seen": 1307000832 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030503510531594785, + "loss": 2.8722, + "theoretical_loss": 3.5595964295836193, + "tokens_seen": 1307066368 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030502507522567703, + "loss": 2.7361, + "theoretical_loss": 3.559580275298283, + "tokens_seen": 1307131904 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050150451354062, + "loss": 2.8155, + "theoretical_loss": 3.5595641220496255, + "tokens_seen": 1307197440 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003050050150451354, + "loss": 2.8248, + "theoretical_loss": 3.5595479698375287, + "tokens_seen": 1307262976 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3100267, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.442708969116211, + "objective/train/theoretical_loss": 3.5595318186618736, + "objective/train/tokens_used": 1327788512, + "theoretical_loss": 3.5595318186618736, + "tokens_seen": 1307328512 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030499498495486463, + "loss": 2.7199, + "theoretical_loss": 3.5595318186618736, + "tokens_seen": 1307328512 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030498495486459376, + "loss": 2.8009, + "theoretical_loss": 3.5595156685225424, + "tokens_seen": 1307394048 + }, + { + "epoch": 16.0, + "learning_rate": 0.000304974924774323, + "loss": 2.7478, + "theoretical_loss": 3.559499519419416, + "tokens_seen": 1307459584 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003049648946840521, + "loss": 2.8381, + "theoretical_loss": 3.5594833713523766, + "tokens_seen": 1307525120 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030495486459378136, + "loss": 2.8505, + "theoretical_loss": 3.5594672243213052, + "tokens_seen": 1307590656 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030494483450351054, + "loss": 2.6977, + "theoretical_loss": 3.5594510783260844, + "tokens_seen": 1307656192 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003049348044132397, + "loss": 2.7045, + "theoretical_loss": 3.5594349333665947, + "tokens_seen": 1307721728 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003049247743229689, + "loss": 2.812, + "theoretical_loss": 3.5594187894427183, + "tokens_seen": 1307787264 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003049147442326981, + "loss": 2.6882, + "theoretical_loss": 3.5594026465543367, + "tokens_seen": 1307852800 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030490471414242726, + "loss": 2.7937, + "theoretical_loss": 3.5593865047013318, + "tokens_seen": 1307918336 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048946840521565, + "loss": 2.6994, + "theoretical_loss": 3.559370363883585, + "tokens_seen": 1307983872 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048846539618856, + "loss": 2.8551, + "theoretical_loss": 3.5593542241009786, + "tokens_seen": 1308049408 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030487462387161486, + "loss": 2.882, + "theoretical_loss": 3.559338085353394, + "tokens_seen": 1308114944 + }, + { + "epoch": 16.0, + "learning_rate": 0.000304864593781344, + "loss": 2.7987, + "theoretical_loss": 3.5593219476407127, + "tokens_seen": 1308180480 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048545636910732, + "loss": 2.7878, + "theoretical_loss": 3.559305810962817, + "tokens_seen": 1308246016 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048445336008024, + "loss": 2.7768, + "theoretical_loss": 3.5592896753195884, + "tokens_seen": 1308311552 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048345035105316, + "loss": 2.7835, + "theoretical_loss": 3.559273540710909, + "tokens_seen": 1308377088 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030482447342026076, + "loss": 2.7494, + "theoretical_loss": 3.559257407136661, + "tokens_seen": 1308442624 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030481444332999, + "loss": 2.7376, + "theoretical_loss": 3.559241274596725, + "tokens_seen": 1308508160 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003048044132397192, + "loss": 2.7936, + "theoretical_loss": 3.559225143090984, + "tokens_seen": 1308573696 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030479438314944836, + "loss": 2.7636, + "theoretical_loss": 3.559209012619319, + "tokens_seen": 1308639232 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030478435305917754, + "loss": 2.6919, + "theoretical_loss": 3.5591928831816135, + "tokens_seen": 1308704768 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003047743229689067, + "loss": 2.7183, + "theoretical_loss": 3.5591767547777478, + "tokens_seen": 1308770304 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030476429287863596, + "loss": 2.708, + "theoretical_loss": 3.559160627407605, + "tokens_seen": 1308835840 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003047542627883651, + "loss": 2.7869, + "theoretical_loss": 3.5591445010710663, + "tokens_seen": 1308901376 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3105189, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8958349227905273, + "objective/train/theoretical_loss": 3.5591283757680134, + "objective/train/tokens_used": 1329426912, + "theoretical_loss": 3.5591283757680134, + "tokens_seen": 1308966912 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003047442326980943, + "loss": 2.7566, + "theoretical_loss": 3.5591283757680134, + "tokens_seen": 1308966912 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030473420260782345, + "loss": 2.822, + "theoretical_loss": 3.55911225149833, + "tokens_seen": 1309032448 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003047241725175527, + "loss": 2.6447, + "theoretical_loss": 3.5590961282618965, + "tokens_seen": 1309097984 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030471414242728187, + "loss": 2.7188, + "theoretical_loss": 3.559080006058596, + "tokens_seen": 1309163520 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030470411233701105, + "loss": 2.6745, + "theoretical_loss": 3.55906388488831, + "tokens_seen": 1309229056 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030469408224674023, + "loss": 2.8291, + "theoretical_loss": 3.559047764750921, + "tokens_seen": 1309294592 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030468405215646946, + "loss": 2.7013, + "theoretical_loss": 3.5590316456463102, + "tokens_seen": 1309360128 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003046740220661986, + "loss": 2.858, + "theoretical_loss": 3.5590155275743607, + "tokens_seen": 1309425664 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003046639919759278, + "loss": 2.7232, + "theoretical_loss": 3.5589994105349545, + "tokens_seen": 1309491200 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030465396188565695, + "loss": 2.7578, + "theoretical_loss": 3.558983294527974, + "tokens_seen": 1309556736 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003046439317953862, + "loss": 2.8136, + "theoretical_loss": 3.5589671795533007, + "tokens_seen": 1309622272 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030463390170511537, + "loss": 2.876, + "theoretical_loss": 3.5589510656108168, + "tokens_seen": 1309687808 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030462387161484455, + "loss": 2.7829, + "theoretical_loss": 3.558934952700406, + "tokens_seen": 1309753344 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030461384152457373, + "loss": 2.8111, + "theoretical_loss": 3.5589188408219488, + "tokens_seen": 1309818880 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003046038114343029, + "loss": 2.7859, + "theoretical_loss": 3.5589027299753284, + "tokens_seen": 1309884416 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003045937813440321, + "loss": 2.7941, + "theoretical_loss": 3.558886620160427, + "tokens_seen": 1309949952 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030458375125376133, + "loss": 2.6661, + "theoretical_loss": 3.558870511377126, + "tokens_seen": 1310015488 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030457372116349046, + "loss": 2.685, + "theoretical_loss": 3.5588544036253094, + "tokens_seen": 1310081024 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003045636910732197, + "loss": 2.7181, + "theoretical_loss": 3.5588382969048586, + "tokens_seen": 1310146560 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003045536609829488, + "loss": 2.768, + "theoretical_loss": 3.558822191215656, + "tokens_seen": 1310212096 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030454363089267805, + "loss": 2.8496, + "theoretical_loss": 3.558806086557584, + "tokens_seen": 1310277632 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030453360080240723, + "loss": 2.7217, + "theoretical_loss": 3.5587899829305254, + "tokens_seen": 1310343168 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003045235707121364, + "loss": 2.7006, + "theoretical_loss": 3.5587738803343623, + "tokens_seen": 1310408704 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003045135406218656, + "loss": 2.7023, + "theoretical_loss": 3.558757778768977, + "tokens_seen": 1310474240 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030450351053159483, + "loss": 2.7476, + "theoretical_loss": 3.558741678234252, + "tokens_seen": 1310539776 + }, + { + "debugging/Self-BLEU-5": 0.670252562645695, + "debugging/distinct-1-grams": 0.7554618913210209, + "debugging/distinct-2-grams": 0.9529359101605587, + "debugging/entropy-1-grams": 6.417847252368519, + "debugging/entropy-2-grams": 7.740590445613455, + "debugging/length": 498.4054054054054, + "debugging/num_segments": 37, + "epoch": 16.0, + "objective/train/docs_used": 3108007, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7862226963043213, + "objective/train/theoretical_loss": 3.55872557873007, + "objective/train/tokens_used": 1331065312, + "theoretical_loss": 3.55872557873007, + "tokens_seen": 1310605312 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030449348044132396, + "loss": 2.748, + "theoretical_loss": 3.55872557873007, + "tokens_seen": 1310605312 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044834503510532, + "loss": 2.8057, + "theoretical_loss": 3.5587094802563133, + "tokens_seen": 1310670848 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044734202607823, + "loss": 2.7524, + "theoretical_loss": 3.558693382812865, + "tokens_seen": 1310736384 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030446339017051156, + "loss": 2.6718, + "theoretical_loss": 3.5586772863996066, + "tokens_seen": 1310801920 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030445336008024074, + "loss": 2.7223, + "theoretical_loss": 3.558661191016422, + "tokens_seen": 1310867456 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044433299899699, + "loss": 2.7069, + "theoretical_loss": 3.558645096663193, + "tokens_seen": 1310932992 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044332998996991, + "loss": 2.7547, + "theoretical_loss": 3.558629003339802, + "tokens_seen": 1310998528 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044232698094283, + "loss": 2.7758, + "theoretical_loss": 3.558612911046132, + "tokens_seen": 1311064064 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030441323971915746, + "loss": 2.7945, + "theoretical_loss": 3.558596819782066, + "tokens_seen": 1311129600 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003044032096288867, + "loss": 2.6969, + "theoretical_loss": 3.5585807295474856, + "tokens_seen": 1311195136 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043931795386158, + "loss": 2.7534, + "theoretical_loss": 3.5585646403422744, + "tokens_seen": 1311260672 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030438314944834506, + "loss": 2.7765, + "theoretical_loss": 3.5585485521663145, + "tokens_seen": 1311326208 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043731193580742, + "loss": 2.7636, + "theoretical_loss": 3.5585324650194896, + "tokens_seen": 1311391744 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043630892678034, + "loss": 2.7149, + "theoretical_loss": 3.5585163789016816, + "tokens_seen": 1311457280 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043530591775326, + "loss": 2.6971, + "theoretical_loss": 3.5585002938127737, + "tokens_seen": 1311522816 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043430290872618, + "loss": 2.6809, + "theoretical_loss": 3.5584842097526477, + "tokens_seen": 1311588352 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030433299899699097, + "loss": 2.7448, + "theoretical_loss": 3.5584681267211877, + "tokens_seen": 1311653888 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003043229689067202, + "loss": 2.7812, + "theoretical_loss": 3.5584520447182757, + "tokens_seen": 1311719424 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030431293881644933, + "loss": 2.7824, + "theoretical_loss": 3.5584359637437952, + "tokens_seen": 1311784960 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030430290872617856, + "loss": 2.8069, + "theoretical_loss": 3.5584198837976277, + "tokens_seen": 1311850496 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003042928786359077, + "loss": 2.7836, + "theoretical_loss": 3.5584038048796582, + "tokens_seen": 1311916032 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003042828485456369, + "loss": 2.7364, + "theoretical_loss": 3.5583877269897677, + "tokens_seen": 1311981568 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003042728184553661, + "loss": 2.7337, + "theoretical_loss": 3.55837165012784, + "tokens_seen": 1312047104 + }, + { + "epoch": 16.0, + "learning_rate": 0.0003042627883650953, + "loss": 2.778, + "theoretical_loss": 3.558355574293758, + "tokens_seen": 1312112640 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030425275827482447, + "loss": 2.7279, + "theoretical_loss": 3.5583394994874045, + "tokens_seen": 1312178176 + }, + { + "epoch": 16.0, + "objective/train/docs_used": 3111786, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.653829574584961, + "objective/train/theoretical_loss": 3.5583234257086627, + "objective/train/tokens_used": 1332703712, + "theoretical_loss": 3.5583234257086627, + "tokens_seen": 1312243712 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030424272818455365, + "loss": 2.6906, + "theoretical_loss": 3.5583234257086627, + "tokens_seen": 1312243712 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030423269809428283, + "loss": 2.6874, + "theoretical_loss": 3.558307352957415, + "tokens_seen": 1312309248 + }, + { + "epoch": 16.0, + "learning_rate": 0.00030422266800401207, + "loss": 2.7691, + "theoretical_loss": 3.558291281233545, + "tokens_seen": 1312374784 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003042126379137412, + "loss": 2.8442, + "theoretical_loss": 3.5582752105369355, + "tokens_seen": 1312440320 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030420260782347043, + "loss": 2.6677, + "theoretical_loss": 3.5582591408674697, + "tokens_seen": 1312505856 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003041925777331996, + "loss": 2.7499, + "theoretical_loss": 3.5582430722250304, + "tokens_seen": 1312571392 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003041825476429288, + "loss": 2.8231, + "theoretical_loss": 3.558227004609501, + "tokens_seen": 1312636928 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030417251755265797, + "loss": 2.7926, + "theoretical_loss": 3.558210938020764, + "tokens_seen": 1312702464 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030416248746238715, + "loss": 2.7476, + "theoretical_loss": 3.558194872458704, + "tokens_seen": 1312768000 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030415245737211633, + "loss": 2.7561, + "theoretical_loss": 3.558178807923202, + "tokens_seen": 1312833536 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030414242728184557, + "loss": 2.7384, + "theoretical_loss": 3.558162744414143, + "tokens_seen": 1312899072 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003041323971915747, + "loss": 2.7122, + "theoretical_loss": 3.558146681931409, + "tokens_seen": 1312964608 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030412236710130393, + "loss": 2.7222, + "theoretical_loss": 3.558130620474884, + "tokens_seen": 1313030144 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030411233701103306, + "loss": 2.8209, + "theoretical_loss": 3.5581145600444506, + "tokens_seen": 1313095680 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003041023069207623, + "loss": 2.8255, + "theoretical_loss": 3.558098500639993, + "tokens_seen": 1313161216 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003040922768304915, + "loss": 2.6425, + "theoretical_loss": 3.558082442261393, + "tokens_seen": 1313226752 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030408224674022066, + "loss": 2.8278, + "theoretical_loss": 3.558066384908535, + "tokens_seen": 1313292288 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030407221664994984, + "loss": 2.8051, + "theoretical_loss": 3.5580503285813023, + "tokens_seen": 1313357824 + }, + { + "epoch": 16.01, + "learning_rate": 0.000304062186559679, + "loss": 2.7002, + "theoretical_loss": 3.5580342732795773, + "tokens_seen": 1313423360 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030405215646940825, + "loss": 2.614, + "theoretical_loss": 3.5580182190032446, + "tokens_seen": 1313488896 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030404212637913743, + "loss": 2.7651, + "theoretical_loss": 3.5580021657521863, + "tokens_seen": 1313554432 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003040320962888666, + "loss": 2.7068, + "theoretical_loss": 3.5579861135262867, + "tokens_seen": 1313619968 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003040220661985958, + "loss": 2.7667, + "theoretical_loss": 3.5579700623254285, + "tokens_seen": 1313685504 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030401203610832503, + "loss": 2.8027, + "theoretical_loss": 3.5579540121494952, + "tokens_seen": 1313751040 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030400200601805416, + "loss": 2.7085, + "theoretical_loss": 3.5579379629983716, + "tokens_seen": 1313816576 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3116546, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.705919027328491, + "objective/train/theoretical_loss": 3.5579219148719394, + "objective/train/tokens_used": 1334342112, + "theoretical_loss": 3.5579219148719394, + "tokens_seen": 1313882112 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039919759277834, + "loss": 2.7788, + "theoretical_loss": 3.5579219148719394, + "tokens_seen": 1313882112 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039819458375125, + "loss": 2.7327, + "theoretical_loss": 3.5579058677700823, + "tokens_seen": 1313947648 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030397191574724176, + "loss": 2.8074, + "theoretical_loss": 3.557889821692685, + "tokens_seen": 1314013184 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030396188565697094, + "loss": 2.7581, + "theoretical_loss": 3.55787377663963, + "tokens_seen": 1314078720 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039518555667001, + "loss": 2.7433, + "theoretical_loss": 3.5578577326108007, + "tokens_seen": 1314144256 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039418254764293, + "loss": 2.7726, + "theoretical_loss": 3.557841689606081, + "tokens_seen": 1314209792 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039317953861585, + "loss": 2.6724, + "theoretical_loss": 3.5578256476253545, + "tokens_seen": 1314275328 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030392176529588766, + "loss": 2.8021, + "theoretical_loss": 3.557809606668505, + "tokens_seen": 1314340864 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003039117352056169, + "loss": 2.7025, + "theoretical_loss": 3.5577935667354157, + "tokens_seen": 1314406400 + }, + { + "epoch": 16.01, + "learning_rate": 0.000303901705115346, + "loss": 2.7315, + "theoretical_loss": 3.5577775278259702, + "tokens_seen": 1314471936 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030389167502507526, + "loss": 2.7455, + "theoretical_loss": 3.5577614899400523, + "tokens_seen": 1314537472 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003038816449348044, + "loss": 2.7661, + "theoretical_loss": 3.5577454530775463, + "tokens_seen": 1314603008 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003038716148445336, + "loss": 2.7328, + "theoretical_loss": 3.5577294172383347, + "tokens_seen": 1314668544 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003038615847542628, + "loss": 2.833, + "theoretical_loss": 3.5577133824223015, + "tokens_seen": 1314734080 + }, + { + "epoch": 16.01, + "learning_rate": 0.000303851554663992, + "loss": 2.6749, + "theoretical_loss": 3.5576973486293313, + "tokens_seen": 1314799616 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030384152457372117, + "loss": 2.8222, + "theoretical_loss": 3.557681315859307, + "tokens_seen": 1314865152 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003038314944834504, + "loss": 2.8341, + "theoretical_loss": 3.557665284112112, + "tokens_seen": 1314930688 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030382146439317953, + "loss": 2.8159, + "theoretical_loss": 3.5576492533876314, + "tokens_seen": 1314996224 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030381143430290876, + "loss": 2.7072, + "theoretical_loss": 3.5576332236857477, + "tokens_seen": 1315061760 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003038014042126379, + "loss": 2.7206, + "theoretical_loss": 3.5576171950063458, + "tokens_seen": 1315127296 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003037913741223671, + "loss": 2.7359, + "theoretical_loss": 3.557601167349308, + "tokens_seen": 1315192832 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003037813440320963, + "loss": 2.8371, + "theoretical_loss": 3.55758514071452, + "tokens_seen": 1315258368 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003037713139418255, + "loss": 2.7246, + "theoretical_loss": 3.5575691151018645, + "tokens_seen": 1315323904 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030376128385155467, + "loss": 2.7548, + "theoretical_loss": 3.5575530905112256, + "tokens_seen": 1315389440 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030375125376128385, + "loss": 2.8583, + "theoretical_loss": 3.557537066942487, + "tokens_seen": 1315454976 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3119536, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7070488929748535, + "objective/train/theoretical_loss": 3.5575210443955334, + "objective/train/tokens_used": 1335980512, + "theoretical_loss": 3.5575210443955334, + "tokens_seen": 1315520512 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030374122367101303, + "loss": 2.684, + "theoretical_loss": 3.5575210443955334, + "tokens_seen": 1315520512 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030373119358074227, + "loss": 2.7748, + "theoretical_loss": 3.5575050228702483, + "tokens_seen": 1315586048 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003037211634904714, + "loss": 2.7197, + "theoretical_loss": 3.5574890023665153, + "tokens_seen": 1315651584 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030371113340020063, + "loss": 2.7323, + "theoretical_loss": 3.557472982884219, + "tokens_seen": 1315717120 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003037011033099298, + "loss": 2.7317, + "theoretical_loss": 3.5574569644232428, + "tokens_seen": 1315782656 + }, + { + "epoch": 16.01, + "learning_rate": 0.000303691073219659, + "loss": 2.8089, + "theoretical_loss": 3.557440946983471, + "tokens_seen": 1315848192 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030368104312938817, + "loss": 2.7457, + "theoretical_loss": 3.557424930564788, + "tokens_seen": 1315913728 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030367101303911735, + "loss": 2.6807, + "theoretical_loss": 3.557408915167077, + "tokens_seen": 1315979264 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030366098294884653, + "loss": 2.8168, + "theoretical_loss": 3.557392900790223, + "tokens_seen": 1316044800 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030365095285857577, + "loss": 2.8494, + "theoretical_loss": 3.55737688743411, + "tokens_seen": 1316110336 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003036409227683049, + "loss": 2.7615, + "theoretical_loss": 3.557360875098621, + "tokens_seen": 1316175872 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030363089267803413, + "loss": 2.6804, + "theoretical_loss": 3.5573448637836416, + "tokens_seen": 1316241408 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030362086258776326, + "loss": 2.8479, + "theoretical_loss": 3.5573288534890546, + "tokens_seen": 1316306944 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003036108324974925, + "loss": 2.6767, + "theoretical_loss": 3.5573128442147457, + "tokens_seen": 1316372480 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003036008024072217, + "loss": 2.6845, + "theoretical_loss": 3.5572968359605976, + "tokens_seen": 1316438016 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030359077231695086, + "loss": 2.7482, + "theoretical_loss": 3.557280828726496, + "tokens_seen": 1316503552 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030358074222668004, + "loss": 2.7324, + "theoretical_loss": 3.557264822512323, + "tokens_seen": 1316569088 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003035707121364092, + "loss": 2.7671, + "theoretical_loss": 3.5572488173179653, + "tokens_seen": 1316634624 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003035606820461384, + "loss": 2.7503, + "theoretical_loss": 3.5572328131433055, + "tokens_seen": 1316700160 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030355065195586763, + "loss": 2.7944, + "theoretical_loss": 3.557216809988229, + "tokens_seen": 1316765696 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030354062186559676, + "loss": 2.6486, + "theoretical_loss": 3.5572008078526185, + "tokens_seen": 1316831232 + }, + { + "epoch": 16.01, + "learning_rate": 0.000303530591775326, + "loss": 2.6942, + "theoretical_loss": 3.55718480673636, + "tokens_seen": 1316896768 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003035205616850552, + "loss": 2.6533, + "theoretical_loss": 3.5571688066393374, + "tokens_seen": 1316962304 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030351053159478436, + "loss": 2.727, + "theoretical_loss": 3.5571528075614345, + "tokens_seen": 1317027840 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030350050150451354, + "loss": 2.7276, + "theoretical_loss": 3.557136809502536, + "tokens_seen": 1317093376 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3124438, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.885336399078369, + "objective/train/theoretical_loss": 3.5571208124625264, + "objective/train/tokens_used": 1337618912, + "theoretical_loss": 3.5571208124625264, + "tokens_seen": 1317158912 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034904714142427, + "loss": 2.7861, + "theoretical_loss": 3.5571208124625264, + "tokens_seen": 1317158912 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034804413239719, + "loss": 2.8161, + "theoretical_loss": 3.5571048164412895, + "tokens_seen": 1317224448 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030347041123370114, + "loss": 2.8487, + "theoretical_loss": 3.557088821438711, + "tokens_seen": 1317289984 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030346038114343026, + "loss": 2.8346, + "theoretical_loss": 3.5570728274546743, + "tokens_seen": 1317355520 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034503510531595, + "loss": 2.713, + "theoretical_loss": 3.5570568344890647, + "tokens_seen": 1317421056 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034403209628886, + "loss": 2.7761, + "theoretical_loss": 3.557040842541766, + "tokens_seen": 1317486592 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030343029087261786, + "loss": 2.6347, + "theoretical_loss": 3.5570248516126624, + "tokens_seen": 1317552128 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030342026078234704, + "loss": 2.8039, + "theoretical_loss": 3.557008861701639, + "tokens_seen": 1317617664 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034102306920762, + "loss": 2.8453, + "theoretical_loss": 3.5569928728085807, + "tokens_seen": 1317683200 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003034002006018054, + "loss": 2.804, + "theoretical_loss": 3.5569768849333716, + "tokens_seen": 1317748736 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003033901705115346, + "loss": 2.7961, + "theoretical_loss": 3.5569608980758964, + "tokens_seen": 1317814272 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030338014042126377, + "loss": 2.734, + "theoretical_loss": 3.5569449122360393, + "tokens_seen": 1317879808 + }, + { + "epoch": 16.01, + "learning_rate": 0.000303370110330993, + "loss": 2.8103, + "theoretical_loss": 3.5569289274136855, + "tokens_seen": 1317945344 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030336008024072213, + "loss": 2.7332, + "theoretical_loss": 3.5569129436087197, + "tokens_seen": 1318010880 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030335005015045137, + "loss": 2.7554, + "theoretical_loss": 3.5568969608210255, + "tokens_seen": 1318076416 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030334002006018055, + "loss": 2.7333, + "theoretical_loss": 3.556880979050489, + "tokens_seen": 1318141952 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030332998996990973, + "loss": 2.7803, + "theoretical_loss": 3.556864998296994, + "tokens_seen": 1318207488 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003033199598796389, + "loss": 2.7741, + "theoretical_loss": 3.556849018560426, + "tokens_seen": 1318273024 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003033099297893681, + "loss": 2.7355, + "theoretical_loss": 3.5568330398406687, + "tokens_seen": 1318338560 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003032998996990973, + "loss": 2.8105, + "theoretical_loss": 3.5568170621376076, + "tokens_seen": 1318404096 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003032898696088265, + "loss": 2.7754, + "theoretical_loss": 3.556801085451127, + "tokens_seen": 1318469632 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003032798395185557, + "loss": 2.8413, + "theoretical_loss": 3.556785109781112, + "tokens_seen": 1318535168 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030326980942828487, + "loss": 2.7577, + "theoretical_loss": 3.556769135127448, + "tokens_seen": 1318600704 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030325977933801405, + "loss": 2.7877, + "theoretical_loss": 3.5567531614900183, + "tokens_seen": 1318666240 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030324974924774323, + "loss": 2.8324, + "theoretical_loss": 3.556737188868709, + "tokens_seen": 1318731776 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3127333, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9728188514709473, + "objective/train/theoretical_loss": 3.556721217263405, + "objective/train/tokens_used": 1339257312, + "theoretical_loss": 3.556721217263405, + "tokens_seen": 1318797312 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030323971915747247, + "loss": 2.8926, + "theoretical_loss": 3.556721217263405, + "tokens_seen": 1318797312 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003032296890672016, + "loss": 2.7107, + "theoretical_loss": 3.5567052466739906, + "tokens_seen": 1318862848 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030321965897693083, + "loss": 2.6899, + "theoretical_loss": 3.5566892771003507, + "tokens_seen": 1318928384 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030320962888666, + "loss": 2.7791, + "theoretical_loss": 3.5566733085423703, + "tokens_seen": 1318993920 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003031995987963892, + "loss": 2.7705, + "theoretical_loss": 3.556657340999935, + "tokens_seen": 1319059456 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030318956870611837, + "loss": 2.6736, + "theoretical_loss": 3.556641374472929, + "tokens_seen": 1319124992 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030317953861584755, + "loss": 2.8353, + "theoretical_loss": 3.5566254089612377, + "tokens_seen": 1319190528 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030316950852557673, + "loss": 2.8156, + "theoretical_loss": 3.556609444464746, + "tokens_seen": 1319256064 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030315947843530597, + "loss": 2.8582, + "theoretical_loss": 3.556593480983338, + "tokens_seen": 1319321600 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003031494483450351, + "loss": 2.6954, + "theoretical_loss": 3.5565775185169004, + "tokens_seen": 1319387136 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030313941825476433, + "loss": 2.8315, + "theoretical_loss": 3.5565615570653173, + "tokens_seen": 1319452672 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030312938816449346, + "loss": 2.7975, + "theoretical_loss": 3.556545596628474, + "tokens_seen": 1319518208 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003031193580742227, + "loss": 2.6807, + "theoretical_loss": 3.556529637206255, + "tokens_seen": 1319583744 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003031093279839519, + "loss": 2.7476, + "theoretical_loss": 3.556513678798547, + "tokens_seen": 1319649280 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030309929789368106, + "loss": 2.7873, + "theoretical_loss": 3.556497721405233, + "tokens_seen": 1319714816 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030308926780341024, + "loss": 2.8025, + "theoretical_loss": 3.5564817650262, + "tokens_seen": 1319780352 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003030792377131394, + "loss": 2.8646, + "theoretical_loss": 3.556465809661332, + "tokens_seen": 1319845888 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003030692076228686, + "loss": 2.7725, + "theoretical_loss": 3.5564498553105146, + "tokens_seen": 1319911424 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030305917753259783, + "loss": 2.7551, + "theoretical_loss": 3.556433901973633, + "tokens_seen": 1319976960 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030304914744232696, + "loss": 2.7945, + "theoretical_loss": 3.556417949650572, + "tokens_seen": 1320042496 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003030391173520562, + "loss": 2.8472, + "theoretical_loss": 3.556401998341218, + "tokens_seen": 1320108032 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003030290872617854, + "loss": 2.6944, + "theoretical_loss": 3.556386048045455, + "tokens_seen": 1320173568 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030301905717151456, + "loss": 2.6618, + "theoretical_loss": 3.556370098763169, + "tokens_seen": 1320239104 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030300902708124374, + "loss": 2.7736, + "theoretical_loss": 3.5563541504942453, + "tokens_seen": 1320304640 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029989969909729, + "loss": 2.7473, + "theoretical_loss": 3.5563382032385684, + "tokens_seen": 1320370176 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3131124, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7374343872070312, + "objective/train/theoretical_loss": 3.5563222569960247, + "objective/train/tokens_used": 1340895712, + "theoretical_loss": 3.5563222569960247, + "tokens_seen": 1320435712 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029889669007021, + "loss": 2.827, + "theoretical_loss": 3.5563222569960247, + "tokens_seen": 1320435712 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030297893681043134, + "loss": 2.9062, + "theoretical_loss": 3.556306311766499, + "tokens_seen": 1320501248 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030296890672016046, + "loss": 2.8269, + "theoretical_loss": 3.556290367549877, + "tokens_seen": 1320566784 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029588766298897, + "loss": 2.7357, + "theoretical_loss": 3.556274424346044, + "tokens_seen": 1320632320 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029488465396188, + "loss": 2.8343, + "theoretical_loss": 3.5562584821548846, + "tokens_seen": 1320697856 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030293881644934806, + "loss": 2.7546, + "theoretical_loss": 3.5562425409762852, + "tokens_seen": 1320763392 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030292878635907724, + "loss": 2.6274, + "theoretical_loss": 3.556226600810131, + "tokens_seen": 1320828928 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029187562688064, + "loss": 2.7219, + "theoretical_loss": 3.5562106616563076, + "tokens_seen": 1320894464 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003029087261785356, + "loss": 2.7939, + "theoretical_loss": 3.5561947235147002, + "tokens_seen": 1320960000 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003028986960882648, + "loss": 2.7565, + "theoretical_loss": 3.5561787863851944, + "tokens_seen": 1321025536 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030288866599799397, + "loss": 2.7641, + "theoretical_loss": 3.5561628502676763, + "tokens_seen": 1321091072 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003028786359077232, + "loss": 2.7422, + "theoretical_loss": 3.55614691516203, + "tokens_seen": 1321156608 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030286860581745233, + "loss": 2.8192, + "theoretical_loss": 3.5561309810681427, + "tokens_seen": 1321222144 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030285857572718157, + "loss": 2.7974, + "theoretical_loss": 3.556115047985899, + "tokens_seen": 1321287680 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030284854563691075, + "loss": 2.7846, + "theoretical_loss": 3.5560991159151847, + "tokens_seen": 1321353216 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030283851554663993, + "loss": 2.7033, + "theoretical_loss": 3.5560831848558854, + "tokens_seen": 1321418752 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003028284854563691, + "loss": 2.7604, + "theoretical_loss": 3.5560672548078864, + "tokens_seen": 1321484288 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003028184553660983, + "loss": 2.7686, + "theoretical_loss": 3.5560513257710746, + "tokens_seen": 1321549824 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030280842527582747, + "loss": 2.7603, + "theoretical_loss": 3.556035397745334, + "tokens_seen": 1321615360 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003027983951855567, + "loss": 2.7844, + "theoretical_loss": 3.5560194707305515, + "tokens_seen": 1321680896 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030278836509528583, + "loss": 2.7737, + "theoretical_loss": 3.5560035447266127, + "tokens_seen": 1321746432 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030277833500501507, + "loss": 2.7606, + "theoretical_loss": 3.555987619733403, + "tokens_seen": 1321811968 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003027683049147442, + "loss": 2.7979, + "theoretical_loss": 3.5559716957508076, + "tokens_seen": 1321877504 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030275827482447343, + "loss": 2.6954, + "theoretical_loss": 3.5559557727787134, + "tokens_seen": 1321943040 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003027482447342026, + "loss": 2.7111, + "theoretical_loss": 3.5559398508170057, + "tokens_seen": 1322008576 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3136043, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8840291500091553, + "objective/train/theoretical_loss": 3.55592392986557, + "objective/train/tokens_used": 1342534112, + "theoretical_loss": 3.55592392986557, + "tokens_seen": 1322074112 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003027382146439318, + "loss": 2.7345, + "theoretical_loss": 3.55592392986557, + "tokens_seen": 1322074112 + }, + { + "epoch": 16.01, + "learning_rate": 0.000302728184553661, + "loss": 2.8177, + "theoretical_loss": 3.555908009924292, + "tokens_seen": 1322139648 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003027181544633902, + "loss": 2.7599, + "theoretical_loss": 3.5558920909930585, + "tokens_seen": 1322205184 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030270812437311934, + "loss": 2.7529, + "theoretical_loss": 3.5558761730717547, + "tokens_seen": 1322270720 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030269809428284857, + "loss": 2.7751, + "theoretical_loss": 3.5558602561602664, + "tokens_seen": 1322336256 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026880641925777, + "loss": 2.716, + "theoretical_loss": 3.5558443402584796, + "tokens_seen": 1322401792 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030267803410230693, + "loss": 2.8252, + "theoretical_loss": 3.5558284253662804, + "tokens_seen": 1322467328 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026680040120361, + "loss": 2.8894, + "theoretical_loss": 3.5558125114835546, + "tokens_seen": 1322532864 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026579739217653, + "loss": 2.7157, + "theoretical_loss": 3.555796598610188, + "tokens_seen": 1322598400 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026479438314945, + "loss": 2.7237, + "theoretical_loss": 3.5557806867460666, + "tokens_seen": 1322663936 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030263791374122366, + "loss": 2.7899, + "theoretical_loss": 3.5557647758910766, + "tokens_seen": 1322729472 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030262788365095284, + "loss": 2.7402, + "theoretical_loss": 3.5557488660451044, + "tokens_seen": 1322795008 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026178535606821, + "loss": 2.7454, + "theoretical_loss": 3.555732957208035, + "tokens_seen": 1322860544 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003026078234704112, + "loss": 2.9404, + "theoretical_loss": 3.555717049379755, + "tokens_seen": 1322926080 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030259779338014044, + "loss": 2.8031, + "theoretical_loss": 3.555701142560151, + "tokens_seen": 1322991616 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030258776328986956, + "loss": 2.8174, + "theoretical_loss": 3.555685236749108, + "tokens_seen": 1323057152 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003025777331995988, + "loss": 2.8272, + "theoretical_loss": 3.5556693319465125, + "tokens_seen": 1323122688 + }, + { + "epoch": 16.01, + "learning_rate": 0.000302567703109328, + "loss": 2.8149, + "theoretical_loss": 3.555653428152251, + "tokens_seen": 1323188224 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030255767301905716, + "loss": 2.7924, + "theoretical_loss": 3.555637525366209, + "tokens_seen": 1323253760 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003025476429287864, + "loss": 2.7681, + "theoretical_loss": 3.555621623588274, + "tokens_seen": 1323319296 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003025376128385156, + "loss": 2.6928, + "theoretical_loss": 3.5556057228183304, + "tokens_seen": 1323384832 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030252758274824476, + "loss": 2.8633, + "theoretical_loss": 3.5555898230562653, + "tokens_seen": 1323450368 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030251755265797394, + "loss": 2.7945, + "theoretical_loss": 3.5555739243019646, + "tokens_seen": 1323515904 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003025075225677031, + "loss": 2.7097, + "theoretical_loss": 3.5555580265553153, + "tokens_seen": 1323581440 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003024974924774323, + "loss": 2.7706, + "theoretical_loss": 3.555542129816203, + "tokens_seen": 1323646976 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3139006, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9233555793762207, + "objective/train/theoretical_loss": 3.555526234084514, + "objective/train/tokens_used": 1344172512, + "theoretical_loss": 3.555526234084514, + "tokens_seen": 1323712512 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030248746238716154, + "loss": 2.8065, + "theoretical_loss": 3.555526234084514, + "tokens_seen": 1323712512 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030248746238716154, + "loss": 2.8079, + "theoretical_loss": 3.555510339360134, + "tokens_seen": 1323778048 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030247743229689066, + "loss": 2.8376, + "theoretical_loss": 3.5554944456429505, + "tokens_seen": 1323843584 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003024674022066199, + "loss": 2.7976, + "theoretical_loss": 3.555478552932849, + "tokens_seen": 1323909120 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030245737211634903, + "loss": 2.7303, + "theoretical_loss": 3.5554626612297167, + "tokens_seen": 1323974656 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030244734202607826, + "loss": 2.8342, + "theoretical_loss": 3.5554467705334387, + "tokens_seen": 1324040192 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030243731193580744, + "loss": 2.6901, + "theoretical_loss": 3.5554308808439026, + "tokens_seen": 1324105728 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003024272818455366, + "loss": 2.7175, + "theoretical_loss": 3.555414992160994, + "tokens_seen": 1324171264 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003024172517552658, + "loss": 2.8044, + "theoretical_loss": 3.555399104484599, + "tokens_seen": 1324236800 + }, + { + "epoch": 16.01, + "learning_rate": 0.000302407221664995, + "loss": 2.7807, + "theoretical_loss": 3.5553832178146054, + "tokens_seen": 1324302336 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030239719157472417, + "loss": 2.7389, + "theoretical_loss": 3.5553673321508983, + "tokens_seen": 1324367872 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003023871614844534, + "loss": 2.7898, + "theoretical_loss": 3.555351447493365, + "tokens_seen": 1324433408 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030237713139418253, + "loss": 2.78, + "theoretical_loss": 3.5553355638418918, + "tokens_seen": 1324498944 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030236710130391177, + "loss": 2.8002, + "theoretical_loss": 3.5553196811963645, + "tokens_seen": 1324564480 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030235707121364095, + "loss": 2.748, + "theoretical_loss": 3.555303799556671, + "tokens_seen": 1324630016 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030234704112337013, + "loss": 2.782, + "theoretical_loss": 3.5552879189226965, + "tokens_seen": 1324695552 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003023370110330993, + "loss": 2.7078, + "theoretical_loss": 3.5552720392943282, + "tokens_seen": 1324761088 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003023269809428285, + "loss": 2.7269, + "theoretical_loss": 3.5552561606714526, + "tokens_seen": 1324826624 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030231695085255767, + "loss": 2.8407, + "theoretical_loss": 3.5552402830539567, + "tokens_seen": 1324892160 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003023069207622869, + "loss": 2.7385, + "theoretical_loss": 3.5552244064417264, + "tokens_seen": 1324957696 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030229689067201603, + "loss": 2.8306, + "theoretical_loss": 3.5552085308346486, + "tokens_seen": 1325023232 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030228686058174527, + "loss": 2.8331, + "theoretical_loss": 3.5551926562326095, + "tokens_seen": 1325088768 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003022768304914744, + "loss": 2.8162, + "theoretical_loss": 3.555176782635497, + "tokens_seen": 1325154304 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030226680040120363, + "loss": 2.8567, + "theoretical_loss": 3.555160910043197, + "tokens_seen": 1325219840 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003022567703109328, + "loss": 2.8244, + "theoretical_loss": 3.555145038455596, + "tokens_seen": 1325285376 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3143806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.883793830871582, + "objective/train/theoretical_loss": 3.555129167872581, + "objective/train/tokens_used": 1345810912, + "theoretical_loss": 3.555129167872581, + "tokens_seen": 1325350912 + }, + { + "epoch": 16.01, + "learning_rate": 0.000302246740220662, + "loss": 2.6639, + "theoretical_loss": 3.555129167872581, + "tokens_seen": 1325350912 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003022367101303912, + "loss": 2.7726, + "theoretical_loss": 3.5551132982940388, + "tokens_seen": 1325416448 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003022266800401204, + "loss": 2.8486, + "theoretical_loss": 3.5550974297198565, + "tokens_seen": 1325481984 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030221664994984954, + "loss": 2.7466, + "theoretical_loss": 3.55508156214992, + "tokens_seen": 1325547520 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030220661985957877, + "loss": 2.799, + "theoretical_loss": 3.555065695584117, + "tokens_seen": 1325613056 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021965897693079, + "loss": 2.7374, + "theoretical_loss": 3.555049830022334, + "tokens_seen": 1325678592 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030218655967903713, + "loss": 2.7074, + "theoretical_loss": 3.5550339654644567, + "tokens_seen": 1325744128 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021765295887663, + "loss": 2.8441, + "theoretical_loss": 3.555018101910374, + "tokens_seen": 1325809664 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021664994984955, + "loss": 2.7891, + "theoretical_loss": 3.5550022393599714, + "tokens_seen": 1325875200 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021564694082247, + "loss": 2.791, + "theoretical_loss": 3.5549863778131363, + "tokens_seen": 1325940736 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030214643931795386, + "loss": 2.7899, + "theoretical_loss": 3.5549705172697554, + "tokens_seen": 1326006272 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030213640922768304, + "loss": 2.7368, + "theoretical_loss": 3.5549546577297155, + "tokens_seen": 1326071808 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021263791374123, + "loss": 2.7394, + "theoretical_loss": 3.5549387991929042, + "tokens_seen": 1326137344 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003021163490471414, + "loss": 2.8275, + "theoretical_loss": 3.554922941659208, + "tokens_seen": 1326202880 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030210631895687064, + "loss": 2.6633, + "theoretical_loss": 3.5549070851285136, + "tokens_seen": 1326268416 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030209628886659976, + "loss": 2.7916, + "theoretical_loss": 3.5548912296007082, + "tokens_seen": 1326333952 + }, + { + "epoch": 16.01, + "learning_rate": 0.000302086258776329, + "loss": 2.8152, + "theoretical_loss": 3.554875375075679, + "tokens_seen": 1326399488 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003020762286860582, + "loss": 2.7753, + "theoretical_loss": 3.5548595215533134, + "tokens_seen": 1326465024 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030206619859578736, + "loss": 2.6847, + "theoretical_loss": 3.5548436690334975, + "tokens_seen": 1326530560 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030205616850551654, + "loss": 2.8636, + "theoretical_loss": 3.554827817516119, + "tokens_seen": 1326596096 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003020461384152458, + "loss": 2.7652, + "theoretical_loss": 3.554811967001065, + "tokens_seen": 1326661632 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003020361083249749, + "loss": 2.7787, + "theoretical_loss": 3.5547961174882223, + "tokens_seen": 1326727168 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030202607823470414, + "loss": 2.7545, + "theoretical_loss": 3.5547802689774786, + "tokens_seen": 1326792704 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030201604814443327, + "loss": 2.8457, + "theoretical_loss": 3.5547644214687204, + "tokens_seen": 1326858240 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003020060180541625, + "loss": 2.7681, + "theoretical_loss": 3.5547485749618355, + "tokens_seen": 1326923776 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3146876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.887298583984375, + "objective/train/theoretical_loss": 3.55473272945671, + "objective/train/tokens_used": 1347449312, + "theoretical_loss": 3.55473272945671, + "tokens_seen": 1326989312 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003019959879638917, + "loss": 2.7913, + "theoretical_loss": 3.55473272945671, + "tokens_seen": 1326989312 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030198595787362087, + "loss": 2.8195, + "theoretical_loss": 3.5547168849532325, + "tokens_seen": 1327054848 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030197592778335005, + "loss": 2.8266, + "theoretical_loss": 3.554701041451289, + "tokens_seen": 1327120384 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030196589769307923, + "loss": 2.7906, + "theoretical_loss": 3.554685198950768, + "tokens_seen": 1327185920 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003019558676028084, + "loss": 2.7157, + "theoretical_loss": 3.554669357451555, + "tokens_seen": 1327251456 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030194583751253764, + "loss": 2.7668, + "theoretical_loss": 3.5546535169535396, + "tokens_seen": 1327316992 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030193580742226677, + "loss": 2.6744, + "theoretical_loss": 3.554637677456607, + "tokens_seen": 1327382528 + }, + { + "epoch": 16.01, + "learning_rate": 0.000301925777331996, + "loss": 2.7185, + "theoretical_loss": 3.5546218389606454, + "tokens_seen": 1327448064 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030191574724172513, + "loss": 2.722, + "theoretical_loss": 3.554606001465542, + "tokens_seen": 1327513600 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030190571715145437, + "loss": 2.729, + "theoretical_loss": 3.5545901649711844, + "tokens_seen": 1327579136 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030189568706118355, + "loss": 2.6665, + "theoretical_loss": 3.55457432947746, + "tokens_seen": 1327644672 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030188565697091273, + "loss": 2.7787, + "theoretical_loss": 3.5545584949842555, + "tokens_seen": 1327710208 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003018756268806419, + "loss": 2.8362, + "theoretical_loss": 3.554542661491459, + "tokens_seen": 1327775744 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030186559679037115, + "loss": 2.877, + "theoretical_loss": 3.5545268289989576, + "tokens_seen": 1327841280 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003018555667001003, + "loss": 2.6967, + "theoretical_loss": 3.554510997506639, + "tokens_seen": 1327906816 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003018455366098295, + "loss": 2.638, + "theoretical_loss": 3.5544951670143905, + "tokens_seen": 1327972352 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030183550651955864, + "loss": 2.8232, + "theoretical_loss": 3.5544793375220998, + "tokens_seen": 1328037888 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030182547642928787, + "loss": 2.7564, + "theoretical_loss": 3.5544635090296532, + "tokens_seen": 1328103424 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030181544633901705, + "loss": 2.649, + "theoretical_loss": 3.55444768153694, + "tokens_seen": 1328168960 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030180541624874623, + "loss": 2.7597, + "theoretical_loss": 3.5544318550438465, + "tokens_seen": 1328234496 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030179538615847547, + "loss": 2.7679, + "theoretical_loss": 3.554416029550261, + "tokens_seen": 1328300032 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003017853560682046, + "loss": 2.7037, + "theoretical_loss": 3.5544002050560706, + "tokens_seen": 1328365568 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030177532597793383, + "loss": 2.7365, + "theoretical_loss": 3.554384381561163, + "tokens_seen": 1328431104 + }, + { + "epoch": 16.01, + "learning_rate": 0.000301765295887663, + "loss": 2.7882, + "theoretical_loss": 3.554368559065426, + "tokens_seen": 1328496640 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003017552657973922, + "loss": 2.7275, + "theoretical_loss": 3.5543527375687467, + "tokens_seen": 1328562176 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3150796, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.840074300765991, + "objective/train/theoretical_loss": 3.554336917071013, + "objective/train/tokens_used": 1349087712, + "theoretical_loss": 3.554336917071013, + "tokens_seen": 1328627712 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003017452357071214, + "loss": 2.8659, + "theoretical_loss": 3.554336917071013, + "tokens_seen": 1328627712 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003017352056168506, + "loss": 2.7474, + "theoretical_loss": 3.554321097572113, + "tokens_seen": 1328693248 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030172517552657974, + "loss": 2.8684, + "theoretical_loss": 3.554305279071934, + "tokens_seen": 1328758784 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030171514543630897, + "loss": 2.6596, + "theoretical_loss": 3.554289461570363, + "tokens_seen": 1328824320 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003017051153460381, + "loss": 2.8363, + "theoretical_loss": 3.554273645067289, + "tokens_seen": 1328889856 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030169508525576733, + "loss": 2.8783, + "theoretical_loss": 3.5542578295625993, + "tokens_seen": 1328955392 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003016850551654965, + "loss": 2.7429, + "theoretical_loss": 3.5542420150561815, + "tokens_seen": 1329020928 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003016750250752257, + "loss": 2.7305, + "theoretical_loss": 3.554226201547923, + "tokens_seen": 1329086464 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003016649949849549, + "loss": 2.8141, + "theoretical_loss": 3.5542103890377117, + "tokens_seen": 1329152000 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030165496489468406, + "loss": 2.8271, + "theoretical_loss": 3.554194577525436, + "tokens_seen": 1329217536 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030164493480441324, + "loss": 2.7046, + "theoretical_loss": 3.554178767010984, + "tokens_seen": 1329283072 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003016349047141425, + "loss": 2.7976, + "theoretical_loss": 3.554162957494242, + "tokens_seen": 1329348608 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003016248746238716, + "loss": 2.7364, + "theoretical_loss": 3.5541471489750993, + "tokens_seen": 1329414144 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030161484453360084, + "loss": 2.7258, + "theoretical_loss": 3.5541313414534432, + "tokens_seen": 1329479680 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030160481444332996, + "loss": 2.8506, + "theoretical_loss": 3.5541155349291613, + "tokens_seen": 1329545216 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003015947843530592, + "loss": 2.8102, + "theoretical_loss": 3.554099729402142, + "tokens_seen": 1329610752 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003015847542627884, + "loss": 2.7579, + "theoretical_loss": 3.5540839248722733, + "tokens_seen": 1329676288 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030157472417251756, + "loss": 2.6982, + "theoretical_loss": 3.554068121339443, + "tokens_seen": 1329741824 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030156469408224674, + "loss": 2.7917, + "theoretical_loss": 3.5540523188035387, + "tokens_seen": 1329807360 + }, + { + "epoch": 16.01, + "learning_rate": 0.000301554663991976, + "loss": 2.8091, + "theoretical_loss": 3.554036517264449, + "tokens_seen": 1329872896 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003015446339017051, + "loss": 2.8216, + "theoretical_loss": 3.554020716722061, + "tokens_seen": 1329938432 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030153460381143434, + "loss": 2.8189, + "theoretical_loss": 3.5540049171762638, + "tokens_seen": 1330003968 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030152457372116347, + "loss": 2.8527, + "theoretical_loss": 3.553989118626945, + "tokens_seen": 1330069504 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003015145436308927, + "loss": 2.7952, + "theoretical_loss": 3.5539733210739923, + "tokens_seen": 1330135040 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003015045135406219, + "loss": 2.6957, + "theoretical_loss": 3.5539575245172945, + "tokens_seen": 1330200576 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3155279, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.752527952194214, + "objective/train/theoretical_loss": 3.5539417289567394, + "objective/train/tokens_used": 1350726112, + "theoretical_loss": 3.5539417289567394, + "tokens_seen": 1330266112 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030149448345035107, + "loss": 2.7887, + "theoretical_loss": 3.5539417289567394, + "tokens_seen": 1330266112 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030148445336008025, + "loss": 2.7836, + "theoretical_loss": 3.5539259343922147, + "tokens_seen": 1330331648 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030147442326980943, + "loss": 2.7829, + "theoretical_loss": 3.553910140823609, + "tokens_seen": 1330397184 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003014643931795386, + "loss": 2.6886, + "theoretical_loss": 3.5538943482508096, + "tokens_seen": 1330462720 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030145436308926784, + "loss": 2.7431, + "theoretical_loss": 3.553878556673706, + "tokens_seen": 1330528256 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030144433299899697, + "loss": 2.773, + "theoretical_loss": 3.5538627660921853, + "tokens_seen": 1330593792 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003014343029087262, + "loss": 2.6914, + "theoretical_loss": 3.5538469765061365, + "tokens_seen": 1330659328 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030142427281845533, + "loss": 2.8611, + "theoretical_loss": 3.5538311879154474, + "tokens_seen": 1330724864 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030141424272818457, + "loss": 2.8433, + "theoretical_loss": 3.5538154003200058, + "tokens_seen": 1330790400 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030140421263791375, + "loss": 2.772, + "theoretical_loss": 3.5537996137197014, + "tokens_seen": 1330855936 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030139418254764293, + "loss": 2.845, + "theoretical_loss": 3.5537838281144207, + "tokens_seen": 1330921472 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003013841524573721, + "loss": 2.7783, + "theoretical_loss": 3.553768043504053, + "tokens_seen": 1330987008 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030137412236710135, + "loss": 2.8149, + "theoretical_loss": 3.5537522598884865, + "tokens_seen": 1331052544 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003013640922768305, + "loss": 2.6929, + "theoretical_loss": 3.553736477267609, + "tokens_seen": 1331118080 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003013540621865597, + "loss": 2.8318, + "theoretical_loss": 3.55372069564131, + "tokens_seen": 1331183616 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030134403209628884, + "loss": 2.7729, + "theoretical_loss": 3.553704915009477, + "tokens_seen": 1331249152 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030133400200601807, + "loss": 2.7012, + "theoretical_loss": 3.5536891353719984, + "tokens_seen": 1331314688 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030132397191574725, + "loss": 2.6716, + "theoretical_loss": 3.553673356728763, + "tokens_seen": 1331380224 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030131394182547643, + "loss": 2.8829, + "theoretical_loss": 3.5536575790796583, + "tokens_seen": 1331445760 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003013039117352056, + "loss": 2.7042, + "theoretical_loss": 3.5536418024245737, + "tokens_seen": 1331511296 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012938816449348, + "loss": 2.6955, + "theoretical_loss": 3.5536260267633977, + "tokens_seen": 1331576832 + }, + { + "epoch": 16.01, + "learning_rate": 0.000301283851554664, + "loss": 2.6976, + "theoretical_loss": 3.553610252096018, + "tokens_seen": 1331642368 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012738214643932, + "loss": 2.8072, + "theoretical_loss": 3.553594478422324, + "tokens_seen": 1331707904 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030126379137412234, + "loss": 2.7601, + "theoretical_loss": 3.5535787057422032, + "tokens_seen": 1331773440 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012537612838516, + "loss": 2.725, + "theoretical_loss": 3.553562934055545, + "tokens_seen": 1331838976 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3158687, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7252907752990723, + "objective/train/theoretical_loss": 3.5535471633622375, + "objective/train/tokens_used": 1352364512, + "theoretical_loss": 3.5535471633622375, + "tokens_seen": 1331904512 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012437311935807, + "loss": 2.7647, + "theoretical_loss": 3.5535471633622375, + "tokens_seen": 1331904512 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030123370110330994, + "loss": 2.7574, + "theoretical_loss": 3.553531393662169, + "tokens_seen": 1331970048 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012236710130391, + "loss": 2.7923, + "theoretical_loss": 3.553515624955229, + "tokens_seen": 1332035584 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012136409227683, + "loss": 2.7914, + "theoretical_loss": 3.553499857241305, + "tokens_seen": 1332101120 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003012036108324975, + "loss": 2.7223, + "theoretical_loss": 3.553484090520287, + "tokens_seen": 1332166656 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003011935807422267, + "loss": 2.74, + "theoretical_loss": 3.553468324792062, + "tokens_seen": 1332232192 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030118355065195584, + "loss": 2.6496, + "theoretical_loss": 3.5534525600565194, + "tokens_seen": 1332297728 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003011735205616851, + "loss": 2.7671, + "theoretical_loss": 3.5534367963135485, + "tokens_seen": 1332363264 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003011634904714142, + "loss": 2.8831, + "theoretical_loss": 3.553421033563037, + "tokens_seen": 1332428800 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030115346038114344, + "loss": 2.8464, + "theoretical_loss": 3.5534052718048743, + "tokens_seen": 1332494336 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003011434302908726, + "loss": 2.7913, + "theoretical_loss": 3.553389511038948, + "tokens_seen": 1332559872 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003011334002006018, + "loss": 2.8143, + "theoretical_loss": 3.5533737512651484, + "tokens_seen": 1332625408 + }, + { + "epoch": 16.01, + "learning_rate": 0.000301123370110331, + "loss": 2.7978, + "theoretical_loss": 3.5533579924833636, + "tokens_seen": 1332690944 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030111334002006016, + "loss": 2.8274, + "theoretical_loss": 3.553342234693482, + "tokens_seen": 1332756480 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030110330992978935, + "loss": 2.8768, + "theoretical_loss": 3.553326477895393, + "tokens_seen": 1332822016 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003010932798395186, + "loss": 2.8331, + "theoretical_loss": 3.553310722088985, + "tokens_seen": 1332887552 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003010832497492477, + "loss": 2.7085, + "theoretical_loss": 3.553294967274147, + "tokens_seen": 1332953088 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030107321965897694, + "loss": 2.7837, + "theoretical_loss": 3.553279213450768, + "tokens_seen": 1333018624 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030106318956870607, + "loss": 2.8172, + "theoretical_loss": 3.5532634606187363, + "tokens_seen": 1333084160 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003010531594784353, + "loss": 2.9084, + "theoretical_loss": 3.553247708777941, + "tokens_seen": 1333149696 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030104312938816454, + "loss": 2.8582, + "theoretical_loss": 3.5532319579282716, + "tokens_seen": 1333215232 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030103309929789367, + "loss": 2.8643, + "theoretical_loss": 3.553216208069616, + "tokens_seen": 1333280768 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003010230692076229, + "loss": 2.7856, + "theoretical_loss": 3.5532004592018644, + "tokens_seen": 1333346304 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003010130391173521, + "loss": 2.7941, + "theoretical_loss": 3.5531847113249047, + "tokens_seen": 1333411840 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030100300902708127, + "loss": 2.7772, + "theoretical_loss": 3.553168964438626, + "tokens_seen": 1333477376 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3161565, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784865140914917, + "objective/train/theoretical_loss": 3.5531532185429184, + "objective/train/tokens_used": 1354002912, + "theoretical_loss": 3.5531532185429184, + "tokens_seen": 1333542912 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030099297893681045, + "loss": 2.8045, + "theoretical_loss": 3.5531532185429184, + "tokens_seen": 1333542912 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030098294884653963, + "loss": 2.7685, + "theoretical_loss": 3.5531374736376695, + "tokens_seen": 1333608448 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003009729187562688, + "loss": 2.8536, + "theoretical_loss": 3.553121729722769, + "tokens_seen": 1333673984 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030096288866599804, + "loss": 2.808, + "theoretical_loss": 3.553105986798106, + "tokens_seen": 1333739520 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030095285857572717, + "loss": 2.7774, + "theoretical_loss": 3.553090244863569, + "tokens_seen": 1333805056 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003009428284854564, + "loss": 2.8541, + "theoretical_loss": 3.5530745039190474, + "tokens_seen": 1333870592 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030093279839518553, + "loss": 2.7162, + "theoretical_loss": 3.553058763964431, + "tokens_seen": 1333936128 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030092276830491477, + "loss": 2.7856, + "theoretical_loss": 3.553043024999608, + "tokens_seen": 1334001664 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030091273821464395, + "loss": 2.89, + "theoretical_loss": 3.5530272870244675, + "tokens_seen": 1334067200 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030090270812437313, + "loss": 2.7094, + "theoretical_loss": 3.5530115500388995, + "tokens_seen": 1334132736 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003008926780341023, + "loss": 2.7711, + "theoretical_loss": 3.5529958140427924, + "tokens_seen": 1334198272 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030088264794383155, + "loss": 2.8265, + "theoretical_loss": 3.5529800790360353, + "tokens_seen": 1334263808 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003008726178535607, + "loss": 2.7827, + "theoretical_loss": 3.5529643450185184, + "tokens_seen": 1334329344 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003008625877632899, + "loss": 2.6971, + "theoretical_loss": 3.5529486119901303, + "tokens_seen": 1334394880 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030085255767301904, + "loss": 2.7839, + "theoretical_loss": 3.5529328799507596, + "tokens_seen": 1334460416 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030084252758274827, + "loss": 2.7482, + "theoretical_loss": 3.552917148900297, + "tokens_seen": 1334525952 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030083249749247745, + "loss": 2.7857, + "theoretical_loss": 3.55290141883863, + "tokens_seen": 1334591488 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030082246740220663, + "loss": 2.8699, + "theoretical_loss": 3.55288568976565, + "tokens_seen": 1334657024 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003008124373119358, + "loss": 2.8193, + "theoretical_loss": 3.5528699616812442, + "tokens_seen": 1334722560 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300802407221665, + "loss": 2.748, + "theoretical_loss": 3.552854234585303, + "tokens_seen": 1334788096 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007923771313942, + "loss": 2.7897, + "theoretical_loss": 3.5528385084777163, + "tokens_seen": 1334853632 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007823470411234, + "loss": 2.6326, + "theoretical_loss": 3.5528227833583728, + "tokens_seen": 1334919168 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030077231695085254, + "loss": 2.7662, + "theoretical_loss": 3.5528070592271614, + "tokens_seen": 1334984704 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007622868605818, + "loss": 2.7846, + "theoretical_loss": 3.5527913360839722, + "tokens_seen": 1335050240 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007522567703109, + "loss": 2.7622, + "theoretical_loss": 3.552775613928694, + "tokens_seen": 1335115776 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3165037, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.740142345428467, + "objective/train/theoretical_loss": 3.552759892761218, + "objective/train/tokens_used": 1355641312, + "theoretical_loss": 3.552759892761218, + "tokens_seen": 1335181312 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030074222668004014, + "loss": 2.7328, + "theoretical_loss": 3.552759892761218, + "tokens_seen": 1335181312 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007321965897693, + "loss": 2.7576, + "theoretical_loss": 3.552744172581431, + "tokens_seen": 1335246848 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007221664994985, + "loss": 2.7173, + "theoretical_loss": 3.5527284533892245, + "tokens_seen": 1335312384 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007121364092277, + "loss": 2.8531, + "theoretical_loss": 3.552712735184487, + "tokens_seen": 1335377920 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003007021063189569, + "loss": 2.818, + "theoretical_loss": 3.5526970179671085, + "tokens_seen": 1335443456 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030069207622868604, + "loss": 2.8405, + "theoretical_loss": 3.5526813017369783, + "tokens_seen": 1335508992 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003006820461384153, + "loss": 2.7701, + "theoretical_loss": 3.552665586493986, + "tokens_seen": 1335574528 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003006720160481444, + "loss": 2.8362, + "theoretical_loss": 3.552649872238021, + "tokens_seen": 1335640064 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030066198595787364, + "loss": 2.7376, + "theoretical_loss": 3.5526341589689734, + "tokens_seen": 1335705600 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003006519558676028, + "loss": 2.8689, + "theoretical_loss": 3.5526184466867323, + "tokens_seen": 1335771136 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300641925777332, + "loss": 2.8232, + "theoretical_loss": 3.5526027353911873, + "tokens_seen": 1335836672 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003006318956870612, + "loss": 2.7734, + "theoretical_loss": 3.5525870250822287, + "tokens_seen": 1335902208 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030062186559679036, + "loss": 2.9077, + "theoretical_loss": 3.552571315759745, + "tokens_seen": 1335967744 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030061183550651955, + "loss": 2.9018, + "theoretical_loss": 3.552555607423627, + "tokens_seen": 1336033280 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003006018054162488, + "loss": 2.7565, + "theoretical_loss": 3.5525399000737634, + "tokens_seen": 1336098816 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003005917753259779, + "loss": 2.8578, + "theoretical_loss": 3.5525241937100445, + "tokens_seen": 1336164352 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030058174523570714, + "loss": 2.7835, + "theoretical_loss": 3.5525084883323603, + "tokens_seen": 1336229888 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030057171514543627, + "loss": 2.7462, + "theoretical_loss": 3.5524927839405995, + "tokens_seen": 1336295424 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003005616850551655, + "loss": 2.7831, + "theoretical_loss": 3.5524770805346533, + "tokens_seen": 1336360960 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003005516549648947, + "loss": 2.8782, + "theoretical_loss": 3.5524613781144105, + "tokens_seen": 1336426496 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030054162487462387, + "loss": 2.7966, + "theoretical_loss": 3.5524456766797607, + "tokens_seen": 1336492032 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030053159478435305, + "loss": 2.7089, + "theoretical_loss": 3.552429976230595, + "tokens_seen": 1336557568 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003005215646940823, + "loss": 2.8148, + "theoretical_loss": 3.5524142767668017, + "tokens_seen": 1336623104 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003005115346038114, + "loss": 2.759, + "theoretical_loss": 3.5523985782882717, + "tokens_seen": 1336688640 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030050150451354065, + "loss": 2.7869, + "theoretical_loss": 3.5523828807948936, + "tokens_seen": 1336754176 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3169970, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.829728603363037, + "objective/train/theoretical_loss": 3.5523671842865587, + "objective/train/tokens_used": 1357279712, + "theoretical_loss": 3.5523671842865587, + "tokens_seen": 1336819712 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003004914744232698, + "loss": 2.8263, + "theoretical_loss": 3.5523671842865587, + "tokens_seen": 1336819712 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300481444332999, + "loss": 2.7354, + "theoretical_loss": 3.5523514887631564, + "tokens_seen": 1336885248 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003004714142427282, + "loss": 2.6999, + "theoretical_loss": 3.5523357942245766, + "tokens_seen": 1336950784 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030046138415245737, + "loss": 2.756, + "theoretical_loss": 3.552320100670709, + "tokens_seen": 1337016320 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030045135406218655, + "loss": 2.7645, + "theoretical_loss": 3.552304408101444, + "tokens_seen": 1337081856 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030044132397191573, + "loss": 2.7602, + "theoretical_loss": 3.552288716516671, + "tokens_seen": 1337147392 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003004312938816449, + "loss": 2.7245, + "theoretical_loss": 3.5522730259162807, + "tokens_seen": 1337212928 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030042126379137415, + "loss": 2.7878, + "theoretical_loss": 3.552257336300163, + "tokens_seen": 1337278464 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003004112337011033, + "loss": 2.7983, + "theoretical_loss": 3.5522416476682066, + "tokens_seen": 1337344000 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003004012036108325, + "loss": 2.7557, + "theoretical_loss": 3.552225960020303, + "tokens_seen": 1337409536 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003003911735205617, + "loss": 2.8097, + "theoretical_loss": 3.5522102733563425, + "tokens_seen": 1337475072 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003003811434302909, + "loss": 2.7947, + "theoretical_loss": 3.552194587676214, + "tokens_seen": 1337540608 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030037111334002006, + "loss": 2.7628, + "theoretical_loss": 3.552178902979808, + "tokens_seen": 1337606144 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030036108324974924, + "loss": 2.8016, + "theoretical_loss": 3.552163219267015, + "tokens_seen": 1337671680 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003003510531594784, + "loss": 2.7968, + "theoretical_loss": 3.552147536537725, + "tokens_seen": 1337737216 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030034102306920765, + "loss": 2.7651, + "theoretical_loss": 3.5521318547918277, + "tokens_seen": 1337802752 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003003309929789368, + "loss": 2.7622, + "theoretical_loss": 3.552116174029214, + "tokens_seen": 1337868288 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300320962888666, + "loss": 2.7823, + "theoretical_loss": 3.552100494249773, + "tokens_seen": 1337933824 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030031093279839514, + "loss": 2.7253, + "theoretical_loss": 3.552084815453396, + "tokens_seen": 1337999360 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003003009027081244, + "loss": 2.8391, + "theoretical_loss": 3.5520691376399727, + "tokens_seen": 1338064896 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002908726178536, + "loss": 2.6508, + "theoretical_loss": 3.552053460809393, + "tokens_seen": 1338130432 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030028084252758274, + "loss": 2.814, + "theoretical_loss": 3.5520377849615485, + "tokens_seen": 1338195968 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300270812437312, + "loss": 2.8274, + "theoretical_loss": 3.552022110096328, + "tokens_seen": 1338261504 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002607823470411, + "loss": 2.8339, + "theoretical_loss": 3.552006436213622, + "tokens_seen": 1338327040 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030025075225677034, + "loss": 2.7305, + "theoretical_loss": 3.551990763313322, + "tokens_seen": 1338392576 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3173013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.773421049118042, + "objective/train/theoretical_loss": 3.5519750913953168, + "objective/train/tokens_used": 1358918112, + "theoretical_loss": 3.5519750913953168, + "tokens_seen": 1338458112 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002407221664995, + "loss": 2.8185, + "theoretical_loss": 3.5519750913953168, + "tokens_seen": 1338458112 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002306920762287, + "loss": 2.9185, + "theoretical_loss": 3.5519594204594975, + "tokens_seen": 1338523648 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002206619859579, + "loss": 2.649, + "theoretical_loss": 3.551943750505755, + "tokens_seen": 1338589184 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003002106318956871, + "loss": 2.8541, + "theoretical_loss": 3.5519280815339784, + "tokens_seen": 1338654720 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030020060180541624, + "loss": 2.8058, + "theoretical_loss": 3.551912413544059, + "tokens_seen": 1338720256 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003001905717151455, + "loss": 2.7623, + "theoretical_loss": 3.5518967465358866, + "tokens_seen": 1338785792 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003001805416248746, + "loss": 2.8154, + "theoretical_loss": 3.551881080509353, + "tokens_seen": 1338851328 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030017051153460384, + "loss": 2.7496, + "theoretical_loss": 3.5518654154643468, + "tokens_seen": 1338916864 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300160481444333, + "loss": 2.7967, + "theoretical_loss": 3.551849751400759, + "tokens_seen": 1338982400 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003001504513540622, + "loss": 2.7688, + "theoretical_loss": 3.551834088318481, + "tokens_seen": 1339047936 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003001404212637914, + "loss": 2.7636, + "theoretical_loss": 3.5518184262174026, + "tokens_seen": 1339113472 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030013039117352056, + "loss": 2.8312, + "theoretical_loss": 3.551802765097414, + "tokens_seen": 1339179008 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030012036108324975, + "loss": 2.6934, + "theoretical_loss": 3.5517871049584064, + "tokens_seen": 1339244544 + }, + { + "epoch": 16.01, + "learning_rate": 0.000300110330992979, + "loss": 2.8254, + "theoretical_loss": 3.5517714458002705, + "tokens_seen": 1339310080 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003001003009027081, + "loss": 2.8375, + "theoretical_loss": 3.5517557876228962, + "tokens_seen": 1339375616 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030009027081243734, + "loss": 2.6883, + "theoretical_loss": 3.5517401304261744, + "tokens_seen": 1339441152 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030008024072216647, + "loss": 2.7065, + "theoretical_loss": 3.551724474209995, + "tokens_seen": 1339506688 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003000702106318957, + "loss": 2.9003, + "theoretical_loss": 3.55170881897425, + "tokens_seen": 1339572224 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003000601805416249, + "loss": 2.8052, + "theoretical_loss": 3.551693164718829, + "tokens_seen": 1339637760 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030005015045135407, + "loss": 2.856, + "theoretical_loss": 3.551677511443623, + "tokens_seen": 1339703296 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030004012036108325, + "loss": 2.7418, + "theoretical_loss": 3.551661859148523, + "tokens_seen": 1339768832 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003000300902708125, + "loss": 2.7551, + "theoretical_loss": 3.5516462078334197, + "tokens_seen": 1339834368 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003000200601805416, + "loss": 2.7359, + "theoretical_loss": 3.5516305574982026, + "tokens_seen": 1339899904 + }, + { + "epoch": 16.01, + "learning_rate": 0.00030001003009027085, + "loss": 2.7631, + "theoretical_loss": 3.5516149081427635, + "tokens_seen": 1339965440 + }, + { + "epoch": 16.01, + "learning_rate": 0.0003, + "loss": 2.9138, + "theoretical_loss": 3.551599259766993, + "tokens_seen": 1340030976 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3177964, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.959418535232544, + "objective/train/theoretical_loss": 3.551583612370782, + "objective/train/tokens_used": 1360556512, + "theoretical_loss": 3.551583612370782, + "tokens_seen": 1340096512 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002999899699097292, + "loss": 2.7999, + "theoretical_loss": 3.551583612370782, + "tokens_seen": 1340096512 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002999799398194584, + "loss": 2.8131, + "theoretical_loss": 3.551567965954021, + "tokens_seen": 1340162048 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029996990972918757, + "loss": 2.7706, + "theoretical_loss": 3.551552320516601, + "tokens_seen": 1340227584 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029995987963891675, + "loss": 2.696, + "theoretical_loss": 3.5515366760584124, + "tokens_seen": 1340293120 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029994984954864593, + "loss": 2.8423, + "theoretical_loss": 3.5515210325793465, + "tokens_seen": 1340358656 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002999398194583751, + "loss": 2.7743, + "theoretical_loss": 3.551505390079294, + "tokens_seen": 1340424192 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029992978936810435, + "loss": 2.6274, + "theoretical_loss": 3.5514897485581463, + "tokens_seen": 1340489728 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002999197592778335, + "loss": 2.8001, + "theoretical_loss": 3.551474108015793, + "tokens_seen": 1340555264 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002999097291875627, + "loss": 2.7199, + "theoretical_loss": 3.5514584684521258, + "tokens_seen": 1340620800 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002998996990972919, + "loss": 2.7844, + "theoretical_loss": 3.5514428298670357, + "tokens_seen": 1340686336 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002998896690070211, + "loss": 2.7754, + "theoretical_loss": 3.5514271922604137, + "tokens_seen": 1340751872 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029987963891675026, + "loss": 2.7677, + "theoretical_loss": 3.5514115556321504, + "tokens_seen": 1340817408 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029986960882647944, + "loss": 2.7736, + "theoretical_loss": 3.551395919982137, + "tokens_seen": 1340882944 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002998595787362086, + "loss": 2.7997, + "theoretical_loss": 3.5513802853102643, + "tokens_seen": 1340948480 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029984954864593785, + "loss": 2.8875, + "theoretical_loss": 3.5513646516164235, + "tokens_seen": 1341014016 + }, + { + "epoch": 16.01, + "learning_rate": 0.000299839518555667, + "loss": 2.8313, + "theoretical_loss": 3.5513490189005057, + "tokens_seen": 1341079552 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002998294884653962, + "loss": 2.8615, + "theoretical_loss": 3.5513333871624018, + "tokens_seen": 1341145088 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029981945837512534, + "loss": 2.8135, + "theoretical_loss": 3.5513177564020024, + "tokens_seen": 1341210624 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002998094282848546, + "loss": 2.7716, + "theoretical_loss": 3.5513021266191998, + "tokens_seen": 1341276160 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029979939819458376, + "loss": 2.8693, + "theoretical_loss": 3.551286497813884, + "tokens_seen": 1341341696 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029978936810431294, + "loss": 2.823, + "theoretical_loss": 3.551270869985946, + "tokens_seen": 1341407232 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002997793380140421, + "loss": 2.753, + "theoretical_loss": 3.5512552431352784, + "tokens_seen": 1341472768 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002997693079237713, + "loss": 2.7542, + "theoretical_loss": 3.551239617261771, + "tokens_seen": 1341538304 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002997592778335005, + "loss": 2.6613, + "theoretical_loss": 3.5512239923653146, + "tokens_seen": 1341603840 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002997492477432297, + "loss": 2.6611, + "theoretical_loss": 3.551208368445802, + "tokens_seen": 1341669376 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3180908, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6064834594726562, + "objective/train/theoretical_loss": 3.551192745503123, + "objective/train/tokens_used": 1362194912, + "theoretical_loss": 3.551192745503123, + "tokens_seen": 1341734912 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029973921765295885, + "loss": 2.7169, + "theoretical_loss": 3.551192745503123, + "tokens_seen": 1341734912 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002997291875626881, + "loss": 2.7837, + "theoretical_loss": 3.5511771235371694, + "tokens_seen": 1341800448 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029971915747241726, + "loss": 2.803, + "theoretical_loss": 3.5511615025478322, + "tokens_seen": 1341865984 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029970912738214644, + "loss": 2.8517, + "theoretical_loss": 3.5511458825350033, + "tokens_seen": 1341931520 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996990972918756, + "loss": 2.8387, + "theoretical_loss": 3.551130263498573, + "tokens_seen": 1341997056 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996890672016048, + "loss": 2.7667, + "theoretical_loss": 3.5511146454384335, + "tokens_seen": 1342062592 + }, + { + "epoch": 16.01, + "learning_rate": 0.000299679037111334, + "loss": 2.8251, + "theoretical_loss": 3.551099028354475, + "tokens_seen": 1342128128 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996690070210632, + "loss": 2.7313, + "theoretical_loss": 3.55108341224659, + "tokens_seen": 1342193664 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029965897693079235, + "loss": 2.6773, + "theoretical_loss": 3.551067797114669, + "tokens_seen": 1342259200 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996489468405216, + "loss": 2.7359, + "theoretical_loss": 3.551052182958604, + "tokens_seen": 1342324736 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996389167502507, + "loss": 2.7882, + "theoretical_loss": 3.5510365697782857, + "tokens_seen": 1342390272 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029962888665997995, + "loss": 2.8322, + "theoretical_loss": 3.5510209575736065, + "tokens_seen": 1342455808 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029961885656970913, + "loss": 2.8111, + "theoretical_loss": 3.551005346344457, + "tokens_seen": 1342521344 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002996088264794383, + "loss": 2.7548, + "theoretical_loss": 3.5509897360907283, + "tokens_seen": 1342586880 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995987963891675, + "loss": 2.6839, + "theoretical_loss": 3.5509741268123127, + "tokens_seen": 1342652416 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029958876629889667, + "loss": 2.6453, + "theoretical_loss": 3.550958518509101, + "tokens_seen": 1342717952 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029957873620862585, + "loss": 2.7508, + "theoretical_loss": 3.5509429111809854, + "tokens_seen": 1342783488 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995687061183551, + "loss": 2.9283, + "theoretical_loss": 3.550927304827857, + "tokens_seen": 1342849024 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995586760280842, + "loss": 2.8535, + "theoretical_loss": 3.550911699449607, + "tokens_seen": 1342914560 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029954864593781345, + "loss": 2.8423, + "theoretical_loss": 3.5508960950461272, + "tokens_seen": 1342980096 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995386158475427, + "loss": 2.7642, + "theoretical_loss": 3.550880491617309, + "tokens_seen": 1343045632 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995285857572718, + "loss": 2.7577, + "theoretical_loss": 3.5508648891630443, + "tokens_seen": 1343111168 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029951855566700105, + "loss": 2.8335, + "theoretical_loss": 3.5508492876832247, + "tokens_seen": 1343176704 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002995085255767302, + "loss": 2.7776, + "theoretical_loss": 3.5508336871777413, + "tokens_seen": 1343242240 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994984954864594, + "loss": 2.744, + "theoretical_loss": 3.5508180876464865, + "tokens_seen": 1343307776 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3184602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.821622133255005, + "objective/train/theoretical_loss": 3.5508024890893513, + "objective/train/tokens_used": 1363833312, + "theoretical_loss": 3.5508024890893513, + "tokens_seen": 1343373312 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994884653961886, + "loss": 2.6703, + "theoretical_loss": 3.5508024890893513, + "tokens_seen": 1343373312 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029947843530591777, + "loss": 2.7581, + "theoretical_loss": 3.5507868915062275, + "tokens_seen": 1343438848 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029946840521564695, + "loss": 2.7431, + "theoretical_loss": 3.5507712948970065, + "tokens_seen": 1343504384 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029945837512537613, + "loss": 2.8235, + "theoretical_loss": 3.5507556992615807, + "tokens_seen": 1343569920 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994483450351053, + "loss": 2.6365, + "theoretical_loss": 3.5507401045998415, + "tokens_seen": 1343635456 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029943831494483455, + "loss": 2.7414, + "theoretical_loss": 3.55072451091168, + "tokens_seen": 1343700992 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994282848545637, + "loss": 2.7161, + "theoretical_loss": 3.550708918196989, + "tokens_seen": 1343766528 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994182547642929, + "loss": 2.8304, + "theoretical_loss": 3.5506933264556597, + "tokens_seen": 1343832064 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002994082246740221, + "loss": 2.8291, + "theoretical_loss": 3.550677735687583, + "tokens_seen": 1343897600 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002993981945837513, + "loss": 2.6684, + "theoretical_loss": 3.5506621458926526, + "tokens_seen": 1343963136 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029938816449348046, + "loss": 2.7979, + "theoretical_loss": 3.550646557070759, + "tokens_seen": 1344028672 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029937813440320964, + "loss": 2.8668, + "theoretical_loss": 3.550630969221794, + "tokens_seen": 1344094208 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002993681043129388, + "loss": 2.8082, + "theoretical_loss": 3.55061538234565, + "tokens_seen": 1344159744 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029935807422266805, + "loss": 2.7609, + "theoretical_loss": 3.5505997964422185, + "tokens_seen": 1344225280 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002993480441323972, + "loss": 2.8023, + "theoretical_loss": 3.5505842115113913, + "tokens_seen": 1344290816 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002993380140421264, + "loss": 2.8526, + "theoretical_loss": 3.5505686275530612, + "tokens_seen": 1344356352 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029932798395185554, + "loss": 2.8159, + "theoretical_loss": 3.5505530445671187, + "tokens_seen": 1344421888 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002993179538615848, + "loss": 2.7648, + "theoretical_loss": 3.5505374625534567, + "tokens_seen": 1344487424 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029930792377131396, + "loss": 2.8458, + "theoretical_loss": 3.550521881511967, + "tokens_seen": 1344552960 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029929789368104314, + "loss": 2.8167, + "theoretical_loss": 3.5505063014425415, + "tokens_seen": 1344618496 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992878635907723, + "loss": 2.869, + "theoretical_loss": 3.5504907223450717, + "tokens_seen": 1344684032 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992778335005015, + "loss": 2.7871, + "theoretical_loss": 3.55047514421945, + "tokens_seen": 1344749568 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992678034102307, + "loss": 2.7726, + "theoretical_loss": 3.5504595670655688, + "tokens_seen": 1344815104 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992577733199599, + "loss": 2.771, + "theoretical_loss": 3.55044399088332, + "tokens_seen": 1344880640 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029924774322968905, + "loss": 2.8746, + "theoretical_loss": 3.5504284156725947, + "tokens_seen": 1344946176 + }, + { + "epoch": 16.01, + "objective/train/docs_used": 3189590, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5494935512542725, + "objective/train/theoretical_loss": 3.5504128414332863, + "objective/train/tokens_used": 1365471712, + "theoretical_loss": 3.5504128414332863, + "tokens_seen": 1345011712 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992377131394183, + "loss": 2.7661, + "theoretical_loss": 3.5504128414332863, + "tokens_seen": 1345011712 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029922768304914746, + "loss": 2.8669, + "theoretical_loss": 3.550397268165286, + "tokens_seen": 1345077248 + }, + { + "epoch": 16.01, + "learning_rate": 0.00029921765295887664, + "loss": 2.7545, + "theoretical_loss": 3.550381695868486, + "tokens_seen": 1345142784 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002992076228686058, + "loss": 2.7685, + "theoretical_loss": 3.550366124542779, + "tokens_seen": 1345208320 + }, + { + "epoch": 16.01, + "learning_rate": 0.000299197592778335, + "loss": 2.6553, + "theoretical_loss": 3.5503505541880567, + "tokens_seen": 1345273856 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002991875626880642, + "loss": 2.8047, + "theoretical_loss": 3.550334984804211, + "tokens_seen": 1345339392 + }, + { + "epoch": 16.01, + "learning_rate": 0.0002991775325977934, + "loss": 2.8184, + "theoretical_loss": 3.550319416391135, + "tokens_seen": 1345404928 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029916750250752255, + "loss": 2.8428, + "theoretical_loss": 3.5503038489487198, + "tokens_seen": 1345470464 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002991574724172518, + "loss": 2.7799, + "theoretical_loss": 3.550288282476858, + "tokens_seen": 1345536000 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002991474423269809, + "loss": 2.7917, + "theoretical_loss": 3.550272716975442, + "tokens_seen": 1345601536 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029913741223671015, + "loss": 2.8335, + "theoretical_loss": 3.5502571524443636, + "tokens_seen": 1345667072 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029912738214643933, + "loss": 2.8118, + "theoretical_loss": 3.550241588883516, + "tokens_seen": 1345732608 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002991173520561685, + "loss": 2.7763, + "theoretical_loss": 3.55022602629279, + "tokens_seen": 1345798144 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002991073219658977, + "loss": 2.7733, + "theoretical_loss": 3.5502104646720793, + "tokens_seen": 1345863680 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029909729187562687, + "loss": 2.7653, + "theoretical_loss": 3.550194904021276, + "tokens_seen": 1345929216 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029908726178535605, + "loss": 2.8956, + "theoretical_loss": 3.5501793443402714, + "tokens_seen": 1345994752 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002990772316950853, + "loss": 2.7311, + "theoretical_loss": 3.5501637856289587, + "tokens_seen": 1346060288 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002990672016048144, + "loss": 2.7331, + "theoretical_loss": 3.55014822788723, + "tokens_seen": 1346125824 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029905717151454365, + "loss": 2.7469, + "theoretical_loss": 3.5501326711149783, + "tokens_seen": 1346191360 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029904714142427283, + "loss": 2.7739, + "theoretical_loss": 3.550117115312095, + "tokens_seen": 1346256896 + }, + { + "epoch": 16.02, + "learning_rate": 0.000299037111334002, + "loss": 2.8564, + "theoretical_loss": 3.550101560478473, + "tokens_seen": 1346322432 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002990270812437312, + "loss": 2.634, + "theoretical_loss": 3.5500860066140048, + "tokens_seen": 1346387968 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002990170511534604, + "loss": 2.7014, + "theoretical_loss": 3.5500704537185825, + "tokens_seen": 1346453504 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029900702106318956, + "loss": 2.7204, + "theoretical_loss": 3.550054901792099, + "tokens_seen": 1346519040 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989969909729188, + "loss": 2.8389, + "theoretical_loss": 3.5500393508344468, + "tokens_seen": 1346584576 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3192346, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.70182204246521, + "objective/train/theoretical_loss": 3.5500238008455183, + "objective/train/tokens_used": 1367110112, + "theoretical_loss": 3.5500238008455183, + "tokens_seen": 1346650112 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989869608826479, + "loss": 2.7246, + "theoretical_loss": 3.5500238008455183, + "tokens_seen": 1346650112 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029897693079237715, + "loss": 2.7656, + "theoretical_loss": 3.5500082518252056, + "tokens_seen": 1346715648 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989669007021063, + "loss": 2.6818, + "theoretical_loss": 3.549992703773402, + "tokens_seen": 1346781184 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989568706118355, + "loss": 2.808, + "theoretical_loss": 3.549977156689999, + "tokens_seen": 1346846720 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989468405215647, + "loss": 2.8432, + "theoretical_loss": 3.54996161057489, + "tokens_seen": 1346912256 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989368104312939, + "loss": 2.8953, + "theoretical_loss": 3.549946065427967, + "tokens_seen": 1346977792 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029892678034102306, + "loss": 2.8146, + "theoretical_loss": 3.5499305212491232, + "tokens_seen": 1347043328 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989167502507523, + "loss": 2.7604, + "theoretical_loss": 3.5499149780382506, + "tokens_seen": 1347108864 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002989067201604814, + "loss": 2.7772, + "theoretical_loss": 3.549899435795243, + "tokens_seen": 1347174400 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029889669007021066, + "loss": 2.7189, + "theoretical_loss": 3.5498838945199918, + "tokens_seen": 1347239936 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002988866599799398, + "loss": 2.8046, + "theoretical_loss": 3.54986835421239, + "tokens_seen": 1347305472 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298876629889669, + "loss": 2.8935, + "theoretical_loss": 3.5498528148723305, + "tokens_seen": 1347371008 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002988665997993982, + "loss": 2.8126, + "theoretical_loss": 3.549837276499706, + "tokens_seen": 1347436544 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002988565697091274, + "loss": 2.7338, + "theoretical_loss": 3.5498217390944093, + "tokens_seen": 1347502080 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029884653961885656, + "loss": 2.771, + "theoretical_loss": 3.549806202656333, + "tokens_seen": 1347567616 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029883650952858574, + "loss": 2.8683, + "theoretical_loss": 3.549790667185369, + "tokens_seen": 1347633152 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002988264794383149, + "loss": 2.8237, + "theoretical_loss": 3.5497751326814115, + "tokens_seen": 1347698688 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029881644934804416, + "loss": 2.7808, + "theoretical_loss": 3.5497595991443527, + "tokens_seen": 1347764224 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002988064192577733, + "loss": 2.7247, + "theoretical_loss": 3.5497440665740854, + "tokens_seen": 1347829760 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987963891675025, + "loss": 2.7502, + "theoretical_loss": 3.549728534970502, + "tokens_seen": 1347895296 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987863590772317, + "loss": 2.8114, + "theoretical_loss": 3.5497130043334963, + "tokens_seen": 1347960832 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987763289869609, + "loss": 2.6359, + "theoretical_loss": 3.5496974746629606, + "tokens_seen": 1348026368 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987662988966901, + "loss": 2.8255, + "theoretical_loss": 3.5496819459587874, + "tokens_seen": 1348091904 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029875626880641925, + "loss": 2.7237, + "theoretical_loss": 3.54966641822087, + "tokens_seen": 1348157440 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987462387161485, + "loss": 2.746, + "theoretical_loss": 3.549650891449102, + "tokens_seen": 1348222976 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3195380, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.992070436477661, + "objective/train/theoretical_loss": 3.5496353656433746, + "objective/train/tokens_used": 1368748512, + "theoretical_loss": 3.5496353656433746, + "tokens_seen": 1348288512 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029873620862587766, + "loss": 2.8251, + "theoretical_loss": 3.5496353656433746, + "tokens_seen": 1348288512 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029872617853560684, + "loss": 2.7754, + "theoretical_loss": 3.549619840803582, + "tokens_seen": 1348354048 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298716148445336, + "loss": 2.8069, + "theoretical_loss": 3.549604316929617, + "tokens_seen": 1348419584 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002987061183550652, + "loss": 2.7541, + "theoretical_loss": 3.549588794021373, + "tokens_seen": 1348485120 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002986960882647944, + "loss": 2.7873, + "theoretical_loss": 3.5495732720787423, + "tokens_seen": 1348550656 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002986860581745236, + "loss": 2.7301, + "theoretical_loss": 3.549557751101618, + "tokens_seen": 1348616192 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029867602808425275, + "loss": 2.8081, + "theoretical_loss": 3.5495422310898928, + "tokens_seen": 1348681728 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298665997993982, + "loss": 2.7645, + "theoretical_loss": 3.549526712043461, + "tokens_seen": 1348747264 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002986559679037111, + "loss": 2.7972, + "theoretical_loss": 3.549511193962214, + "tokens_seen": 1348812800 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029864593781344035, + "loss": 2.8515, + "theoretical_loss": 3.549495676846046, + "tokens_seen": 1348878336 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029863590772316953, + "loss": 2.7787, + "theoretical_loss": 3.5494801606948503, + "tokens_seen": 1348943872 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002986258776328987, + "loss": 2.8096, + "theoretical_loss": 3.549464645508519, + "tokens_seen": 1349009408 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002986158475426279, + "loss": 2.8202, + "theoretical_loss": 3.549449131286946, + "tokens_seen": 1349074944 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029860581745235707, + "loss": 2.7355, + "theoretical_loss": 3.549433618030024, + "tokens_seen": 1349140480 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029859578736208625, + "loss": 2.7824, + "theoretical_loss": 3.5494181057376464, + "tokens_seen": 1349206016 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002985857572718155, + "loss": 2.8055, + "theoretical_loss": 3.5494025944097065, + "tokens_seen": 1349271552 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002985757271815446, + "loss": 2.8618, + "theoretical_loss": 3.5493870840460975, + "tokens_seen": 1349337088 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029856569709127385, + "loss": 2.8556, + "theoretical_loss": 3.5493715746467123, + "tokens_seen": 1349402624 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029855566700100303, + "loss": 2.7253, + "theoretical_loss": 3.549356066211444, + "tokens_seen": 1349468160 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002985456369107322, + "loss": 2.8309, + "theoretical_loss": 3.5493405587401865, + "tokens_seen": 1349533696 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002985356068204614, + "loss": 2.8178, + "theoretical_loss": 3.549325052232833, + "tokens_seen": 1349599232 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002985255767301906, + "loss": 2.896, + "theoretical_loss": 3.5493095466892757, + "tokens_seen": 1349664768 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029851554663991976, + "loss": 2.7302, + "theoretical_loss": 3.549294042109409, + "tokens_seen": 1349730304 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298505516549649, + "loss": 2.8176, + "theoretical_loss": 3.549278538493126, + "tokens_seen": 1349795840 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984954864593781, + "loss": 2.7546, + "theoretical_loss": 3.54926303584032, + "tokens_seen": 1349861376 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3200290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8715593814849854, + "objective/train/theoretical_loss": 3.5492475341508847, + "objective/train/tokens_used": 1370386912, + "theoretical_loss": 3.5492475341508847, + "tokens_seen": 1349926912 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029848545636910735, + "loss": 2.8237, + "theoretical_loss": 3.5492475341508847, + "tokens_seen": 1349926912 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984754262788365, + "loss": 2.7359, + "theoretical_loss": 3.5492320334247123, + "tokens_seen": 1349992448 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984653961885657, + "loss": 2.8074, + "theoretical_loss": 3.5492165336616974, + "tokens_seen": 1350057984 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984553660982949, + "loss": 2.8099, + "theoretical_loss": 3.549201034861733, + "tokens_seen": 1350123520 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984453360080241, + "loss": 2.6854, + "theoretical_loss": 3.5491855370247123, + "tokens_seen": 1350189056 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029843530591775326, + "loss": 2.7871, + "theoretical_loss": 3.549170040150529, + "tokens_seen": 1350254592 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984252758274825, + "loss": 2.7615, + "theoretical_loss": 3.5491545442390766, + "tokens_seen": 1350320128 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002984152457372116, + "loss": 2.8005, + "theoretical_loss": 3.549139049290248, + "tokens_seen": 1350385664 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029840521564694086, + "loss": 2.7788, + "theoretical_loss": 3.5491235553039377, + "tokens_seen": 1350451200 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029839518555667, + "loss": 2.8053, + "theoretical_loss": 3.5491080622800384, + "tokens_seen": 1350516736 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983851554663992, + "loss": 2.8459, + "theoretical_loss": 3.549092570218444, + "tokens_seen": 1350582272 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983751253761284, + "loss": 2.8334, + "theoretical_loss": 3.5490770791190474, + "tokens_seen": 1350647808 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983650952858576, + "loss": 2.7367, + "theoretical_loss": 3.5490615889817434, + "tokens_seen": 1350713344 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029835506519558676, + "loss": 2.7559, + "theoretical_loss": 3.549046099806424, + "tokens_seen": 1350778880 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029834503510531594, + "loss": 2.8397, + "theoretical_loss": 3.5490306115929844, + "tokens_seen": 1350844416 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983350050150451, + "loss": 2.7812, + "theoretical_loss": 3.5490151243413166, + "tokens_seen": 1350909952 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029832497492477436, + "loss": 2.8676, + "theoretical_loss": 3.548999638051316, + "tokens_seen": 1350975488 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983149448345035, + "loss": 2.8622, + "theoretical_loss": 3.5489841527228743, + "tokens_seen": 1351041024 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002983049147442327, + "loss": 2.776, + "theoretical_loss": 3.5489686683558865, + "tokens_seen": 1351106560 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029829488465396185, + "loss": 2.7984, + "theoretical_loss": 3.5489531849502463, + "tokens_seen": 1351172096 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002982848545636911, + "loss": 2.7995, + "theoretical_loss": 3.548937702505847, + "tokens_seen": 1351237632 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029827482447342026, + "loss": 2.7766, + "theoretical_loss": 3.5489222210225817, + "tokens_seen": 1351303168 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029826479438314945, + "loss": 2.7831, + "theoretical_loss": 3.548906740500345, + "tokens_seen": 1351368704 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002982547642928786, + "loss": 2.8679, + "theoretical_loss": 3.5488912609390306, + "tokens_seen": 1351434240 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029824473420260786, + "loss": 2.8021, + "theoretical_loss": 3.548875782338532, + "tokens_seen": 1351499776 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3204150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.627852201461792, + "objective/train/theoretical_loss": 3.548860304698743, + "objective/train/tokens_used": 1372025312, + "theoretical_loss": 3.548860304698743, + "tokens_seen": 1351565312 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298234704112337, + "loss": 2.7117, + "theoretical_loss": 3.548860304698743, + "tokens_seen": 1351565312 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002982246740220662, + "loss": 2.6851, + "theoretical_loss": 3.548844828019557, + "tokens_seen": 1351630848 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029821464393179535, + "loss": 2.7921, + "theoretical_loss": 3.548829352300869, + "tokens_seen": 1351696384 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002982046138415246, + "loss": 2.9198, + "theoretical_loss": 3.548813877542571, + "tokens_seen": 1351761920 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029819458375125377, + "loss": 2.7755, + "theoretical_loss": 3.5487984037445583, + "tokens_seen": 1351827456 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029818455366098295, + "loss": 2.8173, + "theoretical_loss": 3.5487829309067247, + "tokens_seen": 1351892992 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029817452357071213, + "loss": 2.7838, + "theoretical_loss": 3.5487674590289635, + "tokens_seen": 1351958528 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002981644934804413, + "loss": 2.7262, + "theoretical_loss": 3.548751988111169, + "tokens_seen": 1352024064 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002981544633901705, + "loss": 2.7383, + "theoretical_loss": 3.548736518153235, + "tokens_seen": 1352089600 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029814443329989973, + "loss": 2.9108, + "theoretical_loss": 3.5487210491550547, + "tokens_seen": 1352155136 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029813440320962885, + "loss": 2.7903, + "theoretical_loss": 3.548705581116523, + "tokens_seen": 1352220672 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002981243731193581, + "loss": 2.8893, + "theoretical_loss": 3.5486901140375338, + "tokens_seen": 1352286208 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002981143430290872, + "loss": 2.7213, + "theoretical_loss": 3.5486746479179807, + "tokens_seen": 1352351744 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029810431293881645, + "loss": 2.7917, + "theoretical_loss": 3.5486591827577576, + "tokens_seen": 1352417280 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029809428284854563, + "loss": 2.6841, + "theoretical_loss": 3.548643718556759, + "tokens_seen": 1352482816 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002980842527582748, + "loss": 2.8355, + "theoretical_loss": 3.5486282553148785, + "tokens_seen": 1352548352 + }, + { + "epoch": 16.02, + "learning_rate": 0.000298074222668004, + "loss": 2.7841, + "theoretical_loss": 3.5486127930320106, + "tokens_seen": 1352613888 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029806419257773323, + "loss": 2.8042, + "theoretical_loss": 3.5485973317080486, + "tokens_seen": 1352679424 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029805416248746236, + "loss": 2.7576, + "theoretical_loss": 3.5485818713428876, + "tokens_seen": 1352744960 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002980441323971916, + "loss": 2.8052, + "theoretical_loss": 3.548566411936421, + "tokens_seen": 1352810496 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002980341023069208, + "loss": 2.8504, + "theoretical_loss": 3.548550953488543, + "tokens_seen": 1352876032 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029802407221664996, + "loss": 2.7887, + "theoretical_loss": 3.5485354959991477, + "tokens_seen": 1352941568 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002980140421263792, + "loss": 2.7078, + "theoretical_loss": 3.548520039468129, + "tokens_seen": 1353007104 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002980040120361083, + "loss": 2.8499, + "theoretical_loss": 3.548504583895382, + "tokens_seen": 1353072640 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029799398194583755, + "loss": 2.8461, + "theoretical_loss": 3.5484891292808003, + "tokens_seen": 1353138176 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3206949, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8383185863494873, + "objective/train/theoretical_loss": 3.5484736756242774, + "objective/train/tokens_used": 1373663712, + "theoretical_loss": 3.5484736756242774, + "tokens_seen": 1353203712 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979839518555667, + "loss": 2.7732, + "theoretical_loss": 3.5484736756242774, + "tokens_seen": 1353203712 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979739217652959, + "loss": 2.8141, + "theoretical_loss": 3.548458222925709, + "tokens_seen": 1353269248 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979638916750251, + "loss": 2.8825, + "theoretical_loss": 3.548442771184988, + "tokens_seen": 1353334784 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979538615847543, + "loss": 2.7459, + "theoretical_loss": 3.548427320402009, + "tokens_seen": 1353400320 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029794383149448346, + "loss": 2.7496, + "theoretical_loss": 3.548411870576667, + "tokens_seen": 1353465856 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979338014042127, + "loss": 2.7563, + "theoretical_loss": 3.5483964217088557, + "tokens_seen": 1353531392 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979237713139418, + "loss": 2.7799, + "theoretical_loss": 3.548380973798469, + "tokens_seen": 1353596928 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029791374122367106, + "loss": 2.7761, + "theoretical_loss": 3.5483655268454015, + "tokens_seen": 1353662464 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002979037111334002, + "loss": 2.7743, + "theoretical_loss": 3.548350080849548, + "tokens_seen": 1353728000 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978936810431294, + "loss": 2.6891, + "theoretical_loss": 3.5483346358108028, + "tokens_seen": 1353793536 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978836509528586, + "loss": 2.8329, + "theoretical_loss": 3.5483191917290595, + "tokens_seen": 1353859072 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978736208625878, + "loss": 2.8969, + "theoretical_loss": 3.548303748604213, + "tokens_seen": 1353924608 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029786359077231696, + "loss": 2.7062, + "theoretical_loss": 3.5482883064361577, + "tokens_seen": 1353990144 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029785356068204614, + "loss": 2.8033, + "theoretical_loss": 3.5482728652247877, + "tokens_seen": 1354055680 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978435305917753, + "loss": 2.8314, + "theoretical_loss": 3.5482574249699983, + "tokens_seen": 1354121216 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029783350050150456, + "loss": 2.7022, + "theoretical_loss": 3.548241985671683, + "tokens_seen": 1354186752 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978234704112337, + "loss": 2.7698, + "theoretical_loss": 3.5482265473297363, + "tokens_seen": 1354252288 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002978134403209629, + "loss": 2.8403, + "theoretical_loss": 3.548211109944053, + "tokens_seen": 1354317824 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029780341023069205, + "loss": 2.7842, + "theoretical_loss": 3.5481956735145284, + "tokens_seen": 1354383360 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002977933801404213, + "loss": 2.781, + "theoretical_loss": 3.5481802380410556, + "tokens_seen": 1354448896 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029778335005015046, + "loss": 2.7781, + "theoretical_loss": 3.5481648035235294, + "tokens_seen": 1354514432 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029777331995987965, + "loss": 2.7137, + "theoretical_loss": 3.548149369961845, + "tokens_seen": 1354579968 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029776328986960883, + "loss": 2.8331, + "theoretical_loss": 3.548133937355897, + "tokens_seen": 1354645504 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029775325977933806, + "loss": 2.8294, + "theoretical_loss": 3.5481185057055793, + "tokens_seen": 1354711040 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002977432296890672, + "loss": 2.7029, + "theoretical_loss": 3.548103075010787, + "tokens_seen": 1354776576 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3211571, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.983362913131714, + "objective/train/theoretical_loss": 3.5480876452714143, + "objective/train/tokens_used": 1375302112, + "theoretical_loss": 3.5480876452714143, + "tokens_seen": 1354842112 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002977331995987964, + "loss": 2.7393, + "theoretical_loss": 3.5480876452714143, + "tokens_seen": 1354842112 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029772316950852555, + "loss": 2.6964, + "theoretical_loss": 3.5480722164873564, + "tokens_seen": 1354907648 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002977131394182548, + "loss": 2.735, + "theoretical_loss": 3.5480567886585073, + "tokens_seen": 1354973184 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029770310932798397, + "loss": 2.7514, + "theoretical_loss": 3.5480413617847617, + "tokens_seen": 1355038720 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029769307923771315, + "loss": 2.8487, + "theoretical_loss": 3.5480259358660153, + "tokens_seen": 1355104256 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029768304914744233, + "loss": 2.9238, + "theoretical_loss": 3.548010510902162, + "tokens_seen": 1355169792 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002976730190571715, + "loss": 2.8121, + "theoretical_loss": 3.547995086893096, + "tokens_seen": 1355235328 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002976629889669007, + "loss": 2.7807, + "theoretical_loss": 3.547979663838713, + "tokens_seen": 1355300864 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029765295887662993, + "loss": 2.8074, + "theoretical_loss": 3.5479642417389075, + "tokens_seen": 1355366400 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029764292878635905, + "loss": 2.8099, + "theoretical_loss": 3.547948820593574, + "tokens_seen": 1355431936 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002976328986960883, + "loss": 2.8638, + "theoretical_loss": 3.5479334004026075, + "tokens_seen": 1355497472 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002976228686058174, + "loss": 2.7437, + "theoretical_loss": 3.5479179811659023, + "tokens_seen": 1355563008 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029761283851554665, + "loss": 2.785, + "theoretical_loss": 3.547902562883354, + "tokens_seen": 1355628544 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029760280842527583, + "loss": 2.7963, + "theoretical_loss": 3.5478871455548573, + "tokens_seen": 1355694080 + }, + { + "epoch": 16.02, + "learning_rate": 0.000297592778335005, + "loss": 2.7581, + "theoretical_loss": 3.547871729180306, + "tokens_seen": 1355759616 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002975827482447342, + "loss": 2.826, + "theoretical_loss": 3.5478563137595964, + "tokens_seen": 1355825152 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029757271815446343, + "loss": 2.7781, + "theoretical_loss": 3.547840899292623, + "tokens_seen": 1355890688 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029756268806419256, + "loss": 2.822, + "theoretical_loss": 3.5478254857792804, + "tokens_seen": 1355956224 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002975526579739218, + "loss": 2.7921, + "theoretical_loss": 3.547810073219463, + "tokens_seen": 1356021760 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002975426278836509, + "loss": 2.7413, + "theoretical_loss": 3.5477946616130667, + "tokens_seen": 1356087296 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029753259779338016, + "loss": 2.7918, + "theoretical_loss": 3.5477792509599864, + "tokens_seen": 1356152832 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029752256770310934, + "loss": 2.7603, + "theoretical_loss": 3.547763841260116, + "tokens_seen": 1356218368 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002975125376128385, + "loss": 2.7488, + "theoretical_loss": 3.5477484325133517, + "tokens_seen": 1356283904 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002975025075225677, + "loss": 2.8432, + "theoretical_loss": 3.5477330247195877, + "tokens_seen": 1356349440 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974924774322969, + "loss": 2.7319, + "theoretical_loss": 3.5477176178787198, + "tokens_seen": 1356414976 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7913577556610107, + "objective/train/theoretical_loss": 3.547702211990642, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.547702211990642, + "tokens_seen": 1356480512 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029748244734202606, + "loss": 2.764, + "theoretical_loss": 3.547702211990642, + "tokens_seen": 1356480512 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974724172517553, + "loss": 2.7275, + "theoretical_loss": 3.5476868070552503, + "tokens_seen": 1356546048 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974623871614844, + "loss": 2.7393, + "theoretical_loss": 3.5476714030724392, + "tokens_seen": 1356611584 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029745235707121366, + "loss": 2.7731, + "theoretical_loss": 3.547656000042104, + "tokens_seen": 1356677120 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974423269809428, + "loss": 2.9228, + "theoretical_loss": 3.54764059796414, + "tokens_seen": 1356742656 + }, + { + "epoch": 16.02, + "learning_rate": 0.000297432296890672, + "loss": 2.7699, + "theoretical_loss": 3.5476251968384416, + "tokens_seen": 1356808192 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974222668004012, + "loss": 2.91, + "theoretical_loss": 3.5476097966649047, + "tokens_seen": 1356873728 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002974122367101304, + "loss": 2.8351, + "theoretical_loss": 3.547594397443424, + "tokens_seen": 1356939264 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029740220661985956, + "loss": 2.6651, + "theoretical_loss": 3.5475789991738953, + "tokens_seen": 1357004800 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973921765295888, + "loss": 2.7918, + "theoretical_loss": 3.547563601856213, + "tokens_seen": 1357070336 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973821464393179, + "loss": 2.8792, + "theoretical_loss": 3.5475482054902723, + "tokens_seen": 1357135872 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029737211634904716, + "loss": 2.7288, + "theoretical_loss": 3.5475328100759693, + "tokens_seen": 1357201408 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973620862587763, + "loss": 2.8229, + "theoretical_loss": 3.547517415613198, + "tokens_seen": 1357266944 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973520561685055, + "loss": 2.8186, + "theoretical_loss": 3.5475020221018547, + "tokens_seen": 1357332480 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973420260782347, + "loss": 2.8711, + "theoretical_loss": 3.5474866295418344, + "tokens_seen": 1357398016 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002973319959879639, + "loss": 2.8015, + "theoretical_loss": 3.5474712379330318, + "tokens_seen": 1357463552 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029732196589769307, + "loss": 2.7967, + "theoretical_loss": 3.547455847275343, + "tokens_seen": 1357529088 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029731193580742225, + "loss": 2.7764, + "theoretical_loss": 3.547440457568663, + "tokens_seen": 1357594624 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029730190571715143, + "loss": 2.7923, + "theoretical_loss": 3.5474250688128866, + "tokens_seen": 1357660160 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029729187562688067, + "loss": 2.8131, + "theoretical_loss": 3.54740968100791, + "tokens_seen": 1357725696 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029728184553660985, + "loss": 2.8075, + "theoretical_loss": 3.5473942941536283, + "tokens_seen": 1357791232 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029727181544633903, + "loss": 2.7461, + "theoretical_loss": 3.5473789082499367, + "tokens_seen": 1357856768 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029726178535606826, + "loss": 2.8074, + "theoretical_loss": 3.54736352329673, + "tokens_seen": 1357922304 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002972517552657974, + "loss": 2.7723, + "theoretical_loss": 3.5473481392939057, + "tokens_seen": 1357987840 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002972417251755266, + "loss": 2.8602, + "theoretical_loss": 3.5473327562413566, + "tokens_seen": 1358053376 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7404630184173584, + "objective/train/theoretical_loss": 3.54731737413898, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.54731737413898, + "tokens_seen": 1358118912 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029723169508525575, + "loss": 2.7907, + "theoretical_loss": 3.54731737413898, + "tokens_seen": 1358118912 + }, + { + "epoch": 16.02, + "learning_rate": 0.000297221664994985, + "loss": 2.7939, + "theoretical_loss": 3.5473019929866707, + "tokens_seen": 1358184448 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029721163490471417, + "loss": 2.5678, + "theoretical_loss": 3.5472866127843243, + "tokens_seen": 1358249984 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029720160481444335, + "loss": 2.8411, + "theoretical_loss": 3.547271233531836, + "tokens_seen": 1358315520 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029719157472417253, + "loss": 2.7406, + "theoretical_loss": 3.5472558552291016, + "tokens_seen": 1358381056 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002971815446339017, + "loss": 2.8221, + "theoretical_loss": 3.5472404778760165, + "tokens_seen": 1358446592 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002971715145436309, + "loss": 2.7015, + "theoretical_loss": 3.547225101472476, + "tokens_seen": 1358512128 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029716148445336013, + "loss": 2.7932, + "theoretical_loss": 3.5472097260183766, + "tokens_seen": 1358577664 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029715145436308926, + "loss": 2.8766, + "theoretical_loss": 3.5471943515136126, + "tokens_seen": 1358643200 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002971414242728185, + "loss": 2.826, + "theoretical_loss": 3.5471789779580805, + "tokens_seen": 1358708736 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002971313941825476, + "loss": 2.8139, + "theoretical_loss": 3.5471636053516757, + "tokens_seen": 1358774272 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029712136409227685, + "loss": 2.8083, + "theoretical_loss": 3.547148233694294, + "tokens_seen": 1358839808 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029711133400200603, + "loss": 2.7865, + "theoretical_loss": 3.5471328629858307, + "tokens_seen": 1358905344 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002971013039117352, + "loss": 2.7885, + "theoretical_loss": 3.5471174932261818, + "tokens_seen": 1358970880 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002970912738214644, + "loss": 2.8712, + "theoretical_loss": 3.5471021244152423, + "tokens_seen": 1359036416 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029708124373119363, + "loss": 2.7899, + "theoretical_loss": 3.5470867565529085, + "tokens_seen": 1359101952 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029707121364092276, + "loss": 2.8564, + "theoretical_loss": 3.5470713896390764, + "tokens_seen": 1359167488 + }, + { + "epoch": 16.02, + "learning_rate": 0.000297061183550652, + "loss": 2.6942, + "theoretical_loss": 3.547056023673641, + "tokens_seen": 1359233024 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002970511534603811, + "loss": 2.7757, + "theoretical_loss": 3.547040658656498, + "tokens_seen": 1359298560 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029704112337011036, + "loss": 2.7376, + "theoretical_loss": 3.5470252945875442, + "tokens_seen": 1359364096 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029703109327983954, + "loss": 2.8027, + "theoretical_loss": 3.5470099314666745, + "tokens_seen": 1359429632 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002970210631895687, + "loss": 2.8386, + "theoretical_loss": 3.5469945692937848, + "tokens_seen": 1359495168 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002970110330992979, + "loss": 2.8891, + "theoretical_loss": 3.546979208068771, + "tokens_seen": 1359560704 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002970010030090271, + "loss": 2.8194, + "theoretical_loss": 3.546963847791529, + "tokens_seen": 1359626240 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029699097291875626, + "loss": 2.8015, + "theoretical_loss": 3.5469484884619544, + "tokens_seen": 1359691776 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7387497425079346, + "objective/train/theoretical_loss": 3.5469331300799434, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5469331300799434, + "tokens_seen": 1359757312 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002969809428284855, + "loss": 2.8058, + "theoretical_loss": 3.5469331300799434, + "tokens_seen": 1359757312 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002969709127382146, + "loss": 2.7541, + "theoretical_loss": 3.5469177726453913, + "tokens_seen": 1359822848 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029696088264794386, + "loss": 2.802, + "theoretical_loss": 3.546902416158195, + "tokens_seen": 1359888384 + }, + { + "epoch": 16.02, + "learning_rate": 0.000296950852557673, + "loss": 2.742, + "theoretical_loss": 3.5468870606182494, + "tokens_seen": 1359953920 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002969408224674022, + "loss": 2.8273, + "theoretical_loss": 3.546871706025451, + "tokens_seen": 1360019456 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002969307923771314, + "loss": 2.8013, + "theoretical_loss": 3.5468563523796957, + "tokens_seen": 1360084992 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002969207622868606, + "loss": 2.779, + "theoretical_loss": 3.546840999680879, + "tokens_seen": 1360150528 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029691073219658976, + "loss": 2.637, + "theoretical_loss": 3.5468256479288973, + "tokens_seen": 1360216064 + }, + { + "epoch": 16.02, + "learning_rate": 0.000296900702106319, + "loss": 2.8068, + "theoretical_loss": 3.546810297123647, + "tokens_seen": 1360281600 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002968906720160481, + "loss": 2.8233, + "theoretical_loss": 3.546794947265023, + "tokens_seen": 1360347136 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029688064192577736, + "loss": 2.7797, + "theoretical_loss": 3.5467795983529222, + "tokens_seen": 1360412672 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002968706118355065, + "loss": 2.7811, + "theoretical_loss": 3.5467642503872403, + "tokens_seen": 1360478208 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002968605817452357, + "loss": 2.6522, + "theoretical_loss": 3.546748903367874, + "tokens_seen": 1360543744 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002968505516549649, + "loss": 2.7332, + "theoretical_loss": 3.5467335572947176, + "tokens_seen": 1360609280 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002968405215646941, + "loss": 2.7333, + "theoretical_loss": 3.5467182121676695, + "tokens_seen": 1360674816 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029683049147442327, + "loss": 2.7268, + "theoretical_loss": 3.546702867986624, + "tokens_seen": 1360740352 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029682046138415245, + "loss": 2.7907, + "theoretical_loss": 3.546687524751478, + "tokens_seen": 1360805888 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029681043129388163, + "loss": 2.7846, + "theoretical_loss": 3.546672182462128, + "tokens_seen": 1360871424 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029680040120361087, + "loss": 2.7194, + "theoretical_loss": 3.5466568411184696, + "tokens_seen": 1360936960 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029679037111334, + "loss": 2.7594, + "theoretical_loss": 3.5466415007203995, + "tokens_seen": 1361002496 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029678034102306923, + "loss": 2.8708, + "theoretical_loss": 3.5466261612678127, + "tokens_seen": 1361068032 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002967703109327984, + "loss": 2.7983, + "theoretical_loss": 3.5466108227606066, + "tokens_seen": 1361133568 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002967602808425276, + "loss": 2.7826, + "theoretical_loss": 3.5465954851986767, + "tokens_seen": 1361199104 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029675025075225677, + "loss": 2.7847, + "theoretical_loss": 3.5465801485819197, + "tokens_seen": 1361264640 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029674022066198595, + "loss": 2.8815, + "theoretical_loss": 3.546564812910232, + "tokens_seen": 1361330176 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.746945858001709, + "objective/train/theoretical_loss": 3.5465494781835094, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5465494781835094, + "tokens_seen": 1361395712 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029673019057171513, + "loss": 2.7892, + "theoretical_loss": 3.5465494781835094, + "tokens_seen": 1361395712 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029672016048144437, + "loss": 2.7524, + "theoretical_loss": 3.5465341444016483, + "tokens_seen": 1361461248 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002967101303911735, + "loss": 2.7521, + "theoretical_loss": 3.546518811564545, + "tokens_seen": 1361526784 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029670010030090273, + "loss": 2.7966, + "theoretical_loss": 3.5465034796720962, + "tokens_seen": 1361592320 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029669007021063186, + "loss": 2.849, + "theoretical_loss": 3.5464881487241975, + "tokens_seen": 1361657856 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002966800401203611, + "loss": 2.8176, + "theoretical_loss": 3.546472818720746, + "tokens_seen": 1361723392 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002966700100300903, + "loss": 2.7609, + "theoretical_loss": 3.5464574896616377, + "tokens_seen": 1361788928 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029665997993981946, + "loss": 2.8325, + "theoretical_loss": 3.5464421615467687, + "tokens_seen": 1361854464 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029664994984954864, + "loss": 2.7979, + "theoretical_loss": 3.546426834376036, + "tokens_seen": 1361920000 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002966399197592778, + "loss": 2.7986, + "theoretical_loss": 3.546411508149336, + "tokens_seen": 1361985536 + }, + { + "epoch": 16.02, + "learning_rate": 0.000296629889669007, + "loss": 2.805, + "theoretical_loss": 3.5463961828665647, + "tokens_seen": 1362051072 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029661985957873623, + "loss": 2.731, + "theoretical_loss": 3.546380858527619, + "tokens_seen": 1362116608 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029660982948846536, + "loss": 2.9186, + "theoretical_loss": 3.546365535132394, + "tokens_seen": 1362182144 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965997993981946, + "loss": 2.7659, + "theoretical_loss": 3.546350212680789, + "tokens_seen": 1362247680 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965897693079238, + "loss": 2.7927, + "theoretical_loss": 3.5463348911726973, + "tokens_seen": 1362313216 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029657973921765296, + "loss": 2.785, + "theoretical_loss": 3.5463195706080173, + "tokens_seen": 1362378752 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029656970912738214, + "loss": 2.8115, + "theoretical_loss": 3.546304250986646, + "tokens_seen": 1362444288 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965596790371113, + "loss": 2.8358, + "theoretical_loss": 3.546288932308478, + "tokens_seen": 1362509824 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965496489468405, + "loss": 2.7061, + "theoretical_loss": 3.546273614573411, + "tokens_seen": 1362575360 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029653961885656974, + "loss": 2.8893, + "theoretical_loss": 3.5462582977813417, + "tokens_seen": 1362640896 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965295887662989, + "loss": 2.7308, + "theoretical_loss": 3.546242981932167, + "tokens_seen": 1362706432 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965195586760281, + "loss": 2.6791, + "theoretical_loss": 3.5462276670257826, + "tokens_seen": 1362771968 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002965095285857573, + "loss": 2.8976, + "theoretical_loss": 3.546212353062086, + "tokens_seen": 1362837504 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029649949849548646, + "loss": 2.8615, + "theoretical_loss": 3.5461970400409726, + "tokens_seen": 1362903040 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964894684052157, + "loss": 2.8593, + "theoretical_loss": 3.546181727962341, + "tokens_seen": 1362968576 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8163931369781494, + "objective/train/theoretical_loss": 3.5461664168260856, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5461664168260856, + "tokens_seen": 1363034112 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964794383149448, + "loss": 2.699, + "theoretical_loss": 3.5461664168260856, + "tokens_seen": 1363034112 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029646940822467406, + "loss": 2.8319, + "theoretical_loss": 3.546151106632105, + "tokens_seen": 1363099648 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964593781344032, + "loss": 2.7835, + "theoretical_loss": 3.5461357973802947, + "tokens_seen": 1363165184 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964493480441324, + "loss": 2.8595, + "theoretical_loss": 3.546120489070552, + "tokens_seen": 1363230720 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964393179538616, + "loss": 2.7691, + "theoretical_loss": 3.546105181702774, + "tokens_seen": 1363296256 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964292878635908, + "loss": 2.8819, + "theoretical_loss": 3.5460898752768566, + "tokens_seen": 1363361792 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029641925777331996, + "loss": 2.811, + "theoretical_loss": 3.5460745697926974, + "tokens_seen": 1363427328 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002964092276830492, + "loss": 2.891, + "theoretical_loss": 3.546059265250192, + "tokens_seen": 1363492864 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002963991975927783, + "loss": 2.7695, + "theoretical_loss": 3.5460439616492385, + "tokens_seen": 1363558400 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029638916750250756, + "loss": 2.7271, + "theoretical_loss": 3.5460286589897336, + "tokens_seen": 1363623936 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002963791374122367, + "loss": 2.7851, + "theoretical_loss": 3.546013357271573, + "tokens_seen": 1363689472 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002963691073219659, + "loss": 2.8936, + "theoretical_loss": 3.5459980564946543, + "tokens_seen": 1363755008 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002963590772316951, + "loss": 2.816, + "theoretical_loss": 3.5459827566588746, + "tokens_seen": 1363820544 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002963490471414243, + "loss": 2.7414, + "theoretical_loss": 3.5459674577641307, + "tokens_seen": 1363886080 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029633901705115347, + "loss": 2.8085, + "theoretical_loss": 3.5459521598103194, + "tokens_seen": 1363951616 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029632898696088265, + "loss": 2.8347, + "theoretical_loss": 3.5459368627973378, + "tokens_seen": 1364017152 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029631895687061183, + "loss": 2.7811, + "theoretical_loss": 3.545921566725082, + "tokens_seen": 1364082688 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029630892678034107, + "loss": 2.829, + "theoretical_loss": 3.54590627159345, + "tokens_seen": 1364148224 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002962988966900702, + "loss": 2.8399, + "theoretical_loss": 3.545890977402338, + "tokens_seen": 1364213760 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029628886659979943, + "loss": 2.8546, + "theoretical_loss": 3.545875684151644, + "tokens_seen": 1364279296 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002962788365095286, + "loss": 2.8048, + "theoretical_loss": 3.5458603918412637, + "tokens_seen": 1364344832 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002962688064192578, + "loss": 2.7954, + "theoretical_loss": 3.5458451004710954, + "tokens_seen": 1364410368 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029625877632898697, + "loss": 2.7966, + "theoretical_loss": 3.545829810041035, + "tokens_seen": 1364475904 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029624874623871615, + "loss": 2.7295, + "theoretical_loss": 3.54581452055098, + "tokens_seen": 1364541440 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029623871614844533, + "loss": 2.6622, + "theoretical_loss": 3.545799232000828, + "tokens_seen": 1364606976 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.836733818054199, + "objective/train/theoretical_loss": 3.5457839443904757, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5457839443904757, + "tokens_seen": 1364672512 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029622868605817457, + "loss": 2.8585, + "theoretical_loss": 3.5457839443904757, + "tokens_seen": 1364672512 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002962186559679037, + "loss": 2.8194, + "theoretical_loss": 3.5457686577198197, + "tokens_seen": 1364738048 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029620862587763293, + "loss": 2.7339, + "theoretical_loss": 3.5457533719887575, + "tokens_seen": 1364803584 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029619859578736206, + "loss": 2.798, + "theoretical_loss": 3.5457380871971864, + "tokens_seen": 1364869120 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002961885656970913, + "loss": 2.8281, + "theoretical_loss": 3.545722803345004, + "tokens_seen": 1364934656 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002961785356068205, + "loss": 2.8297, + "theoretical_loss": 3.5457075204321065, + "tokens_seen": 1365000192 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029616850551654966, + "loss": 2.9049, + "theoretical_loss": 3.5456922384583915, + "tokens_seen": 1365065728 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029615847542627884, + "loss": 2.8299, + "theoretical_loss": 3.5456769574237557, + "tokens_seen": 1365131264 + }, + { + "epoch": 16.02, + "learning_rate": 0.000296148445336008, + "loss": 2.7865, + "theoretical_loss": 3.5456616773280976, + "tokens_seen": 1365196800 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002961384152457372, + "loss": 2.743, + "theoretical_loss": 3.545646398171313, + "tokens_seen": 1365262336 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029612838515546643, + "loss": 2.7856, + "theoretical_loss": 3.5456311199533, + "tokens_seen": 1365327872 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029611835506519556, + "loss": 2.7597, + "theoretical_loss": 3.5456158426739552, + "tokens_seen": 1365393408 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002961083249749248, + "loss": 2.8665, + "theoretical_loss": 3.5456005663331767, + "tokens_seen": 1365458944 + }, + { + "epoch": 16.02, + "learning_rate": 0.000296098294884654, + "loss": 2.7529, + "theoretical_loss": 3.5455852909308616, + "tokens_seen": 1365524480 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029608826479438316, + "loss": 2.88, + "theoretical_loss": 3.5455700164669066, + "tokens_seen": 1365590016 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029607823470411234, + "loss": 2.8368, + "theoretical_loss": 3.54555474294121, + "tokens_seen": 1365655552 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002960682046138415, + "loss": 2.9167, + "theoretical_loss": 3.545539470353668, + "tokens_seen": 1365721088 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002960581745235707, + "loss": 2.6781, + "theoretical_loss": 3.545524198704179, + "tokens_seen": 1365786624 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029604814443329994, + "loss": 2.8758, + "theoretical_loss": 3.54550892799264, + "tokens_seen": 1365852160 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029603811434302906, + "loss": 2.9048, + "theoretical_loss": 3.545493658218948, + "tokens_seen": 1365917696 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002960280842527583, + "loss": 2.8266, + "theoretical_loss": 3.5454783893830006, + "tokens_seen": 1365983232 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002960180541624874, + "loss": 2.7729, + "theoretical_loss": 3.545463121484696, + "tokens_seen": 1366048768 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029600802407221666, + "loss": 2.7399, + "theoretical_loss": 3.54544785452393, + "tokens_seen": 1366114304 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029599799398194584, + "loss": 2.7699, + "theoretical_loss": 3.545432588500602, + "tokens_seen": 1366179840 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295987963891675, + "loss": 2.866, + "theoretical_loss": 3.5454173234146076, + "tokens_seen": 1366245376 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.886842727661133, + "objective/train/theoretical_loss": 3.545402059265846, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.545402059265846, + "tokens_seen": 1366310912 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002959779338014042, + "loss": 2.8468, + "theoretical_loss": 3.545402059265846, + "tokens_seen": 1366310912 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002959679037111334, + "loss": 2.8499, + "theoretical_loss": 3.545386796054214, + "tokens_seen": 1366376448 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029595787362086257, + "loss": 2.7864, + "theoretical_loss": 3.5453715337796083, + "tokens_seen": 1366441984 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002959478435305918, + "loss": 2.8644, + "theoretical_loss": 3.545356272441928, + "tokens_seen": 1366507520 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029593781344032093, + "loss": 2.8186, + "theoretical_loss": 3.545341012041069, + "tokens_seen": 1366573056 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029592778335005016, + "loss": 2.8973, + "theoretical_loss": 3.5453257525769306, + "tokens_seen": 1366638592 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029591775325977935, + "loss": 2.7723, + "theoretical_loss": 3.545310494049409, + "tokens_seen": 1366704128 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029590772316950853, + "loss": 2.847, + "theoretical_loss": 3.5452952364584025, + "tokens_seen": 1366769664 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958976930792377, + "loss": 2.7708, + "theoretical_loss": 3.5452799798038086, + "tokens_seen": 1366835200 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958876629889669, + "loss": 2.7434, + "theoretical_loss": 3.5452647240855244, + "tokens_seen": 1366900736 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029587763289869607, + "loss": 2.7834, + "theoretical_loss": 3.5452494693034486, + "tokens_seen": 1366966272 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958676028084253, + "loss": 2.7284, + "theoretical_loss": 3.5452342154574774, + "tokens_seen": 1367031808 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029585757271815443, + "loss": 2.7084, + "theoretical_loss": 3.5452189625475103, + "tokens_seen": 1367097344 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029584754262788367, + "loss": 2.7688, + "theoretical_loss": 3.5452037105734435, + "tokens_seen": 1367162880 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958375125376128, + "loss": 2.8004, + "theoretical_loss": 3.5451884595351757, + "tokens_seen": 1367228416 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029582748244734203, + "loss": 2.8498, + "theoretical_loss": 3.545173209432604, + "tokens_seen": 1367293952 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958174523570712, + "loss": 2.78, + "theoretical_loss": 3.545157960265626, + "tokens_seen": 1367359488 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002958074222668004, + "loss": 2.776, + "theoretical_loss": 3.5451427120341403, + "tokens_seen": 1367425024 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002957973921765296, + "loss": 2.8389, + "theoretical_loss": 3.5451274647380435, + "tokens_seen": 1367490560 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002957873620862588, + "loss": 2.7861, + "theoretical_loss": 3.545112218377235, + "tokens_seen": 1367556096 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295777331995988, + "loss": 2.7449, + "theoretical_loss": 3.545096972951611, + "tokens_seen": 1367621632 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029576730190571717, + "loss": 2.8765, + "theoretical_loss": 3.5450817284610707, + "tokens_seen": 1367687168 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029575727181544635, + "loss": 2.8373, + "theoretical_loss": 3.5450664849055107, + "tokens_seen": 1367752704 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029574724172517553, + "loss": 2.7646, + "theoretical_loss": 3.5450512422848295, + "tokens_seen": 1367818240 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029573721163490477, + "loss": 2.7136, + "theoretical_loss": 3.545036000598925, + "tokens_seen": 1367883776 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.831223249435425, + "objective/train/theoretical_loss": 3.545020759847695, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.545020759847695, + "tokens_seen": 1367949312 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002957271815446339, + "loss": 2.8821, + "theoretical_loss": 3.545020759847695, + "tokens_seen": 1367949312 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029571715145436313, + "loss": 2.7456, + "theoretical_loss": 3.5450055200310375, + "tokens_seen": 1368014848 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029570712136409226, + "loss": 2.7866, + "theoretical_loss": 3.54499028114885, + "tokens_seen": 1368080384 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002956970912738215, + "loss": 2.6659, + "theoretical_loss": 3.544975043201031, + "tokens_seen": 1368145920 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002956870611835507, + "loss": 2.8778, + "theoretical_loss": 3.544959806187478, + "tokens_seen": 1368211456 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029567703109327986, + "loss": 2.7448, + "theoretical_loss": 3.5449445701080897, + "tokens_seen": 1368276992 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029566700100300904, + "loss": 2.8257, + "theoretical_loss": 3.544929334962763, + "tokens_seen": 1368342528 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002956569709127382, + "loss": 2.7727, + "theoretical_loss": 3.544914100751397, + "tokens_seen": 1368408064 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002956469408224674, + "loss": 2.7482, + "theoretical_loss": 3.544898867473889, + "tokens_seen": 1368473600 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029563691073219663, + "loss": 2.7433, + "theoretical_loss": 3.5448836351301374, + "tokens_seen": 1368539136 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029562688064192576, + "loss": 2.872, + "theoretical_loss": 3.5448684037200398, + "tokens_seen": 1368604672 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295616850551655, + "loss": 2.7753, + "theoretical_loss": 3.544853173243495, + "tokens_seen": 1368670208 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002956068204613842, + "loss": 2.8426, + "theoretical_loss": 3.5448379437004, + "tokens_seen": 1368735744 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029559679037111336, + "loss": 2.8575, + "theoretical_loss": 3.544822715090654, + "tokens_seen": 1368801280 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029558676028084254, + "loss": 2.8365, + "theoretical_loss": 3.544807487414155, + "tokens_seen": 1368866816 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002955767301905717, + "loss": 2.869, + "theoretical_loss": 3.5447922606707998, + "tokens_seen": 1368932352 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002955667001003009, + "loss": 2.7095, + "theoretical_loss": 3.544777034860488, + "tokens_seen": 1368997888 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029555667001003014, + "loss": 2.7355, + "theoretical_loss": 3.5447618099831177, + "tokens_seen": 1369063424 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029554663991975926, + "loss": 2.7862, + "theoretical_loss": 3.5447465860385865, + "tokens_seen": 1369128960 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002955366098294885, + "loss": 2.7924, + "theoretical_loss": 3.5447313630267923, + "tokens_seen": 1369194496 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002955265797392176, + "loss": 2.7887, + "theoretical_loss": 3.544716140947634, + "tokens_seen": 1369260032 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029551654964894686, + "loss": 2.769, + "theoretical_loss": 3.54470091980101, + "tokens_seen": 1369325568 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029550651955867604, + "loss": 2.8781, + "theoretical_loss": 3.544685699586818, + "tokens_seen": 1369391104 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002954964894684052, + "loss": 2.8048, + "theoretical_loss": 3.5446704803049567, + "tokens_seen": 1369456640 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002954864593781344, + "loss": 2.8052, + "theoretical_loss": 3.5446552619553238, + "tokens_seen": 1369522176 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.907487392425537, + "objective/train/theoretical_loss": 3.544640044537818, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.544640044537818, + "tokens_seen": 1369587712 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002954764292878636, + "loss": 2.8363, + "theoretical_loss": 3.544640044537818, + "tokens_seen": 1369587712 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029546639919759277, + "loss": 2.7458, + "theoretical_loss": 3.5446248280523376, + "tokens_seen": 1369653248 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295456369107322, + "loss": 2.8049, + "theoretical_loss": 3.544609612498781, + "tokens_seen": 1369718784 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029544633901705113, + "loss": 2.8361, + "theoretical_loss": 3.544594397877046, + "tokens_seen": 1369784320 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029543630892678037, + "loss": 2.7778, + "theoretical_loss": 3.5445791841870316, + "tokens_seen": 1369849856 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029542627883650955, + "loss": 2.871, + "theoretical_loss": 3.5445639714286354, + "tokens_seen": 1369915392 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029541624874623873, + "loss": 2.6845, + "theoretical_loss": 3.5445487596017564, + "tokens_seen": 1369980928 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002954062186559679, + "loss": 2.731, + "theoretical_loss": 3.5445335487062932, + "tokens_seen": 1370046464 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002953961885656971, + "loss": 2.8263, + "theoretical_loss": 3.5445183387421437, + "tokens_seen": 1370112000 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029538615847542627, + "loss": 2.8186, + "theoretical_loss": 3.5445031297092067, + "tokens_seen": 1370177536 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002953761283851555, + "loss": 2.8887, + "theoretical_loss": 3.544487921607381, + "tokens_seen": 1370243072 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029536609829488463, + "loss": 2.8692, + "theoretical_loss": 3.5444727144365635, + "tokens_seen": 1370308608 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029535606820461387, + "loss": 2.7979, + "theoretical_loss": 3.544457508196654, + "tokens_seen": 1370374144 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295346038114343, + "loss": 2.8074, + "theoretical_loss": 3.544442302887551, + "tokens_seen": 1370439680 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029533600802407223, + "loss": 2.7619, + "theoretical_loss": 3.5444270985091526, + "tokens_seen": 1370505216 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002953259779338014, + "loss": 2.8525, + "theoretical_loss": 3.5444118950613577, + "tokens_seen": 1370570752 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002953159478435306, + "loss": 2.8025, + "theoretical_loss": 3.544396692544065, + "tokens_seen": 1370636288 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002953059177532598, + "loss": 2.8415, + "theoretical_loss": 3.5443814909571723, + "tokens_seen": 1370701824 + }, + { + "epoch": 16.02, + "learning_rate": 0.000295295887662989, + "loss": 2.827, + "theoretical_loss": 3.5443662903005784, + "tokens_seen": 1370767360 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029528585757271814, + "loss": 2.8763, + "theoretical_loss": 3.5443510905741826, + "tokens_seen": 1370832896 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029527582748244737, + "loss": 2.795, + "theoretical_loss": 3.5443358917778824, + "tokens_seen": 1370898432 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002952657973921765, + "loss": 2.7928, + "theoretical_loss": 3.5443206939115774, + "tokens_seen": 1370963968 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029525576730190573, + "loss": 2.8678, + "theoretical_loss": 3.5443054969751655, + "tokens_seen": 1371029504 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002952457372116349, + "loss": 2.835, + "theoretical_loss": 3.5442903009685462, + "tokens_seen": 1371095040 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002952357071213641, + "loss": 2.7292, + "theoretical_loss": 3.5442751058916175, + "tokens_seen": 1371160576 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8179733753204346, + "objective/train/theoretical_loss": 3.5442599117442786, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5442599117442786, + "tokens_seen": 1371226112 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002952256770310933, + "loss": 2.8525, + "theoretical_loss": 3.5442599117442786, + "tokens_seen": 1371226112 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029521564694082246, + "loss": 2.7785, + "theoretical_loss": 3.544244718526427, + "tokens_seen": 1371291648 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029520561685055164, + "loss": 2.7721, + "theoretical_loss": 3.544229526237963, + "tokens_seen": 1371357184 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002951955867602809, + "loss": 2.7918, + "theoretical_loss": 3.5442143348787845, + "tokens_seen": 1371422720 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029518555667001, + "loss": 2.7532, + "theoretical_loss": 3.5441991444487906, + "tokens_seen": 1371488256 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029517552657973924, + "loss": 2.8065, + "theoretical_loss": 3.5441839549478793, + "tokens_seen": 1371553792 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029516549648946836, + "loss": 2.7733, + "theoretical_loss": 3.5441687663759507, + "tokens_seen": 1371619328 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002951554663991976, + "loss": 2.8542, + "theoretical_loss": 3.544153578732902, + "tokens_seen": 1371684864 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002951454363089268, + "loss": 2.8543, + "theoretical_loss": 3.544138392018634, + "tokens_seen": 1371750400 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029513540621865596, + "loss": 2.8496, + "theoretical_loss": 3.5441232062330434, + "tokens_seen": 1371815936 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029512537612838514, + "loss": 2.8688, + "theoretical_loss": 3.5441080213760303, + "tokens_seen": 1371881472 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002951153460381144, + "loss": 2.7346, + "theoretical_loss": 3.5440928374474936, + "tokens_seen": 1371947008 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002951053159478435, + "loss": 2.7486, + "theoretical_loss": 3.5440776544473316, + "tokens_seen": 1372012544 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029509528585757274, + "loss": 2.8681, + "theoretical_loss": 3.544062472375444, + "tokens_seen": 1372078080 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029508525576730187, + "loss": 2.7733, + "theoretical_loss": 3.5440472912317285, + "tokens_seen": 1372143616 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002950752256770311, + "loss": 2.7954, + "theoretical_loss": 3.544032111016085, + "tokens_seen": 1372209152 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002950651955867603, + "loss": 2.768, + "theoretical_loss": 3.5440169317284123, + "tokens_seen": 1372274688 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029505516549648946, + "loss": 2.7294, + "theoretical_loss": 3.544001753368609, + "tokens_seen": 1372340224 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029504513540621865, + "loss": 2.743, + "theoretical_loss": 3.5439865759365747, + "tokens_seen": 1372405760 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002950351053159478, + "loss": 2.8594, + "theoretical_loss": 3.543971399432208, + "tokens_seen": 1372471296 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029502507522567706, + "loss": 2.844, + "theoretical_loss": 3.5439562238554077, + "tokens_seen": 1372536832 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029501504513540624, + "loss": 2.7718, + "theoretical_loss": 3.5439410492060732, + "tokens_seen": 1372602368 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002950050150451354, + "loss": 2.7316, + "theoretical_loss": 3.5439258754841028, + "tokens_seen": 1372667904 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002949949849548646, + "loss": 2.8157, + "theoretical_loss": 3.543910702689397, + "tokens_seen": 1372733440 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002949849548645938, + "loss": 2.7021, + "theoretical_loss": 3.5438955308218536, + "tokens_seen": 1372798976 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8816654682159424, + "objective/train/theoretical_loss": 3.543880359881372, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.543880359881372, + "tokens_seen": 1372864512 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029497492477432297, + "loss": 2.7783, + "theoretical_loss": 3.543880359881372, + "tokens_seen": 1372864512 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002949648946840522, + "loss": 2.8872, + "theoretical_loss": 3.5438651898678515, + "tokens_seen": 1372930048 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029495486459378133, + "loss": 2.7709, + "theoretical_loss": 3.543850020781191, + "tokens_seen": 1372995584 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029494483450351057, + "loss": 2.8178, + "theoretical_loss": 3.54383485262129, + "tokens_seen": 1373061120 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029493480441323975, + "loss": 2.8042, + "theoretical_loss": 3.543819685388047, + "tokens_seen": 1373126656 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029492477432296893, + "loss": 2.9086, + "theoretical_loss": 3.5438045190813616, + "tokens_seen": 1373192192 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002949147442326981, + "loss": 2.7702, + "theoretical_loss": 3.5437893537011336, + "tokens_seen": 1373257728 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002949047141424273, + "loss": 2.8629, + "theoretical_loss": 3.5437741892472605, + "tokens_seen": 1373323264 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029489468405215647, + "loss": 2.8178, + "theoretical_loss": 3.5437590257196434, + "tokens_seen": 1373388800 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002948846539618857, + "loss": 2.7811, + "theoretical_loss": 3.5437438631181806, + "tokens_seen": 1373454336 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029487462387161483, + "loss": 2.8903, + "theoretical_loss": 3.5437287014427707, + "tokens_seen": 1373519872 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029486459378134407, + "loss": 2.7572, + "theoretical_loss": 3.5437135406933145, + "tokens_seen": 1373585408 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002948545636910732, + "loss": 2.773, + "theoretical_loss": 3.54369838086971, + "tokens_seen": 1373650944 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029484453360080243, + "loss": 2.9238, + "theoretical_loss": 3.543683221971857, + "tokens_seen": 1373716480 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002948345035105316, + "loss": 2.744, + "theoretical_loss": 3.5436680639996547, + "tokens_seen": 1373782016 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002948244734202608, + "loss": 2.8168, + "theoretical_loss": 3.5436529069530027, + "tokens_seen": 1373847552 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029481444332999, + "loss": 2.7308, + "theoretical_loss": 3.5436377508317998, + "tokens_seen": 1373913088 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002948044132397192, + "loss": 2.7314, + "theoretical_loss": 3.543622595635946, + "tokens_seen": 1373978624 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029479438314944834, + "loss": 2.8374, + "theoretical_loss": 3.5436074413653404, + "tokens_seen": 1374044160 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029478435305917757, + "loss": 2.7884, + "theoretical_loss": 3.543592288019882, + "tokens_seen": 1374109696 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002947743229689067, + "loss": 2.7697, + "theoretical_loss": 3.5435771355994707, + "tokens_seen": 1374175232 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029476429287863593, + "loss": 2.8384, + "theoretical_loss": 3.543561984104006, + "tokens_seen": 1374240768 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002947542627883651, + "loss": 2.8474, + "theoretical_loss": 3.5435468335333864, + "tokens_seen": 1374306304 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002947442326980943, + "loss": 2.8148, + "theoretical_loss": 3.5435316838875126, + "tokens_seen": 1374371840 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002947342026078235, + "loss": 2.8122, + "theoretical_loss": 3.543516535166283, + "tokens_seen": 1374437376 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.746006488800049, + "objective/train/theoretical_loss": 3.5435013873695977, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5435013873695977, + "tokens_seen": 1374502912 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029472417251755266, + "loss": 2.7682, + "theoretical_loss": 3.5435013873695977, + "tokens_seen": 1374502912 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029471414242728184, + "loss": 2.7787, + "theoretical_loss": 3.543486240497357, + "tokens_seen": 1374568448 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002947041123370111, + "loss": 2.8259, + "theoretical_loss": 3.5434710945494583, + "tokens_seen": 1374633984 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002946940822467402, + "loss": 2.7829, + "theoretical_loss": 3.5434559495258027, + "tokens_seen": 1374699520 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029468405215646944, + "loss": 2.8383, + "theoretical_loss": 3.5434408054262896, + "tokens_seen": 1374765056 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029467402206619856, + "loss": 2.677, + "theoretical_loss": 3.543425662250818, + "tokens_seen": 1374830592 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002946639919759278, + "loss": 2.757, + "theoretical_loss": 3.543410519999288, + "tokens_seen": 1374896128 + }, + { + "epoch": 16.02, + "learning_rate": 0.000294653961885657, + "loss": 2.7358, + "theoretical_loss": 3.5433953786715984, + "tokens_seen": 1374961664 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029464393179538616, + "loss": 2.8561, + "theoretical_loss": 3.54338023826765, + "tokens_seen": 1375027200 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029463390170511534, + "loss": 2.8297, + "theoretical_loss": 3.5433650987873415, + "tokens_seen": 1375092736 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002946238716148446, + "loss": 2.7595, + "theoretical_loss": 3.543349960230573, + "tokens_seen": 1375158272 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002946138415245737, + "loss": 2.8764, + "theoretical_loss": 3.543334822597244, + "tokens_seen": 1375223808 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029460381143430294, + "loss": 2.8006, + "theoretical_loss": 3.543319685887254, + "tokens_seen": 1375289344 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029459378134403207, + "loss": 2.8165, + "theoretical_loss": 3.543304550100503, + "tokens_seen": 1375354880 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002945837512537613, + "loss": 2.8423, + "theoretical_loss": 3.5432894152368903, + "tokens_seen": 1375420416 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002945737211634905, + "loss": 2.7278, + "theoretical_loss": 3.5432742812963163, + "tokens_seen": 1375485952 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029456369107321966, + "loss": 2.8669, + "theoretical_loss": 3.54325914827868, + "tokens_seen": 1375551488 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029455366098294885, + "loss": 2.7461, + "theoretical_loss": 3.5432440161838814, + "tokens_seen": 1375617024 + }, + { + "epoch": 16.02, + "learning_rate": 0.000294543630892678, + "loss": 2.7448, + "theoretical_loss": 3.5432288850118203, + "tokens_seen": 1375682560 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002945336008024072, + "loss": 2.8382, + "theoretical_loss": 3.5432137547623963, + "tokens_seen": 1375748096 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029452357071213644, + "loss": 2.8889, + "theoretical_loss": 3.54319862543551, + "tokens_seen": 1375813632 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029451354062186557, + "loss": 2.8174, + "theoretical_loss": 3.54318349703106, + "tokens_seen": 1375879168 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002945035105315948, + "loss": 2.8586, + "theoretical_loss": 3.5431683695489467, + "tokens_seen": 1375944704 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029449348044132393, + "loss": 2.8485, + "theoretical_loss": 3.54315324298907, + "tokens_seen": 1376010240 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029448345035105317, + "loss": 2.8018, + "theoretical_loss": 3.5431381173513303, + "tokens_seen": 1376075776 + }, + { + "epoch": 16.02, + "objective/train/docs_used": 3214496, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5908331871032715, + "objective/train/theoretical_loss": 3.5431229926356265, + "objective/train/tokens_used": 1376900576, + "theoretical_loss": 3.5431229926356265, + "tokens_seen": 1376141312 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029447342026078235, + "loss": 2.6765, + "theoretical_loss": 3.5431229926356265, + "tokens_seen": 1376141312 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029446339017051153, + "loss": 2.8845, + "theoretical_loss": 3.5431078688418585, + "tokens_seen": 1376206848 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002944533600802407, + "loss": 2.7758, + "theoretical_loss": 3.543092745969927, + "tokens_seen": 1376272384 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029444332998996995, + "loss": 2.8255, + "theoretical_loss": 3.5430776240197313, + "tokens_seen": 1376337920 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002944332998996991, + "loss": 2.8712, + "theoretical_loss": 3.5430625029911713, + "tokens_seen": 1376403456 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002944232698094283, + "loss": 2.8503, + "theoretical_loss": 3.5430473828841476, + "tokens_seen": 1376468992 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029441323971915744, + "loss": 2.838, + "theoretical_loss": 3.5430322636985596, + "tokens_seen": 1376534528 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029440320962888667, + "loss": 2.8447, + "theoretical_loss": 3.543017145434307, + "tokens_seen": 1376600064 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029439317953861585, + "loss": 2.9336, + "theoretical_loss": 3.543002028091291, + "tokens_seen": 1376665600 + }, + { + "epoch": 16.02, + "learning_rate": 0.00029438314944834503, + "loss": 2.7253, + "theoretical_loss": 3.54298691166941, + "tokens_seen": 1376731136 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002943731193580742, + "loss": 2.7968, + "theoretical_loss": 3.5429717961685654, + "tokens_seen": 1376796672 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002943630892678034, + "loss": 2.79, + "theoretical_loss": 3.542956681588657, + "tokens_seen": 1376862208 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002943530591775326, + "loss": 2.8085, + "theoretical_loss": 3.542943220941198, + "tokens_seen": 1376920576 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002943430290872618, + "loss": 2.645, + "theoretical_loss": 3.5429281081021604, + "tokens_seen": 1376986112 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029433299899699094, + "loss": 2.7384, + "theoretical_loss": 3.54291299618377, + "tokens_seen": 1377051648 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002943229689067202, + "loss": 2.7294, + "theoretical_loss": 3.542897885185927, + "tokens_seen": 1377117184 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002943129388164493, + "loss": 2.5654, + "theoretical_loss": 3.542882775108531, + "tokens_seen": 1377182720 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029430290872617854, + "loss": 2.7342, + "theoretical_loss": 3.5428676659514826, + "tokens_seen": 1377248256 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029429287863590777, + "loss": 2.7624, + "theoretical_loss": 3.5428525577146814, + "tokens_seen": 1377313792 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942828485456369, + "loss": 2.729, + "theoretical_loss": 3.5428374503980287, + "tokens_seen": 1377379328 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029427281845536613, + "loss": 2.7145, + "theoretical_loss": 3.542822344001423, + "tokens_seen": 1377444864 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942627883650953, + "loss": 2.7177, + "theoretical_loss": 3.542807238524766, + "tokens_seen": 1377510400 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942527582748245, + "loss": 2.6403, + "theoretical_loss": 3.542792133967957, + "tokens_seen": 1377575936 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942427281845537, + "loss": 2.7458, + "theoretical_loss": 3.542777030330897, + "tokens_seen": 1377641472 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029423269809428286, + "loss": 2.8553, + "theoretical_loss": 3.542761927613485, + "tokens_seen": 1377707008 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3264940, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.588876962661743, + "objective/train/theoretical_loss": 3.5427468258156227, + "objective/train/tokens_used": 1398232544, + "theoretical_loss": 3.5427468258156227, + "tokens_seen": 1377772544 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029422266800401204, + "loss": 2.6414, + "theoretical_loss": 3.5427468258156227, + "tokens_seen": 1377772544 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942126379137413, + "loss": 2.6465, + "theoretical_loss": 3.5427317249372097, + "tokens_seen": 1377838080 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002942026078234704, + "loss": 2.7616, + "theoretical_loss": 3.542716624978146, + "tokens_seen": 1377903616 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029419257773319964, + "loss": 2.7706, + "theoretical_loss": 3.542701525938333, + "tokens_seen": 1377969152 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029418254764292876, + "loss": 2.7149, + "theoretical_loss": 3.5426864278176695, + "tokens_seen": 1378034688 + }, + { + "epoch": 17.0, + "learning_rate": 0.000294172517552658, + "loss": 2.6001, + "theoretical_loss": 3.5426713306160567, + "tokens_seen": 1378100224 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002941624874623872, + "loss": 2.7057, + "theoretical_loss": 3.5426562343333954, + "tokens_seen": 1378165760 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029415245737211636, + "loss": 2.8025, + "theoretical_loss": 3.542641138969585, + "tokens_seen": 1378231296 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029414242728184554, + "loss": 2.7104, + "theoretical_loss": 3.5426260445245266, + "tokens_seen": 1378296832 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002941323971915748, + "loss": 2.7117, + "theoretical_loss": 3.5426109509981196, + "tokens_seen": 1378362368 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002941223671013039, + "loss": 2.8079, + "theoretical_loss": 3.542595858390266, + "tokens_seen": 1378427904 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029411233701103314, + "loss": 2.5988, + "theoretical_loss": 3.5425807667008646, + "tokens_seen": 1378493440 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029410230692076227, + "loss": 2.6408, + "theoretical_loss": 3.542565675929817, + "tokens_seen": 1378558976 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002940922768304915, + "loss": 2.7189, + "theoretical_loss": 3.5425505860770232, + "tokens_seen": 1378624512 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002940822467402207, + "loss": 2.5507, + "theoretical_loss": 3.542535497142384, + "tokens_seen": 1378690048 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029407221664994986, + "loss": 2.7176, + "theoretical_loss": 3.5425204091257996, + "tokens_seen": 1378755584 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029406218655967905, + "loss": 2.6288, + "theoretical_loss": 3.54250532202717, + "tokens_seen": 1378821120 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002940521564694082, + "loss": 2.6463, + "theoretical_loss": 3.5424902358463966, + "tokens_seen": 1378886656 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002940421263791374, + "loss": 2.8082, + "theoretical_loss": 3.54247515058338, + "tokens_seen": 1378952192 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029403209628886664, + "loss": 2.6686, + "theoretical_loss": 3.5424600662380197, + "tokens_seen": 1379017728 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029402206619859577, + "loss": 2.7644, + "theoretical_loss": 3.5424449828102174, + "tokens_seen": 1379083264 + }, + { + "epoch": 17.0, + "learning_rate": 0.000294012036108325, + "loss": 2.631, + "theoretical_loss": 3.542429900299873, + "tokens_seen": 1379148800 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029400200601805413, + "loss": 2.7127, + "theoretical_loss": 3.5424148187068876, + "tokens_seen": 1379214336 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029399197592778337, + "loss": 2.6738, + "theoretical_loss": 3.542399738031161, + "tokens_seen": 1379279872 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029398194583751255, + "loss": 2.7839, + "theoretical_loss": 3.542384658272595, + "tokens_seen": 1379345408 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3268086, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.771439552307129, + "objective/train/theoretical_loss": 3.5423695794310897, + "objective/train/tokens_used": 1399870944, + "theoretical_loss": 3.5423695794310897, + "tokens_seen": 1379410944 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029397191574724173, + "loss": 2.7606, + "theoretical_loss": 3.5423695794310897, + "tokens_seen": 1379410944 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002939618856569709, + "loss": 2.727, + "theoretical_loss": 3.542354501506545, + "tokens_seen": 1379476480 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029395185556670015, + "loss": 2.7289, + "theoretical_loss": 3.542339424498863, + "tokens_seen": 1379542016 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002939418254764293, + "loss": 2.7258, + "theoretical_loss": 3.542324348407943, + "tokens_seen": 1379607552 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002939317953861585, + "loss": 2.7036, + "theoretical_loss": 3.5423092732336867, + "tokens_seen": 1379673088 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029392176529588764, + "loss": 2.7337, + "theoretical_loss": 3.542294198975995, + "tokens_seen": 1379738624 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029391173520561687, + "loss": 2.7506, + "theoretical_loss": 3.542279125634767, + "tokens_seen": 1379804160 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029390170511534605, + "loss": 2.7629, + "theoretical_loss": 3.542264053209906, + "tokens_seen": 1379869696 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029389167502507523, + "loss": 2.6329, + "theoretical_loss": 3.5422489817013103, + "tokens_seen": 1379935232 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938816449348044, + "loss": 2.7514, + "theoretical_loss": 3.542233911108882, + "tokens_seen": 1380000768 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938716148445336, + "loss": 2.6728, + "theoretical_loss": 3.542218841432522, + "tokens_seen": 1380066304 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938615847542628, + "loss": 2.7326, + "theoretical_loss": 3.5422037726721305, + "tokens_seen": 1380131840 + }, + { + "epoch": 17.0, + "learning_rate": 0.000293851554663992, + "loss": 2.7913, + "theoretical_loss": 3.542188704827609, + "tokens_seen": 1380197376 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029384152457372114, + "loss": 2.7362, + "theoretical_loss": 3.5421736378988573, + "tokens_seen": 1380262912 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938314944834504, + "loss": 2.8028, + "theoretical_loss": 3.5421585718857775, + "tokens_seen": 1380328448 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938214643931795, + "loss": 2.7013, + "theoretical_loss": 3.5421435067882694, + "tokens_seen": 1380393984 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029381143430290874, + "loss": 2.7592, + "theoretical_loss": 3.542128442606235, + "tokens_seen": 1380459520 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002938014042126379, + "loss": 2.7652, + "theoretical_loss": 3.5421133793395745, + "tokens_seen": 1380525056 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937913741223671, + "loss": 2.6778, + "theoretical_loss": 3.5420983169881888, + "tokens_seen": 1380590592 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937813440320963, + "loss": 2.7775, + "theoretical_loss": 3.5420832555519786, + "tokens_seen": 1380656128 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937713139418255, + "loss": 2.6979, + "theoretical_loss": 3.542068195030846, + "tokens_seen": 1380721664 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029376128385155464, + "loss": 2.742, + "theoretical_loss": 3.5420531354246907, + "tokens_seen": 1380787200 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937512537612839, + "loss": 2.6762, + "theoretical_loss": 3.5420380767334145, + "tokens_seen": 1380852736 + }, + { + "epoch": 17.0, + "learning_rate": 0.000293741223671013, + "loss": 2.5869, + "theoretical_loss": 3.542023018956918, + "tokens_seen": 1380918272 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029373119358074224, + "loss": 2.7167, + "theoretical_loss": 3.542007962095102, + "tokens_seen": 1380983808 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3271759, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7307074069976807, + "objective/train/theoretical_loss": 3.5419929061478683, + "objective/train/tokens_used": 1401509344, + "theoretical_loss": 3.5419929061478683, + "tokens_seen": 1381049344 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937211634904714, + "loss": 2.7715, + "theoretical_loss": 3.5419929061478683, + "tokens_seen": 1381049344 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937111334002006, + "loss": 2.7876, + "theoretical_loss": 3.5419778511151176, + "tokens_seen": 1381114880 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002937011033099298, + "loss": 2.7418, + "theoretical_loss": 3.541962796996751, + "tokens_seen": 1381180416 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029369107321965896, + "loss": 2.7084, + "theoretical_loss": 3.5419477437926687, + "tokens_seen": 1381245952 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029368104312938815, + "loss": 2.7449, + "theoretical_loss": 3.5419326915027733, + "tokens_seen": 1381311488 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002936710130391174, + "loss": 2.6608, + "theoretical_loss": 3.541917640126965, + "tokens_seen": 1381377024 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002936609829488465, + "loss": 2.7635, + "theoretical_loss": 3.541902589665145, + "tokens_seen": 1381442560 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029365095285857574, + "loss": 2.6799, + "theoretical_loss": 3.5418875401172145, + "tokens_seen": 1381508096 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029364092276830487, + "loss": 2.6618, + "theoretical_loss": 3.541872491483075, + "tokens_seen": 1381573632 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002936308926780341, + "loss": 2.7723, + "theoretical_loss": 3.541857443762627, + "tokens_seen": 1381639168 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002936208625877633, + "loss": 2.7467, + "theoretical_loss": 3.541842396955772, + "tokens_seen": 1381704704 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029361083249749247, + "loss": 2.7531, + "theoretical_loss": 3.5418273510624116, + "tokens_seen": 1381770240 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029360080240722165, + "loss": 2.7622, + "theoretical_loss": 3.5418123060824467, + "tokens_seen": 1381835776 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002935907723169509, + "loss": 2.6279, + "theoretical_loss": 3.5417972620157787, + "tokens_seen": 1381901312 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029358074222668, + "loss": 2.7877, + "theoretical_loss": 3.5417822188623083, + "tokens_seen": 1381966848 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029357071213640925, + "loss": 2.7263, + "theoretical_loss": 3.5417671766219367, + "tokens_seen": 1382032384 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002935606820461384, + "loss": 2.7307, + "theoretical_loss": 3.5417521352945665, + "tokens_seen": 1382097920 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002935506519558676, + "loss": 2.7662, + "theoretical_loss": 3.541737094880098, + "tokens_seen": 1382163456 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029354062186559684, + "loss": 2.7687, + "theoretical_loss": 3.541722055378432, + "tokens_seen": 1382228992 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029353059177532597, + "loss": 2.6796, + "theoretical_loss": 3.541707016789471, + "tokens_seen": 1382294528 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002935205616850552, + "loss": 2.8569, + "theoretical_loss": 3.5416919791131156, + "tokens_seen": 1382360064 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029351053159478433, + "loss": 2.6293, + "theoretical_loss": 3.5416769423492673, + "tokens_seen": 1382425600 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029350050150451357, + "loss": 2.6702, + "theoretical_loss": 3.5416619064978274, + "tokens_seen": 1382491136 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029349047141424275, + "loss": 2.6909, + "theoretical_loss": 3.541646871558698, + "tokens_seen": 1382556672 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029348044132397193, + "loss": 2.6544, + "theoretical_loss": 3.5416318375317797, + "tokens_seen": 1382622208 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3276906, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.69108247756958, + "objective/train/theoretical_loss": 3.5416168044169734, + "objective/train/tokens_used": 1403147744, + "theoretical_loss": 3.5416168044169734, + "tokens_seen": 1382687744 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002934704112337011, + "loss": 2.7227, + "theoretical_loss": 3.5416168044169734, + "tokens_seen": 1382687744 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029346038114343035, + "loss": 2.7207, + "theoretical_loss": 3.5416017722141824, + "tokens_seen": 1382753280 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002934503510531595, + "loss": 2.7183, + "theoretical_loss": 3.5415867409233064, + "tokens_seen": 1382818816 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002934403209628887, + "loss": 2.7881, + "theoretical_loss": 3.5415717105442477, + "tokens_seen": 1382884352 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029343029087261784, + "loss": 2.6316, + "theoretical_loss": 3.5415566810769072, + "tokens_seen": 1382949888 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029342026078234707, + "loss": 2.6537, + "theoretical_loss": 3.541541652521187, + "tokens_seen": 1383015424 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029341023069207625, + "loss": 2.7078, + "theoretical_loss": 3.5415266248769885, + "tokens_seen": 1383080960 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029340020060180543, + "loss": 2.6346, + "theoretical_loss": 3.541511598144213, + "tokens_seen": 1383146496 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933901705115346, + "loss": 2.6769, + "theoretical_loss": 3.541496572322762, + "tokens_seen": 1383212032 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933801404212638, + "loss": 2.7519, + "theoretical_loss": 3.541481547412537, + "tokens_seen": 1383277568 + }, + { + "epoch": 17.0, + "learning_rate": 0.000293370110330993, + "loss": 2.7502, + "theoretical_loss": 3.5414665234134404, + "tokens_seen": 1383343104 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933600802407222, + "loss": 2.7327, + "theoretical_loss": 3.5414515003253726, + "tokens_seen": 1383408640 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029335005015045134, + "loss": 2.8014, + "theoretical_loss": 3.541436478148236, + "tokens_seen": 1383474176 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933400200601806, + "loss": 2.7009, + "theoretical_loss": 3.541421456881932, + "tokens_seen": 1383539712 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933299899699097, + "loss": 2.6746, + "theoretical_loss": 3.5414064365263624, + "tokens_seen": 1383605248 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029331995987963894, + "loss": 2.755, + "theoretical_loss": 3.5413914170814285, + "tokens_seen": 1383670784 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002933099297893681, + "loss": 2.8091, + "theoretical_loss": 3.541376398547032, + "tokens_seen": 1383736320 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932998996990973, + "loss": 2.6923, + "theoretical_loss": 3.5413613809230746, + "tokens_seen": 1383801856 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932898696088265, + "loss": 2.8104, + "theoretical_loss": 3.5413463642094585, + "tokens_seen": 1383867392 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932798395185557, + "loss": 2.626, + "theoretical_loss": 3.541331348406085, + "tokens_seen": 1383932928 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029326980942828484, + "loss": 2.6496, + "theoretical_loss": 3.5413163335128552, + "tokens_seen": 1383998464 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932597793380141, + "loss": 2.714, + "theoretical_loss": 3.541301319529672, + "tokens_seen": 1384064000 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932497492477432, + "loss": 2.6574, + "theoretical_loss": 3.5412863064564366, + "tokens_seen": 1384129536 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029323971915747244, + "loss": 2.7365, + "theoretical_loss": 3.5412712942930504, + "tokens_seen": 1384195072 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932296890672016, + "loss": 2.7382, + "theoretical_loss": 3.541256283039416, + "tokens_seen": 1384260608 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3279603, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.759375810623169, + "objective/train/theoretical_loss": 3.5412412726954345, + "objective/train/tokens_used": 1404786144, + "theoretical_loss": 3.5412412726954345, + "tokens_seen": 1384326144 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002932196589769308, + "loss": 2.7325, + "theoretical_loss": 3.5412412726954345, + "tokens_seen": 1384326144 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029320962888666, + "loss": 2.7135, + "theoretical_loss": 3.5412262632610085, + "tokens_seen": 1384391680 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029319959879638916, + "loss": 2.6168, + "theoretical_loss": 3.541211254736039, + "tokens_seen": 1384457216 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029318956870611835, + "loss": 2.728, + "theoretical_loss": 3.5411962471204284, + "tokens_seen": 1384522752 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002931795386158476, + "loss": 2.727, + "theoretical_loss": 3.541181240414078, + "tokens_seen": 1384588288 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002931695085255767, + "loss": 2.716, + "theoretical_loss": 3.54116623461689, + "tokens_seen": 1384653824 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029315947843530594, + "loss": 2.7549, + "theoretical_loss": 3.5411512297287664, + "tokens_seen": 1384719360 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029314944834503507, + "loss": 2.7295, + "theoretical_loss": 3.541136225749609, + "tokens_seen": 1384784896 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002931394182547643, + "loss": 2.6929, + "theoretical_loss": 3.54112122267932, + "tokens_seen": 1384850432 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002931293881644935, + "loss": 2.7237, + "theoretical_loss": 3.541106220517801, + "tokens_seen": 1384915968 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029311935807422267, + "loss": 2.6992, + "theoretical_loss": 3.5410912192649535, + "tokens_seen": 1384981504 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029310932798395185, + "loss": 2.781, + "theoretical_loss": 3.541076218920681, + "tokens_seen": 1385047040 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002930992978936811, + "loss": 2.7185, + "theoretical_loss": 3.5410612194848836, + "tokens_seen": 1385112576 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002930892678034102, + "loss": 2.757, + "theoretical_loss": 3.5410462209574645, + "tokens_seen": 1385178112 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029307923771313945, + "loss": 2.649, + "theoretical_loss": 3.5410312233383254, + "tokens_seen": 1385243648 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002930692076228686, + "loss": 2.7346, + "theoretical_loss": 3.5410162266273684, + "tokens_seen": 1385309184 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002930591775325978, + "loss": 2.6326, + "theoretical_loss": 3.541001230824495, + "tokens_seen": 1385374720 + }, + { + "epoch": 17.0, + "learning_rate": 0.000293049147442327, + "loss": 2.7728, + "theoretical_loss": 3.5409862359296085, + "tokens_seen": 1385440256 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029303911735205617, + "loss": 2.71, + "theoretical_loss": 3.54097124194261, + "tokens_seen": 1385505792 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029302908726178535, + "loss": 2.7041, + "theoretical_loss": 3.5409562488634014, + "tokens_seen": 1385571328 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029301905717151453, + "loss": 2.6777, + "theoretical_loss": 3.5409412566918856, + "tokens_seen": 1385636864 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002930090270812437, + "loss": 2.7682, + "theoretical_loss": 3.5409262654279643, + "tokens_seen": 1385702400 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029299899699097295, + "loss": 2.6366, + "theoretical_loss": 3.5409112750715397, + "tokens_seen": 1385767936 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929889669007021, + "loss": 2.8405, + "theoretical_loss": 3.5408962856225132, + "tokens_seen": 1385833472 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929789368104313, + "loss": 2.746, + "theoretical_loss": 3.540881297080789, + "tokens_seen": 1385899008 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3284664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.746143102645874, + "objective/train/theoretical_loss": 3.540866309446267, + "objective/train/tokens_used": 1406424544, + "theoretical_loss": 3.540866309446267, + "tokens_seen": 1385964544 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929689067201605, + "loss": 2.7393, + "theoretical_loss": 3.540866309446267, + "tokens_seen": 1385964544 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929588766298897, + "loss": 2.7235, + "theoretical_loss": 3.5408513227188507, + "tokens_seen": 1386030080 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029294884653961885, + "loss": 2.6699, + "theoretical_loss": 3.5408363368984412, + "tokens_seen": 1386095616 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029293881644934804, + "loss": 2.6371, + "theoretical_loss": 3.5408213519849427, + "tokens_seen": 1386161152 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929287863590772, + "loss": 2.7321, + "theoretical_loss": 3.5408063679782553, + "tokens_seen": 1386226688 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029291875626880645, + "loss": 2.6869, + "theoretical_loss": 3.5407913848782826, + "tokens_seen": 1386292224 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002929087261785356, + "loss": 2.748, + "theoretical_loss": 3.5407764026849264, + "tokens_seen": 1386357760 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928986960882648, + "loss": 2.7042, + "theoretical_loss": 3.540761421398089, + "tokens_seen": 1386423296 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029288866599799394, + "loss": 2.7352, + "theoretical_loss": 3.540746441017673, + "tokens_seen": 1386488832 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928786359077232, + "loss": 2.8029, + "theoretical_loss": 3.5407314615435803, + "tokens_seen": 1386554368 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029286860581745236, + "loss": 2.7425, + "theoretical_loss": 3.5407164829757134, + "tokens_seen": 1386619904 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029285857572718154, + "loss": 2.7552, + "theoretical_loss": 3.540701505313975, + "tokens_seen": 1386685440 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928485456369107, + "loss": 2.7911, + "theoretical_loss": 3.5406865285582665, + "tokens_seen": 1386750976 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928385155466399, + "loss": 2.6798, + "theoretical_loss": 3.540671552708491, + "tokens_seen": 1386816512 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928284854563691, + "loss": 2.7044, + "theoretical_loss": 3.540656577764551, + "tokens_seen": 1386882048 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002928184553660983, + "loss": 2.6991, + "theoretical_loss": 3.540641603726349, + "tokens_seen": 1386947584 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029280842527582744, + "loss": 2.7926, + "theoretical_loss": 3.5406266305937866, + "tokens_seen": 1387013120 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927983951855567, + "loss": 2.765, + "theoretical_loss": 3.5406116583667666, + "tokens_seen": 1387078656 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927883650952859, + "loss": 2.7032, + "theoretical_loss": 3.540596687045192, + "tokens_seen": 1387144192 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029277833500501504, + "loss": 2.7516, + "theoretical_loss": 3.5405817166289655, + "tokens_seen": 1387209728 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927683049147443, + "loss": 2.6818, + "theoretical_loss": 3.540566747117988, + "tokens_seen": 1387275264 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927582748244734, + "loss": 2.7584, + "theoretical_loss": 3.5405517785121634, + "tokens_seen": 1387340800 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029274824473420264, + "loss": 2.6402, + "theoretical_loss": 3.5405368108113935, + "tokens_seen": 1387406336 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927382146439318, + "loss": 2.8419, + "theoretical_loss": 3.5405218440155815, + "tokens_seen": 1387471872 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292728184553661, + "loss": 2.6935, + "theoretical_loss": 3.540506878124629, + "tokens_seen": 1387537408 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3287501, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.606881856918335, + "objective/train/theoretical_loss": 3.54049191313844, + "objective/train/tokens_used": 1408062944, + "theoretical_loss": 3.54049191313844, + "tokens_seen": 1387602944 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002927181544633902, + "loss": 2.6635, + "theoretical_loss": 3.54049191313844, + "tokens_seen": 1387602944 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029270812437311936, + "loss": 2.7096, + "theoretical_loss": 3.5404769490569157, + "tokens_seen": 1387668480 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029269809428284855, + "loss": 2.7227, + "theoretical_loss": 3.540461985879959, + "tokens_seen": 1387734016 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002926880641925778, + "loss": 2.7578, + "theoretical_loss": 3.5404470236074728, + "tokens_seen": 1387799552 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002926780341023069, + "loss": 2.7626, + "theoretical_loss": 3.5404320622393595, + "tokens_seen": 1387865088 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029266800401203614, + "loss": 2.8013, + "theoretical_loss": 3.540417101775522, + "tokens_seen": 1387930624 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029265797392176527, + "loss": 2.8357, + "theoretical_loss": 3.540402142215863, + "tokens_seen": 1387996160 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002926479438314945, + "loss": 2.6969, + "theoretical_loss": 3.5403871835602847, + "tokens_seen": 1388061696 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002926379137412237, + "loss": 2.7376, + "theoretical_loss": 3.54037222580869, + "tokens_seen": 1388127232 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029262788365095287, + "loss": 2.7153, + "theoretical_loss": 3.5403572689609817, + "tokens_seen": 1388192768 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029261785356068205, + "loss": 2.7954, + "theoretical_loss": 3.5403423130170624, + "tokens_seen": 1388258304 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002926078234704113, + "loss": 2.6598, + "theoretical_loss": 3.5403273579768353, + "tokens_seen": 1388323840 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002925977933801404, + "loss": 2.7411, + "theoretical_loss": 3.5403124038402023, + "tokens_seen": 1388389376 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029258776328986965, + "loss": 2.7049, + "theoretical_loss": 3.540297450607066, + "tokens_seen": 1388454912 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002925777331995988, + "loss": 2.8427, + "theoretical_loss": 3.540282498277331, + "tokens_seen": 1388520448 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292567703109328, + "loss": 2.6699, + "theoretical_loss": 3.540267546850898, + "tokens_seen": 1388585984 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002925576730190572, + "loss": 2.5811, + "theoretical_loss": 3.5402525963276705, + "tokens_seen": 1388651520 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029254764292878637, + "loss": 2.6468, + "theoretical_loss": 3.540237646707552, + "tokens_seen": 1388717056 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029253761283851555, + "loss": 2.8104, + "theoretical_loss": 3.5402226979904445, + "tokens_seen": 1388782592 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029252758274824473, + "loss": 2.8065, + "theoretical_loss": 3.5402077501762514, + "tokens_seen": 1388848128 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002925175526579739, + "loss": 2.7737, + "theoretical_loss": 3.540192803264875, + "tokens_seen": 1388913664 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029250752256770315, + "loss": 2.7333, + "theoretical_loss": 3.5401778572562184, + "tokens_seen": 1388979200 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924974924774323, + "loss": 2.6787, + "theoretical_loss": 3.5401629121501843, + "tokens_seen": 1389044736 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924874623871615, + "loss": 2.7276, + "theoretical_loss": 3.5401479679466763, + "tokens_seen": 1389110272 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924774322968907, + "loss": 2.6678, + "theoretical_loss": 3.5401330246455966, + "tokens_seen": 1389175808 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3291155, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6092779636383057, + "objective/train/theoretical_loss": 3.540118082246849, + "objective/train/tokens_used": 1409701344, + "theoretical_loss": 3.540118082246849, + "tokens_seen": 1389241344 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924674022066199, + "loss": 2.7448, + "theoretical_loss": 3.540118082246849, + "tokens_seen": 1389241344 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029245737211634906, + "loss": 2.5485, + "theoretical_loss": 3.540103140750335, + "tokens_seen": 1389306880 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029244734202607824, + "loss": 2.753, + "theoretical_loss": 3.540088200155959, + "tokens_seen": 1389372416 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924373119358074, + "loss": 2.7027, + "theoretical_loss": 3.540073260463623, + "tokens_seen": 1389437952 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029242728184553665, + "loss": 2.8183, + "theoretical_loss": 3.5400583216732304, + "tokens_seen": 1389503488 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002924172517552658, + "loss": 2.6745, + "theoretical_loss": 3.5400433837846847, + "tokens_seen": 1389569024 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292407221664995, + "loss": 2.6843, + "theoretical_loss": 3.5400284467978884, + "tokens_seen": 1389634560 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029239719157472414, + "loss": 2.7352, + "theoretical_loss": 3.540013510712744, + "tokens_seen": 1389700096 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923871614844534, + "loss": 2.7286, + "theoretical_loss": 3.5399985755291556, + "tokens_seen": 1389765632 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029237713139418256, + "loss": 2.7979, + "theoretical_loss": 3.539983641247026, + "tokens_seen": 1389831168 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029236710130391174, + "loss": 2.8462, + "theoretical_loss": 3.539968707866258, + "tokens_seen": 1389896704 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923570712136409, + "loss": 2.6088, + "theoretical_loss": 3.5399537753867545, + "tokens_seen": 1389962240 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923470411233701, + "loss": 2.7159, + "theoretical_loss": 3.539938843808419, + "tokens_seen": 1390027776 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923370110330993, + "loss": 2.803, + "theoretical_loss": 3.5399239131311546, + "tokens_seen": 1390093312 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923269809428285, + "loss": 2.6647, + "theoretical_loss": 3.5399089833548643, + "tokens_seen": 1390158848 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029231695085255765, + "loss": 2.6904, + "theoretical_loss": 3.5398940544794515, + "tokens_seen": 1390224384 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002923069207622869, + "loss": 2.7367, + "theoretical_loss": 3.539879126504819, + "tokens_seen": 1390289920 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029229689067201606, + "loss": 2.7251, + "theoretical_loss": 3.5398641994308706, + "tokens_seen": 1390355456 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029228686058174524, + "loss": 2.7013, + "theoretical_loss": 3.5398492732575084, + "tokens_seen": 1390420992 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002922768304914744, + "loss": 2.8174, + "theoretical_loss": 3.5398343479846375, + "tokens_seen": 1390486528 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002922668004012036, + "loss": 2.7621, + "theoretical_loss": 3.539819423612159, + "tokens_seen": 1390552064 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002922567703109328, + "loss": 2.7065, + "theoretical_loss": 3.5398045001399776, + "tokens_seen": 1390617600 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292246740220662, + "loss": 2.772, + "theoretical_loss": 3.539789577567996, + "tokens_seen": 1390683136 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029223671013039115, + "loss": 2.7246, + "theoretical_loss": 3.5397746558961174, + "tokens_seen": 1390748672 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002922266800401204, + "loss": 2.6718, + "theoretical_loss": 3.5397597351242456, + "tokens_seen": 1390814208 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3296014, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7494664192199707, + "objective/train/theoretical_loss": 3.5397448152522832, + "objective/train/tokens_used": 1411339744, + "theoretical_loss": 3.5397448152522832, + "tokens_seen": 1390879744 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002922166499498495, + "loss": 2.6394, + "theoretical_loss": 3.5397448152522832, + "tokens_seen": 1390879744 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029220661985957875, + "loss": 2.6896, + "theoretical_loss": 3.539729896280134, + "tokens_seen": 1390945280 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002921965897693079, + "loss": 2.6869, + "theoretical_loss": 3.5397149782077015, + "tokens_seen": 1391010816 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002921865596790371, + "loss": 2.704, + "theoretical_loss": 3.539700061034889, + "tokens_seen": 1391076352 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002921765295887663, + "loss": 2.7509, + "theoretical_loss": 3.5396851447615996, + "tokens_seen": 1391141888 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029216649949849547, + "loss": 2.7276, + "theoretical_loss": 3.5396702293877365, + "tokens_seen": 1391207424 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029215646940822465, + "loss": 2.8166, + "theoretical_loss": 3.539655314913203, + "tokens_seen": 1391272960 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002921464393179539, + "loss": 2.7009, + "theoretical_loss": 3.5396404013379037, + "tokens_seen": 1391338496 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292136409227683, + "loss": 2.6734, + "theoretical_loss": 3.5396254886617404, + "tokens_seen": 1391404032 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029212637913741225, + "loss": 2.6812, + "theoretical_loss": 3.539610576884618, + "tokens_seen": 1391469568 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029211634904714143, + "loss": 2.6954, + "theoretical_loss": 3.5395956660064387, + "tokens_seen": 1391535104 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002921063189568706, + "loss": 2.6677, + "theoretical_loss": 3.5395807560271075, + "tokens_seen": 1391600640 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002920962888665998, + "loss": 2.7489, + "theoretical_loss": 3.539565846946526, + "tokens_seen": 1391666176 + }, + { + "epoch": 17.0, + "learning_rate": 0.000292086258776329, + "loss": 2.7966, + "theoretical_loss": 3.539550938764599, + "tokens_seen": 1391731712 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029207622868605815, + "loss": 2.7308, + "theoretical_loss": 3.53953603148123, + "tokens_seen": 1391797248 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002920661985957874, + "loss": 2.7524, + "theoretical_loss": 3.5395211250963223, + "tokens_seen": 1391862784 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002920561685055165, + "loss": 2.7176, + "theoretical_loss": 3.539506219609779, + "tokens_seen": 1391928320 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029204613841524575, + "loss": 2.6986, + "theoretical_loss": 3.539491315021504, + "tokens_seen": 1391993856 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029203610832497493, + "loss": 2.7214, + "theoretical_loss": 3.5394764113314015, + "tokens_seen": 1392059392 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002920260782347041, + "loss": 2.7089, + "theoretical_loss": 3.5394615085393744, + "tokens_seen": 1392124928 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029201604814443335, + "loss": 2.6329, + "theoretical_loss": 3.539446606645326, + "tokens_seen": 1392190464 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002920060180541625, + "loss": 2.8387, + "theoretical_loss": 3.5394317056491604, + "tokens_seen": 1392256000 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002919959879638917, + "loss": 2.8074, + "theoretical_loss": 3.5394168055507818, + "tokens_seen": 1392321536 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002919859578736209, + "loss": 2.8183, + "theoretical_loss": 3.5394019063500926, + "tokens_seen": 1392387072 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002919759277833501, + "loss": 2.6717, + "theoretical_loss": 3.5393870080469974, + "tokens_seen": 1392452608 + }, + { + "epoch": 17.0, + "objective/train/docs_used": 3298918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.744810104370117, + "objective/train/theoretical_loss": 3.5393721106413993, + "objective/train/tokens_used": 1412978144, + "theoretical_loss": 3.5393721106413993, + "tokens_seen": 1392518144 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029196589769307926, + "loss": 2.7611, + "theoretical_loss": 3.5393721106413993, + "tokens_seen": 1392518144 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029195586760280844, + "loss": 2.6946, + "theoretical_loss": 3.5393572141332026, + "tokens_seen": 1392583680 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002919458375125376, + "loss": 2.7858, + "theoretical_loss": 3.5393423185223107, + "tokens_seen": 1392649216 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029193580742226685, + "loss": 2.6806, + "theoretical_loss": 3.5393274238086274, + "tokens_seen": 1392714752 + }, + { + "epoch": 17.0, + "learning_rate": 0.000291925777331996, + "loss": 2.7589, + "theoretical_loss": 3.5393125299920563, + "tokens_seen": 1392780288 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002919157472417252, + "loss": 2.7314, + "theoretical_loss": 3.539297637072501, + "tokens_seen": 1392845824 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029190571715145434, + "loss": 2.5287, + "theoretical_loss": 3.5392827450498663, + "tokens_seen": 1392911360 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002918956870611836, + "loss": 2.6978, + "theoretical_loss": 3.5392678539240543, + "tokens_seen": 1392976896 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029188565697091276, + "loss": 2.7269, + "theoretical_loss": 3.5392529636949703, + "tokens_seen": 1393042432 + }, + { + "epoch": 17.0, + "learning_rate": 0.00029187562688064194, + "loss": 2.7643, + "theoretical_loss": 3.539238074362517, + "tokens_seen": 1393107968 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002918655967903711, + "loss": 2.7285, + "theoretical_loss": 3.5392231859265992, + "tokens_seen": 1393173504 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002918555667001003, + "loss": 2.8075, + "theoretical_loss": 3.53920829838712, + "tokens_seen": 1393239040 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002918455366098295, + "loss": 2.8173, + "theoretical_loss": 3.539193411743984, + "tokens_seen": 1393304576 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002918355065195587, + "loss": 2.7022, + "theoretical_loss": 3.5391785259970945, + "tokens_seen": 1393370112 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029182547642928785, + "loss": 2.7918, + "theoretical_loss": 3.539163641146355, + "tokens_seen": 1393435648 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002918154463390171, + "loss": 2.7288, + "theoretical_loss": 3.53914875719167, + "tokens_seen": 1393501184 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029180541624874626, + "loss": 2.5548, + "theoretical_loss": 3.5391338741329443, + "tokens_seen": 1393566720 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029179538615847544, + "loss": 2.8244, + "theoretical_loss": 3.53911899197008, + "tokens_seen": 1393632256 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917853560682046, + "loss": 2.7232, + "theoretical_loss": 3.5391041107029824, + "tokens_seen": 1393697792 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917753259779338, + "loss": 2.7557, + "theoretical_loss": 3.539089230331555, + "tokens_seen": 1393763328 + }, + { + "epoch": 17.01, + "learning_rate": 0.000291765295887663, + "loss": 2.7283, + "theoretical_loss": 3.539074350855701, + "tokens_seen": 1393828864 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917552657973922, + "loss": 2.5596, + "theoretical_loss": 3.5390594722753264, + "tokens_seen": 1393894400 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029174523570712135, + "loss": 2.7324, + "theoretical_loss": 3.539044594590333, + "tokens_seen": 1393959936 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917352056168506, + "loss": 2.7646, + "theoretical_loss": 3.5390297178006263, + "tokens_seen": 1394025472 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917251755265797, + "loss": 2.7685, + "theoretical_loss": 3.53901484190611, + "tokens_seen": 1394091008 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3303793, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.884026288986206, + "objective/train/theoretical_loss": 3.5389999669066876, + "objective/train/tokens_used": 1414616544, + "theoretical_loss": 3.5389999669066876, + "tokens_seen": 1394156544 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029171514543630895, + "loss": 2.7853, + "theoretical_loss": 3.5389999669066876, + "tokens_seen": 1394156544 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002917051153460381, + "loss": 2.6975, + "theoretical_loss": 3.538985092802264, + "tokens_seen": 1394222080 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002916950852557673, + "loss": 2.7675, + "theoretical_loss": 3.538970219592743, + "tokens_seen": 1394287616 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002916850551654965, + "loss": 2.8264, + "theoretical_loss": 3.538955347278028, + "tokens_seen": 1394353152 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029167502507522567, + "loss": 2.8048, + "theoretical_loss": 3.538940475858024, + "tokens_seen": 1394418688 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029166499498495485, + "loss": 2.7518, + "theoretical_loss": 3.5389256053326346, + "tokens_seen": 1394484224 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002916549648946841, + "loss": 2.6564, + "theoretical_loss": 3.5389107357017644, + "tokens_seen": 1394549760 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002916449348044132, + "loss": 2.6418, + "theoretical_loss": 3.538895866965317, + "tokens_seen": 1394615296 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029163490471414245, + "loss": 2.9036, + "theoretical_loss": 3.5388809991231973, + "tokens_seen": 1394680832 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029162487462387163, + "loss": 2.7029, + "theoretical_loss": 3.5388661321753085, + "tokens_seen": 1394746368 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002916148445336008, + "loss": 2.7336, + "theoretical_loss": 3.538851266121556, + "tokens_seen": 1394811904 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029160481444333, + "loss": 2.7479, + "theoretical_loss": 3.538836400961843, + "tokens_seen": 1394877440 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915947843530592, + "loss": 2.6671, + "theoretical_loss": 3.5388215366960742, + "tokens_seen": 1394942976 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029158475426278835, + "loss": 2.6544, + "theoretical_loss": 3.538806673324154, + "tokens_seen": 1395008512 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915747241725176, + "loss": 2.5924, + "theoretical_loss": 3.5387918108459857, + "tokens_seen": 1395074048 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915646940822467, + "loss": 2.7386, + "theoretical_loss": 3.538776949261475, + "tokens_seen": 1395139584 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029155466399197595, + "loss": 2.7816, + "theoretical_loss": 3.5387620885705253, + "tokens_seen": 1395205120 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915446339017051, + "loss": 2.8159, + "theoretical_loss": 3.5387472287730413, + "tokens_seen": 1395270656 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915346038114343, + "loss": 2.7402, + "theoretical_loss": 3.5387323698689266, + "tokens_seen": 1395336192 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915245737211635, + "loss": 2.6881, + "theoretical_loss": 3.5387175118580867, + "tokens_seen": 1395401728 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002915145436308927, + "loss": 2.756, + "theoretical_loss": 3.5387026547404243, + "tokens_seen": 1395467264 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029150451354062186, + "loss": 2.7441, + "theoretical_loss": 3.538687798515846, + "tokens_seen": 1395532800 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914944834503511, + "loss": 2.7531, + "theoretical_loss": 3.5386729431842543, + "tokens_seen": 1395598336 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914844533600802, + "loss": 2.7703, + "theoretical_loss": 3.538658088745554, + "tokens_seen": 1395663872 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029147442326980946, + "loss": 2.7469, + "theoretical_loss": 3.5386432351996504, + "tokens_seen": 1395729408 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3306733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9329476356506348, + "objective/train/theoretical_loss": 3.5386283825464466, + "objective/train/tokens_used": 1416254944, + "theoretical_loss": 3.5386283825464466, + "tokens_seen": 1395794944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914643931795386, + "loss": 2.842, + "theoretical_loss": 3.5386283825464466, + "tokens_seen": 1395794944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914543630892678, + "loss": 2.6767, + "theoretical_loss": 3.538613530785848, + "tokens_seen": 1395860480 + }, + { + "epoch": 17.01, + "learning_rate": 0.000291444332998997, + "loss": 2.7368, + "theoretical_loss": 3.538598679917759, + "tokens_seen": 1395926016 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914343029087262, + "loss": 2.7957, + "theoretical_loss": 3.538583829942083, + "tokens_seen": 1395991552 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029142427281845536, + "loss": 2.7252, + "theoretical_loss": 3.5385689808587264, + "tokens_seen": 1396057088 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029141424272818454, + "loss": 2.7756, + "theoretical_loss": 3.538554132667592, + "tokens_seen": 1396122624 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002914042126379137, + "loss": 2.7189, + "theoretical_loss": 3.538539285368585, + "tokens_seen": 1396188160 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029139418254764296, + "loss": 2.7716, + "theoretical_loss": 3.53852443896161, + "tokens_seen": 1396253696 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002913841524573721, + "loss": 2.7519, + "theoretical_loss": 3.5385095934465713, + "tokens_seen": 1396319232 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002913741223671013, + "loss": 2.7172, + "theoretical_loss": 3.538494748823373, + "tokens_seen": 1396384768 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029136409227683045, + "loss": 2.8016, + "theoretical_loss": 3.538479905091921, + "tokens_seen": 1396450304 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002913540621865597, + "loss": 2.7292, + "theoretical_loss": 3.538465062252119, + "tokens_seen": 1396515840 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029134403209628886, + "loss": 2.6517, + "theoretical_loss": 3.5384502203038712, + "tokens_seen": 1396581376 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029133400200601805, + "loss": 2.7458, + "theoretical_loss": 3.5384353792470833, + "tokens_seen": 1396646912 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002913239719157472, + "loss": 2.7557, + "theoretical_loss": 3.5384205390816588, + "tokens_seen": 1396712448 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029131394182547646, + "loss": 2.6191, + "theoretical_loss": 3.5384056998075035, + "tokens_seen": 1396777984 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002913039117352056, + "loss": 2.7623, + "theoretical_loss": 3.5383908614245208, + "tokens_seen": 1396843520 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912938816449348, + "loss": 2.8161, + "theoretical_loss": 3.5383760239326163, + "tokens_seen": 1396909056 + }, + { + "epoch": 17.01, + "learning_rate": 0.000291283851554664, + "loss": 2.6754, + "theoretical_loss": 3.5383611873316942, + "tokens_seen": 1396974592 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912738214643932, + "loss": 2.6796, + "theoretical_loss": 3.53834635162166, + "tokens_seen": 1397040128 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912637913741224, + "loss": 2.6841, + "theoretical_loss": 3.538331516802417, + "tokens_seen": 1397105664 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029125376128385155, + "loss": 2.6483, + "theoretical_loss": 3.5383166828738712, + "tokens_seen": 1397171200 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912437311935808, + "loss": 2.7608, + "theoretical_loss": 3.538301849835927, + "tokens_seen": 1397236736 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912337011033099, + "loss": 2.7203, + "theoretical_loss": 3.538287017688489, + "tokens_seen": 1397302272 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029122367101303915, + "loss": 2.767, + "theoretical_loss": 3.538272186431462, + "tokens_seen": 1397367808 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3310544, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.913447141647339, + "objective/train/theoretical_loss": 3.538257356064751, + "objective/train/tokens_used": 1417893344, + "theoretical_loss": 3.538257356064751, + "tokens_seen": 1397433344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029121364092276833, + "loss": 2.8451, + "theoretical_loss": 3.538257356064751, + "tokens_seen": 1397433344 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002912036108324975, + "loss": 2.7086, + "theoretical_loss": 3.53824252658826, + "tokens_seen": 1397498880 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002911935807422267, + "loss": 2.7663, + "theoretical_loss": 3.538227698001895, + "tokens_seen": 1397564416 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029118355065195587, + "loss": 2.7919, + "theoretical_loss": 3.5382128703055598, + "tokens_seen": 1397629952 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029117352056168505, + "loss": 2.7069, + "theoretical_loss": 3.5381980434991602, + "tokens_seen": 1397695488 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002911634904714143, + "loss": 2.6218, + "theoretical_loss": 3.538183217582601, + "tokens_seen": 1397761024 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002911534603811434, + "loss": 2.7788, + "theoretical_loss": 3.538168392555786, + "tokens_seen": 1397826560 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029114343029087265, + "loss": 2.7166, + "theoretical_loss": 3.5381535684186205, + "tokens_seen": 1397892096 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029113340020060183, + "loss": 2.7211, + "theoretical_loss": 3.5381387451710102, + "tokens_seen": 1397957632 + }, + { + "epoch": 17.01, + "learning_rate": 0.000291123370110331, + "loss": 2.7702, + "theoretical_loss": 3.5381239228128596, + "tokens_seen": 1398023168 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002911133400200602, + "loss": 2.7145, + "theoretical_loss": 3.538109101344073, + "tokens_seen": 1398088704 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002911033099297894, + "loss": 2.7662, + "theoretical_loss": 3.5380942807645566, + "tokens_seen": 1398154240 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029109327983951855, + "loss": 2.7297, + "theoretical_loss": 3.538079461074214, + "tokens_seen": 1398219776 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910832497492478, + "loss": 2.7826, + "theoretical_loss": 3.5380646422729507, + "tokens_seen": 1398285312 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910732196589769, + "loss": 2.6362, + "theoretical_loss": 3.538049824360672, + "tokens_seen": 1398350848 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029106318956870615, + "loss": 2.8671, + "theoretical_loss": 3.5380350073372835, + "tokens_seen": 1398416384 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910531594784353, + "loss": 2.687, + "theoretical_loss": 3.5380201912026887, + "tokens_seen": 1398481920 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910431293881645, + "loss": 2.7186, + "theoretical_loss": 3.5380053759567938, + "tokens_seen": 1398547456 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910330992978937, + "loss": 2.7348, + "theoretical_loss": 3.5379905615995026, + "tokens_seen": 1398612992 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910230692076229, + "loss": 2.8455, + "theoretical_loss": 3.537975748130722, + "tokens_seen": 1398678528 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029101303911735206, + "loss": 2.6898, + "theoretical_loss": 3.5379609355503554, + "tokens_seen": 1398744064 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002910030090270813, + "loss": 2.7394, + "theoretical_loss": 3.537946123858309, + "tokens_seen": 1398809600 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002909929789368104, + "loss": 2.7927, + "theoretical_loss": 3.5379313130544876, + "tokens_seen": 1398875136 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029098294884653966, + "loss": 2.7256, + "theoretical_loss": 3.537916503138796, + "tokens_seen": 1398940672 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002909729187562688, + "loss": 2.7098, + "theoretical_loss": 3.53790169411114, + "tokens_seen": 1399006208 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3315459, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4966039657592773, + "objective/train/theoretical_loss": 3.5378868859714236, + "objective/train/tokens_used": 1419531744, + "theoretical_loss": 3.5378868859714236, + "tokens_seen": 1399071744 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290962888665998, + "loss": 2.722, + "theoretical_loss": 3.5378868859714236, + "tokens_seen": 1399071744 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002909528585757272, + "loss": 2.8254, + "theoretical_loss": 3.5378720787195532, + "tokens_seen": 1399137280 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002909428284854564, + "loss": 2.8381, + "theoretical_loss": 3.537857272355433, + "tokens_seen": 1399202816 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029093279839518556, + "loss": 2.6686, + "theoretical_loss": 3.5378424668789696, + "tokens_seen": 1399268352 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029092276830491474, + "loss": 2.7877, + "theoretical_loss": 3.5378276622900664, + "tokens_seen": 1399333888 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002909127382146439, + "loss": 2.6745, + "theoretical_loss": 3.53781285858863, + "tokens_seen": 1399399424 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029090270812437316, + "loss": 2.6987, + "theoretical_loss": 3.537798055774565, + "tokens_seen": 1399464960 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002908926780341023, + "loss": 2.8235, + "theoretical_loss": 3.5377832538477767, + "tokens_seen": 1399530496 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002908826479438315, + "loss": 2.7296, + "theoretical_loss": 3.537768452808171, + "tokens_seen": 1399596032 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029087261785356065, + "loss": 2.6443, + "theoretical_loss": 3.537753652655652, + "tokens_seen": 1399661568 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002908625877632899, + "loss": 2.6489, + "theoretical_loss": 3.5377388533901266, + "tokens_seen": 1399727104 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029085255767301906, + "loss": 2.7226, + "theoretical_loss": 3.5377240550114983, + "tokens_seen": 1399792640 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029084252758274825, + "loss": 2.6794, + "theoretical_loss": 3.5377092575196736, + "tokens_seen": 1399858176 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002908324974924774, + "loss": 2.7531, + "theoretical_loss": 3.5376944609145577, + "tokens_seen": 1399923712 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029082246740220666, + "loss": 2.7646, + "theoretical_loss": 3.537679665196056, + "tokens_seen": 1399989248 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002908124373119358, + "loss": 2.7189, + "theoretical_loss": 3.537664870364073, + "tokens_seen": 1400054784 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290802407221665, + "loss": 2.7882, + "theoretical_loss": 3.5376500764185157, + "tokens_seen": 1400120320 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029079237713139415, + "loss": 2.6371, + "theoretical_loss": 3.537635283359288, + "tokens_seen": 1400185856 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002907823470411234, + "loss": 2.7944, + "theoretical_loss": 3.5376204911862965, + "tokens_seen": 1400251392 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029077231695085257, + "loss": 2.6949, + "theoretical_loss": 3.537605699899445, + "tokens_seen": 1400316928 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029076228686058175, + "loss": 2.6876, + "theoretical_loss": 3.537590909498641, + "tokens_seen": 1400382464 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029075225677031093, + "loss": 2.8217, + "theoretical_loss": 3.5375761199837887, + "tokens_seen": 1400448000 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002907422266800401, + "loss": 2.6441, + "theoretical_loss": 3.5375613313547936, + "tokens_seen": 1400513536 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002907321965897693, + "loss": 2.7108, + "theoretical_loss": 3.537546543611562, + "tokens_seen": 1400579072 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029072216649949853, + "loss": 2.7993, + "theoretical_loss": 3.5375317567539986, + "tokens_seen": 1400644608 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3318436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734802722930908, + "objective/train/theoretical_loss": 3.5375169707820087, + "objective/train/tokens_used": 1421170144, + "theoretical_loss": 3.5375169707820087, + "tokens_seen": 1400710144 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029071213640922765, + "loss": 2.7412, + "theoretical_loss": 3.5375169707820087, + "tokens_seen": 1400710144 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002907021063189569, + "loss": 2.5922, + "theoretical_loss": 3.5375021856954985, + "tokens_seen": 1400775680 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290692076228686, + "loss": 2.8014, + "theoretical_loss": 3.5374874014943734, + "tokens_seen": 1400841216 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029068204613841525, + "loss": 2.8019, + "theoretical_loss": 3.5374726181785388, + "tokens_seen": 1400906752 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029067201604814443, + "loss": 2.7956, + "theoretical_loss": 3.5374578357479005, + "tokens_seen": 1400972288 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002906619859578736, + "loss": 2.8286, + "theoretical_loss": 3.537443054202364, + "tokens_seen": 1401037824 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002906519558676028, + "loss": 2.7734, + "theoretical_loss": 3.5374282735418348, + "tokens_seen": 1401103360 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029064192577733203, + "loss": 2.714, + "theoretical_loss": 3.537413493766218, + "tokens_seen": 1401168896 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029063189568706116, + "loss": 2.738, + "theoretical_loss": 3.5373987148754207, + "tokens_seen": 1401234432 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002906218655967904, + "loss": 2.7548, + "theoretical_loss": 3.5373839368693476, + "tokens_seen": 1401299968 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002906118355065195, + "loss": 2.7842, + "theoretical_loss": 3.537369159747904, + "tokens_seen": 1401365504 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029060180541624876, + "loss": 2.7844, + "theoretical_loss": 3.537354383510996, + "tokens_seen": 1401431040 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029059177532597794, + "loss": 2.8587, + "theoretical_loss": 3.537339608158529, + "tokens_seen": 1401496576 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905817452357071, + "loss": 2.8069, + "theoretical_loss": 3.53732483369041, + "tokens_seen": 1401562112 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905717151454363, + "loss": 2.7077, + "theoretical_loss": 3.5373100601065435, + "tokens_seen": 1401627648 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905616850551655, + "loss": 2.6751, + "theoretical_loss": 3.5372952874068346, + "tokens_seen": 1401693184 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029055165496489466, + "loss": 2.6032, + "theoretical_loss": 3.537280515591191, + "tokens_seen": 1401758720 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905416248746239, + "loss": 2.7159, + "theoretical_loss": 3.537265744659517, + "tokens_seen": 1401824256 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905315947843531, + "loss": 2.7619, + "theoretical_loss": 3.5372509746117182, + "tokens_seen": 1401889792 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029052156469408226, + "loss": 2.8127, + "theoretical_loss": 3.5372362054477016, + "tokens_seen": 1401955328 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905115346038115, + "loss": 2.7216, + "theoretical_loss": 3.5372214371673723, + "tokens_seen": 1402020864 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002905015045135406, + "loss": 2.8332, + "theoretical_loss": 3.5372066697706357, + "tokens_seen": 1402086400 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029049147442326986, + "loss": 2.8908, + "theoretical_loss": 3.5371919032573986, + "tokens_seen": 1402151936 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290481444332999, + "loss": 2.7815, + "theoretical_loss": 3.537177137627566, + "tokens_seen": 1402217472 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002904714142427282, + "loss": 2.7218, + "theoretical_loss": 3.5371623728810446, + "tokens_seen": 1402283008 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3323292, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7711591720581055, + "objective/train/theoretical_loss": 3.5371476090177394, + "objective/train/tokens_used": 1422808544, + "theoretical_loss": 3.5371476090177394, + "tokens_seen": 1402348544 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002904613841524574, + "loss": 2.6898, + "theoretical_loss": 3.5371476090177394, + "tokens_seen": 1402348544 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002904513540621866, + "loss": 2.7943, + "theoretical_loss": 3.537132846037557, + "tokens_seen": 1402414080 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029044132397191576, + "loss": 2.7613, + "theoretical_loss": 3.5371180839404035, + "tokens_seen": 1402479616 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029043129388164494, + "loss": 2.7915, + "theoretical_loss": 3.5371033227261837, + "tokens_seen": 1402545152 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002904212637913741, + "loss": 2.7128, + "theoretical_loss": 3.5370885623948043, + "tokens_seen": 1402610688 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029041123370110336, + "loss": 2.7348, + "theoretical_loss": 3.537073802946171, + "tokens_seen": 1402676224 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002904012036108325, + "loss": 2.6853, + "theoretical_loss": 3.53705904438019, + "tokens_seen": 1402741760 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002903911735205617, + "loss": 2.6719, + "theoretical_loss": 3.5370442866967675, + "tokens_seen": 1402807296 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029038114343029085, + "loss": 2.6847, + "theoretical_loss": 3.5370295298958085, + "tokens_seen": 1402872832 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002903711133400201, + "loss": 2.7885, + "theoretical_loss": 3.53701477397722, + "tokens_seen": 1402938368 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029036108324974926, + "loss": 2.7583, + "theoretical_loss": 3.5370000189409083, + "tokens_seen": 1403003904 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029035105315947845, + "loss": 2.7691, + "theoretical_loss": 3.5369852647867783, + "tokens_seen": 1403069440 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002903410230692076, + "loss": 2.7962, + "theoretical_loss": 3.5369705115147365, + "tokens_seen": 1403134976 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029033099297893686, + "loss": 2.8855, + "theoretical_loss": 3.536955759124689, + "tokens_seen": 1403200512 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290320962888666, + "loss": 2.7479, + "theoretical_loss": 3.5369410076165426, + "tokens_seen": 1403266048 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002903109327983952, + "loss": 2.716, + "theoretical_loss": 3.5369262569902027, + "tokens_seen": 1403331584 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029030090270812435, + "loss": 2.7108, + "theoretical_loss": 3.5369115072455752, + "tokens_seen": 1403397120 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002902908726178536, + "loss": 2.7932, + "theoretical_loss": 3.5368967583825666, + "tokens_seen": 1403462656 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029028084252758277, + "loss": 2.8148, + "theoretical_loss": 3.536882010401083, + "tokens_seen": 1403528192 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029027081243731195, + "loss": 2.8059, + "theoretical_loss": 3.53686726330103, + "tokens_seen": 1403593728 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029026078234704113, + "loss": 2.8003, + "theoretical_loss": 3.5368525170823144, + "tokens_seen": 1403659264 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002902507522567703, + "loss": 2.799, + "theoretical_loss": 3.5368377717448425, + "tokens_seen": 1403724800 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002902407221664995, + "loss": 2.7848, + "theoretical_loss": 3.5368230272885204, + "tokens_seen": 1403790336 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029023069207622873, + "loss": 2.7159, + "theoretical_loss": 3.536808283713254, + "tokens_seen": 1403855872 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029022066198595785, + "loss": 2.7494, + "theoretical_loss": 3.5367935410189495, + "tokens_seen": 1403921408 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3326178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7054150104522705, + "objective/train/theoretical_loss": 3.5367787992055133, + "objective/train/tokens_used": 1424446944, + "theoretical_loss": 3.5367787992055133, + "tokens_seen": 1403986944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002902106318956871, + "loss": 2.8118, + "theoretical_loss": 3.5367787992055133, + "tokens_seen": 1403986944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002902006018054162, + "loss": 2.7625, + "theoretical_loss": 3.536764058272852, + "tokens_seen": 1404052480 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029019057171514545, + "loss": 2.6935, + "theoretical_loss": 3.5367493182208714, + "tokens_seen": 1404118016 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029018054162487463, + "loss": 2.7402, + "theoretical_loss": 3.5367345790494777, + "tokens_seen": 1404183552 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002901705115346038, + "loss": 2.7357, + "theoretical_loss": 3.5367198407585776, + "tokens_seen": 1404249088 + }, + { + "epoch": 17.01, + "learning_rate": 0.000290160481444333, + "loss": 2.7602, + "theoretical_loss": 3.536705103348077, + "tokens_seen": 1404314624 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029015045135406223, + "loss": 2.6641, + "theoretical_loss": 3.536690366817883, + "tokens_seen": 1404380160 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029014042126379136, + "loss": 2.775, + "theoretical_loss": 3.536675631167901, + "tokens_seen": 1404445696 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002901303911735206, + "loss": 2.7493, + "theoretical_loss": 3.536660896398038, + "tokens_seen": 1404511232 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002901203610832497, + "loss": 2.7102, + "theoretical_loss": 3.5366461625082, + "tokens_seen": 1404576768 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029011033099297896, + "loss": 2.799, + "theoretical_loss": 3.5366314294982932, + "tokens_seen": 1404642304 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029010030090270814, + "loss": 2.7756, + "theoretical_loss": 3.5366166973682245, + "tokens_seen": 1404707840 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900902708124373, + "loss": 2.7895, + "theoretical_loss": 3.5366019661179005, + "tokens_seen": 1404773376 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900802407221665, + "loss": 2.8453, + "theoretical_loss": 3.536587235747227, + "tokens_seen": 1404838912 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900702106318957, + "loss": 2.8699, + "theoretical_loss": 3.5365725062561104, + "tokens_seen": 1404904448 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029006018054162486, + "loss": 2.6918, + "theoretical_loss": 3.5365577776444574, + "tokens_seen": 1404969984 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900501504513541, + "loss": 2.7532, + "theoretical_loss": 3.536543049912175, + "tokens_seen": 1405035520 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900401203610832, + "loss": 2.7417, + "theoretical_loss": 3.536528323059169, + "tokens_seen": 1405101056 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029003009027081246, + "loss": 2.7898, + "theoretical_loss": 3.536513597085346, + "tokens_seen": 1405166592 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900200601805416, + "loss": 2.8268, + "theoretical_loss": 3.5364988719906125, + "tokens_seen": 1405232128 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002900100300902708, + "loss": 2.7636, + "theoretical_loss": 3.5364841477748756, + "tokens_seen": 1405297664 + }, + { + "epoch": 17.01, + "learning_rate": 0.00029, + "loss": 2.8022, + "theoretical_loss": 3.536469424438041, + "tokens_seen": 1405363200 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899899699097292, + "loss": 2.8222, + "theoretical_loss": 3.5364547019800154, + "tokens_seen": 1405428736 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028997993981945836, + "loss": 2.84, + "theoretical_loss": 3.536439980400706, + "tokens_seen": 1405494272 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899699097291876, + "loss": 2.6154, + "theoretical_loss": 3.536425259700019, + "tokens_seen": 1405559808 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3329926, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6496922969818115, + "objective/train/theoretical_loss": 3.5364105398778607, + "objective/train/tokens_used": 1426085344, + "theoretical_loss": 3.5364105398778607, + "tokens_seen": 1405625344 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899598796389167, + "loss": 2.6416, + "theoretical_loss": 3.5364105398778607, + "tokens_seen": 1405625344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028994984954864596, + "loss": 2.7945, + "theoretical_loss": 3.536395820934138, + "tokens_seen": 1405690880 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899398194583751, + "loss": 2.7185, + "theoretical_loss": 3.536381102868758, + "tokens_seen": 1405756416 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899297893681043, + "loss": 2.6579, + "theoretical_loss": 3.5363663856816263, + "tokens_seen": 1405821952 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899197592778335, + "loss": 2.7448, + "theoretical_loss": 3.5363516693726504, + "tokens_seen": 1405887488 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002899097291875627, + "loss": 2.7428, + "theoretical_loss": 3.536336953941737, + "tokens_seen": 1405953024 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028989969909729187, + "loss": 2.7964, + "theoretical_loss": 3.536322239388792, + "tokens_seen": 1406018560 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028988966900702105, + "loss": 2.731, + "theoretical_loss": 3.536307525713723, + "tokens_seen": 1406084096 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028987963891675023, + "loss": 2.8377, + "theoretical_loss": 3.536292812916436, + "tokens_seen": 1406149632 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028986960882647946, + "loss": 2.6733, + "theoretical_loss": 3.536278100996838, + "tokens_seen": 1406215168 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002898595787362086, + "loss": 2.8395, + "theoretical_loss": 3.5362633899548355, + "tokens_seen": 1406280704 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002898495486459378, + "loss": 2.7711, + "theoretical_loss": 3.5362486797903356, + "tokens_seen": 1406346240 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289839518555667, + "loss": 2.7188, + "theoretical_loss": 3.5362339705032455, + "tokens_seen": 1406411776 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002898294884653962, + "loss": 2.7997, + "theoretical_loss": 3.536219262093471, + "tokens_seen": 1406477312 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028981945837512537, + "loss": 2.7381, + "theoretical_loss": 3.5362045545609195, + "tokens_seen": 1406542848 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028980942828485455, + "loss": 2.8367, + "theoretical_loss": 3.5361898479054976, + "tokens_seen": 1406608384 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028979939819458373, + "loss": 2.7295, + "theoretical_loss": 3.536175142127112, + "tokens_seen": 1406673920 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028978936810431297, + "loss": 2.7776, + "theoretical_loss": 3.53616043722567, + "tokens_seen": 1406739456 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028977933801404215, + "loss": 2.7773, + "theoretical_loss": 3.5361457332010784, + "tokens_seen": 1406804992 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028976930792377133, + "loss": 2.8293, + "theoretical_loss": 3.5361310300532436, + "tokens_seen": 1406870528 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002897592778335005, + "loss": 2.7684, + "theoretical_loss": 3.5361163277820724, + "tokens_seen": 1406936064 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002897492477432297, + "loss": 2.7394, + "theoretical_loss": 3.5361016263874725, + "tokens_seen": 1407001600 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028973921765295893, + "loss": 2.7277, + "theoretical_loss": 3.53608692586935, + "tokens_seen": 1407067136 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028972918756268805, + "loss": 2.7833, + "theoretical_loss": 3.5360722262276125, + "tokens_seen": 1407132672 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002897191574724173, + "loss": 2.7666, + "theoretical_loss": 3.536057527462167, + "tokens_seen": 1407198208 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3334736, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9078001976013184, + "objective/train/theoretical_loss": 3.5360428295729194, + "objective/train/tokens_used": 1427723744, + "theoretical_loss": 3.5360428295729194, + "tokens_seen": 1407263744 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002897091273821464, + "loss": 2.7749, + "theoretical_loss": 3.5360428295729194, + "tokens_seen": 1407263744 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028969909729187565, + "loss": 2.8095, + "theoretical_loss": 3.5360281325597773, + "tokens_seen": 1407329280 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028968906720160483, + "loss": 2.7124, + "theoretical_loss": 3.536013436422648, + "tokens_seen": 1407394816 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289679037111334, + "loss": 2.7028, + "theoretical_loss": 3.535998741161438, + "tokens_seen": 1407460352 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002896690070210632, + "loss": 2.7751, + "theoretical_loss": 3.535984046776055, + "tokens_seen": 1407525888 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028965897693079243, + "loss": 2.6588, + "theoretical_loss": 3.535969353266405, + "tokens_seen": 1407591424 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028964894684052156, + "loss": 2.7859, + "theoretical_loss": 3.535954660632396, + "tokens_seen": 1407656960 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002896389167502508, + "loss": 2.7884, + "theoretical_loss": 3.535939968873935, + "tokens_seen": 1407722496 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002896288866599799, + "loss": 2.7608, + "theoretical_loss": 3.5359252779909283, + "tokens_seen": 1407788032 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028961885656970916, + "loss": 2.7143, + "theoretical_loss": 3.5359105879832833, + "tokens_seen": 1407853568 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028960882647943834, + "loss": 2.7925, + "theoretical_loss": 3.5358958988509075, + "tokens_seen": 1407919104 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895987963891675, + "loss": 2.6971, + "theoretical_loss": 3.535881210593707, + "tokens_seen": 1407984640 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895887662988967, + "loss": 2.8829, + "theoretical_loss": 3.5358665232115905, + "tokens_seen": 1408050176 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895787362086259, + "loss": 2.7081, + "theoretical_loss": 3.535851836704464, + "tokens_seen": 1408115712 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028956870611835506, + "loss": 2.8272, + "theoretical_loss": 3.5358371510722346, + "tokens_seen": 1408181248 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895586760280843, + "loss": 2.7517, + "theoretical_loss": 3.5358224663148103, + "tokens_seen": 1408246784 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895486459378134, + "loss": 2.8231, + "theoretical_loss": 3.535807782432097, + "tokens_seen": 1408312320 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028953861584754266, + "loss": 2.7854, + "theoretical_loss": 3.535793099424003, + "tokens_seen": 1408377856 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895285857572718, + "loss": 2.7945, + "theoretical_loss": 3.5357784172904356, + "tokens_seen": 1408443392 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289518555667001, + "loss": 2.7938, + "theoretical_loss": 3.535763736031302, + "tokens_seen": 1408508928 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002895085255767302, + "loss": 2.7841, + "theoretical_loss": 3.535749055646508, + "tokens_seen": 1408574464 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894984954864594, + "loss": 2.7317, + "theoretical_loss": 3.535734376135962, + "tokens_seen": 1408640000 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028948846539618856, + "loss": 2.7704, + "theoretical_loss": 3.535719697499572, + "tokens_seen": 1408705536 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894784353059178, + "loss": 2.8004, + "theoretical_loss": 3.535705019737244, + "tokens_seen": 1408771072 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894684052156469, + "loss": 2.6317, + "theoretical_loss": 3.5356903428488855, + "tokens_seen": 1408836608 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3337940, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.801241874694824, + "objective/train/theoretical_loss": 3.535675666834404, + "objective/train/tokens_used": 1429362144, + "theoretical_loss": 3.535675666834404, + "tokens_seen": 1408902144 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028945837512537616, + "loss": 2.8201, + "theoretical_loss": 3.535675666834404, + "tokens_seen": 1408902144 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894483450351053, + "loss": 2.6626, + "theoretical_loss": 3.535660991693707, + "tokens_seen": 1408967680 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894383149448345, + "loss": 2.76, + "theoretical_loss": 3.535646317426702, + "tokens_seen": 1409033216 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894282848545637, + "loss": 2.7482, + "theoretical_loss": 3.535631644033296, + "tokens_seen": 1409098752 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002894182547642929, + "loss": 2.7751, + "theoretical_loss": 3.535616971513396, + "tokens_seen": 1409164288 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028940822467402207, + "loss": 2.7788, + "theoretical_loss": 3.5356022998669103, + "tokens_seen": 1409229824 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028939819458375125, + "loss": 2.762, + "theoretical_loss": 3.5355876290937456, + "tokens_seen": 1409295360 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028938816449348043, + "loss": 2.7832, + "theoretical_loss": 3.53557295919381, + "tokens_seen": 1409360896 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028937813440320966, + "loss": 2.7558, + "theoretical_loss": 3.5355582901670095, + "tokens_seen": 1409426432 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002893681043129388, + "loss": 2.6996, + "theoretical_loss": 3.535543622013253, + "tokens_seen": 1409491968 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289358074222668, + "loss": 2.7256, + "theoretical_loss": 3.535528954732447, + "tokens_seen": 1409557504 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002893480441323972, + "loss": 2.85, + "theoretical_loss": 3.5355142883245, + "tokens_seen": 1409623040 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002893380140421264, + "loss": 2.7364, + "theoretical_loss": 3.5354996227893185, + "tokens_seen": 1409688576 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028932798395185557, + "loss": 2.7465, + "theoretical_loss": 3.53548495812681, + "tokens_seen": 1409754112 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028931795386158475, + "loss": 2.764, + "theoretical_loss": 3.535470294336883, + "tokens_seen": 1409819648 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028930792377131393, + "loss": 2.7251, + "theoretical_loss": 3.5354556314194445, + "tokens_seen": 1409885184 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028929789368104317, + "loss": 2.6729, + "theoretical_loss": 3.5354409693744016, + "tokens_seen": 1409950720 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002892878635907723, + "loss": 2.7494, + "theoretical_loss": 3.535426308201662, + "tokens_seen": 1410016256 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028927783350050153, + "loss": 2.812, + "theoretical_loss": 3.5354116479011335, + "tokens_seen": 1410081792 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028926780341023066, + "loss": 2.7737, + "theoretical_loss": 3.5353969884727237, + "tokens_seen": 1410147328 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002892577733199599, + "loss": 2.7124, + "theoretical_loss": 3.53538232991634, + "tokens_seen": 1410212864 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002892477432296891, + "loss": 2.7799, + "theoretical_loss": 3.53536767223189, + "tokens_seen": 1410278400 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028923771313941825, + "loss": 2.7183, + "theoretical_loss": 3.5353530154192816, + "tokens_seen": 1410343936 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028922768304914744, + "loss": 2.8156, + "theoretical_loss": 3.5353383594784225, + "tokens_seen": 1410409472 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002892176529588766, + "loss": 2.7447, + "theoretical_loss": 3.53532370440922, + "tokens_seen": 1410475008 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3342569, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7712435722351074, + "objective/train/theoretical_loss": 3.5353090502115814, + "objective/train/tokens_used": 1431000544, + "theoretical_loss": 3.5353090502115814, + "tokens_seen": 1410540544 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002892076228686058, + "loss": 2.6967, + "theoretical_loss": 3.5353090502115814, + "tokens_seen": 1410540544 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028919759277833503, + "loss": 2.7776, + "theoretical_loss": 3.5352943968854156, + "tokens_seen": 1410606080 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028918756268806416, + "loss": 2.7595, + "theoretical_loss": 3.5352797444306288, + "tokens_seen": 1410671616 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002891775325977934, + "loss": 2.6348, + "theoretical_loss": 3.53526509284713, + "tokens_seen": 1410737152 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002891675025075226, + "loss": 2.6902, + "theoretical_loss": 3.5352504421348256, + "tokens_seen": 1410802688 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028915747241725176, + "loss": 2.6914, + "theoretical_loss": 3.5352357922936246, + "tokens_seen": 1410868224 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028914744232698094, + "loss": 2.6503, + "theoretical_loss": 3.535221143323434, + "tokens_seen": 1410933760 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002891374122367101, + "loss": 2.7802, + "theoretical_loss": 3.535206495224162, + "tokens_seen": 1410999296 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002891273821464393, + "loss": 2.7268, + "theoretical_loss": 3.535191847995716, + "tokens_seen": 1411064832 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028911735205616854, + "loss": 2.7511, + "theoretical_loss": 3.535177201638004, + "tokens_seen": 1411130368 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028910732196589766, + "loss": 2.8376, + "theoretical_loss": 3.535162556150934, + "tokens_seen": 1411195904 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890972918756269, + "loss": 2.7702, + "theoretical_loss": 3.5351479115344135, + "tokens_seen": 1411261440 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289087261785356, + "loss": 2.7479, + "theoretical_loss": 3.5351332677883502, + "tokens_seen": 1411326976 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028907723169508526, + "loss": 2.737, + "theoretical_loss": 3.535118624912652, + "tokens_seen": 1411392512 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028906720160481444, + "loss": 2.8007, + "theoretical_loss": 3.5351039829072275, + "tokens_seen": 1411458048 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890571715145436, + "loss": 2.692, + "theoretical_loss": 3.5350893417719833, + "tokens_seen": 1411523584 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890471414242728, + "loss": 2.7161, + "theoretical_loss": 3.5350747015068285, + "tokens_seen": 1411589120 + }, + { + "epoch": 17.01, + "learning_rate": 0.000289037111334002, + "loss": 2.6406, + "theoretical_loss": 3.53506006211167, + "tokens_seen": 1411654656 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890270812437312, + "loss": 2.8508, + "theoretical_loss": 3.535045423586417, + "tokens_seen": 1411720192 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890170511534604, + "loss": 2.7375, + "theoretical_loss": 3.5350307859309753, + "tokens_seen": 1411785728 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002890070210631896, + "loss": 2.7355, + "theoretical_loss": 3.535016149145255, + "tokens_seen": 1411851264 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028899699097291876, + "loss": 2.7091, + "theoretical_loss": 3.5350015132291635, + "tokens_seen": 1411916800 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288986960882648, + "loss": 2.7696, + "theoretical_loss": 3.534986878182608, + "tokens_seen": 1411982336 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002889769307923771, + "loss": 2.7598, + "theoretical_loss": 3.534972244005497, + "tokens_seen": 1412047872 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028896690070210636, + "loss": 2.8979, + "theoretical_loss": 3.534957610697738, + "tokens_seen": 1412113408 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3345835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9373745918273926, + "objective/train/theoretical_loss": 3.5349429782592403, + "objective/train/tokens_used": 1432638944, + "theoretical_loss": 3.5349429782592403, + "tokens_seen": 1412178944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002889568706118355, + "loss": 2.7933, + "theoretical_loss": 3.5349429782592403, + "tokens_seen": 1412178944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002889468405215647, + "loss": 2.7406, + "theoretical_loss": 3.5349283466899104, + "tokens_seen": 1412244480 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002889368104312939, + "loss": 2.7991, + "theoretical_loss": 3.5349137159896573, + "tokens_seen": 1412310016 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002889267803410231, + "loss": 2.7717, + "theoretical_loss": 3.534899086158389, + "tokens_seen": 1412375552 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028891675025075227, + "loss": 2.7292, + "theoretical_loss": 3.5348844571960134, + "tokens_seen": 1412441088 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028890672016048145, + "loss": 2.8275, + "theoretical_loss": 3.5348698291024383, + "tokens_seen": 1412506624 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028889669007021063, + "loss": 2.8181, + "theoretical_loss": 3.534855201877572, + "tokens_seen": 1412572160 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028888665997993986, + "loss": 2.6338, + "theoretical_loss": 3.534840575521323, + "tokens_seen": 1412637696 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288876629889669, + "loss": 2.7357, + "theoretical_loss": 3.534825950033599, + "tokens_seen": 1412703232 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028886659979939823, + "loss": 2.8331, + "theoretical_loss": 3.5348113254143074, + "tokens_seen": 1412768768 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002888565697091274, + "loss": 2.7702, + "theoretical_loss": 3.534796701663358, + "tokens_seen": 1412834304 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002888465396188566, + "loss": 2.7537, + "theoretical_loss": 3.5347820787806583, + "tokens_seen": 1412899840 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028883650952858577, + "loss": 2.8188, + "theoretical_loss": 3.534767456766116, + "tokens_seen": 1412965376 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028882647943831495, + "loss": 2.8444, + "theoretical_loss": 3.5347528356196394, + "tokens_seen": 1413030912 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028881644934804413, + "loss": 2.7607, + "theoretical_loss": 3.534738215341137, + "tokens_seen": 1413096448 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028880641925777337, + "loss": 2.8367, + "theoretical_loss": 3.534723595930517, + "tokens_seen": 1413161984 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002887963891675025, + "loss": 2.7743, + "theoretical_loss": 3.534708977387688, + "tokens_seen": 1413227520 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028878635907723173, + "loss": 2.7013, + "theoretical_loss": 3.534694359712557, + "tokens_seen": 1413293056 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028877632898696086, + "loss": 2.8393, + "theoretical_loss": 3.5346797429050336, + "tokens_seen": 1413358592 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002887662988966901, + "loss": 2.8289, + "theoretical_loss": 3.5346651269650256, + "tokens_seen": 1413424128 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002887562688064193, + "loss": 2.8077, + "theoretical_loss": 3.534650511892441, + "tokens_seen": 1413489664 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028874623871614845, + "loss": 2.8096, + "theoretical_loss": 3.5346358976871883, + "tokens_seen": 1413555200 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028873620862587764, + "loss": 2.7621, + "theoretical_loss": 3.5346212843491767, + "tokens_seen": 1413620736 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002887261785356068, + "loss": 2.7833, + "theoretical_loss": 3.5346066718783127, + "tokens_seen": 1413686272 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288716148445336, + "loss": 2.7281, + "theoretical_loss": 3.5345920602745062, + "tokens_seen": 1413751808 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3349622, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7486042976379395, + "objective/train/theoretical_loss": 3.534577449537665, + "objective/train/tokens_used": 1434277344, + "theoretical_loss": 3.534577449537665, + "tokens_seen": 1413817344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028870611835506523, + "loss": 2.7757, + "theoretical_loss": 3.534577449537665, + "tokens_seen": 1413817344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028869608826479436, + "loss": 2.771, + "theoretical_loss": 3.5345628396676974, + "tokens_seen": 1413882880 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002886860581745236, + "loss": 2.7667, + "theoretical_loss": 3.534548230664512, + "tokens_seen": 1413948416 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002886760280842528, + "loss": 2.6712, + "theoretical_loss": 3.5345336225280173, + "tokens_seen": 1414013952 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028866599799398196, + "loss": 2.7754, + "theoretical_loss": 3.5345190152581205, + "tokens_seen": 1414079488 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028865596790371114, + "loss": 2.7542, + "theoretical_loss": 3.5345044088547324, + "tokens_seen": 1414145024 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002886459378134403, + "loss": 2.8297, + "theoretical_loss": 3.534489803317759, + "tokens_seen": 1414210560 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002886359077231695, + "loss": 2.6932, + "theoretical_loss": 3.534475198647111, + "tokens_seen": 1414276096 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028862587763289874, + "loss": 2.8387, + "theoretical_loss": 3.5344605948426944, + "tokens_seen": 1414341632 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028861584754262786, + "loss": 2.8003, + "theoretical_loss": 3.5344459919044198, + "tokens_seen": 1414407168 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002886058174523571, + "loss": 2.6475, + "theoretical_loss": 3.5344313898321946, + "tokens_seen": 1414472704 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002885957873620862, + "loss": 2.7575, + "theoretical_loss": 3.534416788625928, + "tokens_seen": 1414538240 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028858575727181546, + "loss": 2.8037, + "theoretical_loss": 3.534402188285528, + "tokens_seen": 1414603776 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028857572718154464, + "loss": 2.7475, + "theoretical_loss": 3.5343875888109033, + "tokens_seen": 1414669312 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002885656970912738, + "loss": 2.7049, + "theoretical_loss": 3.5343729902019625, + "tokens_seen": 1414734848 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288555667001003, + "loss": 2.8133, + "theoretical_loss": 3.534358392458614, + "tokens_seen": 1414800384 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002885456369107322, + "loss": 2.8339, + "theoretical_loss": 3.5343437955807664, + "tokens_seen": 1414865920 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028853560682046137, + "loss": 2.6916, + "theoretical_loss": 3.5343291995683286, + "tokens_seen": 1414931456 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002885255767301906, + "loss": 2.7802, + "theoretical_loss": 3.5343146044212093, + "tokens_seen": 1414996992 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028851554663991973, + "loss": 2.6945, + "theoretical_loss": 3.5343000101393165, + "tokens_seen": 1415062528 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028850551654964896, + "loss": 2.781, + "theoretical_loss": 3.5342854167225592, + "tokens_seen": 1415128064 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028849548645937815, + "loss": 2.8253, + "theoretical_loss": 3.534270824170846, + "tokens_seen": 1415193600 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002884854563691073, + "loss": 2.7622, + "theoretical_loss": 3.534256232484086, + "tokens_seen": 1415259136 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002884754262788365, + "loss": 2.7579, + "theoretical_loss": 3.534241641662187, + "tokens_seen": 1415324672 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002884653961885657, + "loss": 2.6703, + "theoretical_loss": 3.5342270517050585, + "tokens_seen": 1415390208 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3354125, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5863230228424072, + "objective/train/theoretical_loss": 3.534212462612609, + "objective/train/tokens_used": 1435915744, + "theoretical_loss": 3.534212462612609, + "tokens_seen": 1415455744 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028845536609829487, + "loss": 2.7901, + "theoretical_loss": 3.534212462612609, + "tokens_seen": 1415455744 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002884453360080241, + "loss": 2.743, + "theoretical_loss": 3.5341978743847466, + "tokens_seen": 1415521280 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028843530591775323, + "loss": 2.7184, + "theoretical_loss": 3.5341832870213805, + "tokens_seen": 1415586816 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028842527582748247, + "loss": 2.751, + "theoretical_loss": 3.53416870052242, + "tokens_seen": 1415652352 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002884152457372116, + "loss": 2.8024, + "theoretical_loss": 3.534154114887773, + "tokens_seen": 1415717888 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028840521564694083, + "loss": 2.7887, + "theoretical_loss": 3.5341395301173493, + "tokens_seen": 1415783424 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028839518555667, + "loss": 2.7275, + "theoretical_loss": 3.5341249462110564, + "tokens_seen": 1415848960 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883851554663992, + "loss": 2.6995, + "theoretical_loss": 3.534110363168804, + "tokens_seen": 1415914496 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883751253761284, + "loss": 2.7794, + "theoretical_loss": 3.5340957809905005, + "tokens_seen": 1415980032 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883650952858576, + "loss": 2.6934, + "theoretical_loss": 3.5340811996760553, + "tokens_seen": 1416045568 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028835506519558674, + "loss": 2.7072, + "theoretical_loss": 3.534066619225377, + "tokens_seen": 1416111104 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028834503510531597, + "loss": 2.708, + "theoretical_loss": 3.5340520396383734, + "tokens_seen": 1416176640 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883350050150451, + "loss": 2.6789, + "theoretical_loss": 3.5340374609149547, + "tokens_seen": 1416242176 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028832497492477433, + "loss": 2.7266, + "theoretical_loss": 3.5340228830550298, + "tokens_seen": 1416307712 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883149448345035, + "loss": 2.8125, + "theoretical_loss": 3.5340083060585066, + "tokens_seen": 1416373248 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002883049147442327, + "loss": 2.8038, + "theoretical_loss": 3.533993729925295, + "tokens_seen": 1416438784 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002882948846539619, + "loss": 2.7698, + "theoretical_loss": 3.5339791546553037, + "tokens_seen": 1416504320 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028828485456369106, + "loss": 2.7842, + "theoretical_loss": 3.533964580248441, + "tokens_seen": 1416569856 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002882748244734203, + "loss": 2.6878, + "theoretical_loss": 3.5339500067046163, + "tokens_seen": 1416635392 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002882647943831495, + "loss": 2.853, + "theoretical_loss": 3.533935434023739, + "tokens_seen": 1416700928 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028825476429287866, + "loss": 2.7405, + "theoretical_loss": 3.5339208622057177, + "tokens_seen": 1416766464 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028824473420260784, + "loss": 2.7868, + "theoretical_loss": 3.533906291250461, + "tokens_seen": 1416832000 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288234704112337, + "loss": 2.669, + "theoretical_loss": 3.533891721157879, + "tokens_seen": 1416897536 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002882246740220662, + "loss": 2.6816, + "theoretical_loss": 3.533877151927879, + "tokens_seen": 1416963072 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028821464393179543, + "loss": 2.8269, + "theoretical_loss": 3.5338625835603716, + "tokens_seen": 1417028608 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3357260, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8090014457702637, + "objective/train/theoretical_loss": 3.5338480160552654, + "objective/train/tokens_used": 1437554144, + "theoretical_loss": 3.5338480160552654, + "tokens_seen": 1417094144 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028820461384152456, + "loss": 2.7842, + "theoretical_loss": 3.5338480160552654, + "tokens_seen": 1417094144 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002881945837512538, + "loss": 2.7101, + "theoretical_loss": 3.5338334494124695, + "tokens_seen": 1417159680 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288184553660983, + "loss": 2.6885, + "theoretical_loss": 3.5338188836318927, + "tokens_seen": 1417225216 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028817452357071216, + "loss": 2.7596, + "theoretical_loss": 3.533804318713444, + "tokens_seen": 1417290752 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028816449348044134, + "loss": 2.789, + "theoretical_loss": 3.5337897546570334, + "tokens_seen": 1417356288 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002881544633901705, + "loss": 2.8308, + "theoretical_loss": 3.533775191462569, + "tokens_seen": 1417421824 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002881444332998997, + "loss": 2.792, + "theoretical_loss": 3.5337606291299606, + "tokens_seen": 1417487360 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028813440320962894, + "loss": 2.7639, + "theoretical_loss": 3.5337460676591164, + "tokens_seen": 1417552896 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028812437311935806, + "loss": 2.7073, + "theoretical_loss": 3.5337315070499473, + "tokens_seen": 1417618432 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002881143430290873, + "loss": 2.65, + "theoretical_loss": 3.5337169473023606, + "tokens_seen": 1417683968 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002881043129388164, + "loss": 2.77, + "theoretical_loss": 3.533702388416267, + "tokens_seen": 1417749504 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028809428284854566, + "loss": 2.6689, + "theoretical_loss": 3.5336878303915746, + "tokens_seen": 1417815040 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028808425275827484, + "loss": 2.7254, + "theoretical_loss": 3.5336732732281932, + "tokens_seen": 1417880576 + }, + { + "epoch": 17.01, + "learning_rate": 0.000288074222668004, + "loss": 2.6867, + "theoretical_loss": 3.533658716926032, + "tokens_seen": 1417946112 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002880641925777332, + "loss": 2.7039, + "theoretical_loss": 3.533644161485, + "tokens_seen": 1418011648 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002880541624874624, + "loss": 2.6878, + "theoretical_loss": 3.5336296069050066, + "tokens_seen": 1418077184 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028804413239719157, + "loss": 2.7643, + "theoretical_loss": 3.533615053185961, + "tokens_seen": 1418142720 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002880341023069208, + "loss": 2.7213, + "theoretical_loss": 3.533600500327773, + "tokens_seen": 1418208256 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028802407221664993, + "loss": 2.7611, + "theoretical_loss": 3.5335859483303507, + "tokens_seen": 1418273792 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028801404212637916, + "loss": 2.707, + "theoretical_loss": 3.533571397193605, + "tokens_seen": 1418339328 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028800401203610835, + "loss": 2.8004, + "theoretical_loss": 3.5335568469174437, + "tokens_seen": 1418404864 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879939819458375, + "loss": 2.7028, + "theoretical_loss": 3.5335422975017776, + "tokens_seen": 1418470400 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879839518555667, + "loss": 2.6497, + "theoretical_loss": 3.533527748946515, + "tokens_seen": 1418535936 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879739217652959, + "loss": 2.7409, + "theoretical_loss": 3.5335132012515658, + "tokens_seen": 1418601472 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028796389167502507, + "loss": 2.8474, + "theoretical_loss": 3.5334986544168387, + "tokens_seen": 1418667008 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3360267, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7880311012268066, + "objective/train/theoretical_loss": 3.533484108442244, + "objective/train/tokens_used": 1439192544, + "theoretical_loss": 3.533484108442244, + "tokens_seen": 1418732544 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879538615847543, + "loss": 2.7307, + "theoretical_loss": 3.533484108442244, + "tokens_seen": 1418732544 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028794383149448343, + "loss": 2.8518, + "theoretical_loss": 3.5334695633276905, + "tokens_seen": 1418798080 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028793380140421267, + "loss": 2.7923, + "theoretical_loss": 3.533455019073088, + "tokens_seen": 1418863616 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879237713139418, + "loss": 2.7202, + "theoretical_loss": 3.5334404756783453, + "tokens_seen": 1418929152 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028791374122367103, + "loss": 2.753, + "theoretical_loss": 3.5334259331433726, + "tokens_seen": 1418994688 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002879037111334002, + "loss": 2.797, + "theoretical_loss": 3.5334113914680794, + "tokens_seen": 1419060224 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878936810431294, + "loss": 2.7803, + "theoretical_loss": 3.533396850652375, + "tokens_seen": 1419125760 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878836509528586, + "loss": 2.6924, + "theoretical_loss": 3.533382310696168, + "tokens_seen": 1419191296 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878736208625878, + "loss": 2.7208, + "theoretical_loss": 3.533367771599369, + "tokens_seen": 1419256832 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028786359077231694, + "loss": 2.6614, + "theoretical_loss": 3.533353233361887, + "tokens_seen": 1419322368 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028785356068204617, + "loss": 2.8174, + "theoretical_loss": 3.533338695983632, + "tokens_seen": 1419387904 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878435305917753, + "loss": 2.6521, + "theoretical_loss": 3.5333241594645135, + "tokens_seen": 1419453440 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028783350050150453, + "loss": 2.7711, + "theoretical_loss": 3.53330962380444, + "tokens_seen": 1419518976 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878234704112337, + "loss": 2.7691, + "theoretical_loss": 3.533295089003323, + "tokens_seen": 1419584512 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878134403209629, + "loss": 2.7702, + "theoretical_loss": 3.5332805550610704, + "tokens_seen": 1419650048 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002878034102306921, + "loss": 2.7886, + "theoretical_loss": 3.5332660219775924, + "tokens_seen": 1419715584 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028779338014042126, + "loss": 2.7095, + "theoretical_loss": 3.533251489752799, + "tokens_seen": 1419781120 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028778335005015044, + "loss": 2.7874, + "theoretical_loss": 3.533236958386599, + "tokens_seen": 1419846656 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002877733199598797, + "loss": 2.754, + "theoretical_loss": 3.533222427878903, + "tokens_seen": 1419912192 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002877632898696088, + "loss": 2.7195, + "theoretical_loss": 3.5332078982296196, + "tokens_seen": 1419977728 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028775325977933804, + "loss": 2.8062, + "theoretical_loss": 3.5331933694386595, + "tokens_seen": 1420043264 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028774322968906716, + "loss": 2.8939, + "theoretical_loss": 3.5331788415059315, + "tokens_seen": 1420108800 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002877331995987964, + "loss": 2.736, + "theoretical_loss": 3.533164314431346, + "tokens_seen": 1420174336 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002877231695085256, + "loss": 2.7867, + "theoretical_loss": 3.5331497882148124, + "tokens_seen": 1420239872 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028771313941825476, + "loss": 2.8045, + "theoretical_loss": 3.5331352628562405, + "tokens_seen": 1420305408 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3363998, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.715198516845703, + "objective/train/theoretical_loss": 3.5331207383555405, + "objective/train/tokens_used": 1440830944, + "theoretical_loss": 3.5331207383555405, + "tokens_seen": 1420370944 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028770310932798394, + "loss": 2.7292, + "theoretical_loss": 3.5331207383555405, + "tokens_seen": 1420370944 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876930792377132, + "loss": 2.6988, + "theoretical_loss": 3.533106214712621, + "tokens_seen": 1420436480 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876830491474423, + "loss": 2.8065, + "theoretical_loss": 3.5330916919273925, + "tokens_seen": 1420502016 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028767301905717154, + "loss": 2.6927, + "theoretical_loss": 3.533077169999765, + "tokens_seen": 1420567552 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028766298896690067, + "loss": 2.8098, + "theoretical_loss": 3.533062648929648, + "tokens_seen": 1420633088 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876529588766299, + "loss": 2.8113, + "theoretical_loss": 3.533048128716951, + "tokens_seen": 1420698624 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876429287863591, + "loss": 2.7624, + "theoretical_loss": 3.5330336093615844, + "tokens_seen": 1420764160 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028763289869608826, + "loss": 2.7743, + "theoretical_loss": 3.5330190908634576, + "tokens_seen": 1420829696 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028762286860581745, + "loss": 2.797, + "theoretical_loss": 3.533004573222481, + "tokens_seen": 1420895232 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876128385155466, + "loss": 2.6475, + "theoretical_loss": 3.532990056438564, + "tokens_seen": 1420960768 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002876028084252758, + "loss": 2.7231, + "theoretical_loss": 3.5329755405116163, + "tokens_seen": 1421026304 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028759277833500504, + "loss": 2.802, + "theoretical_loss": 3.532961025441548, + "tokens_seen": 1421091840 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028758274824473417, + "loss": 2.7196, + "theoretical_loss": 3.5329465112282694, + "tokens_seen": 1421157376 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002875727181544634, + "loss": 2.8245, + "theoretical_loss": 3.53293199787169, + "tokens_seen": 1421222912 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028756268806419253, + "loss": 2.8212, + "theoretical_loss": 3.53291748537172, + "tokens_seen": 1421288448 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028755265797392177, + "loss": 2.8127, + "theoretical_loss": 3.532902973728269, + "tokens_seen": 1421353984 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028754262788365095, + "loss": 2.7865, + "theoretical_loss": 3.532888462941247, + "tokens_seen": 1421419520 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028753259779338013, + "loss": 2.7256, + "theoretical_loss": 3.532873953010564, + "tokens_seen": 1421485056 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028752256770310936, + "loss": 2.7605, + "theoretical_loss": 3.5328594439361307, + "tokens_seen": 1421550592 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028751253761283855, + "loss": 2.7872, + "theoretical_loss": 3.532844935717856, + "tokens_seen": 1421616128 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002875025075225677, + "loss": 2.6168, + "theoretical_loss": 3.532830428355651, + "tokens_seen": 1421681664 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002874924774322969, + "loss": 2.821, + "theoretical_loss": 3.5328159218494246, + "tokens_seen": 1421747200 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002874824473420261, + "loss": 2.744, + "theoretical_loss": 3.5328014161990873, + "tokens_seen": 1421812736 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028747241725175527, + "loss": 2.7846, + "theoretical_loss": 3.5327869114045494, + "tokens_seen": 1421878272 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002874623871614845, + "loss": 2.7927, + "theoretical_loss": 3.532772407465721, + "tokens_seen": 1421943808 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3368996, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8142764568328857, + "objective/train/theoretical_loss": 3.5327579043825117, + "objective/train/tokens_used": 1442469344, + "theoretical_loss": 3.5327579043825117, + "tokens_seen": 1422009344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028745235707121363, + "loss": 2.7712, + "theoretical_loss": 3.5327579043825117, + "tokens_seen": 1422009344 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028744232698094287, + "loss": 2.7437, + "theoretical_loss": 3.532743402154832, + "tokens_seen": 1422074880 + }, + { + "epoch": 17.01, + "learning_rate": 0.000287432296890672, + "loss": 2.8286, + "theoretical_loss": 3.5327289007825917, + "tokens_seen": 1422140416 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028742226680040123, + "loss": 2.7533, + "theoretical_loss": 3.5327144002657014, + "tokens_seen": 1422205952 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002874122367101304, + "loss": 2.7792, + "theoretical_loss": 3.532699900604071, + "tokens_seen": 1422271488 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002874022066198596, + "loss": 2.7068, + "theoretical_loss": 3.53268540179761, + "tokens_seen": 1422337024 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002873921765295888, + "loss": 2.7263, + "theoretical_loss": 3.53267090384623, + "tokens_seen": 1422402560 + }, + { + "epoch": 17.01, + "learning_rate": 0.000287382146439318, + "loss": 2.6732, + "theoretical_loss": 3.53265640674984, + "tokens_seen": 1422468096 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028737211634904714, + "loss": 2.7137, + "theoretical_loss": 3.5326419105083504, + "tokens_seen": 1422533632 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028736208625877637, + "loss": 2.817, + "theoretical_loss": 3.5326274151216714, + "tokens_seen": 1422599168 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002873520561685055, + "loss": 2.6914, + "theoretical_loss": 3.5326129205897137, + "tokens_seen": 1422664704 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028734202607823473, + "loss": 2.8059, + "theoretical_loss": 3.532598426912387, + "tokens_seen": 1422730240 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002873319959879639, + "loss": 2.7359, + "theoretical_loss": 3.532583934089602, + "tokens_seen": 1422795776 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002873219658976931, + "loss": 2.8119, + "theoretical_loss": 3.532569442121268, + "tokens_seen": 1422861312 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002873119358074223, + "loss": 2.6915, + "theoretical_loss": 3.5325549510072967, + "tokens_seen": 1422926848 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028730190571715146, + "loss": 2.7991, + "theoretical_loss": 3.5325404607475974, + "tokens_seen": 1422992384 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028729187562688064, + "loss": 2.7275, + "theoretical_loss": 3.5325259713420802, + "tokens_seen": 1423057920 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002872818455366099, + "loss": 2.7541, + "theoretical_loss": 3.5325114827906567, + "tokens_seen": 1423123456 + }, + { + "epoch": 17.01, + "learning_rate": 0.000287271815446339, + "loss": 2.7134, + "theoretical_loss": 3.5324969950932354, + "tokens_seen": 1423188992 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028726178535606824, + "loss": 2.7802, + "theoretical_loss": 3.5324825082497284, + "tokens_seen": 1423254528 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028725175526579736, + "loss": 2.7303, + "theoretical_loss": 3.532468022260045, + "tokens_seen": 1423320064 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002872417251755266, + "loss": 2.8885, + "theoretical_loss": 3.532453537124096, + "tokens_seen": 1423385600 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002872316950852558, + "loss": 2.7768, + "theoretical_loss": 3.5324390528417915, + "tokens_seen": 1423451136 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028722166499498496, + "loss": 2.7354, + "theoretical_loss": 3.5324245694130423, + "tokens_seen": 1423516672 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028721163490471414, + "loss": 2.6896, + "theoretical_loss": 3.532410086837758, + "tokens_seen": 1423582208 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3371820, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6039061546325684, + "objective/train/theoretical_loss": 3.53239560511585, + "objective/train/tokens_used": 1444107744, + "theoretical_loss": 3.53239560511585, + "tokens_seen": 1423647744 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002872016048144434, + "loss": 2.7709, + "theoretical_loss": 3.53239560511585, + "tokens_seen": 1423647744 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002871915747241725, + "loss": 2.8772, + "theoretical_loss": 3.532381124247228, + "tokens_seen": 1423713280 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028718154463390174, + "loss": 2.765, + "theoretical_loss": 3.5323666442318027, + "tokens_seen": 1423778816 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028717151454363087, + "loss": 2.7916, + "theoretical_loss": 3.532352165069485, + "tokens_seen": 1423844352 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002871614844533601, + "loss": 2.685, + "theoretical_loss": 3.532337686760185, + "tokens_seen": 1423909888 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002871514543630893, + "loss": 2.7707, + "theoretical_loss": 3.5323232093038124, + "tokens_seen": 1423975424 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028714142427281846, + "loss": 2.8517, + "theoretical_loss": 3.532308732700279, + "tokens_seen": 1424040960 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028713139418254765, + "loss": 2.7529, + "theoretical_loss": 3.532294256949495, + "tokens_seen": 1424106496 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002871213640922768, + "loss": 2.832, + "theoretical_loss": 3.5322797820513703, + "tokens_seen": 1424172032 + }, + { + "epoch": 17.01, + "learning_rate": 0.000287111334002006, + "loss": 2.8265, + "theoretical_loss": 3.532265308005816, + "tokens_seen": 1424237568 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028710130391173524, + "loss": 2.7939, + "theoretical_loss": 3.5322508348127424, + "tokens_seen": 1424303104 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028709127382146437, + "loss": 2.7106, + "theoretical_loss": 3.5322363624720605, + "tokens_seen": 1424368640 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002870812437311936, + "loss": 2.7804, + "theoretical_loss": 3.53222189098368, + "tokens_seen": 1424434176 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028707121364092273, + "loss": 2.7517, + "theoretical_loss": 3.532207420347512, + "tokens_seen": 1424499712 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028706118355065197, + "loss": 2.7718, + "theoretical_loss": 3.5321929505634677, + "tokens_seen": 1424565248 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028705115346038115, + "loss": 2.6698, + "theoretical_loss": 3.5321784816314574, + "tokens_seen": 1424630784 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028704112337011033, + "loss": 2.8585, + "theoretical_loss": 3.532164013551391, + "tokens_seen": 1424696320 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002870310932798395, + "loss": 2.7457, + "theoretical_loss": 3.53214954632318, + "tokens_seen": 1424761856 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028702106318956875, + "loss": 2.7025, + "theoretical_loss": 3.532135079946734, + "tokens_seen": 1424827392 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002870110330992979, + "loss": 2.764, + "theoretical_loss": 3.5321206144219652, + "tokens_seen": 1424892928 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002870010030090271, + "loss": 2.6971, + "theoretical_loss": 3.5321061497487833, + "tokens_seen": 1424958464 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028699097291875624, + "loss": 2.8068, + "theoretical_loss": 3.5320916859270994, + "tokens_seen": 1425024000 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028698094282848547, + "loss": 2.7493, + "theoretical_loss": 3.532077222956824, + "tokens_seen": 1425089536 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028697091273821465, + "loss": 2.8078, + "theoretical_loss": 3.532062760837867, + "tokens_seen": 1425155072 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028696088264794383, + "loss": 2.6923, + "theoretical_loss": 3.5320482995701408, + "tokens_seen": 1425220608 + }, + { + "epoch": 17.01, + "objective/train/docs_used": 3376726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7302286624908447, + "objective/train/theoretical_loss": 3.5320338391535557, + "objective/train/tokens_used": 1445746144, + "theoretical_loss": 3.5320338391535557, + "tokens_seen": 1425286144 + }, + { + "epoch": 17.01, + "learning_rate": 0.000286950852557673, + "loss": 2.7372, + "theoretical_loss": 3.5320338391535557, + "tokens_seen": 1425286144 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002869408224674022, + "loss": 2.777, + "theoretical_loss": 3.5320193795880215, + "tokens_seen": 1425351680 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002869307923771314, + "loss": 2.8019, + "theoretical_loss": 3.5320049208734496, + "tokens_seen": 1425417216 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002869207622868606, + "loss": 2.7182, + "theoretical_loss": 3.531990463009751, + "tokens_seen": 1425482752 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028691073219658974, + "loss": 2.9098, + "theoretical_loss": 3.5319760059968366, + "tokens_seen": 1425548288 + }, + { + "epoch": 17.01, + "learning_rate": 0.000286900702106319, + "loss": 2.756, + "theoretical_loss": 3.531961549834617, + "tokens_seen": 1425613824 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868906720160481, + "loss": 2.7823, + "theoretical_loss": 3.5319470945230025, + "tokens_seen": 1425679360 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028688064192577734, + "loss": 2.8439, + "theoretical_loss": 3.531932640061905, + "tokens_seen": 1425744896 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868706118355065, + "loss": 2.7638, + "theoretical_loss": 3.531918186451234, + "tokens_seen": 1425810432 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868605817452357, + "loss": 2.7669, + "theoretical_loss": 3.5319037336909016, + "tokens_seen": 1425875968 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868505516549649, + "loss": 2.9002, + "theoretical_loss": 3.5318892817808187, + "tokens_seen": 1425941504 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868405215646941, + "loss": 2.7341, + "theoretical_loss": 3.531874830720896, + "tokens_seen": 1426007040 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028683049147442324, + "loss": 2.742, + "theoretical_loss": 3.531860380511043, + "tokens_seen": 1426072576 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868204613841525, + "loss": 2.7145, + "theoretical_loss": 3.5318459311511727, + "tokens_seen": 1426138112 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002868104312938816, + "loss": 2.7959, + "theoretical_loss": 3.5318314826411954, + "tokens_seen": 1426203648 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028680040120361084, + "loss": 2.8582, + "theoretical_loss": 3.5318170349810214, + "tokens_seen": 1426269184 + }, + { + "epoch": 17.01, + "learning_rate": 0.00028679037111334, + "loss": 2.7537, + "theoretical_loss": 3.5318025881705624, + "tokens_seen": 1426334720 + }, + { + "epoch": 17.01, + "learning_rate": 0.0002867803410230692, + "loss": 2.647, + "theoretical_loss": 3.531788142209729, + "tokens_seen": 1426400256 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028677031093279844, + "loss": 2.8699, + "theoretical_loss": 3.5317736970984326, + "tokens_seen": 1426465792 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028676028084252756, + "loss": 2.6301, + "theoretical_loss": 3.5317592528365833, + "tokens_seen": 1426531328 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002867502507522568, + "loss": 2.8061, + "theoretical_loss": 3.531744809424093, + "tokens_seen": 1426596864 + }, + { + "epoch": 17.02, + "learning_rate": 0.000286740220661986, + "loss": 2.8246, + "theoretical_loss": 3.531730366860873, + "tokens_seen": 1426662400 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028673019057171516, + "loss": 2.7153, + "theoretical_loss": 3.531715925146833, + "tokens_seen": 1426727936 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028672016048144434, + "loss": 2.803, + "theoretical_loss": 3.5317014842818857, + "tokens_seen": 1426793472 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002867101303911736, + "loss": 2.6802, + "theoretical_loss": 3.531687044265941, + "tokens_seen": 1426859008 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3379552, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.839383125305176, + "objective/train/theoretical_loss": 3.5316726050989105, + "objective/train/tokens_used": 1447384544, + "theoretical_loss": 3.5316726050989105, + "tokens_seen": 1426924544 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002867001003009027, + "loss": 2.7596, + "theoretical_loss": 3.5316726050989105, + "tokens_seen": 1426924544 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028669007021063194, + "loss": 2.7542, + "theoretical_loss": 3.531658166780705, + "tokens_seen": 1426990080 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028668004012036107, + "loss": 2.8158, + "theoretical_loss": 3.531643729311236, + "tokens_seen": 1427055616 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002866700100300903, + "loss": 2.851, + "theoretical_loss": 3.5316292926904147, + "tokens_seen": 1427121152 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002866599799398195, + "loss": 2.8323, + "theoretical_loss": 3.531614856918152, + "tokens_seen": 1427186688 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028664994984954866, + "loss": 2.8281, + "theoretical_loss": 3.5316004219943586, + "tokens_seen": 1427252224 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028663991975927785, + "loss": 2.7587, + "theoretical_loss": 3.5315859879189464, + "tokens_seen": 1427317760 + }, + { + "epoch": 17.02, + "learning_rate": 0.000286629889669007, + "loss": 2.6807, + "theoretical_loss": 3.5315715546918263, + "tokens_seen": 1427383296 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002866198595787362, + "loss": 2.7532, + "theoretical_loss": 3.5315571223129094, + "tokens_seen": 1427448832 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028660982948846544, + "loss": 2.7262, + "theoretical_loss": 3.5315426907821075, + "tokens_seen": 1427514368 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028659979939819457, + "loss": 2.7344, + "theoretical_loss": 3.531528260099331, + "tokens_seen": 1427579904 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002865897693079238, + "loss": 2.6945, + "theoretical_loss": 3.5315138302644917, + "tokens_seen": 1427645440 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028657973921765293, + "loss": 2.773, + "theoretical_loss": 3.5314994012775003, + "tokens_seen": 1427710976 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028656970912738217, + "loss": 2.6309, + "theoretical_loss": 3.531484973138269, + "tokens_seen": 1427776512 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028655967903711135, + "loss": 2.7495, + "theoretical_loss": 3.531470545846708, + "tokens_seen": 1427842048 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028654964894684053, + "loss": 2.7227, + "theoretical_loss": 3.5314561194027294, + "tokens_seen": 1427907584 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002865396188565697, + "loss": 2.8267, + "theoretical_loss": 3.5314416938062445, + "tokens_seen": 1427973120 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028652958876629895, + "loss": 2.6837, + "theoretical_loss": 3.531427269057164, + "tokens_seen": 1428038656 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002865195586760281, + "loss": 2.7851, + "theoretical_loss": 3.5314128451553994, + "tokens_seen": 1428104192 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002865095285857573, + "loss": 2.8063, + "theoretical_loss": 3.531398422100862, + "tokens_seen": 1428169728 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028649949849548644, + "loss": 2.644, + "theoretical_loss": 3.531383999893464, + "tokens_seen": 1428235264 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028648946840521567, + "loss": 2.8384, + "theoretical_loss": 3.5313695785331163, + "tokens_seen": 1428300800 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028647943831494485, + "loss": 2.7338, + "theoretical_loss": 3.5313551580197298, + "tokens_seen": 1428366336 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028646940822467403, + "loss": 2.6857, + "theoretical_loss": 3.531340738353216, + "tokens_seen": 1428431872 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002864593781344032, + "loss": 2.6877, + "theoretical_loss": 3.531326319533487, + "tokens_seen": 1428497408 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3383187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6201987266540527, + "objective/train/theoretical_loss": 3.5313119015604535, + "objective/train/tokens_used": 1449022944, + "theoretical_loss": 3.5313119015604535, + "tokens_seen": 1428562944 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002864493480441324, + "loss": 2.7331, + "theoretical_loss": 3.5313119015604535, + "tokens_seen": 1428562944 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002864393179538616, + "loss": 2.7645, + "theoretical_loss": 3.5312974844340275, + "tokens_seen": 1428628480 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002864292878635908, + "loss": 2.7994, + "theoretical_loss": 3.5312830681541203, + "tokens_seen": 1428694016 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028641925777331994, + "loss": 2.7612, + "theoretical_loss": 3.5312686527206427, + "tokens_seen": 1428759552 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002864092276830492, + "loss": 2.6508, + "theoretical_loss": 3.5312542381335073, + "tokens_seen": 1428825088 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863991975927783, + "loss": 2.8059, + "theoretical_loss": 3.531239824392625, + "tokens_seen": 1428890624 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028638916750250754, + "loss": 2.7639, + "theoretical_loss": 3.531225411497907, + "tokens_seen": 1428956160 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863791374122367, + "loss": 2.7394, + "theoretical_loss": 3.5312109994492653, + "tokens_seen": 1429021696 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863691073219659, + "loss": 2.849, + "theoretical_loss": 3.5311965882466114, + "tokens_seen": 1429087232 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863590772316951, + "loss": 2.7988, + "theoretical_loss": 3.531182177889857, + "tokens_seen": 1429152768 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863490471414243, + "loss": 2.8243, + "theoretical_loss": 3.5311677683789133, + "tokens_seen": 1429218304 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028633901705115344, + "loss": 2.7217, + "theoretical_loss": 3.5311533597136915, + "tokens_seen": 1429283840 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863289869608827, + "loss": 2.7799, + "theoretical_loss": 3.531138951894104, + "tokens_seen": 1429349376 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002863189568706118, + "loss": 2.8697, + "theoretical_loss": 3.531124544920062, + "tokens_seen": 1429414912 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028630892678034104, + "loss": 2.8179, + "theoretical_loss": 3.5311101387914774, + "tokens_seen": 1429480448 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862988966900702, + "loss": 2.796, + "theoretical_loss": 3.531095733508262, + "tokens_seen": 1429545984 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862888665997994, + "loss": 2.7519, + "theoretical_loss": 3.531081329070326, + "tokens_seen": 1429611520 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862788365095286, + "loss": 2.7261, + "theoretical_loss": 3.5310669254775826, + "tokens_seen": 1429677056 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028626880641925776, + "loss": 2.8553, + "theoretical_loss": 3.5310525227299427, + "tokens_seen": 1429742592 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028625877632898694, + "loss": 2.7228, + "theoretical_loss": 3.531038120827319, + "tokens_seen": 1429808128 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862487462387162, + "loss": 2.7996, + "theoretical_loss": 3.531023719769622, + "tokens_seen": 1429873664 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862387161484453, + "loss": 2.7435, + "theoretical_loss": 3.531009319556764, + "tokens_seen": 1429939200 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028622868605817454, + "loss": 2.7553, + "theoretical_loss": 3.5309949201886566, + "tokens_seen": 1430004736 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028621865596790367, + "loss": 2.7507, + "theoretical_loss": 3.530980521665211, + "tokens_seen": 1430070272 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002862086258776329, + "loss": 2.7126, + "theoretical_loss": 3.53096612398634, + "tokens_seen": 1430135808 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3388443, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6819229125976562, + "objective/train/theoretical_loss": 3.5309517271519546, + "objective/train/tokens_used": 1450661344, + "theoretical_loss": 3.5309517271519546, + "tokens_seen": 1430201344 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002861985957873621, + "loss": 2.7066, + "theoretical_loss": 3.5309517271519546, + "tokens_seen": 1430201344 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028618856569709127, + "loss": 2.812, + "theoretical_loss": 3.530937331161967, + "tokens_seen": 1430266880 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028617853560682045, + "loss": 2.7782, + "theoretical_loss": 3.530922936016289, + "tokens_seen": 1430332416 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002861685055165497, + "loss": 2.7968, + "theoretical_loss": 3.5309085417148314, + "tokens_seen": 1430397952 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002861584754262788, + "loss": 2.7193, + "theoretical_loss": 3.5308941482575076, + "tokens_seen": 1430463488 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028614844533600805, + "loss": 2.7765, + "theoretical_loss": 3.5308797556442286, + "tokens_seen": 1430529024 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028613841524573717, + "loss": 2.7246, + "theoretical_loss": 3.5308653638749057, + "tokens_seen": 1430594560 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002861283851554664, + "loss": 2.6921, + "theoretical_loss": 3.5308509729494517, + "tokens_seen": 1430660096 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002861183550651956, + "loss": 2.7696, + "theoretical_loss": 3.530836582867778, + "tokens_seen": 1430725632 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028610832497492477, + "loss": 2.7456, + "theoretical_loss": 3.5308221936297963, + "tokens_seen": 1430791168 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028609829488465395, + "loss": 2.8926, + "theoretical_loss": 3.5308078052354195, + "tokens_seen": 1430856704 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028608826479438313, + "loss": 2.8422, + "theoretical_loss": 3.530793417684558, + "tokens_seen": 1430922240 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860782347041123, + "loss": 2.6765, + "theoretical_loss": 3.5307790309771248, + "tokens_seen": 1430987776 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028606820461384155, + "loss": 2.807, + "theoretical_loss": 3.530764645113032, + "tokens_seen": 1431053312 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860581745235707, + "loss": 2.7857, + "theoretical_loss": 3.5307502600921903, + "tokens_seen": 1431118848 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860481444332999, + "loss": 2.7693, + "theoretical_loss": 3.5307358759145124, + "tokens_seen": 1431184384 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860381143430291, + "loss": 2.8023, + "theoretical_loss": 3.530721492579911, + "tokens_seen": 1431249920 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860280842527583, + "loss": 2.7629, + "theoretical_loss": 3.5307071100882967, + "tokens_seen": 1431315456 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002860180541624875, + "loss": 2.884, + "theoretical_loss": 3.5306927284395826, + "tokens_seen": 1431380992 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028600802407221664, + "loss": 2.7234, + "theoretical_loss": 3.53067834763368, + "tokens_seen": 1431446528 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028599799398194587, + "loss": 2.793, + "theoretical_loss": 3.5306639676705016, + "tokens_seen": 1431512064 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028598796389167505, + "loss": 2.7724, + "theoretical_loss": 3.530649588549959, + "tokens_seen": 1431577600 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028597793380140423, + "loss": 2.7932, + "theoretical_loss": 3.530635210271964, + "tokens_seen": 1431643136 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002859679037111334, + "loss": 2.7006, + "theoretical_loss": 3.530620832836429, + "tokens_seen": 1431708672 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002859578736208626, + "loss": 2.844, + "theoretical_loss": 3.5306064562432664, + "tokens_seen": 1431774208 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3391241, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.582108497619629, + "objective/train/theoretical_loss": 3.530592080492388, + "objective/train/tokens_used": 1452299744, + "theoretical_loss": 3.530592080492388, + "tokens_seen": 1431839744 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002859478435305918, + "loss": 2.7405, + "theoretical_loss": 3.530592080492388, + "tokens_seen": 1431839744 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285937813440321, + "loss": 2.7759, + "theoretical_loss": 3.5305777055837053, + "tokens_seen": 1431905280 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028592778335005014, + "loss": 2.7351, + "theoretical_loss": 3.530563331517131, + "tokens_seen": 1431970816 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002859177532597794, + "loss": 2.7276, + "theoretical_loss": 3.5305489582925778, + "tokens_seen": 1432036352 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002859077231695085, + "loss": 2.7787, + "theoretical_loss": 3.5305345859099573, + "tokens_seen": 1432101888 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028589769307923774, + "loss": 2.7693, + "theoretical_loss": 3.5305202143691807, + "tokens_seen": 1432167424 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858876629889669, + "loss": 2.7882, + "theoretical_loss": 3.530505843670162, + "tokens_seen": 1432232960 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858776328986961, + "loss": 2.8539, + "theoretical_loss": 3.5304914738128117, + "tokens_seen": 1432298496 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858676028084253, + "loss": 2.7533, + "theoretical_loss": 3.5304771047970434, + "tokens_seen": 1432364032 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858575727181545, + "loss": 2.8026, + "theoretical_loss": 3.530462736622768, + "tokens_seen": 1432429568 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028584754262788364, + "loss": 2.8105, + "theoretical_loss": 3.530448369289899, + "tokens_seen": 1432495104 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858375125376129, + "loss": 2.8899, + "theoretical_loss": 3.530434002798348, + "tokens_seen": 1432560640 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285827482447342, + "loss": 2.8134, + "theoretical_loss": 3.530419637148027, + "tokens_seen": 1432626176 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028581745235707124, + "loss": 2.7392, + "theoretical_loss": 3.5304052723388484, + "tokens_seen": 1432691712 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002858074222668004, + "loss": 2.7933, + "theoretical_loss": 3.530390908370725, + "tokens_seen": 1432757248 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857973921765296, + "loss": 2.6979, + "theoretical_loss": 3.530376545243568, + "tokens_seen": 1432822784 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857873620862588, + "loss": 2.76, + "theoretical_loss": 3.5303621829572913, + "tokens_seen": 1432888320 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028577733199598796, + "loss": 2.8424, + "theoretical_loss": 3.5303478215118056, + "tokens_seen": 1432953856 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028576730190571714, + "loss": 2.6451, + "theoretical_loss": 3.5303334609070243, + "tokens_seen": 1433019392 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857572718154464, + "loss": 2.7323, + "theoretical_loss": 3.530319101142859, + "tokens_seen": 1433084928 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857472417251755, + "loss": 2.7173, + "theoretical_loss": 3.530304742219223, + "tokens_seen": 1433150464 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028573721163490474, + "loss": 2.7889, + "theoretical_loss": 3.530290384136028, + "tokens_seen": 1433216000 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028572718154463387, + "loss": 2.8189, + "theoretical_loss": 3.530276026893186, + "tokens_seen": 1433281536 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857171514543631, + "loss": 2.7846, + "theoretical_loss": 3.530261670490611, + "tokens_seen": 1433347072 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002857071213640923, + "loss": 2.769, + "theoretical_loss": 3.5302473149282134, + "tokens_seen": 1433412608 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3396037, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.924757242202759, + "objective/train/theoretical_loss": 3.5302329602059066, + "objective/train/tokens_used": 1453938144, + "theoretical_loss": 3.5302329602059066, + "tokens_seen": 1433478144 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028569709127382147, + "loss": 2.7402, + "theoretical_loss": 3.5302329602059066, + "tokens_seen": 1433478144 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028568706118355065, + "loss": 2.7539, + "theoretical_loss": 3.5302186063236025, + "tokens_seen": 1433543680 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002856770310932799, + "loss": 2.7075, + "theoretical_loss": 3.530204253281215, + "tokens_seen": 1433609216 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285667001003009, + "loss": 2.7395, + "theoretical_loss": 3.5301899010786553, + "tokens_seen": 1433674752 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028565697091273825, + "loss": 2.7893, + "theoretical_loss": 3.5301755497158354, + "tokens_seen": 1433740288 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028564694082246737, + "loss": 2.6053, + "theoretical_loss": 3.530161199192669, + "tokens_seen": 1433805824 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002856369107321966, + "loss": 2.7658, + "theoretical_loss": 3.530146849509068, + "tokens_seen": 1433871360 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002856268806419258, + "loss": 2.7738, + "theoretical_loss": 3.530132500664945, + "tokens_seen": 1433936896 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028561685055165497, + "loss": 2.8032, + "theoretical_loss": 3.5301181526602132, + "tokens_seen": 1434002432 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028560682046138415, + "loss": 2.7393, + "theoretical_loss": 3.5301038054947838, + "tokens_seen": 1434067968 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028559679037111333, + "loss": 2.8123, + "theoretical_loss": 3.53008945916857, + "tokens_seen": 1434133504 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002855867602808425, + "loss": 2.7761, + "theoretical_loss": 3.5300751136814847, + "tokens_seen": 1434199040 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028557673019057175, + "loss": 2.8042, + "theoretical_loss": 3.5300607690334402, + "tokens_seen": 1434264576 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002855667001003009, + "loss": 2.7353, + "theoretical_loss": 3.5300464252243486, + "tokens_seen": 1434330112 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002855566700100301, + "loss": 2.8012, + "theoretical_loss": 3.530032082254124, + "tokens_seen": 1434395648 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002855466399197593, + "loss": 2.7055, + "theoretical_loss": 3.530017740122677, + "tokens_seen": 1434461184 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002855366098294885, + "loss": 2.6965, + "theoretical_loss": 3.5300033988299218, + "tokens_seen": 1434526720 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028552657973921765, + "loss": 2.8024, + "theoretical_loss": 3.52998905837577, + "tokens_seen": 1434592256 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028551654964894684, + "loss": 2.8281, + "theoretical_loss": 3.529974718760135, + "tokens_seen": 1434657792 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285506519558676, + "loss": 2.7952, + "theoretical_loss": 3.5299603799829296, + "tokens_seen": 1434723328 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028549648946840525, + "loss": 2.6981, + "theoretical_loss": 3.5299460420440654, + "tokens_seen": 1434788864 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002854864593781344, + "loss": 2.7878, + "theoretical_loss": 3.529931704943456, + "tokens_seen": 1434854400 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002854764292878636, + "loss": 2.7496, + "theoretical_loss": 3.529917368681014, + "tokens_seen": 1434919936 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028546639919759274, + "loss": 2.7092, + "theoretical_loss": 3.5299030332566517, + "tokens_seen": 1434985472 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285456369107322, + "loss": 2.7153, + "theoretical_loss": 3.529888698670282, + "tokens_seen": 1435051008 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3398853, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8553903102874756, + "objective/train/theoretical_loss": 3.5298743649218185, + "objective/train/tokens_used": 1455576544, + "theoretical_loss": 3.5298743649218185, + "tokens_seen": 1435116544 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028544633901705116, + "loss": 2.8387, + "theoretical_loss": 3.5298743649218185, + "tokens_seen": 1435116544 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028543630892678034, + "loss": 2.8459, + "theoretical_loss": 3.5298600320111726, + "tokens_seen": 1435182080 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002854262788365095, + "loss": 2.739, + "theoretical_loss": 3.5298456999382584, + "tokens_seen": 1435247616 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002854162487462387, + "loss": 2.8131, + "theoretical_loss": 3.5298313687029874, + "tokens_seen": 1435313152 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002854062186559679, + "loss": 2.7159, + "theoretical_loss": 3.529817038305273, + "tokens_seen": 1435378688 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002853961885656971, + "loss": 2.8256, + "theoretical_loss": 3.5298027087450285, + "tokens_seen": 1435444224 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028538615847542624, + "loss": 2.8367, + "theoretical_loss": 3.5297883800221665, + "tokens_seen": 1435509760 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002853761283851555, + "loss": 2.7465, + "theoretical_loss": 3.529774052136599, + "tokens_seen": 1435575296 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028536609829488466, + "loss": 2.8492, + "theoretical_loss": 3.5297597250882395, + "tokens_seen": 1435640832 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028535606820461384, + "loss": 2.9094, + "theoretical_loss": 3.5297453988770005, + "tokens_seen": 1435706368 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285346038114343, + "loss": 2.7584, + "theoretical_loss": 3.529731073502796, + "tokens_seen": 1435771904 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002853360080240722, + "loss": 2.7435, + "theoretical_loss": 3.529716748965537, + "tokens_seen": 1435837440 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002853259779338014, + "loss": 2.8403, + "theoretical_loss": 3.529702425265138, + "tokens_seen": 1435902976 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002853159478435306, + "loss": 2.8038, + "theoretical_loss": 3.529688102401512, + "tokens_seen": 1435968512 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028530591775325975, + "loss": 2.7003, + "theoretical_loss": 3.529673780374571, + "tokens_seen": 1436034048 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285295887662989, + "loss": 2.7113, + "theoretical_loss": 3.529659459184228, + "tokens_seen": 1436099584 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002852858575727181, + "loss": 2.7509, + "theoretical_loss": 3.5296451388303964, + "tokens_seen": 1436165120 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028527582748244735, + "loss": 2.8354, + "theoretical_loss": 3.529630819312989, + "tokens_seen": 1436230656 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002852657973921766, + "loss": 2.7184, + "theoretical_loss": 3.5296165006319185, + "tokens_seen": 1436296192 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002852557673019057, + "loss": 2.7899, + "theoretical_loss": 3.529602182787098, + "tokens_seen": 1436361728 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028524573721163494, + "loss": 2.8388, + "theoretical_loss": 3.529587865778441, + "tokens_seen": 1436427264 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028523570712136407, + "loss": 2.7449, + "theoretical_loss": 3.5295735496058604, + "tokens_seen": 1436492800 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002852256770310933, + "loss": 2.7406, + "theoretical_loss": 3.529559234269269, + "tokens_seen": 1436558336 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002852156469408225, + "loss": 2.8529, + "theoretical_loss": 3.52954491976858, + "tokens_seen": 1436623872 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028520561685055167, + "loss": 2.8294, + "theoretical_loss": 3.529530606103706, + "tokens_seen": 1436689408 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3402629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8787732124328613, + "objective/train/theoretical_loss": 3.529516293274561, + "objective/train/tokens_used": 1457214944, + "theoretical_loss": 3.529516293274561, + "tokens_seen": 1436754944 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028519558676028085, + "loss": 2.7945, + "theoretical_loss": 3.529516293274561, + "tokens_seen": 1436754944 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002851855566700101, + "loss": 2.7022, + "theoretical_loss": 3.529501981281057, + "tokens_seen": 1436820480 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002851755265797392, + "loss": 2.7408, + "theoretical_loss": 3.5294876701231077, + "tokens_seen": 1436886016 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028516549648946845, + "loss": 2.7971, + "theoretical_loss": 3.529473359800626, + "tokens_seen": 1436951552 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028515546639919757, + "loss": 2.7323, + "theoretical_loss": 3.529459050313526, + "tokens_seen": 1437017088 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002851454363089268, + "loss": 2.7218, + "theoretical_loss": 3.5294447416617194, + "tokens_seen": 1437082624 + }, + { + "epoch": 17.02, + "learning_rate": 0.000285135406218656, + "loss": 2.8456, + "theoretical_loss": 3.5294304338451195, + "tokens_seen": 1437148160 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028512537612838517, + "loss": 2.8292, + "theoretical_loss": 3.5294161268636404, + "tokens_seen": 1437213696 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028511534603811435, + "loss": 2.8477, + "theoretical_loss": 3.5294018207171947, + "tokens_seen": 1437279232 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028510531594784353, + "loss": 2.7452, + "theoretical_loss": 3.5293875154056957, + "tokens_seen": 1437344768 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850952858575727, + "loss": 2.6753, + "theoretical_loss": 3.5293732109290565, + "tokens_seen": 1437410304 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028508525576730195, + "loss": 2.768, + "theoretical_loss": 3.5293589072871905, + "tokens_seen": 1437475840 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850752256770311, + "loss": 2.7663, + "theoretical_loss": 3.529344604480011, + "tokens_seen": 1437541376 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850651955867603, + "loss": 2.7703, + "theoretical_loss": 3.5293303025074305, + "tokens_seen": 1437606912 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850551654964895, + "loss": 2.8402, + "theoretical_loss": 3.529316001369364, + "tokens_seen": 1437672448 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850451354062187, + "loss": 2.641, + "theoretical_loss": 3.529301701065722, + "tokens_seen": 1437737984 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028503510531594785, + "loss": 2.794, + "theoretical_loss": 3.5292874015964206, + "tokens_seen": 1437803520 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028502507522567704, + "loss": 2.7807, + "theoretical_loss": 3.5292731029613718, + "tokens_seen": 1437869056 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002850150451354062, + "loss": 2.7979, + "theoretical_loss": 3.529258805160488, + "tokens_seen": 1437934592 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028500501504513545, + "loss": 2.8158, + "theoretical_loss": 3.5292445081936847, + "tokens_seen": 1438000128 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849949849548646, + "loss": 2.8035, + "theoretical_loss": 3.529230212060873, + "tokens_seen": 1438065664 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849849548645938, + "loss": 2.805, + "theoretical_loss": 3.5292159167619683, + "tokens_seen": 1438131200 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028497492477432294, + "loss": 2.7315, + "theoretical_loss": 3.529201622296882, + "tokens_seen": 1438196736 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849648946840522, + "loss": 2.8057, + "theoretical_loss": 3.529187328665529, + "tokens_seen": 1438262272 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028495486459378136, + "loss": 2.8317, + "theoretical_loss": 3.5291730358678217, + "tokens_seen": 1438327808 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.673631191253662, + "objective/train/theoretical_loss": 3.5291587439036745, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5291587439036745, + "tokens_seen": 1438393344 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028494483450351054, + "loss": 2.7105, + "theoretical_loss": 3.5291587439036745, + "tokens_seen": 1438393344 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849348044132397, + "loss": 2.7484, + "theoretical_loss": 3.529144452773, + "tokens_seen": 1438458880 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849247743229689, + "loss": 2.7664, + "theoretical_loss": 3.5291301624757114, + "tokens_seen": 1438524416 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849147442326981, + "loss": 2.8329, + "theoretical_loss": 3.529115873011723, + "tokens_seen": 1438589952 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002849047141424273, + "loss": 2.7709, + "theoretical_loss": 3.529101584380948, + "tokens_seen": 1438655488 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028489468405215644, + "loss": 2.8562, + "theoretical_loss": 3.529087296583299, + "tokens_seen": 1438721024 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848846539618857, + "loss": 2.7258, + "theoretical_loss": 3.5290730096186906, + "tokens_seen": 1438786560 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028487462387161486, + "loss": 2.7881, + "theoretical_loss": 3.5290587234870356, + "tokens_seen": 1438852096 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028486459378134404, + "loss": 2.658, + "theoretical_loss": 3.529044438188248, + "tokens_seen": 1438917632 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848545636910732, + "loss": 2.7148, + "theoretical_loss": 3.5290301537222413, + "tokens_seen": 1438983168 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848445336008024, + "loss": 2.7136, + "theoretical_loss": 3.5290158700889283, + "tokens_seen": 1439048704 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848345035105316, + "loss": 2.7503, + "theoretical_loss": 3.5290015872882234, + "tokens_seen": 1439114240 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848244734202608, + "loss": 2.8159, + "theoretical_loss": 3.5289873053200393, + "tokens_seen": 1439179776 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028481444332998995, + "loss": 2.7029, + "theoretical_loss": 3.5289730241842907, + "tokens_seen": 1439245312 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002848044132397192, + "loss": 2.6686, + "theoretical_loss": 3.5289587438808905, + "tokens_seen": 1439310848 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847943831494483, + "loss": 2.7776, + "theoretical_loss": 3.5289444644097516, + "tokens_seen": 1439376384 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028478435305917755, + "loss": 2.7681, + "theoretical_loss": 3.528930185770789, + "tokens_seen": 1439441920 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847743229689067, + "loss": 2.7071, + "theoretical_loss": 3.528915907963915, + "tokens_seen": 1439507456 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847642928786359, + "loss": 2.7303, + "theoretical_loss": 3.528901630989045, + "tokens_seen": 1439572992 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847542627883651, + "loss": 2.7935, + "theoretical_loss": 3.528887354846091, + "tokens_seen": 1439638528 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028474423269809427, + "loss": 2.8594, + "theoretical_loss": 3.528873079534967, + "tokens_seen": 1439704064 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028473420260782345, + "loss": 2.7898, + "theoretical_loss": 3.5288588050555867, + "tokens_seen": 1439769600 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847241725175527, + "loss": 2.8586, + "theoretical_loss": 3.5288445314078642, + "tokens_seen": 1439835136 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002847141424272818, + "loss": 2.7901, + "theoretical_loss": 3.5288302585917126, + "tokens_seen": 1439900672 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028470411233701105, + "loss": 2.7524, + "theoretical_loss": 3.5288159866070465, + "tokens_seen": 1439966208 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.766927480697632, + "objective/train/theoretical_loss": 3.5288017154537785, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5288017154537785, + "tokens_seen": 1440031744 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028469408224674023, + "loss": 2.8241, + "theoretical_loss": 3.5288017154537785, + "tokens_seen": 1440031744 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846840521564694, + "loss": 2.8262, + "theoretical_loss": 3.5287874451318233, + "tokens_seen": 1440097280 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846740220661986, + "loss": 2.7194, + "theoretical_loss": 3.528773175641094, + "tokens_seen": 1440162816 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846639919759278, + "loss": 2.8036, + "theoretical_loss": 3.528758906981505, + "tokens_seen": 1440228352 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028465396188565695, + "loss": 2.7772, + "theoretical_loss": 3.5287446391529693, + "tokens_seen": 1440293888 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846439317953862, + "loss": 2.8468, + "theoretical_loss": 3.528730372155401, + "tokens_seen": 1440359424 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846339017051153, + "loss": 2.7381, + "theoretical_loss": 3.528716105988714, + "tokens_seen": 1440424960 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028462387161484455, + "loss": 2.8037, + "theoretical_loss": 3.5287018406528228, + "tokens_seen": 1440490496 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846138415245737, + "loss": 2.7651, + "theoretical_loss": 3.5286875761476395, + "tokens_seen": 1440556032 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002846038114343029, + "loss": 2.8687, + "theoretical_loss": 3.528673312473079, + "tokens_seen": 1440621568 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845937813440321, + "loss": 2.6954, + "theoretical_loss": 3.5286590496290557, + "tokens_seen": 1440687104 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845837512537613, + "loss": 2.7709, + "theoretical_loss": 3.5286447876154825, + "tokens_seen": 1440752640 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028457372116349046, + "loss": 2.8199, + "theoretical_loss": 3.5286305264322735, + "tokens_seen": 1440818176 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845636910732197, + "loss": 2.8415, + "theoretical_loss": 3.528616266079343, + "tokens_seen": 1440883712 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845536609829488, + "loss": 2.7695, + "theoretical_loss": 3.5286020065566044, + "tokens_seen": 1440949248 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028454363089267805, + "loss": 2.8134, + "theoretical_loss": 3.5285877478639716, + "tokens_seen": 1441014784 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028453360080240724, + "loss": 2.7941, + "theoretical_loss": 3.5285734900013592, + "tokens_seen": 1441080320 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845235707121364, + "loss": 2.7897, + "theoretical_loss": 3.5285592329686803, + "tokens_seen": 1441145856 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028451354062186565, + "loss": 2.8404, + "theoretical_loss": 3.5285449767658497, + "tokens_seen": 1441211392 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002845035105315948, + "loss": 2.749, + "theoretical_loss": 3.528530721392781, + "tokens_seen": 1441276928 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284493480441324, + "loss": 2.7624, + "theoretical_loss": 3.5285164668493874, + "tokens_seen": 1441342464 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028448345035105314, + "loss": 2.8204, + "theoretical_loss": 3.528502213135584, + "tokens_seen": 1441408000 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002844734202607824, + "loss": 2.7785, + "theoretical_loss": 3.5284879602512844, + "tokens_seen": 1441473536 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028446339017051156, + "loss": 2.7154, + "theoretical_loss": 3.5284737081964024, + "tokens_seen": 1441539072 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028445336008024074, + "loss": 2.8507, + "theoretical_loss": 3.528459456970852, + "tokens_seen": 1441604608 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6464128494262695, + "objective/train/theoretical_loss": 3.528445206574548, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.528445206574548, + "tokens_seen": 1441670144 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002844433299899699, + "loss": 2.7772, + "theoretical_loss": 3.528445206574548, + "tokens_seen": 1441670144 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002844332998996991, + "loss": 2.7722, + "theoretical_loss": 3.5284309570074037, + "tokens_seen": 1441735680 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002844232698094283, + "loss": 2.7901, + "theoretical_loss": 3.528416708269334, + "tokens_seen": 1441801216 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002844132397191575, + "loss": 2.6514, + "theoretical_loss": 3.528402460360251, + "tokens_seen": 1441866752 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028440320962888664, + "loss": 2.7891, + "theoretical_loss": 3.5283882132800715, + "tokens_seen": 1441932288 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843931795386159, + "loss": 2.6435, + "theoretical_loss": 3.528373967028707, + "tokens_seen": 1441997824 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028438314944834506, + "loss": 2.7305, + "theoretical_loss": 3.528359721606074, + "tokens_seen": 1442063360 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028437311935807424, + "loss": 2.7634, + "theoretical_loss": 3.5283454770120852, + "tokens_seen": 1442128896 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843630892678034, + "loss": 2.7615, + "theoretical_loss": 3.528331233246655, + "tokens_seen": 1442194432 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843530591775326, + "loss": 2.7656, + "theoretical_loss": 3.5283169903096976, + "tokens_seen": 1442259968 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843430290872618, + "loss": 2.7793, + "theoretical_loss": 3.5283027482011273, + "tokens_seen": 1442325504 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284332998996991, + "loss": 2.7644, + "theoretical_loss": 3.528288506920858, + "tokens_seen": 1442391040 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028432296890672015, + "loss": 2.7322, + "theoretical_loss": 3.528274266468804, + "tokens_seen": 1442456576 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843129388164494, + "loss": 2.825, + "theoretical_loss": 3.52826002684488, + "tokens_seen": 1442522112 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002843029087261785, + "loss": 2.7449, + "theoretical_loss": 3.5282457880489995, + "tokens_seen": 1442587648 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028429287863590775, + "loss": 2.7963, + "theoretical_loss": 3.5282315500810775, + "tokens_seen": 1442653184 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002842828485456369, + "loss": 2.7667, + "theoretical_loss": 3.5282173129410275, + "tokens_seen": 1442718720 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002842728184553661, + "loss": 2.7751, + "theoretical_loss": 3.5282030766287633, + "tokens_seen": 1442784256 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002842627883650953, + "loss": 2.8582, + "theoretical_loss": 3.5281888411442006, + "tokens_seen": 1442849792 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028425275827482447, + "loss": 2.7417, + "theoretical_loss": 3.5281746064872532, + "tokens_seen": 1442915328 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028424272818455365, + "loss": 2.7373, + "theoretical_loss": 3.528160372657835, + "tokens_seen": 1442980864 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002842326980942829, + "loss": 2.7071, + "theoretical_loss": 3.5281461396558607, + "tokens_seen": 1443046400 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284222668004012, + "loss": 2.7347, + "theoretical_loss": 3.528131907481244, + "tokens_seen": 1443111936 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028421263791374125, + "loss": 2.752, + "theoretical_loss": 3.5281176761339, + "tokens_seen": 1443177472 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028420260782347043, + "loss": 2.7535, + "theoretical_loss": 3.528103445613743, + "tokens_seen": 1443243008 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6151833534240723, + "objective/train/theoretical_loss": 3.5280892159206863, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5280892159206863, + "tokens_seen": 1443308544 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841925777331996, + "loss": 2.7472, + "theoretical_loss": 3.5280892159206863, + "tokens_seen": 1443308544 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841825476429288, + "loss": 2.7142, + "theoretical_loss": 3.5280749870546453, + "tokens_seen": 1443374080 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284172517552658, + "loss": 2.7539, + "theoretical_loss": 3.528060759015535, + "tokens_seen": 1443439616 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028416248746238715, + "loss": 2.8355, + "theoretical_loss": 3.528046531803268, + "tokens_seen": 1443505152 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841524573721164, + "loss": 2.8152, + "theoretical_loss": 3.52803230541776, + "tokens_seen": 1443570688 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841424272818455, + "loss": 2.8042, + "theoretical_loss": 3.528018079858925, + "tokens_seen": 1443636224 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028413239719157475, + "loss": 2.7706, + "theoretical_loss": 3.528003855126678, + "tokens_seen": 1443701760 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841223671013039, + "loss": 2.7542, + "theoretical_loss": 3.5279896312209322, + "tokens_seen": 1443767296 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841123370110331, + "loss": 2.6868, + "theoretical_loss": 3.527975408141603, + "tokens_seen": 1443832832 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002841023069207623, + "loss": 2.8051, + "theoretical_loss": 3.5279611858886053, + "tokens_seen": 1443898368 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002840922768304915, + "loss": 2.7896, + "theoretical_loss": 3.5279469644618526, + "tokens_seen": 1443963904 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028408224674022066, + "loss": 2.8015, + "theoretical_loss": 3.5279327438612595, + "tokens_seen": 1444029440 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002840722166499499, + "loss": 2.7847, + "theoretical_loss": 3.5279185240867417, + "tokens_seen": 1444094976 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284062186559679, + "loss": 2.7783, + "theoretical_loss": 3.527904305138212, + "tokens_seen": 1444160512 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028405215646940825, + "loss": 2.6439, + "theoretical_loss": 3.527890087015586, + "tokens_seen": 1444226048 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002840421263791374, + "loss": 2.8505, + "theoretical_loss": 3.5278758697187786, + "tokens_seen": 1444291584 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002840320962888666, + "loss": 2.8687, + "theoretical_loss": 3.5278616532477027, + "tokens_seen": 1444357120 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002840220661985958, + "loss": 2.7661, + "theoretical_loss": 3.527847437602275, + "tokens_seen": 1444422656 + }, + { + "epoch": 17.02, + "learning_rate": 0.000284012036108325, + "loss": 2.8475, + "theoretical_loss": 3.5278332227824087, + "tokens_seen": 1444488192 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028400200601805416, + "loss": 2.7962, + "theoretical_loss": 3.527819008788019, + "tokens_seen": 1444553728 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028399197592778334, + "loss": 2.7583, + "theoretical_loss": 3.52780479561902, + "tokens_seen": 1444619264 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002839819458375125, + "loss": 2.7668, + "theoretical_loss": 3.5277905832753267, + "tokens_seen": 1444684800 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028397191574724176, + "loss": 2.8062, + "theoretical_loss": 3.527776371756854, + "tokens_seen": 1444750336 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002839618856569709, + "loss": 2.8653, + "theoretical_loss": 3.5277621610635155, + "tokens_seen": 1444815872 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002839518555667001, + "loss": 2.7805, + "theoretical_loss": 3.527747951195227, + "tokens_seen": 1444881408 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.897705316543579, + "objective/train/theoretical_loss": 3.527733742151903, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.527733742151903, + "tokens_seen": 1444946944 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028394182547642925, + "loss": 2.8486, + "theoretical_loss": 3.527733742151903, + "tokens_seen": 1444946944 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002839317953861585, + "loss": 2.8076, + "theoretical_loss": 3.527719533933458, + "tokens_seen": 1445012480 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028392176529588766, + "loss": 2.6804, + "theoretical_loss": 3.527705326539807, + "tokens_seen": 1445078016 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028391173520561684, + "loss": 2.8338, + "theoretical_loss": 3.5276911199708634, + "tokens_seen": 1445143552 + }, + { + "epoch": 17.02, + "learning_rate": 0.000283901705115346, + "loss": 2.8819, + "theoretical_loss": 3.527676914226544, + "tokens_seen": 1445209088 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028389167502507526, + "loss": 2.7597, + "theoretical_loss": 3.527662709306761, + "tokens_seen": 1445274624 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002838816449348044, + "loss": 2.9163, + "theoretical_loss": 3.527648505211432, + "tokens_seen": 1445340160 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002838716148445336, + "loss": 2.7318, + "theoretical_loss": 3.52763430194047, + "tokens_seen": 1445405696 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028386158475426275, + "loss": 2.7491, + "theoretical_loss": 3.52762009949379, + "tokens_seen": 1445471232 + }, + { + "epoch": 17.02, + "learning_rate": 0.000283851554663992, + "loss": 2.7691, + "theoretical_loss": 3.527605897871307, + "tokens_seen": 1445536768 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028384152457372117, + "loss": 2.8114, + "theoretical_loss": 3.527591697072936, + "tokens_seen": 1445602304 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028383149448345035, + "loss": 2.7752, + "theoretical_loss": 3.5275774970985916, + "tokens_seen": 1445667840 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028382146439317953, + "loss": 2.7899, + "theoretical_loss": 3.527563297948188, + "tokens_seen": 1445733376 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002838114343029087, + "loss": 2.8225, + "theoretical_loss": 3.527549099621641, + "tokens_seen": 1445798912 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002838014042126379, + "loss": 2.7968, + "theoretical_loss": 3.5275349021188656, + "tokens_seen": 1445864448 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837913741223671, + "loss": 2.7554, + "theoretical_loss": 3.527520705439776, + "tokens_seen": 1445929984 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837813440320963, + "loss": 2.7603, + "theoretical_loss": 3.5275065095842866, + "tokens_seen": 1445995520 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837713139418255, + "loss": 2.7967, + "theoretical_loss": 3.5274923145523136, + "tokens_seen": 1446061056 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028376128385155467, + "loss": 2.6977, + "theoretical_loss": 3.5274781203437717, + "tokens_seen": 1446126592 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028375125376128385, + "loss": 2.7107, + "theoretical_loss": 3.527463926958575, + "tokens_seen": 1446192128 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837412236710131, + "loss": 2.7414, + "theoretical_loss": 3.5274497343966384, + "tokens_seen": 1446257664 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837311935807422, + "loss": 2.8417, + "theoretical_loss": 3.5274355426578774, + "tokens_seen": 1446323200 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028372116349047145, + "loss": 2.749, + "theoretical_loss": 3.5274213517422073, + "tokens_seen": 1446388736 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028371113340020063, + "loss": 2.7019, + "theoretical_loss": 3.527407161649543, + "tokens_seen": 1446454272 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002837011033099298, + "loss": 2.768, + "theoretical_loss": 3.527392972379798, + "tokens_seen": 1446519808 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.73848819732666, + "objective/train/theoretical_loss": 3.527378783932889, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.527378783932889, + "tokens_seen": 1446585344 + }, + { + "epoch": 17.02, + "learning_rate": 0.000283691073219659, + "loss": 2.7463, + "theoretical_loss": 3.527378783932889, + "tokens_seen": 1446585344 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836810431293882, + "loss": 2.8292, + "theoretical_loss": 3.5273645963087303, + "tokens_seen": 1446650880 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028367101303911735, + "loss": 2.7942, + "theoretical_loss": 3.527350409507237, + "tokens_seen": 1446716416 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836609829488466, + "loss": 2.7486, + "theoretical_loss": 3.5273362235283243, + "tokens_seen": 1446781952 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836509528585757, + "loss": 2.7103, + "theoretical_loss": 3.5273220383719073, + "tokens_seen": 1446847488 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028364092276830495, + "loss": 2.8136, + "theoretical_loss": 3.5273078540379004, + "tokens_seen": 1446913024 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836308926780341, + "loss": 2.7895, + "theoretical_loss": 3.5272936705262197, + "tokens_seen": 1446978560 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836208625877633, + "loss": 2.7351, + "theoretical_loss": 3.527279487836779, + "tokens_seen": 1447044096 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836108324974925, + "loss": 2.8532, + "theoretical_loss": 3.527265305969495, + "tokens_seen": 1447109632 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002836008024072217, + "loss": 2.7369, + "theoretical_loss": 3.5272511249242813, + "tokens_seen": 1447175168 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028359077231695086, + "loss": 2.7759, + "theoretical_loss": 3.5272369447010536, + "tokens_seen": 1447240704 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002835807422266801, + "loss": 2.7985, + "theoretical_loss": 3.5272227652997272, + "tokens_seen": 1447306240 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002835707121364092, + "loss": 2.6779, + "theoretical_loss": 3.527208586720217, + "tokens_seen": 1447371776 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028356068204613846, + "loss": 2.7166, + "theoretical_loss": 3.5271944089624387, + "tokens_seen": 1447437312 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002835506519558676, + "loss": 2.7603, + "theoretical_loss": 3.527180232026307, + "tokens_seen": 1447502848 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002835406218655968, + "loss": 2.8009, + "theoretical_loss": 3.5271660559117377, + "tokens_seen": 1447568384 + }, + { + "epoch": 17.02, + "learning_rate": 0.000283530591775326, + "loss": 2.8128, + "theoretical_loss": 3.5271518806186446, + "tokens_seen": 1447633920 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002835205616850552, + "loss": 2.6868, + "theoretical_loss": 3.527137706146944, + "tokens_seen": 1447699456 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028351053159478436, + "loss": 2.7682, + "theoretical_loss": 3.527123532496551, + "tokens_seen": 1447764992 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028350050150451354, + "loss": 2.802, + "theoretical_loss": 3.5271093596673806, + "tokens_seen": 1447830528 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002834904714142427, + "loss": 2.7678, + "theoretical_loss": 3.527095187659348, + "tokens_seen": 1447896064 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028348044132397196, + "loss": 2.8112, + "theoretical_loss": 3.527081016472369, + "tokens_seen": 1447961600 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002834704112337011, + "loss": 2.7906, + "theoretical_loss": 3.527066846106359, + "tokens_seen": 1448027136 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002834603811434303, + "loss": 2.7594, + "theoretical_loss": 3.5270526765612322, + "tokens_seen": 1448092672 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028345035105315945, + "loss": 2.7717, + "theoretical_loss": 3.527038507836904, + "tokens_seen": 1448158208 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7795612812042236, + "objective/train/theoretical_loss": 3.5270243399332912, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5270243399332912, + "tokens_seen": 1448223744 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002834403209628887, + "loss": 2.8352, + "theoretical_loss": 3.5270243399332912, + "tokens_seen": 1448223744 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028343029087261786, + "loss": 2.7733, + "theoretical_loss": 3.5270101728503076, + "tokens_seen": 1448289280 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028342026078234705, + "loss": 2.8325, + "theoretical_loss": 3.5269960065878694, + "tokens_seen": 1448354816 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002834102306920762, + "loss": 2.8591, + "theoretical_loss": 3.526981841145891, + "tokens_seen": 1448420352 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028340020060180546, + "loss": 2.8236, + "theoretical_loss": 3.526967676524289, + "tokens_seen": 1448485888 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002833901705115346, + "loss": 2.7815, + "theoretical_loss": 3.5269535127229776, + "tokens_seen": 1448551424 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002833801404212638, + "loss": 2.8431, + "theoretical_loss": 3.5269393497418733, + "tokens_seen": 1448616960 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028337011033099295, + "loss": 2.7817, + "theoretical_loss": 3.5269251875808907, + "tokens_seen": 1448682496 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002833600802407222, + "loss": 2.7214, + "theoretical_loss": 3.526911026239946, + "tokens_seen": 1448748032 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028335005015045137, + "loss": 2.7522, + "theoretical_loss": 3.5268968657189532, + "tokens_seen": 1448813568 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028334002006018055, + "loss": 2.723, + "theoretical_loss": 3.526882706017829, + "tokens_seen": 1448879104 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028332998996990973, + "loss": 2.7261, + "theoretical_loss": 3.526868547136489, + "tokens_seen": 1448944640 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002833199598796389, + "loss": 2.7515, + "theoretical_loss": 3.5268543890748476, + "tokens_seen": 1449010176 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002833099297893681, + "loss": 2.8242, + "theoretical_loss": 3.5268402318328205, + "tokens_seen": 1449075712 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002832998996990973, + "loss": 2.8499, + "theoretical_loss": 3.526826075410324, + "tokens_seen": 1449141248 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028328986960882645, + "loss": 2.7833, + "theoretical_loss": 3.5268119198072734, + "tokens_seen": 1449206784 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002832798395185557, + "loss": 2.7654, + "theoretical_loss": 3.5267977650235833, + "tokens_seen": 1449272320 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002832698094282848, + "loss": 2.7731, + "theoretical_loss": 3.52678361105917, + "tokens_seen": 1449337856 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028325977933801405, + "loss": 2.7095, + "theoretical_loss": 3.526769457913949, + "tokens_seen": 1449403392 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028324974924774323, + "loss": 2.8386, + "theoretical_loss": 3.5267553055878356, + "tokens_seen": 1449468928 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002832397191574724, + "loss": 2.7883, + "theoretical_loss": 3.5267411540807454, + "tokens_seen": 1449534464 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002832296890672016, + "loss": 2.7713, + "theoretical_loss": 3.526727003392594, + "tokens_seen": 1449600000 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028321965897693083, + "loss": 2.7976, + "theoretical_loss": 3.526712853523297, + "tokens_seen": 1449665536 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028320962888665996, + "loss": 2.8291, + "theoretical_loss": 3.5266987044727705, + "tokens_seen": 1449731072 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831995987963892, + "loss": 2.7168, + "theoretical_loss": 3.5266845562409297, + "tokens_seen": 1449796608 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6539671421051025, + "objective/train/theoretical_loss": 3.52667040882769, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.52667040882769, + "tokens_seen": 1449862144 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831895687061183, + "loss": 2.7588, + "theoretical_loss": 3.52667040882769, + "tokens_seen": 1449862144 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028317953861584755, + "loss": 2.8086, + "theoretical_loss": 3.526656262232967, + "tokens_seen": 1449927680 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028316950852557674, + "loss": 2.5865, + "theoretical_loss": 3.526642116456677, + "tokens_seen": 1449993216 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831594784353059, + "loss": 2.6769, + "theoretical_loss": 3.526627971498735, + "tokens_seen": 1450058752 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831494483450351, + "loss": 2.7434, + "theoretical_loss": 3.526613827359057, + "tokens_seen": 1450124288 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831394182547643, + "loss": 2.7643, + "theoretical_loss": 3.526599684037558, + "tokens_seen": 1450189824 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028312938816449346, + "loss": 2.8702, + "theoretical_loss": 3.526585541534155, + "tokens_seen": 1450255360 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831193580742227, + "loss": 2.7993, + "theoretical_loss": 3.526571399848763, + "tokens_seen": 1450320896 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002831093279839518, + "loss": 2.7968, + "theoretical_loss": 3.5265572589812972, + "tokens_seen": 1450386432 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028309929789368106, + "loss": 2.8091, + "theoretical_loss": 3.5265431189316745, + "tokens_seen": 1450451968 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830892678034102, + "loss": 2.8661, + "theoretical_loss": 3.52652897969981, + "tokens_seen": 1450517504 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830792377131394, + "loss": 2.8856, + "theoretical_loss": 3.526514841285619, + "tokens_seen": 1450583040 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830692076228686, + "loss": 2.7691, + "theoretical_loss": 3.526500703689018, + "tokens_seen": 1450648576 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830591775325978, + "loss": 2.7312, + "theoretical_loss": 3.526486566909923, + "tokens_seen": 1450714112 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028304914744232696, + "loss": 2.7595, + "theoretical_loss": 3.5264724309482487, + "tokens_seen": 1450779648 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830391173520562, + "loss": 2.737, + "theoretical_loss": 3.526458295803912, + "tokens_seen": 1450845184 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002830290872617854, + "loss": 2.7431, + "theoretical_loss": 3.526444161476828, + "tokens_seen": 1450910720 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028301905717151456, + "loss": 2.8312, + "theoretical_loss": 3.5264300279669127, + "tokens_seen": 1450976256 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028300902708124374, + "loss": 2.6901, + "theoretical_loss": 3.5264158952740825, + "tokens_seen": 1451041792 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002829989969909729, + "loss": 2.7693, + "theoretical_loss": 3.526401763398253, + "tokens_seen": 1451107328 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028298896690070216, + "loss": 2.799, + "theoretical_loss": 3.5263876323393397, + "tokens_seen": 1451172864 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002829789368104313, + "loss": 2.7605, + "theoretical_loss": 3.5263735020972584, + "tokens_seen": 1451238400 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002829689067201605, + "loss": 2.8386, + "theoretical_loss": 3.5263593726719256, + "tokens_seen": 1451303936 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028295887662988965, + "loss": 2.8414, + "theoretical_loss": 3.526345244063257, + "tokens_seen": 1451369472 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002829488465396189, + "loss": 2.7224, + "theoretical_loss": 3.526331116271168, + "tokens_seen": 1451435008 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8081960678100586, + "objective/train/theoretical_loss": 3.5263169892955752, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5263169892955752, + "tokens_seen": 1451500544 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028293881644934806, + "loss": 2.8294, + "theoretical_loss": 3.5263169892955752, + "tokens_seen": 1451500544 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028292878635907725, + "loss": 2.857, + "theoretical_loss": 3.5263028631363946, + "tokens_seen": 1451566080 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002829187562688064, + "loss": 2.8923, + "theoretical_loss": 3.526288737793542, + "tokens_seen": 1451631616 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028290872617853566, + "loss": 2.809, + "theoretical_loss": 3.5262746132669323, + "tokens_seen": 1451697152 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002828986960882648, + "loss": 2.7667, + "theoretical_loss": 3.5262604895564835, + "tokens_seen": 1451762688 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282888665997994, + "loss": 2.8445, + "theoretical_loss": 3.52624636666211, + "tokens_seen": 1451828224 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028287863590772315, + "loss": 2.8323, + "theoretical_loss": 3.5262322445837286, + "tokens_seen": 1451893760 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002828686058174524, + "loss": 2.8269, + "theoretical_loss": 3.526218123321255, + "tokens_seen": 1451959296 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028285857572718157, + "loss": 2.8175, + "theoretical_loss": 3.5262040028746053, + "tokens_seen": 1452024832 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028284854563691075, + "loss": 2.8218, + "theoretical_loss": 3.5261898832436955, + "tokens_seen": 1452090368 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028283851554663993, + "loss": 2.7333, + "theoretical_loss": 3.526175764428442, + "tokens_seen": 1452155904 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002828284854563691, + "loss": 2.7479, + "theoretical_loss": 3.5261616464287604, + "tokens_seen": 1452221440 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002828184553660983, + "loss": 2.7972, + "theoretical_loss": 3.526147529244567, + "tokens_seen": 1452286976 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002828084252758275, + "loss": 2.7701, + "theoretical_loss": 3.526133412875778, + "tokens_seen": 1452352512 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028279839518555665, + "loss": 2.8404, + "theoretical_loss": 3.5261192973223094, + "tokens_seen": 1452418048 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002827883650952859, + "loss": 2.8047, + "theoretical_loss": 3.526105182584078, + "tokens_seen": 1452483584 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282778335005015, + "loss": 2.8167, + "theoretical_loss": 3.526091068660998, + "tokens_seen": 1452549120 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028276830491474425, + "loss": 2.8222, + "theoretical_loss": 3.5260769555529876, + "tokens_seen": 1452614656 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028275827482447343, + "loss": 2.7338, + "theoretical_loss": 3.526062843259962, + "tokens_seen": 1452680192 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002827482447342026, + "loss": 2.7855, + "theoretical_loss": 3.526048731781837, + "tokens_seen": 1452745728 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002827382146439318, + "loss": 2.8126, + "theoretical_loss": 3.52603462111853, + "tokens_seen": 1452811264 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028272818455366103, + "loss": 2.7855, + "theoretical_loss": 3.526020511269957, + "tokens_seen": 1452876800 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028271815446339016, + "loss": 2.749, + "theoretical_loss": 3.5260064022360327, + "tokens_seen": 1452942336 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002827081243731194, + "loss": 2.8189, + "theoretical_loss": 3.525992294016675, + "tokens_seen": 1453007872 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002826980942828485, + "loss": 2.6336, + "theoretical_loss": 3.525978186611799, + "tokens_seen": 1453073408 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9555892944335938, + "objective/train/theoretical_loss": 3.525964080021322, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.525964080021322, + "tokens_seen": 1453138944 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028268806419257775, + "loss": 2.8548, + "theoretical_loss": 3.525964080021322, + "tokens_seen": 1453138944 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028267803410230694, + "loss": 2.7455, + "theoretical_loss": 3.52594997424516, + "tokens_seen": 1453204480 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002826680040120361, + "loss": 2.7173, + "theoretical_loss": 3.525935869283228, + "tokens_seen": 1453270016 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002826579739217653, + "loss": 2.7691, + "theoretical_loss": 3.525921765135444, + "tokens_seen": 1453335552 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002826479438314945, + "loss": 2.8007, + "theoretical_loss": 3.5259076618017233, + "tokens_seen": 1453401088 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028263791374122366, + "loss": 2.8138, + "theoretical_loss": 3.5258935592819824, + "tokens_seen": 1453466624 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002826278836509529, + "loss": 2.7806, + "theoretical_loss": 3.5258794575761376, + "tokens_seen": 1453532160 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282617853560682, + "loss": 2.783, + "theoretical_loss": 3.525865356684106, + "tokens_seen": 1453597696 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028260782347041126, + "loss": 2.8221, + "theoretical_loss": 3.5258512566058027, + "tokens_seen": 1453663232 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825977933801404, + "loss": 2.7329, + "theoretical_loss": 3.5258371573411447, + "tokens_seen": 1453728768 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825877632898696, + "loss": 2.756, + "theoretical_loss": 3.5258230588900483, + "tokens_seen": 1453794304 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825777331995988, + "loss": 2.8494, + "theoretical_loss": 3.52580896125243, + "tokens_seen": 1453859840 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282567703109328, + "loss": 2.8103, + "theoretical_loss": 3.525794864428206, + "tokens_seen": 1453925376 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028255767301905716, + "loss": 2.725, + "theoretical_loss": 3.525780768417293, + "tokens_seen": 1453990912 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825476429287864, + "loss": 2.7675, + "theoretical_loss": 3.525766673219607, + "tokens_seen": 1454056448 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825376128385155, + "loss": 2.8136, + "theoretical_loss": 3.525752578835065, + "tokens_seen": 1454121984 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028252758274824476, + "loss": 2.8135, + "theoretical_loss": 3.5257384852635827, + "tokens_seen": 1454187520 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825175526579739, + "loss": 2.8041, + "theoretical_loss": 3.525724392505077, + "tokens_seen": 1454253056 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002825075225677031, + "loss": 2.803, + "theoretical_loss": 3.5257103005594645, + "tokens_seen": 1454318592 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002824974924774323, + "loss": 2.8595, + "theoretical_loss": 3.5256962094266617, + "tokens_seen": 1454384128 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002824874623871615, + "loss": 2.6805, + "theoretical_loss": 3.525682119106585, + "tokens_seen": 1454449664 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028247743229689067, + "loss": 2.8561, + "theoretical_loss": 3.52566802959915, + "tokens_seen": 1454515200 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028246740220661985, + "loss": 2.7975, + "theoretical_loss": 3.5256539409042746, + "tokens_seen": 1454580736 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028245737211634903, + "loss": 2.836, + "theoretical_loss": 3.5256398530218744, + "tokens_seen": 1454646272 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028244734202607826, + "loss": 2.7652, + "theoretical_loss": 3.5256257659518666, + "tokens_seen": 1454711808 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8113508224487305, + "objective/train/theoretical_loss": 3.525611679694167, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.525611679694167, + "tokens_seen": 1454777344 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002824373119358074, + "loss": 2.8421, + "theoretical_loss": 3.525611679694167, + "tokens_seen": 1454777344 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002824272818455366, + "loss": 2.78, + "theoretical_loss": 3.5255975942486932, + "tokens_seen": 1454842880 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002824172517552658, + "loss": 2.8255, + "theoretical_loss": 3.5255835096153616, + "tokens_seen": 1454908416 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282407221664995, + "loss": 2.7563, + "theoretical_loss": 3.525569425794088, + "tokens_seen": 1454973952 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028239719157472417, + "loss": 2.7755, + "theoretical_loss": 3.525555342784789, + "tokens_seen": 1455039488 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028238716148445335, + "loss": 2.7547, + "theoretical_loss": 3.5255412605873824, + "tokens_seen": 1455105024 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028237713139418253, + "loss": 2.8177, + "theoretical_loss": 3.5255271792017835, + "tokens_seen": 1455170560 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028236710130391177, + "loss": 2.8242, + "theoretical_loss": 3.5255130986279095, + "tokens_seen": 1455236096 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002823570712136409, + "loss": 2.7271, + "theoretical_loss": 3.5254990188656774, + "tokens_seen": 1455301632 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028234704112337013, + "loss": 2.7946, + "theoretical_loss": 3.5254849399150032, + "tokens_seen": 1455367168 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028233701103309926, + "loss": 2.7102, + "theoretical_loss": 3.525470861775804, + "tokens_seen": 1455432704 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002823269809428285, + "loss": 2.8143, + "theoretical_loss": 3.5254567844479965, + "tokens_seen": 1455498240 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002823169508525577, + "loss": 2.725, + "theoretical_loss": 3.5254427079314974, + "tokens_seen": 1455563776 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028230692076228685, + "loss": 2.7806, + "theoretical_loss": 3.525428632226223, + "tokens_seen": 1455629312 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028229689067201604, + "loss": 2.7751, + "theoretical_loss": 3.5254145573320907, + "tokens_seen": 1455694848 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002822868605817452, + "loss": 2.7838, + "theoretical_loss": 3.5254004832490167, + "tokens_seen": 1455760384 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028227683049147445, + "loss": 2.8175, + "theoretical_loss": 3.5253864099769183, + "tokens_seen": 1455825920 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028226680040120363, + "loss": 2.8074, + "theoretical_loss": 3.5253723375157113, + "tokens_seen": 1455891456 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002822567703109328, + "loss": 2.7767, + "theoretical_loss": 3.5253582658653135, + "tokens_seen": 1455956992 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282246740220662, + "loss": 2.7987, + "theoretical_loss": 3.5253441950256414, + "tokens_seen": 1456022528 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028223671013039123, + "loss": 2.7923, + "theoretical_loss": 3.5253301249966116, + "tokens_seen": 1456088064 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028222668004012036, + "loss": 2.7115, + "theoretical_loss": 3.5253160557781404, + "tokens_seen": 1456153600 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002822166499498496, + "loss": 2.7481, + "theoretical_loss": 3.5253019873701463, + "tokens_seen": 1456219136 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002822066198595787, + "loss": 2.8124, + "theoretical_loss": 3.525287919772544, + "tokens_seen": 1456284672 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028219658976930795, + "loss": 2.8453, + "theoretical_loss": 3.525273852985252, + "tokens_seen": 1456350208 + }, + { + "epoch": 17.02, + "objective/train/docs_used": 3403584, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8192009925842285, + "objective/train/theoretical_loss": 3.5252597870081868, + "objective/train/tokens_used": 1457895904, + "theoretical_loss": 3.5252597870081868, + "tokens_seen": 1456415744 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028218655967903714, + "loss": 2.7528, + "theoretical_loss": 3.5252597870081868, + "tokens_seen": 1456415744 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821765295887663, + "loss": 2.6717, + "theoretical_loss": 3.525245721841265, + "tokens_seen": 1456481280 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821664994984955, + "loss": 2.7422, + "theoretical_loss": 3.525231657484403, + "tokens_seen": 1456546816 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821564694082247, + "loss": 2.8828, + "theoretical_loss": 3.5252175939375188, + "tokens_seen": 1456612352 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028214643931795386, + "loss": 2.8682, + "theoretical_loss": 3.5252035312005283, + "tokens_seen": 1456677888 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821364092276831, + "loss": 2.7391, + "theoretical_loss": 3.525189469273349, + "tokens_seen": 1456743424 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821263791374122, + "loss": 2.8013, + "theoretical_loss": 3.5251754081558984, + "tokens_seen": 1456808960 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028211634904714146, + "loss": 2.812, + "theoretical_loss": 3.5251613478480923, + "tokens_seen": 1456874496 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002821063189568706, + "loss": 2.8189, + "theoretical_loss": 3.525147288349848, + "tokens_seen": 1456940032 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820962888665998, + "loss": 2.8377, + "theoretical_loss": 3.5251332296610824, + "tokens_seen": 1457005568 + }, + { + "epoch": 17.02, + "learning_rate": 0.000282086258776329, + "loss": 2.7537, + "theoretical_loss": 3.5251191717817134, + "tokens_seen": 1457071104 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820762286860582, + "loss": 2.8313, + "theoretical_loss": 3.525105114711657, + "tokens_seen": 1457136640 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028206619859578736, + "loss": 2.8181, + "theoretical_loss": 3.5250910584508306, + "tokens_seen": 1457202176 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820561685055166, + "loss": 2.7819, + "theoretical_loss": 3.5250770029991507, + "tokens_seen": 1457267712 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820461384152457, + "loss": 2.7704, + "theoretical_loss": 3.525062948356535, + "tokens_seen": 1457333248 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028203610832497496, + "loss": 2.7687, + "theoretical_loss": 3.525048894522901, + "tokens_seen": 1457398784 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820260782347041, + "loss": 2.7991, + "theoretical_loss": 3.5250348414981643, + "tokens_seen": 1457464320 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820160481444333, + "loss": 2.6753, + "theoretical_loss": 3.5250207892822427, + "tokens_seen": 1457529856 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002820060180541625, + "loss": 2.8984, + "theoretical_loss": 3.525006737875054, + "tokens_seen": 1457595392 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002819959879638917, + "loss": 2.6198, + "theoretical_loss": 3.524992687276514, + "tokens_seen": 1457660928 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028198595787362087, + "loss": 2.7629, + "theoretical_loss": 3.524978637486541, + "tokens_seen": 1457726464 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028197592778335005, + "loss": 2.7976, + "theoretical_loss": 3.5249645885050507, + "tokens_seen": 1457792000 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028196589769307923, + "loss": 2.7982, + "theoretical_loss": 3.524950540331962, + "tokens_seen": 1457857536 + }, + { + "epoch": 17.02, + "learning_rate": 0.00028195586760280846, + "loss": 2.7696, + "theoretical_loss": 3.5249380293583448, + "tokens_seen": 1457915904 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002819458375125376, + "loss": 2.7643, + "theoretical_loss": 3.5249239827134122, + "tokens_seen": 1457981440 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3452259, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.652331590652466, + "objective/train/theoretical_loss": 3.5249099368766403, + "objective/train/tokens_used": 1478506976, + "theoretical_loss": 3.5249099368766403, + "tokens_seen": 1458046976 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002819358074222668, + "loss": 2.6916, + "theoretical_loss": 3.5249099368766403, + "tokens_seen": 1458046976 + }, + { + "epoch": 18.0, + "learning_rate": 0.000281925777331996, + "loss": 2.7479, + "theoretical_loss": 3.5248958918479474, + "tokens_seen": 1458112512 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002819157472417252, + "loss": 2.7004, + "theoretical_loss": 3.52488184762725, + "tokens_seen": 1458178048 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028190571715145437, + "loss": 2.616, + "theoretical_loss": 3.524867804214465, + "tokens_seen": 1458243584 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028189568706118355, + "loss": 2.7509, + "theoretical_loss": 3.5248537616095104, + "tokens_seen": 1458309120 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028188565697091273, + "loss": 2.655, + "theoretical_loss": 3.524839719812303, + "tokens_seen": 1458374656 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028187562688064197, + "loss": 2.709, + "theoretical_loss": 3.52482567882276, + "tokens_seen": 1458440192 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002818655967903711, + "loss": 2.7933, + "theoretical_loss": 3.5248116386407986, + "tokens_seen": 1458505728 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028185556670010033, + "loss": 2.6192, + "theoretical_loss": 3.5247975992663365, + "tokens_seen": 1458571264 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028184553660982946, + "loss": 2.739, + "theoretical_loss": 3.5247835606992908, + "tokens_seen": 1458636800 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002818355065195587, + "loss": 2.699, + "theoretical_loss": 3.524769522939579, + "tokens_seen": 1458702336 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002818254764292879, + "loss": 2.7366, + "theoretical_loss": 3.524755485987117, + "tokens_seen": 1458767872 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028181544633901705, + "loss": 2.6685, + "theoretical_loss": 3.524741449841824, + "tokens_seen": 1458833408 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028180541624874624, + "loss": 2.7056, + "theoretical_loss": 3.5247274145036167, + "tokens_seen": 1458898944 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817953861584754, + "loss": 2.6834, + "theoretical_loss": 3.524713379972412, + "tokens_seen": 1458964480 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817853560682046, + "loss": 2.6297, + "theoretical_loss": 3.524699346248127, + "tokens_seen": 1459030016 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028177532597793383, + "loss": 2.5608, + "theoretical_loss": 3.5246853133306804, + "tokens_seen": 1459095552 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028176529588766296, + "loss": 2.7239, + "theoretical_loss": 3.5246712812199883, + "tokens_seen": 1459161088 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817552657973922, + "loss": 2.7455, + "theoretical_loss": 3.524657249915969, + "tokens_seen": 1459226624 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817452357071214, + "loss": 2.7981, + "theoretical_loss": 3.5246432194185395, + "tokens_seen": 1459292160 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028173520561685056, + "loss": 2.6213, + "theoretical_loss": 3.5246291897276167, + "tokens_seen": 1459357696 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028172517552657974, + "loss": 2.5383, + "theoretical_loss": 3.5246151608431187, + "tokens_seen": 1459423232 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817151454363089, + "loss": 2.5183, + "theoretical_loss": 3.5246011327649627, + "tokens_seen": 1459488768 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002817051153460381, + "loss": 2.6914, + "theoretical_loss": 3.524587105493066, + "tokens_seen": 1459554304 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028169508525576734, + "loss": 2.6261, + "theoretical_loss": 3.5245730790273466, + "tokens_seen": 1459619840 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3455914, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3166511058807373, + "objective/train/theoretical_loss": 3.524559053367722, + "objective/train/tokens_used": 1480145376, + "theoretical_loss": 3.524559053367722, + "tokens_seen": 1459685376 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028168505516549646, + "loss": 2.5617, + "theoretical_loss": 3.524559053367722, + "tokens_seen": 1459685376 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002816750250752257, + "loss": 2.8445, + "theoretical_loss": 3.5245450285141082, + "tokens_seen": 1459750912 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002816649949849548, + "loss": 2.6094, + "theoretical_loss": 3.524531004466425, + "tokens_seen": 1459816448 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028165496489468406, + "loss": 2.645, + "theoretical_loss": 3.524516981224588, + "tokens_seen": 1459881984 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028164493480441324, + "loss": 2.7443, + "theoretical_loss": 3.5245029587885153, + "tokens_seen": 1459947520 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002816349047141424, + "loss": 2.6508, + "theoretical_loss": 3.524488937158125, + "tokens_seen": 1460013056 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002816248746238716, + "loss": 2.7273, + "theoretical_loss": 3.524474916333334, + "tokens_seen": 1460078592 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002816148445336008, + "loss": 2.6755, + "theoretical_loss": 3.5244608963140607, + "tokens_seen": 1460144128 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028160481444332997, + "loss": 2.6554, + "theoretical_loss": 3.524446877100221, + "tokens_seen": 1460209664 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815947843530592, + "loss": 2.7373, + "theoretical_loss": 3.524432858691734, + "tokens_seen": 1460275200 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028158475426278833, + "loss": 2.6839, + "theoretical_loss": 3.5244188410885173, + "tokens_seen": 1460340736 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028157472417251756, + "loss": 2.7206, + "theoretical_loss": 3.5244048242904875, + "tokens_seen": 1460406272 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028156469408224674, + "loss": 2.6869, + "theoretical_loss": 3.5243908082975626, + "tokens_seen": 1460471808 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815546639919759, + "loss": 2.6743, + "theoretical_loss": 3.524376793109661, + "tokens_seen": 1460537344 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815446339017051, + "loss": 2.6893, + "theoretical_loss": 3.524362778726699, + "tokens_seen": 1460602880 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815346038114343, + "loss": 2.5421, + "theoretical_loss": 3.524348765148596, + "tokens_seen": 1460668416 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815245737211635, + "loss": 2.7222, + "theoretical_loss": 3.5243347523752675, + "tokens_seen": 1460733952 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815145436308927, + "loss": 2.7125, + "theoretical_loss": 3.5243207404066332, + "tokens_seen": 1460799488 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002815045135406219, + "loss": 2.7942, + "theoretical_loss": 3.5243067292426096, + "tokens_seen": 1460865024 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028149448345035107, + "loss": 2.7481, + "theoretical_loss": 3.524292718883115, + "tokens_seen": 1460930560 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028148445336008025, + "loss": 2.7598, + "theoretical_loss": 3.5242787093280663, + "tokens_seen": 1460996096 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028147442326980943, + "loss": 2.6071, + "theoretical_loss": 3.5242647005773824, + "tokens_seen": 1461061632 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028146439317953866, + "loss": 2.7215, + "theoretical_loss": 3.5242506926309796, + "tokens_seen": 1461127168 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002814543630892678, + "loss": 2.7836, + "theoretical_loss": 3.524236685488777, + "tokens_seen": 1461192704 + }, + { + "epoch": 18.0, + "learning_rate": 0.000281444332998997, + "loss": 2.7338, + "theoretical_loss": 3.524222679150692, + "tokens_seen": 1461258240 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3460847, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6322484016418457, + "objective/train/theoretical_loss": 3.524208673616642, + "objective/train/tokens_used": 1481783776, + "theoretical_loss": 3.524208673616642, + "tokens_seen": 1461323776 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002814343029087262, + "loss": 2.621, + "theoretical_loss": 3.524208673616642, + "tokens_seen": 1461323776 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002814242728184554, + "loss": 2.6356, + "theoretical_loss": 3.524194668886545, + "tokens_seen": 1461389312 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028141424272818457, + "loss": 2.73, + "theoretical_loss": 3.5241806649603187, + "tokens_seen": 1461454848 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028140421263791375, + "loss": 2.531, + "theoretical_loss": 3.524166661837881, + "tokens_seen": 1461520384 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028139418254764293, + "loss": 2.7203, + "theoretical_loss": 3.52415265951915, + "tokens_seen": 1461585920 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028138415245737217, + "loss": 2.5977, + "theoretical_loss": 3.524138658004043, + "tokens_seen": 1461651456 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002813741223671013, + "loss": 2.6037, + "theoretical_loss": 3.524124657292478, + "tokens_seen": 1461716992 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028136409227683053, + "loss": 2.693, + "theoretical_loss": 3.524110657384373, + "tokens_seen": 1461782528 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028135406218655966, + "loss": 2.6761, + "theoretical_loss": 3.524096658279646, + "tokens_seen": 1461848064 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002813440320962889, + "loss": 2.6029, + "theoretical_loss": 3.524082659978215, + "tokens_seen": 1461913600 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002813340020060181, + "loss": 2.6286, + "theoretical_loss": 3.5240686624799977, + "tokens_seen": 1461979136 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028132397191574725, + "loss": 2.7365, + "theoretical_loss": 3.5240546657849117, + "tokens_seen": 1462044672 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028131394182547644, + "loss": 2.7554, + "theoretical_loss": 3.524040669892875, + "tokens_seen": 1462110208 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002813039117352056, + "loss": 2.7382, + "theoretical_loss": 3.524026674803806, + "tokens_seen": 1462175744 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002812938816449348, + "loss": 2.7207, + "theoretical_loss": 3.524012680517622, + "tokens_seen": 1462241280 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028128385155466403, + "loss": 2.6243, + "theoretical_loss": 3.5239986870342417, + "tokens_seen": 1462306816 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028127382146439316, + "loss": 2.7206, + "theoretical_loss": 3.523984694353582, + "tokens_seen": 1462372352 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002812637913741224, + "loss": 2.695, + "theoretical_loss": 3.5239707024755624, + "tokens_seen": 1462437888 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002812537612838516, + "loss": 2.737, + "theoretical_loss": 3.5239567114001, + "tokens_seen": 1462503424 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028124373119358076, + "loss": 2.5648, + "theoretical_loss": 3.5239427211271126, + "tokens_seen": 1462568960 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028123370110330994, + "loss": 2.7241, + "theoretical_loss": 3.523928731656518, + "tokens_seen": 1462634496 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002812236710130391, + "loss": 2.6891, + "theoretical_loss": 3.523914742988236, + "tokens_seen": 1462700032 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002812136409227683, + "loss": 2.7527, + "theoretical_loss": 3.523900755122182, + "tokens_seen": 1462765568 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028120361083249754, + "loss": 2.7642, + "theoretical_loss": 3.5238867680582757, + "tokens_seen": 1462831104 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028119358074222666, + "loss": 2.7208, + "theoretical_loss": 3.5238727817964355, + "tokens_seen": 1462896640 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3464031, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5816643238067627, + "objective/train/theoretical_loss": 3.523858796336578, + "objective/train/tokens_used": 1483422176, + "theoretical_loss": 3.523858796336578, + "tokens_seen": 1462962176 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002811835506519559, + "loss": 2.5953, + "theoretical_loss": 3.523858796336578, + "tokens_seen": 1462962176 + }, + { + "epoch": 18.0, + "learning_rate": 0.000281173520561685, + "loss": 2.7305, + "theoretical_loss": 3.523844811678623, + "tokens_seen": 1463027712 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028116349047141426, + "loss": 2.7587, + "theoretical_loss": 3.5238308278224872, + "tokens_seen": 1463093248 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028115346038114344, + "loss": 2.7109, + "theoretical_loss": 3.5238168447680893, + "tokens_seen": 1463158784 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002811434302908726, + "loss": 2.6298, + "theoretical_loss": 3.523802862515348, + "tokens_seen": 1463224320 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002811334002006018, + "loss": 2.6084, + "theoretical_loss": 3.52378888106418, + "tokens_seen": 1463289856 + }, + { + "epoch": 18.0, + "learning_rate": 0.000281123370110331, + "loss": 2.6365, + "theoretical_loss": 3.5237749004145043, + "tokens_seen": 1463355392 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028111334002006017, + "loss": 2.7015, + "theoretical_loss": 3.5237609205662395, + "tokens_seen": 1463420928 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002811033099297894, + "loss": 2.7067, + "theoretical_loss": 3.5237469415193035, + "tokens_seen": 1463486464 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028109327983951853, + "loss": 2.6943, + "theoretical_loss": 3.5237329632736136, + "tokens_seen": 1463552000 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028108324974924776, + "loss": 2.7217, + "theoretical_loss": 3.523718985829089, + "tokens_seen": 1463617536 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028107321965897695, + "loss": 2.6489, + "theoretical_loss": 3.5237050091856474, + "tokens_seen": 1463683072 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002810631895687061, + "loss": 2.7116, + "theoretical_loss": 3.5236910333432077, + "tokens_seen": 1463748608 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002810531594784353, + "loss": 2.7869, + "theoretical_loss": 3.523677058301687, + "tokens_seen": 1463814144 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002810431293881645, + "loss": 2.7494, + "theoretical_loss": 3.5236630840610053, + "tokens_seen": 1463879680 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028103309929789367, + "loss": 2.711, + "theoretical_loss": 3.523649110621079, + "tokens_seen": 1463945216 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002810230692076229, + "loss": 2.701, + "theoretical_loss": 3.5236351379818274, + "tokens_seen": 1464010752 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028101303911735203, + "loss": 2.6789, + "theoretical_loss": 3.523621166143168, + "tokens_seen": 1464076288 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028100300902708127, + "loss": 2.6362, + "theoretical_loss": 3.5236071951050203, + "tokens_seen": 1464141824 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002809929789368104, + "loss": 2.6254, + "theoretical_loss": 3.5235932248673016, + "tokens_seen": 1464207360 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028098294884653963, + "loss": 2.656, + "theoretical_loss": 3.5235792554299303, + "tokens_seen": 1464272896 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002809729187562688, + "loss": 2.7569, + "theoretical_loss": 3.5235652867928255, + "tokens_seen": 1464338432 + }, + { + "epoch": 18.0, + "learning_rate": 0.000280962888665998, + "loss": 2.6743, + "theoretical_loss": 3.5235513189559047, + "tokens_seen": 1464403968 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028095285857572717, + "loss": 2.6922, + "theoretical_loss": 3.523537351919087, + "tokens_seen": 1464469504 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028094282848545635, + "loss": 2.6671, + "theoretical_loss": 3.5235233856822896, + "tokens_seen": 1464535040 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3466899, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.599944829940796, + "objective/train/theoretical_loss": 3.523509420245432, + "objective/train/tokens_used": 1485060576, + "theoretical_loss": 3.523509420245432, + "tokens_seen": 1464600576 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028093279839518553, + "loss": 2.7307, + "theoretical_loss": 3.523509420245432, + "tokens_seen": 1464600576 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028092276830491477, + "loss": 2.7251, + "theoretical_loss": 3.5234954556084324, + "tokens_seen": 1464666112 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002809127382146439, + "loss": 2.7018, + "theoretical_loss": 3.5234814917712085, + "tokens_seen": 1464731648 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028090270812437313, + "loss": 2.6261, + "theoretical_loss": 3.5234675287336796, + "tokens_seen": 1464797184 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002808926780341023, + "loss": 2.637, + "theoretical_loss": 3.5234535664957636, + "tokens_seen": 1464862720 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002808826479438315, + "loss": 2.6945, + "theoretical_loss": 3.523439605057379, + "tokens_seen": 1464928256 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002808726178535607, + "loss": 2.655, + "theoretical_loss": 3.5234256444184444, + "tokens_seen": 1464993792 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028086258776328986, + "loss": 2.7386, + "theoretical_loss": 3.5234116845788783, + "tokens_seen": 1465059328 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028085255767301904, + "loss": 2.676, + "theoretical_loss": 3.523397725538599, + "tokens_seen": 1465124864 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002808425275827483, + "loss": 2.7319, + "theoretical_loss": 3.5233837672975254, + "tokens_seen": 1465190400 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002808324974924774, + "loss": 2.6723, + "theoretical_loss": 3.5233698098555752, + "tokens_seen": 1465255936 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028082246740220664, + "loss": 2.7579, + "theoretical_loss": 3.523355853212668, + "tokens_seen": 1465321472 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028081243731193576, + "loss": 2.504, + "theoretical_loss": 3.523341897368721, + "tokens_seen": 1465387008 + }, + { + "epoch": 18.0, + "learning_rate": 0.000280802407221665, + "loss": 2.6265, + "theoretical_loss": 3.523327942323654, + "tokens_seen": 1465452544 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002807923771313942, + "loss": 2.6278, + "theoretical_loss": 3.5233139880773843, + "tokens_seen": 1465518080 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028078234704112336, + "loss": 2.6726, + "theoretical_loss": 3.5233000346298318, + "tokens_seen": 1465583616 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002807723169508526, + "loss": 2.7176, + "theoretical_loss": 3.523286081980914, + "tokens_seen": 1465649152 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002807622868605818, + "loss": 2.744, + "theoretical_loss": 3.5232721301305503, + "tokens_seen": 1465714688 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028075225677031096, + "loss": 2.6622, + "theoretical_loss": 3.5232581790786583, + "tokens_seen": 1465780224 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028074222668004014, + "loss": 2.6911, + "theoretical_loss": 3.5232442288251575, + "tokens_seen": 1465845760 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002807321965897693, + "loss": 2.6726, + "theoretical_loss": 3.523230279369966, + "tokens_seen": 1465911296 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002807221664994985, + "loss": 2.8139, + "theoretical_loss": 3.523216330713003, + "tokens_seen": 1465976832 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028071213640922774, + "loss": 2.6935, + "theoretical_loss": 3.5232023828541865, + "tokens_seen": 1466042368 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028070210631895686, + "loss": 2.6973, + "theoretical_loss": 3.5231884357934353, + "tokens_seen": 1466107904 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002806920762286861, + "loss": 2.7985, + "theoretical_loss": 3.5231744895306685, + "tokens_seen": 1466173440 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3471792, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.678370475769043, + "objective/train/theoretical_loss": 3.5231605440658043, + "objective/train/tokens_used": 1486698976, + "theoretical_loss": 3.5231605440658043, + "tokens_seen": 1466238976 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002806820461384152, + "loss": 2.6704, + "theoretical_loss": 3.5231605440658043, + "tokens_seen": 1466238976 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028067201604814446, + "loss": 2.6252, + "theoretical_loss": 3.5231465993987614, + "tokens_seen": 1466304512 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028066198595787364, + "loss": 2.6535, + "theoretical_loss": 3.523132655529459, + "tokens_seen": 1466370048 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002806519558676028, + "loss": 2.718, + "theoretical_loss": 3.5231187124578156, + "tokens_seen": 1466435584 + }, + { + "epoch": 18.0, + "learning_rate": 0.000280641925777332, + "loss": 2.6048, + "theoretical_loss": 3.5231047701837497, + "tokens_seen": 1466501120 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002806318956870612, + "loss": 2.7288, + "theoretical_loss": 3.5230908287071796, + "tokens_seen": 1466566656 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028062186559679037, + "loss": 2.6945, + "theoretical_loss": 3.523076888028025, + "tokens_seen": 1466632192 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002806118355065196, + "loss": 2.7865, + "theoretical_loss": 3.523062948146204, + "tokens_seen": 1466697728 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028060180541624873, + "loss": 2.8119, + "theoretical_loss": 3.523049009061636, + "tokens_seen": 1466763264 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028059177532597796, + "loss": 2.6148, + "theoretical_loss": 3.523035070774239, + "tokens_seen": 1466828800 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028058174523570715, + "loss": 2.7607, + "theoretical_loss": 3.5230211332839327, + "tokens_seen": 1466894336 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002805717151454363, + "loss": 2.5434, + "theoretical_loss": 3.523007196590635, + "tokens_seen": 1466959872 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002805616850551655, + "loss": 2.7979, + "theoretical_loss": 3.5229932606942653, + "tokens_seen": 1467025408 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002805516549648947, + "loss": 2.7478, + "theoretical_loss": 3.5229793255947426, + "tokens_seen": 1467090944 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028054162487462387, + "loss": 2.7604, + "theoretical_loss": 3.5229653912919847, + "tokens_seen": 1467156480 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002805315947843531, + "loss": 2.7088, + "theoretical_loss": 3.5229514577859113, + "tokens_seen": 1467222016 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028052156469408223, + "loss": 2.6727, + "theoretical_loss": 3.5229375250764416, + "tokens_seen": 1467287552 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028051153460381147, + "loss": 2.7918, + "theoretical_loss": 3.522923593163494, + "tokens_seen": 1467353088 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002805015045135406, + "loss": 2.5854, + "theoretical_loss": 3.5229096620469864, + "tokens_seen": 1467418624 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028049147442326983, + "loss": 2.7608, + "theoretical_loss": 3.5228957317268397, + "tokens_seen": 1467484160 + }, + { + "epoch": 18.0, + "learning_rate": 0.000280481444332999, + "loss": 2.7207, + "theoretical_loss": 3.5228818022029715, + "tokens_seen": 1467549696 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002804714142427282, + "loss": 2.5963, + "theoretical_loss": 3.5228678734753007, + "tokens_seen": 1467615232 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028046138415245737, + "loss": 2.7769, + "theoretical_loss": 3.5228539455437473, + "tokens_seen": 1467680768 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028045135406218655, + "loss": 2.7455, + "theoretical_loss": 3.522840018408229, + "tokens_seen": 1467746304 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028044132397191574, + "loss": 2.7461, + "theoretical_loss": 3.522826092068666, + "tokens_seen": 1467811840 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3475643, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.533113479614258, + "objective/train/theoretical_loss": 3.522812166524976, + "objective/train/tokens_used": 1488337376, + "theoretical_loss": 3.522812166524976, + "tokens_seen": 1467877376 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028043129388164497, + "loss": 2.6318, + "theoretical_loss": 3.522812166524976, + "tokens_seen": 1467877376 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002804212637913741, + "loss": 2.7926, + "theoretical_loss": 3.5227982417770782, + "tokens_seen": 1467942912 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028041123370110333, + "loss": 2.6759, + "theoretical_loss": 3.5227843178248923, + "tokens_seen": 1468008448 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002804012036108325, + "loss": 2.7419, + "theoretical_loss": 3.5227703946683375, + "tokens_seen": 1468073984 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803911735205617, + "loss": 2.6927, + "theoretical_loss": 3.5227564723073317, + "tokens_seen": 1468139520 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803811434302909, + "loss": 2.7326, + "theoretical_loss": 3.5227425507417944, + "tokens_seen": 1468205056 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028037111334002006, + "loss": 2.7286, + "theoretical_loss": 3.5227286299716454, + "tokens_seen": 1468270592 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028036108324974924, + "loss": 2.682, + "theoretical_loss": 3.5227147099968024, + "tokens_seen": 1468336128 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803510531594785, + "loss": 2.699, + "theoretical_loss": 3.522700790817186, + "tokens_seen": 1468401664 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803410230692076, + "loss": 2.8, + "theoretical_loss": 3.522686872432714, + "tokens_seen": 1468467200 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028033099297893684, + "loss": 2.6263, + "theoretical_loss": 3.522672954843306, + "tokens_seen": 1468532736 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028032096288866596, + "loss": 2.7072, + "theoretical_loss": 3.5226590380488814, + "tokens_seen": 1468598272 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803109327983952, + "loss": 2.7413, + "theoretical_loss": 3.522645122049358, + "tokens_seen": 1468663808 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002803009027081244, + "loss": 2.6781, + "theoretical_loss": 3.5226312068446566, + "tokens_seen": 1468729344 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028029087261785356, + "loss": 2.6457, + "theoretical_loss": 3.5226172924346963, + "tokens_seen": 1468794880 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028028084252758274, + "loss": 2.7502, + "theoretical_loss": 3.5226033788193947, + "tokens_seen": 1468860416 + }, + { + "epoch": 18.0, + "learning_rate": 0.000280270812437312, + "loss": 2.7779, + "theoretical_loss": 3.5225894659986725, + "tokens_seen": 1468925952 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002802607823470411, + "loss": 2.7033, + "theoretical_loss": 3.5225755539724477, + "tokens_seen": 1468991488 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028025075225677034, + "loss": 2.6854, + "theoretical_loss": 3.52256164274064, + "tokens_seen": 1469057024 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028024072216649947, + "loss": 2.7392, + "theoretical_loss": 3.522547732303169, + "tokens_seen": 1469122560 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002802306920762287, + "loss": 2.7599, + "theoretical_loss": 3.5225338226599536, + "tokens_seen": 1469188096 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002802206619859579, + "loss": 2.7572, + "theoretical_loss": 3.522519913810913, + "tokens_seen": 1469253632 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028021063189568706, + "loss": 2.7162, + "theoretical_loss": 3.5225060057559663, + "tokens_seen": 1469319168 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028020060180541624, + "loss": 2.7358, + "theoretical_loss": 3.5224920984950328, + "tokens_seen": 1469384704 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002801905717151454, + "loss": 2.7716, + "theoretical_loss": 3.522478192028032, + "tokens_seen": 1469450240 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3478443, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.78225040435791, + "objective/train/theoretical_loss": 3.5224642863548823, + "objective/train/tokens_used": 1489975776, + "theoretical_loss": 3.5224642863548823, + "tokens_seen": 1469515776 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002801805416248746, + "loss": 2.7797, + "theoretical_loss": 3.5224642863548823, + "tokens_seen": 1469515776 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028017051153460384, + "loss": 2.7032, + "theoretical_loss": 3.5224503814755046, + "tokens_seen": 1469581312 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028016048144433297, + "loss": 2.6254, + "theoretical_loss": 3.522436477389817, + "tokens_seen": 1469646848 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002801504513540622, + "loss": 2.7406, + "theoretical_loss": 3.522422574097739, + "tokens_seen": 1469712384 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028014042126379133, + "loss": 2.678, + "theoretical_loss": 3.52240867159919, + "tokens_seen": 1469777920 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028013039117352057, + "loss": 2.7004, + "theoretical_loss": 3.522394769894089, + "tokens_seen": 1469843456 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028012036108324975, + "loss": 2.6571, + "theoretical_loss": 3.5223808689823564, + "tokens_seen": 1469908992 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028011033099297893, + "loss": 2.6476, + "theoretical_loss": 3.52236696886391, + "tokens_seen": 1469974528 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002801003009027081, + "loss": 2.6443, + "theoretical_loss": 3.5223530695386707, + "tokens_seen": 1470040064 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028009027081243735, + "loss": 2.8011, + "theoretical_loss": 3.5223391710065566, + "tokens_seen": 1470105600 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028008024072216647, + "loss": 2.7181, + "theoretical_loss": 3.522325273267488, + "tokens_seen": 1470171136 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002800702106318957, + "loss": 2.7269, + "theoretical_loss": 3.522311376321384, + "tokens_seen": 1470236672 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028006018054162483, + "loss": 2.5788, + "theoretical_loss": 3.5222974801681635, + "tokens_seen": 1470302208 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028005015045135407, + "loss": 2.6101, + "theoretical_loss": 3.5222835848077465, + "tokens_seen": 1470367744 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028004012036108325, + "loss": 2.651, + "theoretical_loss": 3.5222696902400523, + "tokens_seen": 1470433280 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028003009027081243, + "loss": 2.7666, + "theoretical_loss": 3.522255796465001, + "tokens_seen": 1470498816 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028002006018054167, + "loss": 2.8424, + "theoretical_loss": 3.522241903482511, + "tokens_seen": 1470564352 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002800100300902708, + "loss": 2.7039, + "theoretical_loss": 3.522228011292502, + "tokens_seen": 1470629888 + }, + { + "epoch": 18.0, + "learning_rate": 0.00028000000000000003, + "loss": 2.7761, + "theoretical_loss": 3.522214119894894, + "tokens_seen": 1470695424 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002799899699097292, + "loss": 2.7224, + "theoretical_loss": 3.5222002292896057, + "tokens_seen": 1470760960 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002799799398194584, + "loss": 2.7724, + "theoretical_loss": 3.5221863394765576, + "tokens_seen": 1470826496 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002799699097291876, + "loss": 2.7634, + "theoretical_loss": 3.5221724504556686, + "tokens_seen": 1470892032 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027995987963891675, + "loss": 2.6691, + "theoretical_loss": 3.522158562226858, + "tokens_seen": 1470957568 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027994984954864594, + "loss": 2.7236, + "theoretical_loss": 3.522144674790046, + "tokens_seen": 1471023104 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027993981945837517, + "loss": 2.7032, + "theoretical_loss": 3.5221307881451516, + "tokens_seen": 1471088640 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3483365, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8181629180908203, + "objective/train/theoretical_loss": 3.522116902292095, + "objective/train/tokens_used": 1491614176, + "theoretical_loss": 3.522116902292095, + "tokens_seen": 1471154176 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002799297893681043, + "loss": 2.7339, + "theoretical_loss": 3.522116902292095, + "tokens_seen": 1471154176 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027991975927783353, + "loss": 2.7056, + "theoretical_loss": 3.522103017230795, + "tokens_seen": 1471219712 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002799097291875627, + "loss": 2.666, + "theoretical_loss": 3.522089132961172, + "tokens_seen": 1471285248 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798996990972919, + "loss": 2.669, + "theoretical_loss": 3.5220752494831444, + "tokens_seen": 1471350784 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798896690070211, + "loss": 2.6883, + "theoretical_loss": 3.5220613667966334, + "tokens_seen": 1471416320 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027987963891675026, + "loss": 2.7551, + "theoretical_loss": 3.5220474849015577, + "tokens_seen": 1471481856 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027986960882647944, + "loss": 2.7555, + "theoretical_loss": 3.522033603797836, + "tokens_seen": 1471547392 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798595787362087, + "loss": 2.7444, + "theoretical_loss": 3.52201972348539, + "tokens_seen": 1471612928 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798495486459378, + "loss": 2.7265, + "theoretical_loss": 3.522005843964138, + "tokens_seen": 1471678464 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027983951855566704, + "loss": 2.8677, + "theoretical_loss": 3.521991965234, + "tokens_seen": 1471744000 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027982948846539616, + "loss": 2.6978, + "theoretical_loss": 3.521978087294896, + "tokens_seen": 1471809536 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798194583751254, + "loss": 2.7462, + "theoretical_loss": 3.521964210146745, + "tokens_seen": 1471875072 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002798094282848546, + "loss": 2.7167, + "theoretical_loss": 3.5219503337894675, + "tokens_seen": 1471940608 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027979939819458376, + "loss": 2.6824, + "theoretical_loss": 3.5219364582229824, + "tokens_seen": 1472006144 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027978936810431294, + "loss": 2.6973, + "theoretical_loss": 3.52192258344721, + "tokens_seen": 1472071680 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002797793380140422, + "loss": 2.7487, + "theoretical_loss": 3.52190870946207, + "tokens_seen": 1472137216 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002797693079237713, + "loss": 2.6799, + "theoretical_loss": 3.5218948362674816, + "tokens_seen": 1472202752 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027975927783350054, + "loss": 2.7032, + "theoretical_loss": 3.5218809638633655, + "tokens_seen": 1472268288 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027974924774322967, + "loss": 2.522, + "theoretical_loss": 3.5218670922496407, + "tokens_seen": 1472333824 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002797392176529589, + "loss": 2.7913, + "theoretical_loss": 3.5218532214262273, + "tokens_seen": 1472399360 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002797291875626881, + "loss": 2.7573, + "theoretical_loss": 3.5218393513930453, + "tokens_seen": 1472464896 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027971915747241726, + "loss": 2.7253, + "theoretical_loss": 3.521825482150014, + "tokens_seen": 1472530432 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027970912738214644, + "loss": 2.7561, + "theoretical_loss": 3.5218116136970536, + "tokens_seen": 1472595968 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002796990972918756, + "loss": 2.7194, + "theoretical_loss": 3.5217977460340837, + "tokens_seen": 1472661504 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002796890672016048, + "loss": 2.6811, + "theoretical_loss": 3.521783879161024, + "tokens_seen": 1472727040 + }, + { + "epoch": 18.0, + "objective/train/docs_used": 3486183, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7544894218444824, + "objective/train/theoretical_loss": 3.5217700130777954, + "objective/train/tokens_used": 1493252576, + "theoretical_loss": 3.5217700130777954, + "tokens_seen": 1472792576 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027967903711133404, + "loss": 2.6794, + "theoretical_loss": 3.5217700130777954, + "tokens_seen": 1472792576 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027966900702106317, + "loss": 2.6946, + "theoretical_loss": 3.5217561477843167, + "tokens_seen": 1472858112 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002796589769307924, + "loss": 2.7071, + "theoretical_loss": 3.521742283280508, + "tokens_seen": 1472923648 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027964894684052153, + "loss": 2.6487, + "theoretical_loss": 3.5217284195662892, + "tokens_seen": 1472989184 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027963891675025077, + "loss": 2.6706, + "theoretical_loss": 3.52171455664158, + "tokens_seen": 1473054720 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027962888665997995, + "loss": 2.7664, + "theoretical_loss": 3.5217006945063014, + "tokens_seen": 1473120256 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027961885656970913, + "loss": 2.7194, + "theoretical_loss": 3.521686833160372, + "tokens_seen": 1473185792 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002796088264794383, + "loss": 2.708, + "theoretical_loss": 3.5216729726037124, + "tokens_seen": 1473251328 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027959879638916755, + "loss": 2.5663, + "theoretical_loss": 3.5216591128362422, + "tokens_seen": 1473316864 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027958876629889667, + "loss": 2.7933, + "theoretical_loss": 3.5216452538578817, + "tokens_seen": 1473382400 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002795787362086259, + "loss": 2.6867, + "theoretical_loss": 3.5216313956685514, + "tokens_seen": 1473447936 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027956870611835503, + "loss": 2.7733, + "theoretical_loss": 3.52161753826817, + "tokens_seen": 1473513472 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027955867602808427, + "loss": 2.6916, + "theoretical_loss": 3.521603681656658, + "tokens_seen": 1473579008 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027954864593781345, + "loss": 2.6534, + "theoretical_loss": 3.521589825833936, + "tokens_seen": 1473644544 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027953861584754263, + "loss": 2.7317, + "theoretical_loss": 3.5215759707999235, + "tokens_seen": 1473710080 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002795285857572718, + "loss": 2.5898, + "theoretical_loss": 3.5215621165545405, + "tokens_seen": 1473775616 + }, + { + "epoch": 18.0, + "learning_rate": 0.000279518555667001, + "loss": 2.7381, + "theoretical_loss": 3.5215482630977073, + "tokens_seen": 1473841152 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002795085255767302, + "loss": 2.7972, + "theoretical_loss": 3.521534410429344, + "tokens_seen": 1473906688 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002794984954864594, + "loss": 2.6509, + "theoretical_loss": 3.52152055854937, + "tokens_seen": 1473972224 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027948846539618854, + "loss": 2.7204, + "theoretical_loss": 3.521506707457706, + "tokens_seen": 1474037760 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002794784353059178, + "loss": 2.6602, + "theoretical_loss": 3.521492857154272, + "tokens_seen": 1474103296 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002794684052156469, + "loss": 2.7026, + "theoretical_loss": 3.5214790076389884, + "tokens_seen": 1474168832 + }, + { + "epoch": 18.0, + "learning_rate": 0.00027945837512537614, + "loss": 2.696, + "theoretical_loss": 3.5214651589117745, + "tokens_seen": 1474234368 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002794483450351053, + "loss": 2.8266, + "theoretical_loss": 3.5214513109725507, + "tokens_seen": 1474299904 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002794383149448345, + "loss": 2.7458, + "theoretical_loss": 3.521437463821238, + "tokens_seen": 1474365440 + }, + { + "debugging/Self-BLEU-5": 0.6897357198211433, + "debugging/distinct-1-grams": 0.7322734885092704, + "debugging/distinct-2-grams": 0.9352954277574638, + "debugging/entropy-1-grams": 6.432242911789563, + "debugging/entropy-2-grams": 7.79124981717318, + "debugging/length": 544.4444444444445, + "debugging/num_segments": 36, + "epoch": 18.0, + "objective/train/docs_used": 3489962, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5121731758117676, + "objective/train/theoretical_loss": 3.5214236174577556, + "objective/train/tokens_used": 1494890976, + "theoretical_loss": 3.5214236174577556, + "tokens_seen": 1474430976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002794282848545637, + "loss": 2.7284, + "theoretical_loss": 3.5214236174577556, + "tokens_seen": 1474430976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002794182547642929, + "loss": 2.6009, + "theoretical_loss": 3.521409771882024, + "tokens_seen": 1474496512 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027940822467402204, + "loss": 2.6412, + "theoretical_loss": 3.5213959270939634, + "tokens_seen": 1474562048 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793981945837513, + "loss": 2.7968, + "theoretical_loss": 3.5213820830934934, + "tokens_seen": 1474627584 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793881644934804, + "loss": 2.6228, + "theoretical_loss": 3.5213682398805353, + "tokens_seen": 1474693120 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027937813440320964, + "loss": 2.651, + "theoretical_loss": 3.5213543974550086, + "tokens_seen": 1474758656 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793681043129388, + "loss": 2.7837, + "theoretical_loss": 3.521340555816834, + "tokens_seen": 1474824192 + }, + { + "epoch": 18.01, + "learning_rate": 0.000279358074222668, + "loss": 2.7253, + "theoretical_loss": 3.52132671496593, + "tokens_seen": 1474889728 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793480441323972, + "loss": 2.6527, + "theoretical_loss": 3.5213128749022196, + "tokens_seen": 1474955264 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027933801404212636, + "loss": 2.6756, + "theoretical_loss": 3.521299035625621, + "tokens_seen": 1475020800 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027932798395185554, + "loss": 2.7774, + "theoretical_loss": 3.5212851971360557, + "tokens_seen": 1475086336 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793179538615848, + "loss": 2.8934, + "theoretical_loss": 3.521271359433443, + "tokens_seen": 1475151872 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002793079237713139, + "loss": 2.7567, + "theoretical_loss": 3.5212575225177036, + "tokens_seen": 1475217408 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027929789368104314, + "loss": 2.7361, + "theoretical_loss": 3.5212436863887575, + "tokens_seen": 1475282944 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027928786359077227, + "loss": 2.6485, + "theoretical_loss": 3.521229851046526, + "tokens_seen": 1475348480 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002792778335005015, + "loss": 2.7559, + "theoretical_loss": 3.5212160164909285, + "tokens_seen": 1475414016 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027926780341023074, + "loss": 2.792, + "theoretical_loss": 3.5212021827218853, + "tokens_seen": 1475479552 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027925777331995987, + "loss": 2.6915, + "theoretical_loss": 3.5211883497393175, + "tokens_seen": 1475545088 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002792477432296891, + "loss": 2.7494, + "theoretical_loss": 3.5211745175431446, + "tokens_seen": 1475610624 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002792377131394183, + "loss": 2.7832, + "theoretical_loss": 3.5211606861332876, + "tokens_seen": 1475676160 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027922768304914746, + "loss": 2.7962, + "theoretical_loss": 3.5211468555096666, + "tokens_seen": 1475741696 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027921765295887664, + "loss": 2.7215, + "theoretical_loss": 3.521133025672202, + "tokens_seen": 1475807232 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002792076228686058, + "loss": 2.6912, + "theoretical_loss": 3.521119196620814, + "tokens_seen": 1475872768 + }, + { + "epoch": 18.01, + "learning_rate": 0.000279197592778335, + "loss": 2.6809, + "theoretical_loss": 3.5211053683554234, + "tokens_seen": 1475938304 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027918756268806424, + "loss": 2.7891, + "theoretical_loss": 3.5210915408759504, + "tokens_seen": 1476003840 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3494722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6725199222564697, + "objective/train/theoretical_loss": 3.521077714182316, + "objective/train/tokens_used": 1496529376, + "theoretical_loss": 3.521077714182316, + "tokens_seen": 1476069376 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027917753259779337, + "loss": 2.7195, + "theoretical_loss": 3.521077714182316, + "tokens_seen": 1476069376 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002791675025075226, + "loss": 2.7661, + "theoretical_loss": 3.5210638882744396, + "tokens_seen": 1476134912 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027915747241725173, + "loss": 2.6193, + "theoretical_loss": 3.5210500631522423, + "tokens_seen": 1476200448 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027914744232698097, + "loss": 2.7485, + "theoretical_loss": 3.5210362388156446, + "tokens_seen": 1476265984 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027913741223671015, + "loss": 2.7404, + "theoretical_loss": 3.521022415264567, + "tokens_seen": 1476331520 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027912738214643933, + "loss": 2.7599, + "theoretical_loss": 3.52100859249893, + "tokens_seen": 1476397056 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002791173520561685, + "loss": 2.5758, + "theoretical_loss": 3.520994770518654, + "tokens_seen": 1476462592 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027910732196589775, + "loss": 2.6568, + "theoretical_loss": 3.5209809493236595, + "tokens_seen": 1476528128 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027909729187562687, + "loss": 2.754, + "theoretical_loss": 3.520967128913867, + "tokens_seen": 1476593664 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002790872617853561, + "loss": 2.5987, + "theoretical_loss": 3.520953309289197, + "tokens_seen": 1476659200 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027907723169508523, + "loss": 2.7816, + "theoretical_loss": 3.52093949044957, + "tokens_seen": 1476724736 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027906720160481447, + "loss": 2.6976, + "theoretical_loss": 3.520925672394907, + "tokens_seen": 1476790272 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027905717151454365, + "loss": 2.768, + "theoretical_loss": 3.520911855125129, + "tokens_seen": 1476855808 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027904714142427283, + "loss": 2.7181, + "theoretical_loss": 3.520898038640155, + "tokens_seen": 1476921344 + }, + { + "epoch": 18.01, + "learning_rate": 0.000279037111334002, + "loss": 2.8139, + "theoretical_loss": 3.520884222939906, + "tokens_seen": 1476986880 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002790270812437312, + "loss": 2.632, + "theoretical_loss": 3.5208704080243045, + "tokens_seen": 1477052416 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002790170511534604, + "loss": 2.5921, + "theoretical_loss": 3.520856593893269, + "tokens_seen": 1477117952 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002790070210631896, + "loss": 2.6421, + "theoretical_loss": 3.5208427805467206, + "tokens_seen": 1477183488 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027899699097291874, + "loss": 2.7273, + "theoretical_loss": 3.520828967984581, + "tokens_seen": 1477249024 + }, + { + "epoch": 18.01, + "learning_rate": 0.000278986960882648, + "loss": 2.7778, + "theoretical_loss": 3.5208151562067695, + "tokens_seen": 1477314560 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789769307923771, + "loss": 2.7399, + "theoretical_loss": 3.520801345213208, + "tokens_seen": 1477380096 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027896690070210634, + "loss": 2.721, + "theoretical_loss": 3.520787535003816, + "tokens_seen": 1477445632 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789568706118355, + "loss": 2.7262, + "theoretical_loss": 3.5207737255785148, + "tokens_seen": 1477511168 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789468405215647, + "loss": 2.5753, + "theoretical_loss": 3.520759916937225, + "tokens_seen": 1477576704 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789368104312939, + "loss": 2.6966, + "theoretical_loss": 3.5207461090798677, + "tokens_seen": 1477642240 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3497712, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6407723426818848, + "objective/train/theoretical_loss": 3.520732302006363, + "objective/train/tokens_used": 1498167776, + "theoretical_loss": 3.520732302006363, + "tokens_seen": 1477707776 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789267803410231, + "loss": 2.6498, + "theoretical_loss": 3.520732302006363, + "tokens_seen": 1477707776 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027891675025075224, + "loss": 2.6819, + "theoretical_loss": 3.5207184957166318, + "tokens_seen": 1477773312 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002789067201604815, + "loss": 2.6874, + "theoretical_loss": 3.520704690210595, + "tokens_seen": 1477838848 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002788966900702106, + "loss": 2.6195, + "theoretical_loss": 3.5206908854881735, + "tokens_seen": 1477904384 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027888665997993984, + "loss": 2.795, + "theoretical_loss": 3.5206770815492883, + "tokens_seen": 1477969920 + }, + { + "epoch": 18.01, + "learning_rate": 0.000278876629889669, + "loss": 2.7667, + "theoretical_loss": 3.5206632783938594, + "tokens_seen": 1478035456 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002788665997993982, + "loss": 2.7384, + "theoretical_loss": 3.5206494760218083, + "tokens_seen": 1478100992 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002788565697091274, + "loss": 2.7942, + "theoretical_loss": 3.520635674433055, + "tokens_seen": 1478166528 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027884653961885656, + "loss": 2.6612, + "theoretical_loss": 3.5206218736275217, + "tokens_seen": 1478232064 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027883650952858574, + "loss": 2.6786, + "theoretical_loss": 3.5206080736051275, + "tokens_seen": 1478297600 + }, + { + "epoch": 18.01, + "learning_rate": 0.000278826479438315, + "loss": 2.6961, + "theoretical_loss": 3.5205942743657945, + "tokens_seen": 1478363136 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002788164493480441, + "loss": 2.7668, + "theoretical_loss": 3.5205804759094432, + "tokens_seen": 1478428672 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027880641925777334, + "loss": 2.8015, + "theoretical_loss": 3.520566678235994, + "tokens_seen": 1478494208 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027879638916750247, + "loss": 2.7652, + "theoretical_loss": 3.520552881345369, + "tokens_seen": 1478559744 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002787863590772317, + "loss": 2.659, + "theoretical_loss": 3.520539085237488, + "tokens_seen": 1478625280 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002787763289869609, + "loss": 2.6576, + "theoretical_loss": 3.520525289912272, + "tokens_seen": 1478690816 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027876629889669007, + "loss": 2.7505, + "theoretical_loss": 3.5205114953696426, + "tokens_seen": 1478756352 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027875626880641925, + "loss": 2.7806, + "theoretical_loss": 3.5204977016095196, + "tokens_seen": 1478821888 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002787462387161485, + "loss": 2.7829, + "theoretical_loss": 3.5204839086318254, + "tokens_seen": 1478887424 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002787362086258776, + "loss": 2.7339, + "theoretical_loss": 3.5204701164364796, + "tokens_seen": 1478952960 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027872617853560685, + "loss": 2.625, + "theoretical_loss": 3.520456325023404, + "tokens_seen": 1479018496 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027871614844533597, + "loss": 2.6732, + "theoretical_loss": 3.5204425343925188, + "tokens_seen": 1479084032 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002787061183550652, + "loss": 2.7776, + "theoretical_loss": 3.5204287445437457, + "tokens_seen": 1479149568 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002786960882647944, + "loss": 2.7223, + "theoretical_loss": 3.5204149554770057, + "tokens_seen": 1479215104 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027868605817452357, + "loss": 2.7293, + "theoretical_loss": 3.5204011671922197, + "tokens_seen": 1479280640 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3502614, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6676740646362305, + "objective/train/theoretical_loss": 3.520387379689308, + "objective/train/tokens_used": 1499806176, + "theoretical_loss": 3.520387379689308, + "tokens_seen": 1479346176 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027867602808425275, + "loss": 2.7802, + "theoretical_loss": 3.520387379689308, + "tokens_seen": 1479346176 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027866599799398193, + "loss": 2.7534, + "theoretical_loss": 3.520373592968192, + "tokens_seen": 1479411712 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002786559679037111, + "loss": 2.7396, + "theoretical_loss": 3.520359807028794, + "tokens_seen": 1479477248 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027864593781344035, + "loss": 2.7648, + "theoretical_loss": 3.520346021871033, + "tokens_seen": 1479542784 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002786359077231695, + "loss": 2.7309, + "theoretical_loss": 3.5203322374948316, + "tokens_seen": 1479608320 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002786258776328987, + "loss": 2.8062, + "theoretical_loss": 3.52031845390011, + "tokens_seen": 1479673856 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002786158475426279, + "loss": 2.6782, + "theoretical_loss": 3.52030467108679, + "tokens_seen": 1479739392 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027860581745235707, + "loss": 2.7581, + "theoretical_loss": 3.520290889054792, + "tokens_seen": 1479804928 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027859578736208625, + "loss": 2.6483, + "theoretical_loss": 3.520277107804038, + "tokens_seen": 1479870464 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027858575727181544, + "loss": 2.7403, + "theoretical_loss": 3.520263327334448, + "tokens_seen": 1479936000 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002785757271815446, + "loss": 2.7021, + "theoretical_loss": 3.5202495476459434, + "tokens_seen": 1480001536 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027856569709127385, + "loss": 2.6413, + "theoretical_loss": 3.5202357687384462, + "tokens_seen": 1480067072 + }, + { + "epoch": 18.01, + "learning_rate": 0.000278555667001003, + "loss": 2.6842, + "theoretical_loss": 3.520221990611877, + "tokens_seen": 1480132608 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002785456369107322, + "loss": 2.6484, + "theoretical_loss": 3.5202082132661565, + "tokens_seen": 1480198144 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027853560682046134, + "loss": 2.6832, + "theoretical_loss": 3.520194436701207, + "tokens_seen": 1480263680 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002785255767301906, + "loss": 2.7068, + "theoretical_loss": 3.520180660916948, + "tokens_seen": 1480329216 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002785155466399198, + "loss": 2.6787, + "theoretical_loss": 3.5201668859133024, + "tokens_seen": 1480394752 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027850551654964894, + "loss": 2.7162, + "theoretical_loss": 3.5201531116901905, + "tokens_seen": 1480460288 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784954864593782, + "loss": 2.7431, + "theoretical_loss": 3.520139338247534, + "tokens_seen": 1480525824 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784854563691073, + "loss": 2.7205, + "theoretical_loss": 3.5201255655852535, + "tokens_seen": 1480591360 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027847542627883654, + "loss": 2.7326, + "theoretical_loss": 3.520111793703271, + "tokens_seen": 1480656896 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784653961885657, + "loss": 2.6359, + "theoretical_loss": 3.5200980226015073, + "tokens_seen": 1480722432 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784553660982949, + "loss": 2.7382, + "theoretical_loss": 3.5200842522798834, + "tokens_seen": 1480787968 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784453360080241, + "loss": 2.6943, + "theoretical_loss": 3.5200704827383213, + "tokens_seen": 1480853504 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784353059177533, + "loss": 2.6699, + "theoretical_loss": 3.520056713976742, + "tokens_seen": 1480919040 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3505509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.629917621612549, + "objective/train/theoretical_loss": 3.5200429459950664, + "objective/train/tokens_used": 1501444576, + "theoretical_loss": 3.5200429459950664, + "tokens_seen": 1480984576 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027842527582748244, + "loss": 2.6941, + "theoretical_loss": 3.5200429459950664, + "tokens_seen": 1480984576 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784152457372117, + "loss": 2.7071, + "theoretical_loss": 3.5200291787932163, + "tokens_seen": 1481050112 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002784052156469408, + "loss": 2.8252, + "theoretical_loss": 3.520015412371113, + "tokens_seen": 1481115648 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027839518555667004, + "loss": 2.7137, + "theoretical_loss": 3.5200016467286774, + "tokens_seen": 1481181184 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002783851554663992, + "loss": 2.6832, + "theoretical_loss": 3.5199878818658314, + "tokens_seen": 1481246720 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002783751253761284, + "loss": 2.6813, + "theoretical_loss": 3.519974117782496, + "tokens_seen": 1481312256 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002783650952858576, + "loss": 2.6595, + "theoretical_loss": 3.5199603544785925, + "tokens_seen": 1481377792 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027835506519558676, + "loss": 2.7124, + "theoretical_loss": 3.5199465919540427, + "tokens_seen": 1481443328 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027834503510531594, + "loss": 2.6969, + "theoretical_loss": 3.5199328302087682, + "tokens_seen": 1481508864 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002783350050150452, + "loss": 2.7198, + "theoretical_loss": 3.519919069242689, + "tokens_seen": 1481574400 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002783249749247743, + "loss": 2.7686, + "theoretical_loss": 3.519905309055728, + "tokens_seen": 1481639936 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027831494483450354, + "loss": 2.7537, + "theoretical_loss": 3.5198915496478067, + "tokens_seen": 1481705472 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027830491474423267, + "loss": 2.7558, + "theoretical_loss": 3.5198777910188452, + "tokens_seen": 1481771008 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782948846539619, + "loss": 2.7198, + "theoretical_loss": 3.5198640331687656, + "tokens_seen": 1481836544 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782848545636911, + "loss": 2.7858, + "theoretical_loss": 3.5198502760974897, + "tokens_seen": 1481902080 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027827482447342027, + "loss": 2.7744, + "theoretical_loss": 3.5198365198049393, + "tokens_seen": 1481967616 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027826479438314945, + "loss": 2.8189, + "theoretical_loss": 3.5198227642910345, + "tokens_seen": 1482033152 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782547642928787, + "loss": 2.7393, + "theoretical_loss": 3.519809009555698, + "tokens_seen": 1482098688 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782447342026078, + "loss": 2.744, + "theoretical_loss": 3.519795255598851, + "tokens_seen": 1482164224 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027823470411233705, + "loss": 2.769, + "theoretical_loss": 3.5197815024204147, + "tokens_seen": 1482229760 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027822467402206617, + "loss": 2.7339, + "theoretical_loss": 3.5197677500203115, + "tokens_seen": 1482295296 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782146439317954, + "loss": 2.7129, + "theoretical_loss": 3.519753998398462, + "tokens_seen": 1482360832 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002782046138415246, + "loss": 2.7049, + "theoretical_loss": 3.519740247554788, + "tokens_seen": 1482426368 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027819458375125377, + "loss": 2.7687, + "theoretical_loss": 3.519726497489211, + "tokens_seen": 1482491904 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027818455366098295, + "loss": 2.7293, + "theoretical_loss": 3.519712748201653, + "tokens_seen": 1482557440 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3510422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750730037689209, + "objective/train/theoretical_loss": 3.519698999692035, + "objective/train/tokens_used": 1503082976, + "theoretical_loss": 3.519698999692035, + "tokens_seen": 1482622976 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027817452357071213, + "loss": 2.7112, + "theoretical_loss": 3.519698999692035, + "tokens_seen": 1482622976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002781644934804413, + "loss": 2.7287, + "theoretical_loss": 3.5196852519602793, + "tokens_seen": 1482688512 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027815446339017055, + "loss": 2.7358, + "theoretical_loss": 3.5196715050063068, + "tokens_seen": 1482754048 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002781444332998997, + "loss": 2.7309, + "theoretical_loss": 3.5196577588300393, + "tokens_seen": 1482819584 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002781344032096289, + "loss": 2.8207, + "theoretical_loss": 3.519644013431399, + "tokens_seen": 1482885120 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002781243731193581, + "loss": 2.7946, + "theoretical_loss": 3.519630268810307, + "tokens_seen": 1482950656 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002781143430290873, + "loss": 2.7128, + "theoretical_loss": 3.5196165249666853, + "tokens_seen": 1483016192 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027810431293881645, + "loss": 2.6702, + "theoretical_loss": 3.5196027819004554, + "tokens_seen": 1483081728 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027809428284854564, + "loss": 2.6566, + "theoretical_loss": 3.519589039611539, + "tokens_seen": 1483147264 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002780842527582748, + "loss": 2.6739, + "theoretical_loss": 3.519575298099857, + "tokens_seen": 1483212800 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027807422266800405, + "loss": 2.7634, + "theoretical_loss": 3.519561557365332, + "tokens_seen": 1483278336 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002780641925777332, + "loss": 2.6607, + "theoretical_loss": 3.519547817407886, + "tokens_seen": 1483343872 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002780541624874624, + "loss": 2.7313, + "theoretical_loss": 3.51953407822744, + "tokens_seen": 1483409408 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027804413239719154, + "loss": 2.7967, + "theoretical_loss": 3.519520339823916, + "tokens_seen": 1483474944 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002780341023069208, + "loss": 2.6637, + "theoretical_loss": 3.519506602197236, + "tokens_seen": 1483540480 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027802407221664996, + "loss": 2.7533, + "theoretical_loss": 3.5194928653473214, + "tokens_seen": 1483606016 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027801404212637914, + "loss": 2.776, + "theoretical_loss": 3.5194791292740937, + "tokens_seen": 1483671552 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002780040120361083, + "loss": 2.7674, + "theoretical_loss": 3.5194653939774754, + "tokens_seen": 1483737088 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779939819458375, + "loss": 2.5927, + "theoretical_loss": 3.5194516594573875, + "tokens_seen": 1483802624 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779839518555667, + "loss": 2.7462, + "theoretical_loss": 3.5194379257137527, + "tokens_seen": 1483868160 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779739217652959, + "loss": 2.6847, + "theoretical_loss": 3.5194241927464924, + "tokens_seen": 1483933696 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027796389167502504, + "loss": 2.7781, + "theoretical_loss": 3.5194104605555285, + "tokens_seen": 1483999232 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779538615847543, + "loss": 2.71, + "theoretical_loss": 3.519396729140783, + "tokens_seen": 1484064768 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027794383149448346, + "loss": 2.7035, + "theoretical_loss": 3.5193829985021763, + "tokens_seen": 1484130304 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027793380140421264, + "loss": 2.7671, + "theoretical_loss": 3.5193692686396325, + "tokens_seen": 1484195840 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3514219, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.839871406555176, + "objective/train/theoretical_loss": 3.519355539553072, + "objective/train/tokens_used": 1504721376, + "theoretical_loss": 3.519355539553072, + "tokens_seen": 1484261376 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779237713139418, + "loss": 2.752, + "theoretical_loss": 3.519355539553072, + "tokens_seen": 1484261376 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277913741223671, + "loss": 2.7177, + "theoretical_loss": 3.519341811242417, + "tokens_seen": 1484326912 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002779037111334002, + "loss": 2.6963, + "theoretical_loss": 3.51932808370759, + "tokens_seen": 1484392448 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778936810431294, + "loss": 2.7114, + "theoretical_loss": 3.519314356948512, + "tokens_seen": 1484457984 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027788365095285855, + "loss": 2.6157, + "theoretical_loss": 3.5193006309651054, + "tokens_seen": 1484523520 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778736208625878, + "loss": 2.7959, + "theoretical_loss": 3.519286905757292, + "tokens_seen": 1484589056 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778635907723169, + "loss": 2.6892, + "theoretical_loss": 3.5192731813249933, + "tokens_seen": 1484654592 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027785356068204614, + "loss": 2.6749, + "theoretical_loss": 3.5192594576681326, + "tokens_seen": 1484720128 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778435305917753, + "loss": 2.7175, + "theoretical_loss": 3.5192457347866304, + "tokens_seen": 1484785664 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778335005015045, + "loss": 2.8396, + "theoretical_loss": 3.5192320126804097, + "tokens_seen": 1484851200 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002778234704112337, + "loss": 2.7423, + "theoretical_loss": 3.519218291349392, + "tokens_seen": 1484916736 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027781344032096287, + "loss": 2.7477, + "theoretical_loss": 3.519204570793499, + "tokens_seen": 1484982272 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027780341023069205, + "loss": 2.7792, + "theoretical_loss": 3.5191908510126533, + "tokens_seen": 1485047808 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002777933801404213, + "loss": 2.7797, + "theoretical_loss": 3.5191771320067766, + "tokens_seen": 1485113344 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002777833500501504, + "loss": 2.5983, + "theoretical_loss": 3.5191634137757912, + "tokens_seen": 1485178880 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027777331995987965, + "loss": 2.7133, + "theoretical_loss": 3.5191496963196185, + "tokens_seen": 1485244416 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002777632898696089, + "loss": 2.6684, + "theoretical_loss": 3.5191359796381816, + "tokens_seen": 1485309952 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277753259779338, + "loss": 2.72, + "theoretical_loss": 3.5191222637314015, + "tokens_seen": 1485375488 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027774322968906725, + "loss": 2.596, + "theoretical_loss": 3.519108548599201, + "tokens_seen": 1485441024 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027773319959879637, + "loss": 2.7025, + "theoretical_loss": 3.5190948342415016, + "tokens_seen": 1485506560 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002777231695085256, + "loss": 2.7043, + "theoretical_loss": 3.5190811206582264, + "tokens_seen": 1485572096 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002777131394182548, + "loss": 2.5816, + "theoretical_loss": 3.5190674078492963, + "tokens_seen": 1485637632 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027770310932798397, + "loss": 2.6805, + "theoretical_loss": 3.519053695814634, + "tokens_seen": 1485703168 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027769307923771315, + "loss": 2.6863, + "theoretical_loss": 3.5190399845541616, + "tokens_seen": 1485768704 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027768304914744233, + "loss": 2.6988, + "theoretical_loss": 3.5190262740678016, + "tokens_seen": 1485834240 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3517182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.496483087539673, + "objective/train/theoretical_loss": 3.5190125643554753, + "objective/train/tokens_used": 1506359776, + "theoretical_loss": 3.5190125643554753, + "tokens_seen": 1485899776 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002776730190571715, + "loss": 2.6791, + "theoretical_loss": 3.5190125643554753, + "tokens_seen": 1485899776 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027766298896690075, + "loss": 2.6279, + "theoretical_loss": 3.518998855417105, + "tokens_seen": 1485965312 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002776529588766299, + "loss": 2.6803, + "theoretical_loss": 3.518985147252614, + "tokens_seen": 1486030848 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002776429287863591, + "loss": 2.8469, + "theoretical_loss": 3.5189714398619234, + "tokens_seen": 1486096384 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002776328986960883, + "loss": 2.7541, + "theoretical_loss": 3.5189577332449558, + "tokens_seen": 1486161920 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002776228686058175, + "loss": 2.656, + "theoretical_loss": 3.5189440274016333, + "tokens_seen": 1486227456 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027761283851554665, + "loss": 2.5772, + "theoretical_loss": 3.5189303223318777, + "tokens_seen": 1486292992 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027760280842527584, + "loss": 2.7487, + "theoretical_loss": 3.5189166180356124, + "tokens_seen": 1486358528 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277592778335005, + "loss": 2.8108, + "theoretical_loss": 3.5189029145127586, + "tokens_seen": 1486424064 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027758274824473425, + "loss": 2.8273, + "theoretical_loss": 3.5188892117632387, + "tokens_seen": 1486489600 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002775727181544634, + "loss": 2.7575, + "theoretical_loss": 3.5188755097869757, + "tokens_seen": 1486555136 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002775626880641926, + "loss": 2.7406, + "theoretical_loss": 3.5188618085838907, + "tokens_seen": 1486620672 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027755265797392174, + "loss": 2.8325, + "theoretical_loss": 3.518848108153907, + "tokens_seen": 1486686208 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277542627883651, + "loss": 2.7224, + "theoretical_loss": 3.5188344084969465, + "tokens_seen": 1486751744 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027753259779338016, + "loss": 2.8533, + "theoretical_loss": 3.518820709612931, + "tokens_seen": 1486817280 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027752256770310934, + "loss": 2.7861, + "theoretical_loss": 3.5188070115017838, + "tokens_seen": 1486882816 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002775125376128385, + "loss": 2.6651, + "theoretical_loss": 3.5187933141634264, + "tokens_seen": 1486948352 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002775025075225677, + "loss": 2.6844, + "theoretical_loss": 3.518779617597782, + "tokens_seen": 1487013888 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774924774322969, + "loss": 2.7309, + "theoretical_loss": 3.518765921804772, + "tokens_seen": 1487079424 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774824473420261, + "loss": 2.7792, + "theoretical_loss": 3.51875222678432, + "tokens_seen": 1487144960 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027747241725175524, + "loss": 2.8009, + "theoretical_loss": 3.5187385325363474, + "tokens_seen": 1487210496 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774623871614845, + "loss": 2.822, + "theoretical_loss": 3.518724839060776, + "tokens_seen": 1487276032 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027745235707121366, + "loss": 2.7386, + "theoretical_loss": 3.51871114635753, + "tokens_seen": 1487341568 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027744232698094284, + "loss": 2.6747, + "theoretical_loss": 3.51869745442653, + "tokens_seen": 1487407104 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277432296890672, + "loss": 2.8195, + "theoretical_loss": 3.5186837632677, + "tokens_seen": 1487472640 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3521982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.703190565109253, + "objective/train/theoretical_loss": 3.518670072880961, + "objective/train/tokens_used": 1507998176, + "theoretical_loss": 3.518670072880961, + "tokens_seen": 1487538176 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774222668004012, + "loss": 2.6332, + "theoretical_loss": 3.518670072880961, + "tokens_seen": 1487538176 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774122367101304, + "loss": 2.6872, + "theoretical_loss": 3.5186563832662365, + "tokens_seen": 1487603712 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002774022066198596, + "loss": 2.7228, + "theoretical_loss": 3.518642694423449, + "tokens_seen": 1487669248 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027739217652958875, + "loss": 2.807, + "theoretical_loss": 3.51862900635252, + "tokens_seen": 1487734784 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277382146439318, + "loss": 2.8364, + "theoretical_loss": 3.5186153190533727, + "tokens_seen": 1487800320 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002773721163490471, + "loss": 2.8052, + "theoretical_loss": 3.5186016325259297, + "tokens_seen": 1487865856 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027736208625877634, + "loss": 2.75, + "theoretical_loss": 3.5185879467701127, + "tokens_seen": 1487931392 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002773520561685055, + "loss": 2.7539, + "theoretical_loss": 3.5185742617858446, + "tokens_seen": 1487996928 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002773420260782347, + "loss": 2.7252, + "theoretical_loss": 3.518560577573049, + "tokens_seen": 1488062464 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002773319959879639, + "loss": 2.7777, + "theoretical_loss": 3.5185468941316467, + "tokens_seen": 1488128000 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027732196589769307, + "loss": 2.7168, + "theoretical_loss": 3.5185332114615613, + "tokens_seen": 1488193536 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027731193580742225, + "loss": 2.7492, + "theoretical_loss": 3.5185195295627154, + "tokens_seen": 1488259072 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002773019057171515, + "loss": 2.7111, + "theoretical_loss": 3.518505848435031, + "tokens_seen": 1488324608 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002772918756268806, + "loss": 2.7522, + "theoretical_loss": 3.518492168078431, + "tokens_seen": 1488390144 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027728184553660985, + "loss": 2.8095, + "theoretical_loss": 3.5184784884928377, + "tokens_seen": 1488455680 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027727181544633903, + "loss": 2.7904, + "theoretical_loss": 3.518464809678174, + "tokens_seen": 1488521216 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002772617853560682, + "loss": 2.7268, + "theoretical_loss": 3.5184511316343627, + "tokens_seen": 1488586752 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002772517552657974, + "loss": 2.7214, + "theoretical_loss": 3.518437454361327, + "tokens_seen": 1488652288 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027724172517552657, + "loss": 2.7007, + "theoretical_loss": 3.5184237778589873, + "tokens_seen": 1488717824 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027723169508525575, + "loss": 2.7703, + "theoretical_loss": 3.5184101021272682, + "tokens_seen": 1488783360 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277221664994985, + "loss": 2.8026, + "theoretical_loss": 3.518396427166092, + "tokens_seen": 1488848896 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002772116349047141, + "loss": 2.6638, + "theoretical_loss": 3.518382752975381, + "tokens_seen": 1488914432 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027720160481444335, + "loss": 2.7413, + "theoretical_loss": 3.5183690795550584, + "tokens_seen": 1488979968 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771915747241725, + "loss": 2.6973, + "theoretical_loss": 3.5183554069050462, + "tokens_seen": 1489045504 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771815446339017, + "loss": 2.6619, + "theoretical_loss": 3.5183417350252677, + "tokens_seen": 1489111040 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3525052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5530121326446533, + "objective/train/theoretical_loss": 3.5183280639156456, + "objective/train/tokens_used": 1509636576, + "theoretical_loss": 3.5183280639156456, + "tokens_seen": 1489176576 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771715145436309, + "loss": 2.7148, + "theoretical_loss": 3.5183280639156456, + "tokens_seen": 1489176576 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771614844533601, + "loss": 2.6603, + "theoretical_loss": 3.518314393576102, + "tokens_seen": 1489242112 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027715145436308926, + "loss": 2.748, + "theoretical_loss": 3.5183007240065605, + "tokens_seen": 1489307648 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771414242728185, + "loss": 2.7858, + "theoretical_loss": 3.518287055206943, + "tokens_seen": 1489373184 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771313941825476, + "loss": 2.6986, + "theoretical_loss": 3.5182733871771728, + "tokens_seen": 1489438720 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027712136409227685, + "loss": 2.7671, + "theoretical_loss": 3.5182597199171726, + "tokens_seen": 1489504256 + }, + { + "epoch": 18.01, + "learning_rate": 0.000277111334002006, + "loss": 2.7019, + "theoretical_loss": 3.518246053426865, + "tokens_seen": 1489569792 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002771013039117352, + "loss": 2.8263, + "theoretical_loss": 3.518232387706173, + "tokens_seen": 1489635328 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770912738214644, + "loss": 2.7395, + "theoretical_loss": 3.5182187227550195, + "tokens_seen": 1489700864 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770812437311936, + "loss": 2.8287, + "theoretical_loss": 3.5182050585733267, + "tokens_seen": 1489766400 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027707121364092276, + "loss": 2.7552, + "theoretical_loss": 3.5181913951610184, + "tokens_seen": 1489831936 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027706118355065194, + "loss": 2.7673, + "theoretical_loss": 3.5181777325180166, + "tokens_seen": 1489897472 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770511534603811, + "loss": 2.7017, + "theoretical_loss": 3.5181640706442443, + "tokens_seen": 1489963008 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027704112337011036, + "loss": 2.72, + "theoretical_loss": 3.5181504095396248, + "tokens_seen": 1490028544 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770310932798395, + "loss": 2.775, + "theoretical_loss": 3.5181367492040803, + "tokens_seen": 1490094080 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770210631895687, + "loss": 2.6877, + "theoretical_loss": 3.5181230896375344, + "tokens_seen": 1490159616 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770110330992979, + "loss": 2.814, + "theoretical_loss": 3.5181094308399095, + "tokens_seen": 1490225152 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002770010030090271, + "loss": 2.6689, + "theoretical_loss": 3.5180957728111286, + "tokens_seen": 1490290688 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769909729187563, + "loss": 2.7464, + "theoretical_loss": 3.5180821155511146, + "tokens_seen": 1490356224 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027698094282848544, + "loss": 2.743, + "theoretical_loss": 3.5180684590597906, + "tokens_seen": 1490421760 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769709127382147, + "loss": 2.8116, + "theoretical_loss": 3.5180548033370798, + "tokens_seen": 1490487296 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027696088264794386, + "loss": 2.6656, + "theoretical_loss": 3.5180411483829044, + "tokens_seen": 1490552832 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027695085255767304, + "loss": 2.7178, + "theoretical_loss": 3.5180274941971876, + "tokens_seen": 1490618368 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769408224674022, + "loss": 2.7831, + "theoretical_loss": 3.5180138407798527, + "tokens_seen": 1490683904 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769307923771314, + "loss": 2.7025, + "theoretical_loss": 3.5180001881308227, + "tokens_seen": 1490749440 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3528972, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6309146881103516, + "objective/train/theoretical_loss": 3.51798653625002, + "objective/train/tokens_used": 1511274976, + "theoretical_loss": 3.51798653625002, + "tokens_seen": 1490814976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769207622868606, + "loss": 2.665, + "theoretical_loss": 3.51798653625002, + "tokens_seen": 1490814976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002769107321965898, + "loss": 2.8057, + "theoretical_loss": 3.517972885137368, + "tokens_seen": 1490880512 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027690070210631895, + "loss": 2.6857, + "theoretical_loss": 3.51795923479279, + "tokens_seen": 1490946048 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768906720160482, + "loss": 2.7816, + "theoretical_loss": 3.5179455852162085, + "tokens_seen": 1491011584 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768806419257773, + "loss": 2.6362, + "theoretical_loss": 3.517931936407547, + "tokens_seen": 1491077120 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027687061183550655, + "loss": 2.7878, + "theoretical_loss": 3.517918288366728, + "tokens_seen": 1491142656 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768605817452357, + "loss": 2.598, + "theoretical_loss": 3.517904641093675, + "tokens_seen": 1491208192 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768505516549649, + "loss": 2.7012, + "theoretical_loss": 3.517890994588311, + "tokens_seen": 1491273728 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768405215646941, + "loss": 2.716, + "theoretical_loss": 3.517877348850559, + "tokens_seen": 1491339264 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027683049147442327, + "loss": 2.7722, + "theoretical_loss": 3.517863703880342, + "tokens_seen": 1491404800 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027682046138415245, + "loss": 2.7048, + "theoretical_loss": 3.5178500596775835, + "tokens_seen": 1491470336 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768104312938817, + "loss": 2.7458, + "theoretical_loss": 3.517836416242206, + "tokens_seen": 1491535872 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002768004012036108, + "loss": 2.6461, + "theoretical_loss": 3.5178227735741334, + "tokens_seen": 1491601408 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027679037111334005, + "loss": 2.8297, + "theoretical_loss": 3.517809131673288, + "tokens_seen": 1491666944 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027678034102306923, + "loss": 2.6826, + "theoretical_loss": 3.5177954905395934, + "tokens_seen": 1491732480 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002767703109327984, + "loss": 2.8247, + "theoretical_loss": 3.517781850172973, + "tokens_seen": 1491798016 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002767602808425276, + "loss": 2.6577, + "theoretical_loss": 3.517768210573349, + "tokens_seen": 1491863552 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027675025075225677, + "loss": 2.791, + "theoretical_loss": 3.517754571740646, + "tokens_seen": 1491929088 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027674022066198595, + "loss": 2.7533, + "theoretical_loss": 3.5177409336747862, + "tokens_seen": 1491994624 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002767301905717152, + "loss": 2.8062, + "theoretical_loss": 3.5177272963756927, + "tokens_seen": 1492060160 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002767201604814443, + "loss": 2.7347, + "theoretical_loss": 3.51771365984329, + "tokens_seen": 1492125696 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027671013039117355, + "loss": 2.7664, + "theoretical_loss": 3.5177000240774996, + "tokens_seen": 1492191232 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002767001003009027, + "loss": 2.6599, + "theoretical_loss": 3.5176863890782455, + "tokens_seen": 1492256768 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766900702106319, + "loss": 2.8489, + "theoretical_loss": 3.517672754845451, + "tokens_seen": 1492322304 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766800401203611, + "loss": 2.7092, + "theoretical_loss": 3.5176591213790394, + "tokens_seen": 1492387840 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3533455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.761119842529297, + "objective/train/theoretical_loss": 3.5176454886789346, + "objective/train/tokens_used": 1512913376, + "theoretical_loss": 3.5176454886789346, + "tokens_seen": 1492453376 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766700100300903, + "loss": 2.745, + "theoretical_loss": 3.5176454886789346, + "tokens_seen": 1492453376 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027665997993981946, + "loss": 2.6932, + "theoretical_loss": 3.5176318567450586, + "tokens_seen": 1492518912 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766499498495487, + "loss": 2.8026, + "theoretical_loss": 3.517618225577335, + "tokens_seen": 1492584448 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766399197592778, + "loss": 2.6641, + "theoretical_loss": 3.517604595175688, + "tokens_seen": 1492649984 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027662988966900705, + "loss": 2.7372, + "theoretical_loss": 3.5175909655400392, + "tokens_seen": 1492715520 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766198595787362, + "loss": 2.7292, + "theoretical_loss": 3.517577336670314, + "tokens_seen": 1492781056 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002766098294884654, + "loss": 2.641, + "theoretical_loss": 3.517563708566435, + "tokens_seen": 1492846592 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765997993981946, + "loss": 2.7393, + "theoretical_loss": 3.5175500812283245, + "tokens_seen": 1492912128 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765897693079238, + "loss": 2.7096, + "theoretical_loss": 3.517536454655907, + "tokens_seen": 1492977664 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027657973921765296, + "loss": 2.8032, + "theoretical_loss": 3.5175228288491054, + "tokens_seen": 1493043200 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027656970912738214, + "loss": 2.7296, + "theoretical_loss": 3.5175092038078435, + "tokens_seen": 1493108736 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765596790371113, + "loss": 2.7094, + "theoretical_loss": 3.517495579532044, + "tokens_seen": 1493174272 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027654964894684056, + "loss": 2.7441, + "theoretical_loss": 3.517481956021631, + "tokens_seen": 1493239808 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765396188565697, + "loss": 2.6441, + "theoretical_loss": 3.5174683332765277, + "tokens_seen": 1493305344 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765295887662989, + "loss": 2.6855, + "theoretical_loss": 3.517454711296657, + "tokens_seen": 1493370880 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027651955867602805, + "loss": 2.7048, + "theoretical_loss": 3.5174410900819435, + "tokens_seen": 1493436416 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002765095285857573, + "loss": 2.7857, + "theoretical_loss": 3.5174274696323096, + "tokens_seen": 1493501952 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027649949849548646, + "loss": 2.7079, + "theoretical_loss": 3.517413849947679, + "tokens_seen": 1493567488 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027648946840521564, + "loss": 2.7494, + "theoretical_loss": 3.517400231027975, + "tokens_seen": 1493633024 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002764794383149448, + "loss": 2.712, + "theoretical_loss": 3.5173866128731213, + "tokens_seen": 1493698560 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027646940822467406, + "loss": 2.743, + "theoretical_loss": 3.5173729954830417, + "tokens_seen": 1493764096 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002764593781344032, + "loss": 2.7592, + "theoretical_loss": 3.51735937885766, + "tokens_seen": 1493829632 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002764493480441324, + "loss": 2.8207, + "theoretical_loss": 3.5173457629968983, + "tokens_seen": 1493895168 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027643931795386155, + "loss": 2.7077, + "theoretical_loss": 3.5173321479006807, + "tokens_seen": 1493960704 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002764292878635908, + "loss": 2.7344, + "theoretical_loss": 3.5173185335689316, + "tokens_seen": 1494026240 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3536863, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.821821689605713, + "objective/train/theoretical_loss": 3.517304920001574, + "objective/train/tokens_used": 1514551776, + "theoretical_loss": 3.517304920001574, + "tokens_seen": 1494091776 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027641925777331997, + "loss": 2.6695, + "theoretical_loss": 3.517304920001574, + "tokens_seen": 1494091776 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027640922768304915, + "loss": 2.8055, + "theoretical_loss": 3.517291307198531, + "tokens_seen": 1494157312 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027639919759277833, + "loss": 2.7205, + "theoretical_loss": 3.517277695159727, + "tokens_seen": 1494222848 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002763891675025075, + "loss": 2.7543, + "theoretical_loss": 3.5172640838850846, + "tokens_seen": 1494288384 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002763791374122367, + "loss": 2.6392, + "theoretical_loss": 3.5172504733745287, + "tokens_seen": 1494353920 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002763691073219659, + "loss": 2.6914, + "theoretical_loss": 3.517236863627981, + "tokens_seen": 1494419456 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027635907723169505, + "loss": 2.688, + "theoretical_loss": 3.5172232546453674, + "tokens_seen": 1494484992 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002763490471414243, + "loss": 2.7829, + "theoretical_loss": 3.5172096464266094, + "tokens_seen": 1494550528 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002763390170511534, + "loss": 2.7165, + "theoretical_loss": 3.5171960389716324, + "tokens_seen": 1494616064 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027632898696088265, + "loss": 2.7523, + "theoretical_loss": 3.517182432280359, + "tokens_seen": 1494681600 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027631895687061183, + "loss": 2.6894, + "theoretical_loss": 3.5171688263527123, + "tokens_seen": 1494747136 + }, + { + "epoch": 18.01, + "learning_rate": 0.000276308926780341, + "loss": 2.732, + "theoretical_loss": 3.517155221188618, + "tokens_seen": 1494812672 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002762988966900702, + "loss": 2.814, + "theoretical_loss": 3.517141616787998, + "tokens_seen": 1494878208 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027628886659979943, + "loss": 2.7692, + "theoretical_loss": 3.517128013150776, + "tokens_seen": 1494943744 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027627883650952856, + "loss": 2.7362, + "theoretical_loss": 3.5171144102768768, + "tokens_seen": 1495009280 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002762688064192578, + "loss": 2.8528, + "theoretical_loss": 3.517100808166224, + "tokens_seen": 1495074816 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027625877632898697, + "loss": 2.7381, + "theoretical_loss": 3.51708720681874, + "tokens_seen": 1495140352 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027624874623871615, + "loss": 2.8837, + "theoretical_loss": 3.5170736062343497, + "tokens_seen": 1495205888 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002762387161484454, + "loss": 2.7573, + "theoretical_loss": 3.5170600064129767, + "tokens_seen": 1495271424 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002762286860581745, + "loss": 2.7732, + "theoretical_loss": 3.5170464073545444, + "tokens_seen": 1495336960 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027621865596790375, + "loss": 2.6791, + "theoretical_loss": 3.5170328090589766, + "tokens_seen": 1495402496 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002762086258776329, + "loss": 2.7041, + "theoretical_loss": 3.5170192115261973, + "tokens_seen": 1495468032 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761985957873621, + "loss": 2.8345, + "theoretical_loss": 3.5170056147561306, + "tokens_seen": 1495533568 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761885656970913, + "loss": 2.7823, + "theoretical_loss": 3.5169920187486996, + "tokens_seen": 1495599104 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761785356068205, + "loss": 2.6578, + "theoretical_loss": 3.5169784235038284, + "tokens_seen": 1495664640 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3541573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6440672874450684, + "objective/train/theoretical_loss": 3.5169648290214406, + "objective/train/tokens_used": 1516190176, + "theoretical_loss": 3.5169648290214406, + "tokens_seen": 1495730176 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027616850551654966, + "loss": 2.7185, + "theoretical_loss": 3.5169648290214406, + "tokens_seen": 1495730176 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761584754262789, + "loss": 2.6572, + "theoretical_loss": 3.5169512353014607, + "tokens_seen": 1495795712 + }, + { + "epoch": 18.01, + "learning_rate": 0.000276148445336008, + "loss": 2.7687, + "theoretical_loss": 3.516937642343812, + "tokens_seen": 1495861248 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027613841524573725, + "loss": 2.7832, + "theoretical_loss": 3.5169240501484182, + "tokens_seen": 1495926784 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761283851554664, + "loss": 2.6427, + "theoretical_loss": 3.516910458715204, + "tokens_seen": 1495992320 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761183550651956, + "loss": 2.7613, + "theoretical_loss": 3.516896868044092, + "tokens_seen": 1496057856 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002761083249749248, + "loss": 2.7027, + "theoretical_loss": 3.5168832781350075, + "tokens_seen": 1496123392 + }, + { + "epoch": 18.01, + "learning_rate": 0.000276098294884654, + "loss": 2.7752, + "theoretical_loss": 3.5168696889878728, + "tokens_seen": 1496188928 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027608826479438316, + "loss": 2.7631, + "theoretical_loss": 3.516856100602613, + "tokens_seen": 1496254464 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027607823470411234, + "loss": 2.6817, + "theoretical_loss": 3.516842512979152, + "tokens_seen": 1496320000 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002760682046138415, + "loss": 2.77, + "theoretical_loss": 3.5168289261174137, + "tokens_seen": 1496385536 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027605817452357076, + "loss": 2.722, + "theoretical_loss": 3.516815340017321, + "tokens_seen": 1496451072 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002760481444332999, + "loss": 2.7088, + "theoretical_loss": 3.5168017546787995, + "tokens_seen": 1496516608 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002760381143430291, + "loss": 2.7671, + "theoretical_loss": 3.516788170101772, + "tokens_seen": 1496582144 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027602808425275825, + "loss": 2.6762, + "theoretical_loss": 3.5167745862861626, + "tokens_seen": 1496647680 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002760180541624875, + "loss": 2.7493, + "theoretical_loss": 3.5167610032318954, + "tokens_seen": 1496713216 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027600802407221666, + "loss": 2.7116, + "theoretical_loss": 3.516747420938895, + "tokens_seen": 1496778752 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027599799398194584, + "loss": 2.6903, + "theoretical_loss": 3.5167338394070837, + "tokens_seen": 1496844288 + }, + { + "epoch": 18.01, + "learning_rate": 0.000275987963891675, + "loss": 2.7551, + "theoretical_loss": 3.5167202586363873, + "tokens_seen": 1496909824 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027597793380140426, + "loss": 2.7782, + "theoretical_loss": 3.516706678626729, + "tokens_seen": 1496975360 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002759679037111334, + "loss": 2.7072, + "theoretical_loss": 3.516693099378034, + "tokens_seen": 1497040896 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002759578736208626, + "loss": 2.6947, + "theoretical_loss": 3.516679520890224, + "tokens_seen": 1497106432 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027594784353059175, + "loss": 2.7467, + "theoretical_loss": 3.5166659431632254, + "tokens_seen": 1497171968 + }, + { + "epoch": 18.01, + "learning_rate": 0.000275937813440321, + "loss": 2.7251, + "theoretical_loss": 3.516652366196961, + "tokens_seen": 1497237504 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027592778335005017, + "loss": 2.5842, + "theoretical_loss": 3.5166387899913554, + "tokens_seen": 1497303040 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3544513, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.685133457183838, + "objective/train/theoretical_loss": 3.516625214546332, + "objective/train/tokens_used": 1517828576, + "theoretical_loss": 3.516625214546332, + "tokens_seen": 1497368576 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027591775325977935, + "loss": 2.6706, + "theoretical_loss": 3.516625214546332, + "tokens_seen": 1497368576 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027590772316950853, + "loss": 2.8413, + "theoretical_loss": 3.5166116398618157, + "tokens_seen": 1497434112 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758976930792377, + "loss": 2.6668, + "theoretical_loss": 3.51659806593773, + "tokens_seen": 1497499648 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758876629889669, + "loss": 2.8023, + "theoretical_loss": 3.5165844927739998, + "tokens_seen": 1497565184 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758776328986961, + "loss": 2.7133, + "theoretical_loss": 3.5165709203705484, + "tokens_seen": 1497630720 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027586760280842525, + "loss": 2.7603, + "theoretical_loss": 3.516557348727301, + "tokens_seen": 1497696256 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758575727181545, + "loss": 2.6422, + "theoretical_loss": 3.51654377784418, + "tokens_seen": 1497761792 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758475426278836, + "loss": 2.8334, + "theoretical_loss": 3.5165302077211114, + "tokens_seen": 1497827328 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027583751253761285, + "loss": 2.7276, + "theoretical_loss": 3.516516638358018, + "tokens_seen": 1497892864 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027582748244734203, + "loss": 2.7643, + "theoretical_loss": 3.516503069754825, + "tokens_seen": 1497958400 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758174523570712, + "loss": 2.7799, + "theoretical_loss": 3.516489501911457, + "tokens_seen": 1498023936 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002758074222668004, + "loss": 2.6653, + "theoretical_loss": 3.5164759348278363, + "tokens_seen": 1498089472 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027579739217652963, + "loss": 2.7552, + "theoretical_loss": 3.5164623685038885, + "tokens_seen": 1498155008 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027578736208625876, + "loss": 2.6815, + "theoretical_loss": 3.516448802939538, + "tokens_seen": 1498220544 + }, + { + "epoch": 18.01, + "learning_rate": 0.000275777331995988, + "loss": 2.7626, + "theoretical_loss": 3.5164352381347084, + "tokens_seen": 1498286080 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002757673019057171, + "loss": 2.6784, + "theoretical_loss": 3.516421674089324, + "tokens_seen": 1498351616 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027575727181544635, + "loss": 2.6636, + "theoretical_loss": 3.5164081108033094, + "tokens_seen": 1498417152 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027574724172517554, + "loss": 2.7005, + "theoretical_loss": 3.5163945482765886, + "tokens_seen": 1498482688 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002757372116349047, + "loss": 2.7746, + "theoretical_loss": 3.516380986509086, + "tokens_seen": 1498548224 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002757271815446339, + "loss": 2.8169, + "theoretical_loss": 3.516367425500726, + "tokens_seen": 1498613760 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002757171514543631, + "loss": 2.7726, + "theoretical_loss": 3.516353865251433, + "tokens_seen": 1498679296 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027570712136409226, + "loss": 2.7737, + "theoretical_loss": 3.516340305761131, + "tokens_seen": 1498744832 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756970912738215, + "loss": 2.7577, + "theoretical_loss": 3.5163267470297446, + "tokens_seen": 1498810368 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756870611835506, + "loss": 2.7346, + "theoretical_loss": 3.5163131890571977, + "tokens_seen": 1498875904 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027567703109327986, + "loss": 2.7068, + "theoretical_loss": 3.5162996318434154, + "tokens_seen": 1498941440 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3548146, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6527886390686035, + "objective/train/theoretical_loss": 3.5162860753883214, + "objective/train/tokens_used": 1519466976, + "theoretical_loss": 3.5162860753883214, + "tokens_seen": 1499006976 + }, + { + "epoch": 18.01, + "learning_rate": 0.000275667001003009, + "loss": 2.8233, + "theoretical_loss": 3.5162860753883214, + "tokens_seen": 1499006976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756569709127382, + "loss": 2.7915, + "theoretical_loss": 3.5162725196918405, + "tokens_seen": 1499072512 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756469408224674, + "loss": 2.7177, + "theoretical_loss": 3.516258964753897, + "tokens_seen": 1499138048 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756369107321966, + "loss": 2.8457, + "theoretical_loss": 3.516245410574415, + "tokens_seen": 1499203584 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027562688064192576, + "loss": 2.7115, + "theoretical_loss": 3.5162318571533184, + "tokens_seen": 1499269120 + }, + { + "epoch": 18.01, + "learning_rate": 0.000275616850551655, + "loss": 2.8301, + "theoretical_loss": 3.5162183044905335, + "tokens_seen": 1499334656 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002756068204613841, + "loss": 2.769, + "theoretical_loss": 3.516204752585983, + "tokens_seen": 1499400192 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027559679037111336, + "loss": 2.726, + "theoretical_loss": 3.516191201439592, + "tokens_seen": 1499465728 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002755867602808425, + "loss": 2.7239, + "theoretical_loss": 3.5161776510512848, + "tokens_seen": 1499531264 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002755767301905717, + "loss": 2.7961, + "theoretical_loss": 3.516164101420986, + "tokens_seen": 1499596800 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002755667001003009, + "loss": 2.7367, + "theoretical_loss": 3.51615055254862, + "tokens_seen": 1499662336 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002755566700100301, + "loss": 2.7367, + "theoretical_loss": 3.5161370044341114, + "tokens_seen": 1499727872 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027554663991975927, + "loss": 2.7401, + "theoretical_loss": 3.5161234570773843, + "tokens_seen": 1499793408 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027553660982948845, + "loss": 2.6868, + "theoretical_loss": 3.516109910478363, + "tokens_seen": 1499858944 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027552657973921763, + "loss": 2.7637, + "theoretical_loss": 3.516096364636973, + "tokens_seen": 1499924480 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027551654964894686, + "loss": 2.7799, + "theoretical_loss": 3.5160828195531386, + "tokens_seen": 1499990016 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027550651955867604, + "loss": 2.6877, + "theoretical_loss": 3.516069275226784, + "tokens_seen": 1500055552 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002754964894684052, + "loss": 2.7461, + "theoretical_loss": 3.5160557316578336, + "tokens_seen": 1500121088 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027548645937813446, + "loss": 2.7005, + "theoretical_loss": 3.516042188846212, + "tokens_seen": 1500186624 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002754764292878636, + "loss": 2.7617, + "theoretical_loss": 3.516028646791844, + "tokens_seen": 1500252160 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002754663991975928, + "loss": 2.715, + "theoretical_loss": 3.5160151054946542, + "tokens_seen": 1500317696 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027545636910732195, + "loss": 2.8264, + "theoretical_loss": 3.516001564954567, + "tokens_seen": 1500383232 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002754463390170512, + "loss": 2.8153, + "theoretical_loss": 3.5159880251715077, + "tokens_seen": 1500448768 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027543630892678037, + "loss": 2.8083, + "theoretical_loss": 3.5159744861453994, + "tokens_seen": 1500514304 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027542627883650955, + "loss": 2.6005, + "theoretical_loss": 3.5159609478761684, + "tokens_seen": 1500579840 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3551189, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7603461742401123, + "objective/train/theoretical_loss": 3.5159474103637383, + "objective/train/tokens_used": 1521105376, + "theoretical_loss": 3.5159474103637383, + "tokens_seen": 1500645376 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027541624874623873, + "loss": 2.7067, + "theoretical_loss": 3.5159474103637383, + "tokens_seen": 1500645376 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002754062186559679, + "loss": 2.7186, + "theoretical_loss": 3.5159338736080334, + "tokens_seen": 1500710912 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753961885656971, + "loss": 2.8608, + "theoretical_loss": 3.51592033760898, + "tokens_seen": 1500776448 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753861584754263, + "loss": 2.6898, + "theoretical_loss": 3.515906802366501, + "tokens_seen": 1500841984 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027537612838515545, + "loss": 2.7411, + "theoretical_loss": 3.5158932678805224, + "tokens_seen": 1500907520 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753660982948847, + "loss": 2.7467, + "theoretical_loss": 3.5158797341509676, + "tokens_seen": 1500973056 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753560682046138, + "loss": 2.7974, + "theoretical_loss": 3.5158662011777624, + "tokens_seen": 1501038592 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027534603811434305, + "loss": 2.7458, + "theoretical_loss": 3.5158526689608314, + "tokens_seen": 1501104128 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027533600802407223, + "loss": 2.7222, + "theoretical_loss": 3.515839137500099, + "tokens_seen": 1501169664 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753259779338014, + "loss": 2.6268, + "theoretical_loss": 3.51582560679549, + "tokens_seen": 1501235200 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002753159478435306, + "loss": 2.7465, + "theoretical_loss": 3.5158120768469288, + "tokens_seen": 1501300736 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027530591775325983, + "loss": 2.7156, + "theoretical_loss": 3.5157985476543407, + "tokens_seen": 1501366272 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027529588766298896, + "loss": 2.7371, + "theoretical_loss": 3.51578501921765, + "tokens_seen": 1501431808 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752858575727182, + "loss": 2.6584, + "theoretical_loss": 3.515771491536782, + "tokens_seen": 1501497344 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752758274824473, + "loss": 2.6785, + "theoretical_loss": 3.5157579646116615, + "tokens_seen": 1501562880 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027526579739217655, + "loss": 2.7052, + "theoretical_loss": 3.515744438442212, + "tokens_seen": 1501628416 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027525576730190574, + "loss": 2.7993, + "theoretical_loss": 3.51573091302836, + "tokens_seen": 1501693952 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752457372116349, + "loss": 2.7089, + "theoretical_loss": 3.51571738837003, + "tokens_seen": 1501759488 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752357071213641, + "loss": 2.594, + "theoretical_loss": 3.5157038644671457, + "tokens_seen": 1501825024 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752256770310933, + "loss": 2.7477, + "theoretical_loss": 3.515690341319633, + "tokens_seen": 1501890560 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027521564694082246, + "loss": 2.7005, + "theoretical_loss": 3.5156768189274166, + "tokens_seen": 1501956096 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002752056168505517, + "loss": 2.7402, + "theoretical_loss": 3.5156632972904207, + "tokens_seen": 1502021632 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751955867602808, + "loss": 2.7728, + "theoretical_loss": 3.515649776408571, + "tokens_seen": 1502087168 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027518555667001006, + "loss": 2.798, + "theoretical_loss": 3.5156362562817924, + "tokens_seen": 1502152704 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751755265797392, + "loss": 2.7098, + "theoretical_loss": 3.515622736910009, + "tokens_seen": 1502218240 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3556140, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6103687286376953, + "objective/train/theoretical_loss": 3.5156092182931467, + "objective/train/tokens_used": 1522743776, + "theoretical_loss": 3.5156092182931467, + "tokens_seen": 1502283776 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751654964894684, + "loss": 2.7344, + "theoretical_loss": 3.5156092182931467, + "tokens_seen": 1502283776 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751554663991976, + "loss": 2.726, + "theoretical_loss": 3.5155957004311293, + "tokens_seen": 1502349312 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751454363089268, + "loss": 2.8057, + "theoretical_loss": 3.5155821833238825, + "tokens_seen": 1502414848 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027513540621865596, + "loss": 2.7136, + "theoretical_loss": 3.515568666971331, + "tokens_seen": 1502480384 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751253761283852, + "loss": 2.7141, + "theoretical_loss": 3.5155551513734, + "tokens_seen": 1502545920 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002751153460381143, + "loss": 2.8055, + "theoretical_loss": 3.515541636530014, + "tokens_seen": 1502611456 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027510531594784356, + "loss": 2.7171, + "theoretical_loss": 3.515528122441098, + "tokens_seen": 1502676992 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750952858575727, + "loss": 2.8082, + "theoretical_loss": 3.5155146091065776, + "tokens_seen": 1502742528 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750852557673019, + "loss": 2.7278, + "theoretical_loss": 3.5155010965263775, + "tokens_seen": 1502808064 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750752256770311, + "loss": 2.6974, + "theoretical_loss": 3.515487584700422, + "tokens_seen": 1502873600 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750651955867603, + "loss": 2.6751, + "theoretical_loss": 3.515474073628637, + "tokens_seen": 1502939136 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027505516549648947, + "loss": 2.7462, + "theoretical_loss": 3.5154605633109477, + "tokens_seen": 1503004672 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027504513540621865, + "loss": 2.6766, + "theoretical_loss": 3.515447053747278, + "tokens_seen": 1503070208 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027503510531594783, + "loss": 2.7567, + "theoretical_loss": 3.515433544937554, + "tokens_seen": 1503135744 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027502507522567706, + "loss": 2.7507, + "theoretical_loss": 3.5154200368817, + "tokens_seen": 1503201280 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750150451354062, + "loss": 2.6693, + "theoretical_loss": 3.515406529579642, + "tokens_seen": 1503266816 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002750050150451354, + "loss": 2.7822, + "theoretical_loss": 3.5153930230313035, + "tokens_seen": 1503332352 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027499498495486455, + "loss": 2.784, + "theoretical_loss": 3.5153795172366116, + "tokens_seen": 1503397888 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002749849548645938, + "loss": 2.7136, + "theoretical_loss": 3.51536601219549, + "tokens_seen": 1503463424 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027497492477432297, + "loss": 2.7435, + "theoretical_loss": 3.5153525079078642, + "tokens_seen": 1503528960 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027496489468405215, + "loss": 2.7479, + "theoretical_loss": 3.515339004373659, + "tokens_seen": 1503594496 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027495486459378133, + "loss": 2.6473, + "theoretical_loss": 3.5153255015928, + "tokens_seen": 1503660032 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027494483450351057, + "loss": 2.8104, + "theoretical_loss": 3.5153119995652125, + "tokens_seen": 1503725568 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002749348044132397, + "loss": 2.7997, + "theoretical_loss": 3.515298498290821, + "tokens_seen": 1503791104 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027492477432296893, + "loss": 2.7845, + "theoretical_loss": 3.515284997769551, + "tokens_seen": 1503856640 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3559084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.695509672164917, + "objective/train/theoretical_loss": 3.515271498001328, + "objective/train/tokens_used": 1524382176, + "theoretical_loss": 3.515271498001328, + "tokens_seen": 1503922176 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027491474423269806, + "loss": 2.675, + "theoretical_loss": 3.515271498001328, + "tokens_seen": 1503922176 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002749047141424273, + "loss": 2.8779, + "theoretical_loss": 3.5152579989860766, + "tokens_seen": 1503987712 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027489468405215647, + "loss": 2.7682, + "theoretical_loss": 3.515244500723722, + "tokens_seen": 1504053248 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027488465396188565, + "loss": 2.6505, + "theoretical_loss": 3.51523100321419, + "tokens_seen": 1504118784 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027487462387161483, + "loss": 2.7544, + "theoretical_loss": 3.5152175064574056, + "tokens_seen": 1504184320 + }, + { + "epoch": 18.01, + "learning_rate": 0.000274864593781344, + "loss": 2.7553, + "theoretical_loss": 3.515204010453293, + "tokens_seen": 1504249856 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002748545636910732, + "loss": 2.6308, + "theoretical_loss": 3.5151905152017795, + "tokens_seen": 1504315392 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027484453360080243, + "loss": 2.6833, + "theoretical_loss": 3.5151770207027884, + "tokens_seen": 1504380928 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027483450351053156, + "loss": 2.768, + "theoretical_loss": 3.515163526956246, + "tokens_seen": 1504446464 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002748244734202608, + "loss": 2.7624, + "theoretical_loss": 3.515150033962077, + "tokens_seen": 1504512000 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027481444332999, + "loss": 2.7682, + "theoretical_loss": 3.5151365417202074, + "tokens_seen": 1504577536 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027480441323971916, + "loss": 2.713, + "theoretical_loss": 3.515123050230562, + "tokens_seen": 1504643072 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027479438314944834, + "loss": 2.8558, + "theoretical_loss": 3.515109559493066, + "tokens_seen": 1504708608 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002747843530591775, + "loss": 2.6895, + "theoretical_loss": 3.5150960695076447, + "tokens_seen": 1504774144 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027477432296890675, + "loss": 2.7671, + "theoretical_loss": 3.515082580274224, + "tokens_seen": 1504839680 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027476429287863594, + "loss": 2.6089, + "theoretical_loss": 3.5150690917927285, + "tokens_seen": 1504905216 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002747542627883651, + "loss": 2.8096, + "theoretical_loss": 3.5150556040630843, + "tokens_seen": 1504970752 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002747442326980943, + "loss": 2.7072, + "theoretical_loss": 3.515042117085216, + "tokens_seen": 1505036288 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002747342026078235, + "loss": 2.7619, + "theoretical_loss": 3.5150286308590495, + "tokens_seen": 1505101824 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027472417251755266, + "loss": 2.7793, + "theoretical_loss": 3.5150151453845098, + "tokens_seen": 1505167360 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002747141424272819, + "loss": 2.7165, + "theoretical_loss": 3.515001660661522, + "tokens_seen": 1505232896 + }, + { + "epoch": 18.01, + "learning_rate": 0.000274704112337011, + "loss": 2.763, + "theoretical_loss": 3.5149881766900126, + "tokens_seen": 1505298432 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027469408224674026, + "loss": 2.7828, + "theoretical_loss": 3.514974693469906, + "tokens_seen": 1505363968 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746840521564694, + "loss": 2.7474, + "theoretical_loss": 3.514961211001128, + "tokens_seen": 1505429504 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746740220661986, + "loss": 2.7963, + "theoretical_loss": 3.5149477292836044, + "tokens_seen": 1505495040 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3562778, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.77960467338562, + "objective/train/theoretical_loss": 3.5149342483172603, + "objective/train/tokens_used": 1526020576, + "theoretical_loss": 3.5149342483172603, + "tokens_seen": 1505560576 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746639919759278, + "loss": 2.7779, + "theoretical_loss": 3.5149342483172603, + "tokens_seen": 1505560576 + }, + { + "epoch": 18.01, + "learning_rate": 0.000274653961885657, + "loss": 2.8387, + "theoretical_loss": 3.5149207681020207, + "tokens_seen": 1505626112 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027464393179538616, + "loss": 2.7711, + "theoretical_loss": 3.5149072886378114, + "tokens_seen": 1505691648 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746339017051154, + "loss": 2.7364, + "theoretical_loss": 3.5148938099245584, + "tokens_seen": 1505757184 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746238716148445, + "loss": 2.7183, + "theoretical_loss": 3.5148803319621864, + "tokens_seen": 1505822720 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027461384152457376, + "loss": 2.7068, + "theoretical_loss": 3.514866854750621, + "tokens_seen": 1505888256 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002746038114343029, + "loss": 2.742, + "theoretical_loss": 3.514853378289788, + "tokens_seen": 1505953792 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002745937813440321, + "loss": 2.7394, + "theoretical_loss": 3.514839902579613, + "tokens_seen": 1506019328 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002745837512537613, + "loss": 2.783, + "theoretical_loss": 3.5148264276200214, + "tokens_seen": 1506084864 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002745737211634905, + "loss": 2.7323, + "theoretical_loss": 3.5148129534109382, + "tokens_seen": 1506150400 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027456369107321967, + "loss": 2.6862, + "theoretical_loss": 3.51479947995229, + "tokens_seen": 1506215936 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027455366098294885, + "loss": 2.6496, + "theoretical_loss": 3.5147860072440014, + "tokens_seen": 1506281472 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027454363089267803, + "loss": 2.8665, + "theoretical_loss": 3.5147725352859984, + "tokens_seen": 1506347008 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027453360080240726, + "loss": 2.6922, + "theoretical_loss": 3.5147590640782065, + "tokens_seen": 1506412544 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002745235707121364, + "loss": 2.7641, + "theoretical_loss": 3.5147455936205514, + "tokens_seen": 1506478080 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002745135406218656, + "loss": 2.8066, + "theoretical_loss": 3.5147321239129585, + "tokens_seen": 1506543616 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027450351053159475, + "loss": 2.6581, + "theoretical_loss": 3.514718654955354, + "tokens_seen": 1506609152 + }, + { + "epoch": 18.01, + "learning_rate": 0.000274493480441324, + "loss": 2.7946, + "theoretical_loss": 3.514705186747663, + "tokens_seen": 1506674688 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027448345035105317, + "loss": 2.7445, + "theoretical_loss": 3.5146917192898104, + "tokens_seen": 1506740224 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027447342026078235, + "loss": 2.7798, + "theoretical_loss": 3.5146782525817235, + "tokens_seen": 1506805760 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027446339017051153, + "loss": 2.8232, + "theoretical_loss": 3.5146647866233263, + "tokens_seen": 1506871296 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027445336008024077, + "loss": 2.7405, + "theoretical_loss": 3.5146513214145454, + "tokens_seen": 1506936832 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002744433299899699, + "loss": 2.7292, + "theoretical_loss": 3.5146378569553067, + "tokens_seen": 1507002368 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027443329989969913, + "loss": 2.6954, + "theoretical_loss": 3.514624393245535, + "tokens_seen": 1507067904 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027442326980942826, + "loss": 2.8083, + "theoretical_loss": 3.5146109302851567, + "tokens_seen": 1507133440 + }, + { + "epoch": 18.01, + "objective/train/docs_used": 3567766, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7845590114593506, + "objective/train/theoretical_loss": 3.5145974680740975, + "objective/train/tokens_used": 1527658976, + "theoretical_loss": 3.5145974680740975, + "tokens_seen": 1507198976 + }, + { + "epoch": 18.01, + "learning_rate": 0.0002744132397191575, + "loss": 2.8275, + "theoretical_loss": 3.5145974680740975, + "tokens_seen": 1507198976 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027440320962888667, + "loss": 2.7024, + "theoretical_loss": 3.5145840066122824, + "tokens_seen": 1507264512 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027439317953861585, + "loss": 2.8597, + "theoretical_loss": 3.5145705458996384, + "tokens_seen": 1507330048 + }, + { + "epoch": 18.01, + "learning_rate": 0.00027438314944834503, + "loss": 2.7521, + "theoretical_loss": 3.5145570859360897, + "tokens_seen": 1507395584 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002743731193580742, + "loss": 2.7796, + "theoretical_loss": 3.5145436267215633, + "tokens_seen": 1507461120 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002743630892678034, + "loss": 2.7148, + "theoretical_loss": 3.514530168255984, + "tokens_seen": 1507526656 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027435305917753263, + "loss": 2.6429, + "theoretical_loss": 3.5145167105392785, + "tokens_seen": 1507592192 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027434302908726176, + "loss": 2.7228, + "theoretical_loss": 3.5145032535713714, + "tokens_seen": 1507657728 + }, + { + "epoch": 18.02, + "learning_rate": 0.000274332998996991, + "loss": 2.763, + "theoretical_loss": 3.51448979735219, + "tokens_seen": 1507723264 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002743229689067202, + "loss": 2.7634, + "theoretical_loss": 3.514476341881659, + "tokens_seen": 1507788800 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027431293881644936, + "loss": 2.6895, + "theoretical_loss": 3.5144628871597043, + "tokens_seen": 1507854336 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027430290872617854, + "loss": 2.8156, + "theoretical_loss": 3.514449433186252, + "tokens_seen": 1507919872 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742928786359077, + "loss": 2.7926, + "theoretical_loss": 3.5144359799612284, + "tokens_seen": 1507985408 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742828485456369, + "loss": 2.7634, + "theoretical_loss": 3.514422527484558, + "tokens_seen": 1508050944 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027427281845536614, + "loss": 2.7443, + "theoretical_loss": 3.514409075756168, + "tokens_seen": 1508116480 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027426278836509526, + "loss": 2.7253, + "theoretical_loss": 3.5143956247759833, + "tokens_seen": 1508182016 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742527582748245, + "loss": 2.6733, + "theoretical_loss": 3.5143821745439308, + "tokens_seen": 1508247552 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742427281845536, + "loss": 2.6468, + "theoretical_loss": 3.5143687250599354, + "tokens_seen": 1508313088 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027423269809428286, + "loss": 2.725, + "theoretical_loss": 3.5143552763239234, + "tokens_seen": 1508378624 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027422266800401204, + "loss": 2.8204, + "theoretical_loss": 3.5143418283358208, + "tokens_seen": 1508444160 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742126379137412, + "loss": 2.819, + "theoretical_loss": 3.5143283810955532, + "tokens_seen": 1508509696 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002742026078234704, + "loss": 2.736, + "theoretical_loss": 3.514314934603047, + "tokens_seen": 1508575232 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002741925777331996, + "loss": 2.6721, + "theoretical_loss": 3.514301488858228, + "tokens_seen": 1508640768 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027418254764292877, + "loss": 2.7017, + "theoretical_loss": 3.5142880438610216, + "tokens_seen": 1508706304 + }, + { + "epoch": 18.02, + "learning_rate": 0.000274172517552658, + "loss": 2.7689, + "theoretical_loss": 3.514274599611354, + "tokens_seen": 1508771840 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3570522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.641066074371338, + "objective/train/theoretical_loss": 3.5142611561091517, + "objective/train/tokens_used": 1529297376, + "theoretical_loss": 3.5142611561091517, + "tokens_seen": 1508837376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027416248746238713, + "loss": 2.7432, + "theoretical_loss": 3.5142611561091517, + "tokens_seen": 1508837376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027415245737211636, + "loss": 2.741, + "theoretical_loss": 3.51424771335434, + "tokens_seen": 1508902912 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027414242728184554, + "loss": 2.7551, + "theoretical_loss": 3.5142342713468455, + "tokens_seen": 1508968448 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002741323971915747, + "loss": 2.6475, + "theoretical_loss": 3.514220830086594, + "tokens_seen": 1509033984 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002741223671013039, + "loss": 2.8241, + "theoretical_loss": 3.514207389573511, + "tokens_seen": 1509099520 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002741123370110331, + "loss": 2.6646, + "theoretical_loss": 3.514193949807524, + "tokens_seen": 1509165056 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027410230692076227, + "loss": 2.7035, + "theoretical_loss": 3.514180510788557, + "tokens_seen": 1509230592 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002740922768304915, + "loss": 2.7366, + "theoretical_loss": 3.5141670725165373, + "tokens_seen": 1509296128 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027408224674022063, + "loss": 2.7815, + "theoretical_loss": 3.5141536349913904, + "tokens_seen": 1509361664 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027407221664994987, + "loss": 2.7091, + "theoretical_loss": 3.5141401982130427, + "tokens_seen": 1509427200 + }, + { + "epoch": 18.02, + "learning_rate": 0.000274062186559679, + "loss": 2.7295, + "theoretical_loss": 3.5141267621814203, + "tokens_seen": 1509492736 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027405215646940823, + "loss": 2.753, + "theoretical_loss": 3.5141133268964495, + "tokens_seen": 1509558272 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002740421263791374, + "loss": 2.6932, + "theoretical_loss": 3.5140998923580558, + "tokens_seen": 1509623808 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002740320962888666, + "loss": 2.7179, + "theoretical_loss": 3.5140864585661657, + "tokens_seen": 1509689344 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002740220661985958, + "loss": 2.704, + "theoretical_loss": 3.5140730255207053, + "tokens_seen": 1509754880 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027401203610832495, + "loss": 2.8026, + "theoretical_loss": 3.5140595932216003, + "tokens_seen": 1509820416 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002740020060180542, + "loss": 2.8084, + "theoretical_loss": 3.5140461616687775, + "tokens_seen": 1509885952 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027399197592778337, + "loss": 2.7944, + "theoretical_loss": 3.514032730862162, + "tokens_seen": 1509951488 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027398194583751255, + "loss": 2.8293, + "theoretical_loss": 3.5140193008016816, + "tokens_seen": 1510017024 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027397191574724173, + "loss": 2.6926, + "theoretical_loss": 3.514005871487261, + "tokens_seen": 1510082560 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027396188565697097, + "loss": 2.7102, + "theoretical_loss": 3.513992442918828, + "tokens_seen": 1510148096 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002739518555667001, + "loss": 2.7919, + "theoretical_loss": 3.5139790150963064, + "tokens_seen": 1510213632 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027394182547642933, + "loss": 2.7385, + "theoretical_loss": 3.5139655880196243, + "tokens_seen": 1510279168 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027393179538615846, + "loss": 2.821, + "theoretical_loss": 3.5139521616887075, + "tokens_seen": 1510344704 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002739217652958877, + "loss": 2.7504, + "theoretical_loss": 3.5139387361034817, + "tokens_seen": 1510410240 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3575543, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8823704719543457, + "objective/train/theoretical_loss": 3.5139253112638733, + "objective/train/tokens_used": 1530935776, + "theoretical_loss": 3.5139253112638733, + "tokens_seen": 1510475776 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027391173520561687, + "loss": 2.7961, + "theoretical_loss": 3.5139253112638733, + "tokens_seen": 1510475776 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027390170511534605, + "loss": 2.7121, + "theoretical_loss": 3.513911887169809, + "tokens_seen": 1510541312 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027389167502507524, + "loss": 2.6854, + "theoretical_loss": 3.513898463821215, + "tokens_seen": 1510606848 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002738816449348044, + "loss": 2.7508, + "theoretical_loss": 3.513885041218017, + "tokens_seen": 1510672384 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002738716148445336, + "loss": 2.7935, + "theoretical_loss": 3.513871619360142, + "tokens_seen": 1510737920 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027386158475426283, + "loss": 2.761, + "theoretical_loss": 3.5138581982475157, + "tokens_seen": 1510803456 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027385155466399196, + "loss": 2.7386, + "theoretical_loss": 3.5138447778800646, + "tokens_seen": 1510868992 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002738415245737212, + "loss": 2.8177, + "theoretical_loss": 3.5138313582577148, + "tokens_seen": 1510934528 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002738314944834504, + "loss": 2.758, + "theoretical_loss": 3.5138179393803934, + "tokens_seen": 1511000064 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027382146439317956, + "loss": 2.6016, + "theoretical_loss": 3.5138045212480256, + "tokens_seen": 1511065600 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027381143430290874, + "loss": 2.7545, + "theoretical_loss": 3.5137911038605383, + "tokens_seen": 1511131136 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002738014042126379, + "loss": 2.7382, + "theoretical_loss": 3.5137776872178583, + "tokens_seen": 1511196672 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737913741223671, + "loss": 2.7583, + "theoretical_loss": 3.513764271319911, + "tokens_seen": 1511262208 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027378134403209634, + "loss": 2.7135, + "theoretical_loss": 3.5137508561666237, + "tokens_seen": 1511327744 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027377131394182546, + "loss": 2.7893, + "theoretical_loss": 3.513737441757922, + "tokens_seen": 1511393280 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737612838515547, + "loss": 2.7329, + "theoretical_loss": 3.5137240280937325, + "tokens_seen": 1511458816 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737512537612838, + "loss": 2.7624, + "theoretical_loss": 3.5137106151739825, + "tokens_seen": 1511524352 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027374122367101306, + "loss": 2.7198, + "theoretical_loss": 3.513697202998597, + "tokens_seen": 1511589888 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027373119358074224, + "loss": 2.7562, + "theoretical_loss": 3.513683791567503, + "tokens_seen": 1511655424 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737211634904714, + "loss": 2.7476, + "theoretical_loss": 3.5136703808806273, + "tokens_seen": 1511720960 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737111334002006, + "loss": 2.716, + "theoretical_loss": 3.513656970937896, + "tokens_seen": 1511786496 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002737011033099298, + "loss": 2.7533, + "theoretical_loss": 3.513643561739235, + "tokens_seen": 1511852032 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027369107321965897, + "loss": 2.7504, + "theoretical_loss": 3.513630153284572, + "tokens_seen": 1511917568 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002736810431293882, + "loss": 2.7765, + "theoretical_loss": 3.5136167455738327, + "tokens_seen": 1511983104 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027367101303911733, + "loss": 2.6643, + "theoretical_loss": 3.513603338606944, + "tokens_seen": 1512048640 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3578466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.885348081588745, + "objective/train/theoretical_loss": 3.513589932383832, + "objective/train/tokens_used": 1532574176, + "theoretical_loss": 3.513589932383832, + "tokens_seen": 1512114176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027366098294884656, + "loss": 2.8023, + "theoretical_loss": 3.513589932383832, + "tokens_seen": 1512114176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027365095285857574, + "loss": 2.7534, + "theoretical_loss": 3.513576526904423, + "tokens_seen": 1512179712 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002736409227683049, + "loss": 2.6792, + "theoretical_loss": 3.5135631221686436, + "tokens_seen": 1512245248 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002736308926780341, + "loss": 2.7427, + "theoretical_loss": 3.513549718176421, + "tokens_seen": 1512310784 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002736208625877633, + "loss": 2.8091, + "theoretical_loss": 3.5135363149276815, + "tokens_seen": 1512376320 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027361083249749247, + "loss": 2.7141, + "theoretical_loss": 3.5135229124223506, + "tokens_seen": 1512441856 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002736008024072217, + "loss": 2.6973, + "theoretical_loss": 3.513509510660356, + "tokens_seen": 1512507392 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027359077231695083, + "loss": 2.73, + "theoretical_loss": 3.5134961096416246, + "tokens_seen": 1512572928 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027358074222668007, + "loss": 2.7154, + "theoretical_loss": 3.5134827093660816, + "tokens_seen": 1512638464 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002735707121364092, + "loss": 2.6609, + "theoretical_loss": 3.5134693098336545, + "tokens_seen": 1512704000 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027356068204613843, + "loss": 2.6969, + "theoretical_loss": 3.51345591104427, + "tokens_seen": 1512769536 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002735506519558676, + "loss": 2.6256, + "theoretical_loss": 3.5134425129978544, + "tokens_seen": 1512835072 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002735406218655968, + "loss": 2.7715, + "theoretical_loss": 3.5134291156943336, + "tokens_seen": 1512900608 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027353059177532597, + "loss": 2.6377, + "theoretical_loss": 3.5134157191336355, + "tokens_seen": 1512966144 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027352056168505515, + "loss": 2.7566, + "theoretical_loss": 3.5134023233156864, + "tokens_seen": 1513031680 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027351053159478433, + "loss": 2.7088, + "theoretical_loss": 3.5133889282404125, + "tokens_seen": 1513097216 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027350050150451357, + "loss": 2.7518, + "theoretical_loss": 3.513375533907741, + "tokens_seen": 1513162752 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734904714142427, + "loss": 2.8141, + "theoretical_loss": 3.5133621403175983, + "tokens_seen": 1513228288 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027348044132397193, + "loss": 2.6745, + "theoretical_loss": 3.5133487474699105, + "tokens_seen": 1513293824 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734704112337011, + "loss": 2.7128, + "theoretical_loss": 3.5133353553646054, + "tokens_seen": 1513359360 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734603811434303, + "loss": 2.7745, + "theoretical_loss": 3.5133219640016087, + "tokens_seen": 1513424896 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734503510531595, + "loss": 2.6953, + "theoretical_loss": 3.513308573380848, + "tokens_seen": 1513490432 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027344032096288866, + "loss": 2.6795, + "theoretical_loss": 3.5132951835022497, + "tokens_seen": 1513555968 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027343029087261784, + "loss": 2.7094, + "theoretical_loss": 3.5132817943657404, + "tokens_seen": 1513621504 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734202607823471, + "loss": 2.7251, + "theoretical_loss": 3.513268405971247, + "tokens_seen": 1513687040 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3582326, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.699887752532959, + "objective/train/theoretical_loss": 3.513255018318696, + "objective/train/tokens_used": 1534212576, + "theoretical_loss": 3.513255018318696, + "tokens_seen": 1513752576 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002734102306920762, + "loss": 2.7806, + "theoretical_loss": 3.513255018318696, + "tokens_seen": 1513752576 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027340020060180544, + "loss": 2.7933, + "theoretical_loss": 3.5132416314080137, + "tokens_seen": 1513818112 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027339017051153456, + "loss": 2.7311, + "theoretical_loss": 3.5132282452391284, + "tokens_seen": 1513883648 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002733801404212638, + "loss": 2.781, + "theoretical_loss": 3.5132148598119652, + "tokens_seen": 1513949184 + }, + { + "epoch": 18.02, + "learning_rate": 0.000273370110330993, + "loss": 2.7275, + "theoretical_loss": 3.5132014751264524, + "tokens_seen": 1514014720 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027336008024072216, + "loss": 2.8091, + "theoretical_loss": 3.5131880911825153, + "tokens_seen": 1514080256 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027335005015045134, + "loss": 2.763, + "theoretical_loss": 3.513174707980082, + "tokens_seen": 1514145792 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002733400200601806, + "loss": 2.7658, + "theoretical_loss": 3.5131613255190786, + "tokens_seen": 1514211328 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002733299899699097, + "loss": 2.8065, + "theoretical_loss": 3.5131479437994324, + "tokens_seen": 1514276864 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027331995987963894, + "loss": 2.7346, + "theoretical_loss": 3.5131345628210697, + "tokens_seen": 1514342400 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027330992978936807, + "loss": 2.7995, + "theoretical_loss": 3.513121182583918, + "tokens_seen": 1514407936 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002732998996990973, + "loss": 2.6792, + "theoretical_loss": 3.513107803087904, + "tokens_seen": 1514473472 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002732898696088265, + "loss": 2.6963, + "theoretical_loss": 3.513094424332954, + "tokens_seen": 1514539008 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027327983951855566, + "loss": 2.7149, + "theoretical_loss": 3.5130810463189954, + "tokens_seen": 1514604544 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002732698094282849, + "loss": 2.7468, + "theoretical_loss": 3.513067669045955, + "tokens_seen": 1514670080 + }, + { + "epoch": 18.02, + "learning_rate": 0.000273259779338014, + "loss": 2.7293, + "theoretical_loss": 3.5130542925137593, + "tokens_seen": 1514735616 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027324974924774326, + "loss": 2.7479, + "theoretical_loss": 3.5130409167223364, + "tokens_seen": 1514801152 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027323971915747244, + "loss": 2.726, + "theoretical_loss": 3.513027541671612, + "tokens_seen": 1514866688 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002732296890672016, + "loss": 2.7851, + "theoretical_loss": 3.5130141673615136, + "tokens_seen": 1514932224 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002732196589769308, + "loss": 2.7879, + "theoretical_loss": 3.513000793791968, + "tokens_seen": 1514997760 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027320962888666, + "loss": 2.6441, + "theoretical_loss": 3.5129874209629026, + "tokens_seen": 1515063296 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027319959879638917, + "loss": 2.6316, + "theoretical_loss": 3.512974048874243, + "tokens_seen": 1515128832 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002731895687061184, + "loss": 2.7087, + "theoretical_loss": 3.5129606775259186, + "tokens_seen": 1515194368 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027317953861584753, + "loss": 2.6448, + "theoretical_loss": 3.512947306917854, + "tokens_seen": 1515259904 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027316950852557676, + "loss": 2.7406, + "theoretical_loss": 3.5129339370499775, + "tokens_seen": 1515325440 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3586981, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828657627105713, + "objective/train/theoretical_loss": 3.5129205679222153, + "objective/train/tokens_used": 1535850976, + "theoretical_loss": 3.5129205679222153, + "tokens_seen": 1515390976 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027315947843530594, + "loss": 2.7263, + "theoretical_loss": 3.5129205679222153, + "tokens_seen": 1515390976 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002731494483450351, + "loss": 2.7845, + "theoretical_loss": 3.512907199534496, + "tokens_seen": 1515456512 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002731394182547643, + "loss": 2.7123, + "theoretical_loss": 3.5128938318867444, + "tokens_seen": 1515522048 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002731293881644935, + "loss": 2.7006, + "theoretical_loss": 3.512880464978889, + "tokens_seen": 1515587584 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027311935807422267, + "loss": 2.7914, + "theoretical_loss": 3.512867098810857, + "tokens_seen": 1515653120 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002731093279839519, + "loss": 2.7507, + "theoretical_loss": 3.512853733382574, + "tokens_seen": 1515718656 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027309929789368103, + "loss": 2.8022, + "theoretical_loss": 3.5128403686939693, + "tokens_seen": 1515784192 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027308926780341027, + "loss": 2.7653, + "theoretical_loss": 3.512827004744968, + "tokens_seen": 1515849728 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002730792377131394, + "loss": 2.6027, + "theoretical_loss": 3.512813641535498, + "tokens_seen": 1515915264 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027306920762286863, + "loss": 2.8367, + "theoretical_loss": 3.5128002790654866, + "tokens_seen": 1515980800 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002730591775325978, + "loss": 2.6844, + "theoretical_loss": 3.5127869173348607, + "tokens_seen": 1516046336 + }, + { + "epoch": 18.02, + "learning_rate": 0.000273049147442327, + "loss": 2.698, + "theoretical_loss": 3.5127735563435474, + "tokens_seen": 1516111872 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027303911735205617, + "loss": 2.7062, + "theoretical_loss": 3.512760196091474, + "tokens_seen": 1516177408 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027302908726178535, + "loss": 2.7627, + "theoretical_loss": 3.5127468365785672, + "tokens_seen": 1516242944 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027301905717151453, + "loss": 2.8094, + "theoretical_loss": 3.5127334778047548, + "tokens_seen": 1516308480 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027300902708124377, + "loss": 2.6996, + "theoretical_loss": 3.5127201197699636, + "tokens_seen": 1516374016 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729989969909729, + "loss": 2.7715, + "theoretical_loss": 3.512706762474121, + "tokens_seen": 1516439552 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027298896690070213, + "loss": 2.8764, + "theoretical_loss": 3.5126934059171533, + "tokens_seen": 1516505088 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729789368104313, + "loss": 2.776, + "theoretical_loss": 3.512680050098989, + "tokens_seen": 1516570624 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729689067201605, + "loss": 2.7938, + "theoretical_loss": 3.512666695019554, + "tokens_seen": 1516636160 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729588766298897, + "loss": 2.6713, + "theoretical_loss": 3.512653340678777, + "tokens_seen": 1516701696 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027294884653961886, + "loss": 2.7881, + "theoretical_loss": 3.5126399870765845, + "tokens_seen": 1516767232 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027293881644934804, + "loss": 2.7886, + "theoretical_loss": 3.5126266342129036, + "tokens_seen": 1516832768 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729287863590773, + "loss": 2.7199, + "theoretical_loss": 3.5126132820876617, + "tokens_seen": 1516898304 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002729187562688064, + "loss": 2.6576, + "theoretical_loss": 3.5125999307007856, + "tokens_seen": 1516963840 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3589747, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6903786659240723, + "objective/train/theoretical_loss": 3.5125865800522034, + "objective/train/tokens_used": 1537489376, + "theoretical_loss": 3.5125865800522034, + "tokens_seen": 1517029376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027290872617853564, + "loss": 2.6891, + "theoretical_loss": 3.5125865800522034, + "tokens_seen": 1517029376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027289869608826476, + "loss": 2.7707, + "theoretical_loss": 3.5125732301418418, + "tokens_seen": 1517094912 + }, + { + "epoch": 18.02, + "learning_rate": 0.000272888665997994, + "loss": 2.7676, + "theoretical_loss": 3.5125598809696283, + "tokens_seen": 1517160448 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002728786359077232, + "loss": 2.7306, + "theoretical_loss": 3.51254653253549, + "tokens_seen": 1517225984 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027286860581745236, + "loss": 2.8003, + "theoretical_loss": 3.512533184839355, + "tokens_seen": 1517291520 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027285857572718154, + "loss": 2.6715, + "theoretical_loss": 3.5125198378811495, + "tokens_seen": 1517357056 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002728485456369108, + "loss": 2.8158, + "theoretical_loss": 3.512506491660801, + "tokens_seen": 1517422592 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002728385155466399, + "loss": 2.7913, + "theoretical_loss": 3.512493146178238, + "tokens_seen": 1517488128 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027282848545636914, + "loss": 2.7647, + "theoretical_loss": 3.5124798014333867, + "tokens_seen": 1517553664 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027281845536609827, + "loss": 2.826, + "theoretical_loss": 3.512466457426175, + "tokens_seen": 1517619200 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002728084252758275, + "loss": 2.8139, + "theoretical_loss": 3.5124531141565294, + "tokens_seen": 1517684736 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002727983951855567, + "loss": 2.8183, + "theoretical_loss": 3.5124397716243787, + "tokens_seen": 1517750272 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027278836509528586, + "loss": 2.7306, + "theoretical_loss": 3.5124264298296497, + "tokens_seen": 1517815808 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027277833500501504, + "loss": 2.8483, + "theoretical_loss": 3.512413088772269, + "tokens_seen": 1517881344 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002727683049147442, + "loss": 2.7772, + "theoretical_loss": 3.5123997484521654, + "tokens_seen": 1517946880 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002727582748244734, + "loss": 2.7837, + "theoretical_loss": 3.512386408869265, + "tokens_seen": 1518012416 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027274824473420264, + "loss": 2.7886, + "theoretical_loss": 3.512373070023496, + "tokens_seen": 1518077952 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027273821464393177, + "loss": 2.8025, + "theoretical_loss": 3.5123597319147857, + "tokens_seen": 1518143488 + }, + { + "epoch": 18.02, + "learning_rate": 0.000272728184553661, + "loss": 2.6793, + "theoretical_loss": 3.5123463945430617, + "tokens_seen": 1518209024 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027271815446339013, + "loss": 2.7937, + "theoretical_loss": 3.5123330579082515, + "tokens_seen": 1518274560 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027270812437311937, + "loss": 2.752, + "theoretical_loss": 3.512319722010282, + "tokens_seen": 1518340096 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027269809428284855, + "loss": 2.7548, + "theoretical_loss": 3.5123063868490814, + "tokens_seen": 1518405632 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027268806419257773, + "loss": 2.7961, + "theoretical_loss": 3.512293052424577, + "tokens_seen": 1518471168 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002726780341023069, + "loss": 2.7866, + "theoretical_loss": 3.5122797187366954, + "tokens_seen": 1518536704 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027266800401203614, + "loss": 2.6193, + "theoretical_loss": 3.5122663857853658, + "tokens_seen": 1518602240 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.829399585723877, + "objective/train/theoretical_loss": 3.5122530535705145, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5122530535705145, + "tokens_seen": 1518667776 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027265797392176527, + "loss": 2.7784, + "theoretical_loss": 3.5122530535705145, + "tokens_seen": 1518667776 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002726479438314945, + "loss": 2.7614, + "theoretical_loss": 3.51223972209207, + "tokens_seen": 1518733312 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027263791374122363, + "loss": 2.7537, + "theoretical_loss": 3.5122263913499583, + "tokens_seen": 1518798848 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027262788365095287, + "loss": 2.7882, + "theoretical_loss": 3.5122130613441085, + "tokens_seen": 1518864384 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027261785356068205, + "loss": 2.82, + "theoretical_loss": 3.512199732074447, + "tokens_seen": 1518929920 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027260782347041123, + "loss": 2.6494, + "theoretical_loss": 3.5121864035409027, + "tokens_seen": 1518995456 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002725977933801404, + "loss": 2.7572, + "theoretical_loss": 3.512173075743402, + "tokens_seen": 1519060992 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002725877632898696, + "loss": 2.7152, + "theoretical_loss": 3.512159748681873, + "tokens_seen": 1519126528 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002725777331995988, + "loss": 2.6616, + "theoretical_loss": 3.5121464223562433, + "tokens_seen": 1519192064 + }, + { + "epoch": 18.02, + "learning_rate": 0.000272567703109328, + "loss": 2.718, + "theoretical_loss": 3.5121330967664406, + "tokens_seen": 1519257600 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027255767301905714, + "loss": 2.7592, + "theoretical_loss": 3.512119771912392, + "tokens_seen": 1519323136 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027254764292878637, + "loss": 2.8666, + "theoretical_loss": 3.512106447794026, + "tokens_seen": 1519388672 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002725376128385155, + "loss": 2.7391, + "theoretical_loss": 3.51209312441127, + "tokens_seen": 1519454208 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027252758274824473, + "loss": 2.8299, + "theoretical_loss": 3.5120798017640507, + "tokens_seen": 1519519744 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027251755265797397, + "loss": 2.6748, + "theoretical_loss": 3.5120664798522974, + "tokens_seen": 1519585280 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002725075225677031, + "loss": 2.781, + "theoretical_loss": 3.5120531586759363, + "tokens_seen": 1519650816 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027249749247743233, + "loss": 2.7983, + "theoretical_loss": 3.5120398382348954, + "tokens_seen": 1519716352 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002724874623871615, + "loss": 2.7852, + "theoretical_loss": 3.5120265185291037, + "tokens_seen": 1519781888 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002724774322968907, + "loss": 2.8044, + "theoretical_loss": 3.5120131995584876, + "tokens_seen": 1519847424 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002724674022066199, + "loss": 2.6802, + "theoretical_loss": 3.511999881322975, + "tokens_seen": 1519912960 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027245737211634906, + "loss": 2.8148, + "theoretical_loss": 3.5119865638224934, + "tokens_seen": 1519978496 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027244734202607824, + "loss": 2.716, + "theoretical_loss": 3.5119732470569716, + "tokens_seen": 1520044032 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002724373119358075, + "loss": 2.7356, + "theoretical_loss": 3.511959931026336, + "tokens_seen": 1520109568 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002724272818455366, + "loss": 2.7593, + "theoretical_loss": 3.5119466157305155, + "tokens_seen": 1520175104 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027241725175526584, + "loss": 2.663, + "theoretical_loss": 3.511933301169438, + "tokens_seen": 1520240640 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6212265491485596, + "objective/train/theoretical_loss": 3.51191998734303, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.51191998734303, + "tokens_seen": 1520306176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027240722166499496, + "loss": 2.6496, + "theoretical_loss": 3.51191998734303, + "tokens_seen": 1520306176 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002723971915747242, + "loss": 2.6461, + "theoretical_loss": 3.51190667425122, + "tokens_seen": 1520371712 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002723871614844534, + "loss": 2.7521, + "theoretical_loss": 3.511893361893936, + "tokens_seen": 1520437248 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027237713139418256, + "loss": 2.7581, + "theoretical_loss": 3.5118800502711056, + "tokens_seen": 1520502784 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027236710130391174, + "loss": 2.6632, + "theoretical_loss": 3.5118667393826564, + "tokens_seen": 1520568320 + }, + { + "epoch": 18.02, + "learning_rate": 0.000272357071213641, + "loss": 2.7707, + "theoretical_loss": 3.511853429228517, + "tokens_seen": 1520633856 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002723470411233701, + "loss": 2.603, + "theoretical_loss": 3.5118401198086144, + "tokens_seen": 1520699392 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027233701103309934, + "loss": 2.6916, + "theoretical_loss": 3.5118268111228765, + "tokens_seen": 1520764928 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027232698094282847, + "loss": 2.8386, + "theoretical_loss": 3.5118135031712323, + "tokens_seen": 1520830464 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002723169508525577, + "loss": 2.8287, + "theoretical_loss": 3.511800195953608, + "tokens_seen": 1520896000 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002723069207622869, + "loss": 2.7303, + "theoretical_loss": 3.511786889469933, + "tokens_seen": 1520961536 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027229689067201606, + "loss": 2.7309, + "theoretical_loss": 3.5117735837201343, + "tokens_seen": 1521027072 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027228686058174524, + "loss": 2.766, + "theoretical_loss": 3.51176027870414, + "tokens_seen": 1521092608 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002722768304914744, + "loss": 2.7028, + "theoretical_loss": 3.511746974421878, + "tokens_seen": 1521158144 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002722668004012036, + "loss": 2.8157, + "theoretical_loss": 3.5117336708732765, + "tokens_seen": 1521223680 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027225677031093284, + "loss": 2.764, + "theoretical_loss": 3.5117203680582634, + "tokens_seen": 1521289216 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027224674022066197, + "loss": 2.6436, + "theoretical_loss": 3.5117070659767657, + "tokens_seen": 1521354752 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002722367101303912, + "loss": 2.6954, + "theoretical_loss": 3.5116937646287125, + "tokens_seen": 1521420288 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027222668004012033, + "loss": 2.674, + "theoretical_loss": 3.5116804640140318, + "tokens_seen": 1521485824 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027221664994984957, + "loss": 2.7039, + "theoretical_loss": 3.5116671641326507, + "tokens_seen": 1521551360 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027220661985957875, + "loss": 2.7755, + "theoretical_loss": 3.5116538649844977, + "tokens_seen": 1521616896 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027219658976930793, + "loss": 2.783, + "theoretical_loss": 3.511640566569501, + "tokens_seen": 1521682432 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002721865596790371, + "loss": 2.8605, + "theoretical_loss": 3.5116272688875885, + "tokens_seen": 1521747968 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027217652958876635, + "loss": 2.749, + "theoretical_loss": 3.5116139719386874, + "tokens_seen": 1521813504 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027216649949849547, + "loss": 2.7452, + "theoretical_loss": 3.511600675722727, + "tokens_seen": 1521879040 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.775463819503784, + "objective/train/theoretical_loss": 3.511587380239635, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.511587380239635, + "tokens_seen": 1521944576 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002721564694082247, + "loss": 2.737, + "theoretical_loss": 3.511587380239635, + "tokens_seen": 1521944576 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027214643931795383, + "loss": 2.744, + "theoretical_loss": 3.5115740854893387, + "tokens_seen": 1522010112 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027213640922768307, + "loss": 2.7331, + "theoretical_loss": 3.5115607914717666, + "tokens_seen": 1522075648 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027212637913741225, + "loss": 2.8774, + "theoretical_loss": 3.511547498186847, + "tokens_seen": 1522141184 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027211634904714143, + "loss": 2.7922, + "theoretical_loss": 3.511534205634508, + "tokens_seen": 1522206720 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002721063189568706, + "loss": 2.7411, + "theoretical_loss": 3.5115209138146772, + "tokens_seen": 1522272256 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720962888665998, + "loss": 2.8038, + "theoretical_loss": 3.5115076227272835, + "tokens_seen": 1522337792 + }, + { + "epoch": 18.02, + "learning_rate": 0.000272086258776329, + "loss": 2.812, + "theoretical_loss": 3.511494332372254, + "tokens_seen": 1522403328 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720762286860582, + "loss": 2.6571, + "theoretical_loss": 3.5114810427495176, + "tokens_seen": 1522468864 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027206619859578734, + "loss": 2.7561, + "theoretical_loss": 3.5114677538590024, + "tokens_seen": 1522534400 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027205616850551657, + "loss": 2.9173, + "theoretical_loss": 3.511454465700636, + "tokens_seen": 1522599936 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720461384152457, + "loss": 2.7549, + "theoretical_loss": 3.511441178274347, + "tokens_seen": 1522665472 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027203610832497493, + "loss": 2.7435, + "theoretical_loss": 3.5114278915800634, + "tokens_seen": 1522731008 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720260782347041, + "loss": 2.788, + "theoretical_loss": 3.5114146056177136, + "tokens_seen": 1522796544 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720160481444333, + "loss": 2.754, + "theoretical_loss": 3.5114013203872254, + "tokens_seen": 1522862080 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002720060180541625, + "loss": 2.812, + "theoretical_loss": 3.511388035888527, + "tokens_seen": 1522927616 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002719959879638917, + "loss": 2.7974, + "theoretical_loss": 3.5113747521215473, + "tokens_seen": 1522993152 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027198595787362084, + "loss": 2.7308, + "theoretical_loss": 3.5113614690862134, + "tokens_seen": 1523058688 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002719759277833501, + "loss": 2.7621, + "theoretical_loss": 3.5113481867824543, + "tokens_seen": 1523124224 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002719658976930792, + "loss": 2.6462, + "theoretical_loss": 3.511334905210198, + "tokens_seen": 1523189760 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027195586760280844, + "loss": 2.7583, + "theoretical_loss": 3.5113216243693732, + "tokens_seen": 1523255296 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002719458375125376, + "loss": 2.8063, + "theoretical_loss": 3.5113083442599073, + "tokens_seen": 1523320832 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002719358074222668, + "loss": 2.8376, + "theoretical_loss": 3.51129506488173, + "tokens_seen": 1523386368 + }, + { + "epoch": 18.02, + "learning_rate": 0.000271925777331996, + "loss": 2.7765, + "theoretical_loss": 3.5112817862347674, + "tokens_seen": 1523451904 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027191574724172516, + "loss": 2.7418, + "theoretical_loss": 3.511268508318949, + "tokens_seen": 1523517440 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6887712478637695, + "objective/train/theoretical_loss": 3.5112552311342036, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5112552311342036, + "tokens_seen": 1523582976 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027190571715145434, + "loss": 2.7624, + "theoretical_loss": 3.5112552311342036, + "tokens_seen": 1523582976 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718956870611836, + "loss": 2.7211, + "theoretical_loss": 3.5112419546804587, + "tokens_seen": 1523648512 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718856569709127, + "loss": 2.776, + "theoretical_loss": 3.511228678957643, + "tokens_seen": 1523714048 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027187562688064194, + "loss": 2.81, + "theoretical_loss": 3.511215403965685, + "tokens_seen": 1523779584 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027186559679037107, + "loss": 2.77, + "theoretical_loss": 3.511202129704512, + "tokens_seen": 1523845120 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718555667001003, + "loss": 2.7731, + "theoretical_loss": 3.511188856174053, + "tokens_seen": 1523910656 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718455366098295, + "loss": 2.7924, + "theoretical_loss": 3.5111755833742375, + "tokens_seen": 1523976192 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027183550651955867, + "loss": 2.7425, + "theoretical_loss": 3.511162311304992, + "tokens_seen": 1524041728 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027182547642928785, + "loss": 2.7312, + "theoretical_loss": 3.511149039966246, + "tokens_seen": 1524107264 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718154463390171, + "loss": 2.6728, + "theoretical_loss": 3.511135769357928, + "tokens_seen": 1524172800 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002718054162487462, + "loss": 2.76, + "theoretical_loss": 3.5111224994799652, + "tokens_seen": 1524238336 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027179538615847544, + "loss": 2.7553, + "theoretical_loss": 3.511109230332287, + "tokens_seen": 1524303872 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027178535606820457, + "loss": 2.6831, + "theoretical_loss": 3.511095961914821, + "tokens_seen": 1524369408 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002717753259779338, + "loss": 2.6862, + "theoretical_loss": 3.511082694227497, + "tokens_seen": 1524434944 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027176529588766304, + "loss": 2.6671, + "theoretical_loss": 3.5110694272702423, + "tokens_seen": 1524500480 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027175526579739217, + "loss": 2.7508, + "theoretical_loss": 3.511056161042986, + "tokens_seen": 1524566016 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002717452357071214, + "loss": 2.783, + "theoretical_loss": 3.511042895545656, + "tokens_seen": 1524631552 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027173520561685053, + "loss": 2.7707, + "theoretical_loss": 3.5110296307781805, + "tokens_seen": 1524697088 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027172517552657977, + "loss": 2.6138, + "theoretical_loss": 3.5110163667404892, + "tokens_seen": 1524762624 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027171514543630895, + "loss": 2.8008, + "theoretical_loss": 3.5110031034325093, + "tokens_seen": 1524828160 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027170511534603813, + "loss": 2.8225, + "theoretical_loss": 3.5109898408541707, + "tokens_seen": 1524893696 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002716950852557673, + "loss": 2.625, + "theoretical_loss": 3.5109765790054004, + "tokens_seen": 1524959232 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027168505516549655, + "loss": 2.8205, + "theoretical_loss": 3.5109633178861275, + "tokens_seen": 1525024768 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027167502507522567, + "loss": 2.8506, + "theoretical_loss": 3.510950057496281, + "tokens_seen": 1525090304 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002716649949849549, + "loss": 2.8358, + "theoretical_loss": 3.5109367978357886, + "tokens_seen": 1525155840 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8660082817077637, + "objective/train/theoretical_loss": 3.5109235389045796, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5109235389045796, + "tokens_seen": 1525221376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027165496489468403, + "loss": 2.8221, + "theoretical_loss": 3.5109235389045796, + "tokens_seen": 1525221376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027164493480441327, + "loss": 2.7398, + "theoretical_loss": 3.510910280702582, + "tokens_seen": 1525286912 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027163490471414245, + "loss": 2.7709, + "theoretical_loss": 3.5108970232297247, + "tokens_seen": 1525352448 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027162487462387163, + "loss": 2.6947, + "theoretical_loss": 3.5108837664859367, + "tokens_seen": 1525417984 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002716148445336008, + "loss": 2.7812, + "theoretical_loss": 3.510870510471145, + "tokens_seen": 1525483520 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027160481444333, + "loss": 2.8183, + "theoretical_loss": 3.5108572551852806, + "tokens_seen": 1525549056 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715947843530592, + "loss": 2.7292, + "theoretical_loss": 3.5108440006282695, + "tokens_seen": 1525614592 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715847542627884, + "loss": 2.8047, + "theoretical_loss": 3.5108307468000426, + "tokens_seen": 1525680128 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027157472417251754, + "loss": 2.8176, + "theoretical_loss": 3.510817493700527, + "tokens_seen": 1525745664 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715646940822468, + "loss": 2.7449, + "theoretical_loss": 3.5108042413296516, + "tokens_seen": 1525811200 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715546639919759, + "loss": 2.8315, + "theoretical_loss": 3.5107909896873455, + "tokens_seen": 1525876736 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027154463390170514, + "loss": 2.7746, + "theoretical_loss": 3.5107777387735375, + "tokens_seen": 1525942272 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715346038114343, + "loss": 2.6997, + "theoretical_loss": 3.5107644885881557, + "tokens_seen": 1526007808 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715245737211635, + "loss": 2.6998, + "theoretical_loss": 3.5107512391311286, + "tokens_seen": 1526073344 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715145436308927, + "loss": 2.6478, + "theoretical_loss": 3.5107379904023857, + "tokens_seen": 1526138880 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002715045135406219, + "loss": 2.862, + "theoretical_loss": 3.5107247424018553, + "tokens_seen": 1526204416 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027149448345035104, + "loss": 2.6327, + "theoretical_loss": 3.510711495129466, + "tokens_seen": 1526269952 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002714844533600803, + "loss": 2.6352, + "theoretical_loss": 3.5106982485851463, + "tokens_seen": 1526335488 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002714744232698094, + "loss": 2.7615, + "theoretical_loss": 3.5106850027688257, + "tokens_seen": 1526401024 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027146439317953864, + "loss": 2.6598, + "theoretical_loss": 3.5106717576804325, + "tokens_seen": 1526466560 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002714543630892678, + "loss": 2.789, + "theoretical_loss": 3.510658513319895, + "tokens_seen": 1526532096 + }, + { + "epoch": 18.02, + "learning_rate": 0.000271444332998997, + "loss": 2.709, + "theoretical_loss": 3.5106452696871426, + "tokens_seen": 1526597632 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002714343029087262, + "loss": 2.7474, + "theoretical_loss": 3.510632026782104, + "tokens_seen": 1526663168 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027142427281845536, + "loss": 2.7361, + "theoretical_loss": 3.5106187846047074, + "tokens_seen": 1526728704 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027141424272818454, + "loss": 2.7705, + "theoretical_loss": 3.510605543154882, + "tokens_seen": 1526794240 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.57084321975708, + "objective/train/theoretical_loss": 3.510592302432557, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.510592302432557, + "tokens_seen": 1526859776 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002714042126379138, + "loss": 2.7223, + "theoretical_loss": 3.510592302432557, + "tokens_seen": 1526859776 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002713941825476429, + "loss": 2.7373, + "theoretical_loss": 3.5105790624376607, + "tokens_seen": 1526925312 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027138415245737214, + "loss": 2.8359, + "theoretical_loss": 3.510565823170122, + "tokens_seen": 1526990848 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027137412236710127, + "loss": 2.6775, + "theoretical_loss": 3.5105525846298695, + "tokens_seen": 1527056384 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002713640922768305, + "loss": 2.7781, + "theoretical_loss": 3.5105393468168327, + "tokens_seen": 1527121920 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002713540621865597, + "loss": 2.7298, + "theoretical_loss": 3.5105261097309395, + "tokens_seen": 1527187456 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027134403209628887, + "loss": 2.6904, + "theoretical_loss": 3.51051287337212, + "tokens_seen": 1527252992 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027133400200601805, + "loss": 2.8427, + "theoretical_loss": 3.5104996377403017, + "tokens_seen": 1527318528 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002713239719157473, + "loss": 2.8096, + "theoretical_loss": 3.510486402835414, + "tokens_seen": 1527384064 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002713139418254764, + "loss": 2.7599, + "theoretical_loss": 3.510473168657387, + "tokens_seen": 1527449600 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027130391173520564, + "loss": 2.8427, + "theoretical_loss": 3.5104599352061476, + "tokens_seen": 1527515136 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027129388164493477, + "loss": 2.7529, + "theoretical_loss": 3.5104467024816257, + "tokens_seen": 1527580672 + }, + { + "epoch": 18.02, + "learning_rate": 0.000271283851554664, + "loss": 2.5805, + "theoretical_loss": 3.5104334704837505, + "tokens_seen": 1527646208 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002712738214643932, + "loss": 2.7834, + "theoretical_loss": 3.5104202392124506, + "tokens_seen": 1527711744 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027126379137412237, + "loss": 2.8118, + "theoretical_loss": 3.5104070086676544, + "tokens_seen": 1527777280 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027125376128385155, + "loss": 2.7425, + "theoretical_loss": 3.5103937788492914, + "tokens_seen": 1527842816 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027124373119358073, + "loss": 2.823, + "theoretical_loss": 3.510380549757291, + "tokens_seen": 1527908352 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002712337011033099, + "loss": 2.699, + "theoretical_loss": 3.5103673213915814, + "tokens_seen": 1527973888 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027122367101303915, + "loss": 2.7775, + "theoretical_loss": 3.510354093752092, + "tokens_seen": 1528039424 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002712136409227683, + "loss": 2.6929, + "theoretical_loss": 3.5103408668387512, + "tokens_seen": 1528104960 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002712036108324975, + "loss": 2.8563, + "theoretical_loss": 3.5103276406514894, + "tokens_seen": 1528170496 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002711935807422267, + "loss": 2.8085, + "theoretical_loss": 3.5103144151902335, + "tokens_seen": 1528236032 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027118355065195587, + "loss": 2.7764, + "theoretical_loss": 3.510301190454914, + "tokens_seen": 1528301568 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027117352056168505, + "loss": 2.702, + "theoretical_loss": 3.5102879664454605, + "tokens_seen": 1528367104 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027116349047141423, + "loss": 2.7748, + "theoretical_loss": 3.5102747431618004, + "tokens_seen": 1528432640 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7049484252929688, + "objective/train/theoretical_loss": 3.510261520603863, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.510261520603863, + "tokens_seen": 1528498176 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002711534603811434, + "loss": 2.8783, + "theoretical_loss": 3.510261520603863, + "tokens_seen": 1528498176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027114343029087265, + "loss": 2.7139, + "theoretical_loss": 3.5102482987715784, + "tokens_seen": 1528563712 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002711334002006018, + "loss": 2.8542, + "theoretical_loss": 3.510235077664875, + "tokens_seen": 1528629248 + }, + { + "epoch": 18.02, + "learning_rate": 0.000271123370110331, + "loss": 2.7661, + "theoretical_loss": 3.510221857283682, + "tokens_seen": 1528694784 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027111334002006014, + "loss": 2.749, + "theoretical_loss": 3.5102086376279287, + "tokens_seen": 1528760320 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002711033099297894, + "loss": 2.6512, + "theoretical_loss": 3.5101954186975437, + "tokens_seen": 1528825856 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027109327983951856, + "loss": 2.8426, + "theoretical_loss": 3.5101822004924563, + "tokens_seen": 1528891392 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027108324974924774, + "loss": 2.8292, + "theoretical_loss": 3.510168983012596, + "tokens_seen": 1528956928 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710732196589769, + "loss": 2.8241, + "theoretical_loss": 3.510155766257891, + "tokens_seen": 1529022464 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710631895687061, + "loss": 2.7768, + "theoretical_loss": 3.510142550228272, + "tokens_seen": 1529088000 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710531594784353, + "loss": 2.7673, + "theoretical_loss": 3.510129334923666, + "tokens_seen": 1529153536 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710431293881645, + "loss": 2.7497, + "theoretical_loss": 3.510116120344004, + "tokens_seen": 1529219072 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027103309929789364, + "loss": 2.7695, + "theoretical_loss": 3.510102906489215, + "tokens_seen": 1529284608 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710230692076229, + "loss": 2.6816, + "theoretical_loss": 3.510089693359227, + "tokens_seen": 1529350144 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002710130391173521, + "loss": 2.8228, + "theoretical_loss": 3.51007648095397, + "tokens_seen": 1529415680 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027100300902708124, + "loss": 2.6821, + "theoretical_loss": 3.5100632692733735, + "tokens_seen": 1529481216 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002709929789368105, + "loss": 2.8155, + "theoretical_loss": 3.5100500583173657, + "tokens_seen": 1529546752 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002709829488465396, + "loss": 2.7431, + "theoretical_loss": 3.510036848085877, + "tokens_seen": 1529612288 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027097291875626884, + "loss": 2.8532, + "theoretical_loss": 3.5100236385788355, + "tokens_seen": 1529677824 + }, + { + "epoch": 18.02, + "learning_rate": 0.000270962888665998, + "loss": 2.8226, + "theoretical_loss": 3.5100104297961714, + "tokens_seen": 1529743360 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002709528585757272, + "loss": 2.754, + "theoretical_loss": 3.5099972217378133, + "tokens_seen": 1529808896 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002709428284854564, + "loss": 2.77, + "theoretical_loss": 3.509984014403691, + "tokens_seen": 1529874432 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027093279839518556, + "loss": 2.8004, + "theoretical_loss": 3.5099708077937324, + "tokens_seen": 1529939968 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027092276830491474, + "loss": 2.7735, + "theoretical_loss": 3.509957601907869, + "tokens_seen": 1530005504 + }, + { + "epoch": 18.02, + "learning_rate": 0.000270912738214644, + "loss": 2.6368, + "theoretical_loss": 3.509944396746028, + "tokens_seen": 1530071040 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399168968200684, + "objective/train/theoretical_loss": 3.50993119230814, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.50993119230814, + "tokens_seen": 1530136576 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002709027081243731, + "loss": 2.6761, + "theoretical_loss": 3.50993119230814, + "tokens_seen": 1530136576 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027089267803410234, + "loss": 2.7376, + "theoretical_loss": 3.5099179885941343, + "tokens_seen": 1530202112 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027088264794383147, + "loss": 2.7186, + "theoretical_loss": 3.509904785603939, + "tokens_seen": 1530267648 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002708726178535607, + "loss": 2.7902, + "theoretical_loss": 3.5098915833374846, + "tokens_seen": 1530333184 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002708625877632899, + "loss": 2.7832, + "theoretical_loss": 3.5098783817947004, + "tokens_seen": 1530398720 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027085255767301907, + "loss": 2.7657, + "theoretical_loss": 3.509865180975515, + "tokens_seen": 1530464256 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027084252758274825, + "loss": 2.7587, + "theoretical_loss": 3.5098519808798585, + "tokens_seen": 1530529792 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002708324974924775, + "loss": 2.7257, + "theoretical_loss": 3.5098387815076597, + "tokens_seen": 1530595328 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002708224674022066, + "loss": 2.7624, + "theoretical_loss": 3.5098255828588485, + "tokens_seen": 1530660864 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027081243731193584, + "loss": 2.6958, + "theoretical_loss": 3.509812384933354, + "tokens_seen": 1530726400 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027080240722166497, + "loss": 2.6902, + "theoretical_loss": 3.509799187731105, + "tokens_seen": 1530791936 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707923771313942, + "loss": 2.8519, + "theoretical_loss": 3.5097859912520324, + "tokens_seen": 1530857472 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707823470411234, + "loss": 2.7419, + "theoretical_loss": 3.5097727954960636, + "tokens_seen": 1530923008 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027077231695085257, + "loss": 2.7482, + "theoretical_loss": 3.5097596004631297, + "tokens_seen": 1530988544 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027076228686058175, + "loss": 2.8063, + "theoretical_loss": 3.50974640615316, + "tokens_seen": 1531054080 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027075225677031093, + "loss": 2.8137, + "theoretical_loss": 3.509733212566083, + "tokens_seen": 1531119616 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707422266800401, + "loss": 2.7749, + "theoretical_loss": 3.5097200197018292, + "tokens_seen": 1531185152 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027073219658976935, + "loss": 2.898, + "theoretical_loss": 3.5097068275603274, + "tokens_seen": 1531250688 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707221664994985, + "loss": 2.7942, + "theoretical_loss": 3.5096936361415065, + "tokens_seen": 1531316224 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707121364092277, + "loss": 2.6378, + "theoretical_loss": 3.509680445445297, + "tokens_seen": 1531381760 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002707021063189569, + "loss": 2.7444, + "theoretical_loss": 3.5096672554716286, + "tokens_seen": 1531447296 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027069207622868607, + "loss": 2.8049, + "theoretical_loss": 3.5096540662204303, + "tokens_seen": 1531512832 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027068204613841525, + "loss": 2.7901, + "theoretical_loss": 3.5096408776916315, + "tokens_seen": 1531578368 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027067201604814443, + "loss": 2.8081, + "theoretical_loss": 3.5096276898851615, + "tokens_seen": 1531643904 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002706619859578736, + "loss": 2.8003, + "theoretical_loss": 3.50961450280095, + "tokens_seen": 1531709440 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8285000324249268, + "objective/train/theoretical_loss": 3.5096013164389275, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5096013164389275, + "tokens_seen": 1531774976 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027065195586760285, + "loss": 2.8364, + "theoretical_loss": 3.5096013164389275, + "tokens_seen": 1531774976 + }, + { + "epoch": 18.02, + "learning_rate": 0.000270641925777332, + "loss": 2.6724, + "theoretical_loss": 3.509588130799022, + "tokens_seen": 1531840512 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002706318956870612, + "loss": 2.7424, + "theoretical_loss": 3.5095749458811643, + "tokens_seen": 1531906048 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027062186559679034, + "loss": 2.7951, + "theoretical_loss": 3.5095617616852834, + "tokens_seen": 1531971584 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002706118355065196, + "loss": 2.6982, + "theoretical_loss": 3.509548578211309, + "tokens_seen": 1532037120 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027060180541624876, + "loss": 2.8063, + "theoretical_loss": 3.509535395459171, + "tokens_seen": 1532102656 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027059177532597794, + "loss": 2.6645, + "theoretical_loss": 3.5095222134287978, + "tokens_seen": 1532168192 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705817452357071, + "loss": 2.7942, + "theoretical_loss": 3.5095090321201208, + "tokens_seen": 1532233728 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705717151454363, + "loss": 2.7603, + "theoretical_loss": 3.5094958515330683, + "tokens_seen": 1532299264 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705616850551655, + "loss": 2.8413, + "theoretical_loss": 3.5094826716675707, + "tokens_seen": 1532364800 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705516549648947, + "loss": 2.6802, + "theoretical_loss": 3.5094694925235568, + "tokens_seen": 1532430336 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027054162487462384, + "loss": 2.7652, + "theoretical_loss": 3.509456314100957, + "tokens_seen": 1532495872 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705315947843531, + "loss": 2.8131, + "theoretical_loss": 3.5094431363997005, + "tokens_seen": 1532561408 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027052156469408226, + "loss": 2.6769, + "theoretical_loss": 3.509429959419718, + "tokens_seen": 1532626944 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027051153460381144, + "loss": 2.7048, + "theoretical_loss": 3.5094167831609373, + "tokens_seen": 1532692480 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002705015045135406, + "loss": 2.8973, + "theoretical_loss": 3.5094036076232893, + "tokens_seen": 1532758016 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704914744232698, + "loss": 2.7762, + "theoretical_loss": 3.509390432806704, + "tokens_seen": 1532823552 + }, + { + "epoch": 18.02, + "learning_rate": 0.000270481444332999, + "loss": 2.7872, + "theoretical_loss": 3.509377258711111, + "tokens_seen": 1532889088 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704714142427282, + "loss": 2.8554, + "theoretical_loss": 3.509364085336439, + "tokens_seen": 1532954624 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027046138415245735, + "loss": 2.8425, + "theoretical_loss": 3.509350912682619, + "tokens_seen": 1533020160 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704513540621866, + "loss": 2.7868, + "theoretical_loss": 3.5093377407495794, + "tokens_seen": 1533085696 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704413239719157, + "loss": 2.7658, + "theoretical_loss": 3.5093245695372515, + "tokens_seen": 1533151232 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027043129388164494, + "loss": 2.7165, + "theoretical_loss": 3.5093113990455636, + "tokens_seen": 1533216768 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704212637913741, + "loss": 2.707, + "theoretical_loss": 3.509298229274447, + "tokens_seen": 1533282304 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704112337011033, + "loss": 2.8294, + "theoretical_loss": 3.5092850602238297, + "tokens_seen": 1533347840 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8137941360473633, + "objective/train/theoretical_loss": 3.5092718918936425, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5092718918936425, + "tokens_seen": 1533413376 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002704012036108325, + "loss": 2.828, + "theoretical_loss": 3.5092718918936425, + "tokens_seen": 1533413376 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027039117352056167, + "loss": 2.7837, + "theoretical_loss": 3.509258724283816, + "tokens_seen": 1533478912 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027038114343029085, + "loss": 2.8646, + "theoretical_loss": 3.5092455573942782, + "tokens_seen": 1533544448 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002703711133400201, + "loss": 2.7536, + "theoretical_loss": 3.509232391224961, + "tokens_seen": 1533609984 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002703610832497492, + "loss": 2.7209, + "theoretical_loss": 3.5092192257757917, + "tokens_seen": 1533675520 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027035105315947845, + "loss": 2.7643, + "theoretical_loss": 3.5092060610467026, + "tokens_seen": 1533741056 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027034102306920763, + "loss": 2.7398, + "theoretical_loss": 3.5091928970376216, + "tokens_seen": 1533806592 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002703309929789368, + "loss": 2.758, + "theoretical_loss": 3.5091797337484802, + "tokens_seen": 1533872128 + }, + { + "epoch": 18.02, + "learning_rate": 0.000270320962888666, + "loss": 2.8365, + "theoretical_loss": 3.5091665711792066, + "tokens_seen": 1533937664 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027031093279839517, + "loss": 2.7906, + "theoretical_loss": 3.5091534093297323, + "tokens_seen": 1534003200 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027030090270812435, + "loss": 2.8538, + "theoretical_loss": 3.5091402481999863, + "tokens_seen": 1534068736 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702908726178536, + "loss": 2.7584, + "theoretical_loss": 3.5091270877898983, + "tokens_seen": 1534134272 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702808425275827, + "loss": 2.7857, + "theoretical_loss": 3.5091139280993993, + "tokens_seen": 1534199808 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027027081243731195, + "loss": 2.8253, + "theoretical_loss": 3.509100769128418, + "tokens_seen": 1534265344 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027026078234704113, + "loss": 2.8422, + "theoretical_loss": 3.509087610876885, + "tokens_seen": 1534330880 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702507522567703, + "loss": 2.77, + "theoretical_loss": 3.50907445334473, + "tokens_seen": 1534396416 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027024072216649955, + "loss": 2.7704, + "theoretical_loss": 3.509061296531883, + "tokens_seen": 1534461952 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702306920762287, + "loss": 2.7887, + "theoretical_loss": 3.5090481404382743, + "tokens_seen": 1534527488 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702206619859579, + "loss": 2.7524, + "theoretical_loss": 3.509034985063833, + "tokens_seen": 1534593024 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002702106318956871, + "loss": 2.8527, + "theoretical_loss": 3.5090218304084897, + "tokens_seen": 1534658560 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027020060180541627, + "loss": 2.8407, + "theoretical_loss": 3.5090086764721744, + "tokens_seen": 1534724096 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027019057171514545, + "loss": 2.8615, + "theoretical_loss": 3.508995523254817, + "tokens_seen": 1534789632 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027018054162487463, + "loss": 2.7735, + "theoretical_loss": 3.5089823707563474, + "tokens_seen": 1534855168 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002701705115346038, + "loss": 2.8169, + "theoretical_loss": 3.508969218976696, + "tokens_seen": 1534920704 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027016048144433305, + "loss": 2.7353, + "theoretical_loss": 3.5089560679157925, + "tokens_seen": 1534986240 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7684245109558105, + "objective/train/theoretical_loss": 3.5089429175735667, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5089429175735667, + "tokens_seen": 1535051776 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002701504513540622, + "loss": 2.7158, + "theoretical_loss": 3.5089429175735667, + "tokens_seen": 1535051776 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002701404212637914, + "loss": 2.8218, + "theoretical_loss": 3.508929767949949, + "tokens_seen": 1535117312 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027013039117352054, + "loss": 2.7997, + "theoretical_loss": 3.5089166190448697, + "tokens_seen": 1535182848 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002701203610832498, + "loss": 2.7741, + "theoretical_loss": 3.5089034708582583, + "tokens_seen": 1535248384 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027011033099297896, + "loss": 2.7844, + "theoretical_loss": 3.5088903233900446, + "tokens_seen": 1535313920 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027010030090270814, + "loss": 2.7957, + "theoretical_loss": 3.5088771766401594, + "tokens_seen": 1535379456 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700902708124373, + "loss": 2.6973, + "theoretical_loss": 3.5088640306085335, + "tokens_seen": 1535444992 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700802407221665, + "loss": 2.7777, + "theoretical_loss": 3.5088508852950953, + "tokens_seen": 1535510528 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700702106318957, + "loss": 2.7775, + "theoretical_loss": 3.508837740699776, + "tokens_seen": 1535576064 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700601805416249, + "loss": 2.7805, + "theoretical_loss": 3.508824596822505, + "tokens_seen": 1535641600 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027005015045135404, + "loss": 2.7017, + "theoretical_loss": 3.5088114536632133, + "tokens_seen": 1535707136 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700401203610833, + "loss": 2.7552, + "theoretical_loss": 3.5087983112218306, + "tokens_seen": 1535772672 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027003009027081246, + "loss": 2.7048, + "theoretical_loss": 3.508785169498287, + "tokens_seen": 1535838208 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027002006018054164, + "loss": 2.6559, + "theoretical_loss": 3.508772028492513, + "tokens_seen": 1535903744 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002700100300902708, + "loss": 2.6741, + "theoretical_loss": 3.508758888204438, + "tokens_seen": 1535969280 + }, + { + "epoch": 18.02, + "learning_rate": 0.00027, + "loss": 2.7277, + "theoretical_loss": 3.508745748633993, + "tokens_seen": 1536034816 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699899699097292, + "loss": 2.7756, + "theoretical_loss": 3.5087326097811076, + "tokens_seen": 1536100352 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699799398194584, + "loss": 2.8348, + "theoretical_loss": 3.5087194716457124, + "tokens_seen": 1536165888 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026996990972918755, + "loss": 2.8122, + "theoretical_loss": 3.508706334227738, + "tokens_seen": 1536231424 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699598796389168, + "loss": 2.7933, + "theoretical_loss": 3.5086931975271134, + "tokens_seen": 1536296960 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699498495486459, + "loss": 2.8002, + "theoretical_loss": 3.50868006154377, + "tokens_seen": 1536362496 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026993981945837514, + "loss": 2.7107, + "theoretical_loss": 3.5086669262776375, + "tokens_seen": 1536428032 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699297893681043, + "loss": 2.8337, + "theoretical_loss": 3.508653791728646, + "tokens_seen": 1536493568 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699197592778335, + "loss": 2.6793, + "theoretical_loss": 3.508640657896726, + "tokens_seen": 1536559104 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002699097291875627, + "loss": 2.7859, + "theoretical_loss": 3.508627524781808, + "tokens_seen": 1536624640 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9443881511688232, + "objective/train/theoretical_loss": 3.508614392383822, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.508614392383822, + "tokens_seen": 1536690176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026989969909729187, + "loss": 2.7725, + "theoretical_loss": 3.508614392383822, + "tokens_seen": 1536690176 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026988966900702105, + "loss": 2.7801, + "theoretical_loss": 3.508601260702698, + "tokens_seen": 1536755712 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002698796389167503, + "loss": 2.7871, + "theoretical_loss": 3.5085881297383676, + "tokens_seen": 1536821248 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002698696088264794, + "loss": 2.8412, + "theoretical_loss": 3.5085749994907593, + "tokens_seen": 1536886784 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026985957873620865, + "loss": 2.7701, + "theoretical_loss": 3.508561869959805, + "tokens_seen": 1536952320 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026984954864593783, + "loss": 2.7692, + "theoretical_loss": 3.5085487411454332, + "tokens_seen": 1537017856 + }, + { + "epoch": 18.02, + "learning_rate": 0.000269839518555667, + "loss": 2.7881, + "theoretical_loss": 3.5085356130475764, + "tokens_seen": 1537083392 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002698294884653962, + "loss": 2.7652, + "theoretical_loss": 3.5085224856661634, + "tokens_seen": 1537148928 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026981945837512537, + "loss": 2.8094, + "theoretical_loss": 3.508509359001125, + "tokens_seen": 1537214464 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026980942828485455, + "loss": 2.6492, + "theoretical_loss": 3.5084962330523917, + "tokens_seen": 1537280000 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697993981945838, + "loss": 2.83, + "theoretical_loss": 3.5084831078198935, + "tokens_seen": 1537345536 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697893681043129, + "loss": 2.8335, + "theoretical_loss": 3.508469983303562, + "tokens_seen": 1537411072 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026977933801404215, + "loss": 2.7242, + "theoretical_loss": 3.508456859503326, + "tokens_seen": 1537476608 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697693079237713, + "loss": 2.7925, + "theoretical_loss": 3.5084437364191166, + "tokens_seen": 1537542144 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697592778335005, + "loss": 2.7406, + "theoretical_loss": 3.508430614050864, + "tokens_seen": 1537607680 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697492477432297, + "loss": 2.7125, + "theoretical_loss": 3.5084174923984994, + "tokens_seen": 1537673216 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697392176529589, + "loss": 2.8148, + "theoretical_loss": 3.5084043714619524, + "tokens_seen": 1537738752 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026972918756268806, + "loss": 2.7276, + "theoretical_loss": 3.5083912512411533, + "tokens_seen": 1537804288 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697191574724173, + "loss": 2.6914, + "theoretical_loss": 3.508378131736033, + "tokens_seen": 1537869824 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002697091273821464, + "loss": 2.7245, + "theoretical_loss": 3.5083650129465225, + "tokens_seen": 1537935360 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026969909729187565, + "loss": 2.8591, + "theoretical_loss": 3.5083518948725514, + "tokens_seen": 1538000896 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002696890672016048, + "loss": 2.6853, + "theoretical_loss": 3.5083387775140507, + "tokens_seen": 1538066432 + }, + { + "epoch": 18.02, + "learning_rate": 0.000269679037111334, + "loss": 2.7376, + "theoretical_loss": 3.5083256608709505, + "tokens_seen": 1538131968 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002696690070210632, + "loss": 2.8079, + "theoretical_loss": 3.5083125449431813, + "tokens_seen": 1538197504 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002696589769307924, + "loss": 2.7833, + "theoretical_loss": 3.508299429730674, + "tokens_seen": 1538263040 + }, + { + "epoch": 18.02, + "objective/train/docs_used": 3592672, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6483657360076904, + "objective/train/theoretical_loss": 3.5082863152333594, + "objective/train/tokens_used": 1538891232, + "theoretical_loss": 3.5082863152333594, + "tokens_seen": 1538328576 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026964894684052156, + "loss": 2.6975, + "theoretical_loss": 3.5082863152333594, + "tokens_seen": 1538328576 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026963891675025074, + "loss": 2.7972, + "theoretical_loss": 3.508273201451167, + "tokens_seen": 1538394112 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002696288866599799, + "loss": 2.8078, + "theoretical_loss": 3.508260088384028, + "tokens_seen": 1538459648 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026961885656970916, + "loss": 2.7789, + "theoretical_loss": 3.508246976031873, + "tokens_seen": 1538525184 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002696088264794383, + "loss": 2.711, + "theoretical_loss": 3.508233864394632, + "tokens_seen": 1538590720 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002695987963891675, + "loss": 2.8124, + "theoretical_loss": 3.5082207534722363, + "tokens_seen": 1538656256 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026958876629889665, + "loss": 2.7033, + "theoretical_loss": 3.5082076432646163, + "tokens_seen": 1538721792 + }, + { + "epoch": 18.02, + "learning_rate": 0.0002695787362086259, + "loss": 2.7328, + "theoretical_loss": 3.5081945337717024, + "tokens_seen": 1538787328 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026956870611835506, + "loss": 2.8774, + "theoretical_loss": 3.508181424993425, + "tokens_seen": 1538852864 + }, + { + "epoch": 18.02, + "learning_rate": 0.00026955867602808424, + "loss": 2.6852, + "theoretical_loss": 3.508169750589382, + "tokens_seen": 1538911232 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002695486459378134, + "loss": 2.7694, + "theoretical_loss": 3.508156643162025, + "tokens_seen": 1538976768 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026953861584754266, + "loss": 2.6715, + "theoretical_loss": 3.5081435364491047, + "tokens_seen": 1539042304 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002695285857572718, + "loss": 2.7374, + "theoretical_loss": 3.508130430450551, + "tokens_seen": 1539107840 + }, + { + "epoch": 19.0, + "learning_rate": 0.000269518555667001, + "loss": 2.7217, + "theoretical_loss": 3.508117325166295, + "tokens_seen": 1539173376 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002695085255767302, + "loss": 2.5109, + "theoretical_loss": 3.508104220596267, + "tokens_seen": 1539238912 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694984954864594, + "loss": 2.5057, + "theoretical_loss": 3.508091116740398, + "tokens_seen": 1539304448 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694884653961886, + "loss": 2.731, + "theoretical_loss": 3.508078013598618, + "tokens_seen": 1539369984 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026947843530591775, + "loss": 2.7381, + "theoretical_loss": 3.508064911170859, + "tokens_seen": 1539435520 + }, + { + "epoch": 19.0, + "learning_rate": 0.000269468405215647, + "loss": 2.7784, + "theoretical_loss": 3.5080518094570503, + "tokens_seen": 1539501056 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694583751253761, + "loss": 2.695, + "theoretical_loss": 3.508038708457123, + "tokens_seen": 1539566592 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026944834503510534, + "loss": 2.6233, + "theoretical_loss": 3.5080256081710086, + "tokens_seen": 1539632128 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694383149448345, + "loss": 2.6884, + "theoretical_loss": 3.508012508598637, + "tokens_seen": 1539697664 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694282848545637, + "loss": 2.6656, + "theoretical_loss": 3.507999409739939, + "tokens_seen": 1539763200 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002694182547642929, + "loss": 2.7144, + "theoretical_loss": 3.5079863115948458, + "tokens_seen": 1539828736 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026940822467402207, + "loss": 2.7262, + "theoretical_loss": 3.507973214163288, + "tokens_seen": 1539894272 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3643116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6211864948272705, + "objective/train/theoretical_loss": 3.5079601174451955, + "objective/train/tokens_used": 1560419808, + "theoretical_loss": 3.5079601174451955, + "tokens_seen": 1539959808 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026939819458375125, + "loss": 2.673, + "theoretical_loss": 3.5079601174451955, + "tokens_seen": 1539959808 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002693881644934805, + "loss": 2.6711, + "theoretical_loss": 3.5079470214405006, + "tokens_seen": 1540025344 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002693781344032096, + "loss": 2.7136, + "theoretical_loss": 3.507933926149133, + "tokens_seen": 1540090880 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026936810431293885, + "loss": 2.6699, + "theoretical_loss": 3.507920831571024, + "tokens_seen": 1540156416 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026935807422266803, + "loss": 2.6123, + "theoretical_loss": 3.5079077377061036, + "tokens_seen": 1540221952 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002693480441323972, + "loss": 2.6793, + "theoretical_loss": 3.507894644554304, + "tokens_seen": 1540287488 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002693380140421264, + "loss": 2.6015, + "theoretical_loss": 3.5078815521155544, + "tokens_seen": 1540353024 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026932798395185557, + "loss": 2.6887, + "theoretical_loss": 3.507868460389787, + "tokens_seen": 1540418560 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026931795386158475, + "loss": 2.6398, + "theoretical_loss": 3.507855369376932, + "tokens_seen": 1540484096 + }, + { + "epoch": 19.0, + "learning_rate": 0.000269307923771314, + "loss": 2.6525, + "theoretical_loss": 3.507842279076921, + "tokens_seen": 1540549632 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692978936810431, + "loss": 2.6435, + "theoretical_loss": 3.507829189489683, + "tokens_seen": 1540615168 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026928786359077235, + "loss": 2.5053, + "theoretical_loss": 3.507816100615151, + "tokens_seen": 1540680704 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692778335005015, + "loss": 2.6767, + "theoretical_loss": 3.5078030124532544, + "tokens_seen": 1540746240 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692678034102307, + "loss": 2.684, + "theoretical_loss": 3.507789925003925, + "tokens_seen": 1540811776 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692577733199599, + "loss": 2.7298, + "theoretical_loss": 3.507776838267093, + "tokens_seen": 1540877312 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692477432296891, + "loss": 2.6119, + "theoretical_loss": 3.50776375224269, + "tokens_seen": 1540942848 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026923771313941826, + "loss": 2.6929, + "theoretical_loss": 3.5077506669306464, + "tokens_seen": 1541008384 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692276830491475, + "loss": 2.6064, + "theoretical_loss": 3.5077375823308934, + "tokens_seen": 1541073920 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002692176529588766, + "loss": 2.7731, + "theoretical_loss": 3.5077244984433618, + "tokens_seen": 1541139456 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026920762286860585, + "loss": 2.6706, + "theoretical_loss": 3.507711415267983, + "tokens_seen": 1541204992 + }, + { + "epoch": 19.0, + "learning_rate": 0.000269197592778335, + "loss": 2.6285, + "theoretical_loss": 3.5076983328046873, + "tokens_seen": 1541270528 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691875626880642, + "loss": 2.7235, + "theoretical_loss": 3.507685251053406, + "tokens_seen": 1541336064 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691775325977934, + "loss": 2.7029, + "theoretical_loss": 3.50767217001407, + "tokens_seen": 1541401600 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691675025075226, + "loss": 2.6717, + "theoretical_loss": 3.5076590896866096, + "tokens_seen": 1541467136 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026915747241725176, + "loss": 2.6049, + "theoretical_loss": 3.507646010070957, + "tokens_seen": 1541532672 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3648081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.865565776824951, + "objective/train/theoretical_loss": 3.507632931167043, + "objective/train/tokens_used": 1562058208, + "theoretical_loss": 3.507632931167043, + "tokens_seen": 1541598208 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026914744232698094, + "loss": 2.6518, + "theoretical_loss": 3.507632931167043, + "tokens_seen": 1541598208 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691374122367101, + "loss": 2.662, + "theoretical_loss": 3.5076198529747984, + "tokens_seen": 1541663744 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026912738214643936, + "loss": 2.6027, + "theoretical_loss": 3.5076067754941533, + "tokens_seen": 1541729280 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691173520561685, + "loss": 2.6097, + "theoretical_loss": 3.5075936987250405, + "tokens_seen": 1541794816 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002691073219658977, + "loss": 2.6522, + "theoretical_loss": 3.5075806226673896, + "tokens_seen": 1541860352 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026909729187562685, + "loss": 2.5254, + "theoretical_loss": 3.5075675473211323, + "tokens_seen": 1541925888 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002690872617853561, + "loss": 2.7731, + "theoretical_loss": 3.5075544726861994, + "tokens_seen": 1541991424 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026907723169508526, + "loss": 2.7641, + "theoretical_loss": 3.507541398762522, + "tokens_seen": 1542056960 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026906720160481444, + "loss": 2.7463, + "theoretical_loss": 3.507528325550031, + "tokens_seen": 1542122496 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002690571715145436, + "loss": 2.6962, + "theoretical_loss": 3.5075152530486586, + "tokens_seen": 1542188032 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026904714142427286, + "loss": 2.7179, + "theoretical_loss": 3.5075021812583347, + "tokens_seen": 1542253568 + }, + { + "epoch": 19.0, + "learning_rate": 0.000269037111334002, + "loss": 2.6376, + "theoretical_loss": 3.507489110178991, + "tokens_seen": 1542319104 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002690270812437312, + "loss": 2.6185, + "theoretical_loss": 3.507476039810558, + "tokens_seen": 1542384640 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026901705115346035, + "loss": 2.6972, + "theoretical_loss": 3.5074629701529676, + "tokens_seen": 1542450176 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002690070210631896, + "loss": 2.7501, + "theoretical_loss": 3.5074499012061504, + "tokens_seen": 1542515712 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026899699097291877, + "loss": 2.7096, + "theoretical_loss": 3.5074368329700376, + "tokens_seen": 1542581248 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026898696088264795, + "loss": 2.7622, + "theoretical_loss": 3.507423765444561, + "tokens_seen": 1542646784 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026897693079237713, + "loss": 2.6601, + "theoretical_loss": 3.5074106986296507, + "tokens_seen": 1542712320 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002689669007021063, + "loss": 2.6428, + "theoretical_loss": 3.5073976325252385, + "tokens_seen": 1542777856 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002689568706118355, + "loss": 2.6781, + "theoretical_loss": 3.507384567131256, + "tokens_seen": 1542843392 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002689468405215647, + "loss": 2.7138, + "theoretical_loss": 3.5073715024476337, + "tokens_seen": 1542908928 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026893681043129385, + "loss": 2.6402, + "theoretical_loss": 3.507358438474303, + "tokens_seen": 1542974464 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002689267803410231, + "loss": 2.6169, + "theoretical_loss": 3.507345375211195, + "tokens_seen": 1543040000 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002689167502507522, + "loss": 2.689, + "theoretical_loss": 3.507332312658241, + "tokens_seen": 1543105536 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026890672016048145, + "loss": 2.6431, + "theoretical_loss": 3.5073192508153728, + "tokens_seen": 1543171072 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3651199, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.699176549911499, + "objective/train/theoretical_loss": 3.5073061896825206, + "objective/train/tokens_used": 1563696608, + "theoretical_loss": 3.5073061896825206, + "tokens_seen": 1543236608 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026889669007021063, + "loss": 2.693, + "theoretical_loss": 3.5073061896825206, + "tokens_seen": 1543236608 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002688866599799398, + "loss": 2.6599, + "theoretical_loss": 3.5072931292596166, + "tokens_seen": 1543302144 + }, + { + "epoch": 19.0, + "learning_rate": 0.000268876629889669, + "loss": 2.593, + "theoretical_loss": 3.5072800695465913, + "tokens_seen": 1543367680 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026886659979939823, + "loss": 2.766, + "theoretical_loss": 3.5072670105433765, + "tokens_seen": 1543433216 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026885656970912736, + "loss": 2.513, + "theoretical_loss": 3.5072539522499038, + "tokens_seen": 1543498752 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002688465396188566, + "loss": 2.615, + "theoretical_loss": 3.5072408946661033, + "tokens_seen": 1543564288 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002688365095285857, + "loss": 2.7144, + "theoretical_loss": 3.5072278377919077, + "tokens_seen": 1543629824 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026882647943831495, + "loss": 2.6706, + "theoretical_loss": 3.5072147816272476, + "tokens_seen": 1543695360 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026881644934804413, + "loss": 2.6749, + "theoretical_loss": 3.5072017261720543, + "tokens_seen": 1543760896 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002688064192577733, + "loss": 2.6473, + "theoretical_loss": 3.507188671426259, + "tokens_seen": 1543826432 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687963891675025, + "loss": 2.6647, + "theoretical_loss": 3.5071756173897928, + "tokens_seen": 1543891968 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687863590772317, + "loss": 2.7786, + "theoretical_loss": 3.507162564062588, + "tokens_seen": 1543957504 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026877632898696086, + "loss": 2.682, + "theoretical_loss": 3.5071495114445757, + "tokens_seen": 1544023040 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687662988966901, + "loss": 2.7099, + "theoretical_loss": 3.507136459535687, + "tokens_seen": 1544088576 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687562688064193, + "loss": 2.6741, + "theoretical_loss": 3.5071234083358527, + "tokens_seen": 1544154112 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026874623871614846, + "loss": 2.6696, + "theoretical_loss": 3.507110357845005, + "tokens_seen": 1544219648 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687362086258777, + "loss": 2.7929, + "theoretical_loss": 3.5070973080630754, + "tokens_seen": 1544285184 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687261785356068, + "loss": 2.6034, + "theoretical_loss": 3.5070842589899947, + "tokens_seen": 1544350720 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026871614844533605, + "loss": 2.7359, + "theoretical_loss": 3.507071210625695, + "tokens_seen": 1544416256 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002687061183550652, + "loss": 2.6734, + "theoretical_loss": 3.5070581629701074, + "tokens_seen": 1544481792 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686960882647944, + "loss": 2.6514, + "theoretical_loss": 3.5070451160231624, + "tokens_seen": 1544547328 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686860581745236, + "loss": 2.7543, + "theoretical_loss": 3.507032069784793, + "tokens_seen": 1544612864 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686760280842528, + "loss": 2.5898, + "theoretical_loss": 3.50701902425493, + "tokens_seen": 1544678400 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026866599799398196, + "loss": 2.7048, + "theoretical_loss": 3.507005979433505, + "tokens_seen": 1544743936 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026865596790371114, + "loss": 2.6458, + "theoretical_loss": 3.506992935320449, + "tokens_seen": 1544809472 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3655082, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8086628913879395, + "objective/train/theoretical_loss": 3.5069798919156936, + "objective/train/tokens_used": 1565335008, + "theoretical_loss": 3.5069798919156936, + "tokens_seen": 1544875008 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686459378134403, + "loss": 2.6517, + "theoretical_loss": 3.5069798919156936, + "tokens_seen": 1544875008 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026863590772316956, + "loss": 2.7484, + "theoretical_loss": 3.5069668492191712, + "tokens_seen": 1544940544 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686258776328987, + "loss": 2.7176, + "theoretical_loss": 3.506953807230812, + "tokens_seen": 1545006080 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002686158475426279, + "loss": 2.6308, + "theoretical_loss": 3.5069407659505485, + "tokens_seen": 1545071616 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026860581745235705, + "loss": 2.7224, + "theoretical_loss": 3.5069277253783118, + "tokens_seen": 1545137152 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002685957873620863, + "loss": 2.6872, + "theoretical_loss": 3.506914685514033, + "tokens_seen": 1545202688 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026858575727181546, + "loss": 2.6272, + "theoretical_loss": 3.506901646357645, + "tokens_seen": 1545268224 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026857572718154464, + "loss": 2.6893, + "theoretical_loss": 3.5068886079090777, + "tokens_seen": 1545333760 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002685656970912738, + "loss": 2.7443, + "theoretical_loss": 3.5068755701682637, + "tokens_seen": 1545399296 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026855566700100306, + "loss": 2.5864, + "theoretical_loss": 3.506862533135134, + "tokens_seen": 1545464832 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002685456369107322, + "loss": 2.5962, + "theoretical_loss": 3.506849496809621, + "tokens_seen": 1545530368 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002685356068204614, + "loss": 2.714, + "theoretical_loss": 3.506836461191656, + "tokens_seen": 1545595904 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026852557673019055, + "loss": 2.7728, + "theoretical_loss": 3.5068234262811697, + "tokens_seen": 1545661440 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002685155466399198, + "loss": 2.6893, + "theoretical_loss": 3.5068103920780946, + "tokens_seen": 1545726976 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026850551654964897, + "loss": 2.806, + "theoretical_loss": 3.506797358582362, + "tokens_seen": 1545792512 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026849548645937815, + "loss": 2.6514, + "theoretical_loss": 3.5067843257939035, + "tokens_seen": 1545858048 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026848545636910733, + "loss": 2.7486, + "theoretical_loss": 3.506771293712651, + "tokens_seen": 1545923584 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002684754262788365, + "loss": 2.7182, + "theoretical_loss": 3.5067582623385363, + "tokens_seen": 1545989120 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002684653961885657, + "loss": 2.672, + "theoretical_loss": 3.5067452316714904, + "tokens_seen": 1546054656 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002684553660982949, + "loss": 2.6824, + "theoretical_loss": 3.5067322017114457, + "tokens_seen": 1546120192 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026844533600802405, + "loss": 2.6351, + "theoretical_loss": 3.5067191724583333, + "tokens_seen": 1546185728 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002684353059177533, + "loss": 2.6196, + "theoretical_loss": 3.5067061439120852, + "tokens_seen": 1546251264 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002684252758274824, + "loss": 2.6351, + "theoretical_loss": 3.5066931160726327, + "tokens_seen": 1546316800 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026841524573721165, + "loss": 2.6884, + "theoretical_loss": 3.506680088939908, + "tokens_seen": 1546382336 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026840521564694083, + "loss": 2.6263, + "theoretical_loss": 3.5066670625138427, + "tokens_seen": 1546447872 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3659655, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7655017375946045, + "objective/train/theoretical_loss": 3.5066540367943686, + "objective/train/tokens_used": 1566973408, + "theoretical_loss": 3.5066540367943686, + "tokens_seen": 1546513408 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026839518555667, + "loss": 2.6046, + "theoretical_loss": 3.5066540367943686, + "tokens_seen": 1546513408 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002683851554663992, + "loss": 2.6547, + "theoretical_loss": 3.506641011781417, + "tokens_seen": 1546578944 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026837512537612843, + "loss": 2.6382, + "theoretical_loss": 3.5066279874749195, + "tokens_seen": 1546644480 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026836509528585756, + "loss": 2.6329, + "theoretical_loss": 3.5066149638748083, + "tokens_seen": 1546710016 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002683550651955868, + "loss": 2.7346, + "theoretical_loss": 3.5066019409810156, + "tokens_seen": 1546775552 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002683450351053159, + "loss": 2.6887, + "theoretical_loss": 3.5065889187934727, + "tokens_seen": 1546841088 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026833500501504515, + "loss": 2.6652, + "theoretical_loss": 3.5065758973121106, + "tokens_seen": 1546906624 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026832497492477433, + "loss": 2.7134, + "theoretical_loss": 3.506562876536863, + "tokens_seen": 1546972160 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002683149448345035, + "loss": 2.7549, + "theoretical_loss": 3.5065498564676596, + "tokens_seen": 1547037696 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002683049147442327, + "loss": 2.6705, + "theoretical_loss": 3.5065368371044334, + "tokens_seen": 1547103232 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682948846539619, + "loss": 2.6918, + "theoretical_loss": 3.506523818447116, + "tokens_seen": 1547168768 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026828485456369106, + "loss": 2.7099, + "theoretical_loss": 3.506510800495639, + "tokens_seen": 1547234304 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682748244734203, + "loss": 2.614, + "theoretical_loss": 3.5064977832499347, + "tokens_seen": 1547299840 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682647943831494, + "loss": 2.7137, + "theoretical_loss": 3.506484766709935, + "tokens_seen": 1547365376 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026825476429287866, + "loss": 2.6735, + "theoretical_loss": 3.506471750875571, + "tokens_seen": 1547430912 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682447342026078, + "loss": 2.6676, + "theoretical_loss": 3.506458735746775, + "tokens_seen": 1547496448 + }, + { + "epoch": 19.0, + "learning_rate": 0.000268234704112337, + "loss": 2.6733, + "theoretical_loss": 3.506445721323479, + "tokens_seen": 1547561984 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682246740220662, + "loss": 2.8157, + "theoretical_loss": 3.5064327076056143, + "tokens_seen": 1547627520 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002682146439317954, + "loss": 2.7292, + "theoretical_loss": 3.506419694593114, + "tokens_seen": 1547693056 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026820461384152456, + "loss": 2.7137, + "theoretical_loss": 3.506406682285909, + "tokens_seen": 1547758592 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681945837512538, + "loss": 2.7183, + "theoretical_loss": 3.5063936706839316, + "tokens_seen": 1547824128 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681845536609829, + "loss": 2.7026, + "theoretical_loss": 3.506380659787113, + "tokens_seen": 1547889664 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026817452357071216, + "loss": 2.7018, + "theoretical_loss": 3.5063676495953864, + "tokens_seen": 1547955200 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681644934804413, + "loss": 2.6322, + "theoretical_loss": 3.5063546401086825, + "tokens_seen": 1548020736 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681544633901705, + "loss": 2.6106, + "theoretical_loss": 3.5063416313269338, + "tokens_seen": 1548086272 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3662840, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.72342848777771, + "objective/train/theoretical_loss": 3.5063286232500728, + "objective/train/tokens_used": 1568611808, + "theoretical_loss": 3.5063286232500728, + "tokens_seen": 1548151808 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681444332998997, + "loss": 2.7624, + "theoretical_loss": 3.5063286232500728, + "tokens_seen": 1548151808 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002681344032096289, + "loss": 2.6129, + "theoretical_loss": 3.50631561587803, + "tokens_seen": 1548217344 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026812437311935807, + "loss": 2.8437, + "theoretical_loss": 3.5063026092107394, + "tokens_seen": 1548282880 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026811434302908725, + "loss": 2.7404, + "theoretical_loss": 3.506289603248131, + "tokens_seen": 1548348416 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026810431293881643, + "loss": 2.736, + "theoretical_loss": 3.5062765979901385, + "tokens_seen": 1548413952 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026809428284854566, + "loss": 2.682, + "theoretical_loss": 3.5062635934366924, + "tokens_seen": 1548479488 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002680842527582748, + "loss": 2.6947, + "theoretical_loss": 3.506250589587726, + "tokens_seen": 1548545024 + }, + { + "epoch": 19.0, + "learning_rate": 0.000268074222668004, + "loss": 2.6273, + "theoretical_loss": 3.50623758644317, + "tokens_seen": 1548610560 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026806419257773315, + "loss": 2.7484, + "theoretical_loss": 3.506224584002958, + "tokens_seen": 1548676096 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002680541624874624, + "loss": 2.7258, + "theoretical_loss": 3.506211582267021, + "tokens_seen": 1548741632 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026804413239719157, + "loss": 2.7713, + "theoretical_loss": 3.506198581235291, + "tokens_seen": 1548807168 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026803410230692075, + "loss": 2.7115, + "theoretical_loss": 3.5061855809077005, + "tokens_seen": 1548872704 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026802407221664993, + "loss": 2.7182, + "theoretical_loss": 3.5061725812841815, + "tokens_seen": 1548938240 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026801404212637917, + "loss": 2.5819, + "theoretical_loss": 3.506159582364666, + "tokens_seen": 1549003776 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026800401203610835, + "loss": 2.6629, + "theoretical_loss": 3.5061465841490858, + "tokens_seen": 1549069312 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026799398194583753, + "loss": 2.6231, + "theoretical_loss": 3.5061335866373735, + "tokens_seen": 1549134848 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679839518555667, + "loss": 2.6683, + "theoretical_loss": 3.506120589829461, + "tokens_seen": 1549200384 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679739217652959, + "loss": 2.6528, + "theoretical_loss": 3.5061075937252806, + "tokens_seen": 1549265920 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679638916750251, + "loss": 2.6604, + "theoretical_loss": 3.5060945983247636, + "tokens_seen": 1549331456 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026795386158475425, + "loss": 2.7312, + "theoretical_loss": 3.5060816036278433, + "tokens_seen": 1549396992 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679438314944835, + "loss": 2.5828, + "theoretical_loss": 3.5060686096344513, + "tokens_seen": 1549462528 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679338014042126, + "loss": 2.6237, + "theoretical_loss": 3.5060556163445193, + "tokens_seen": 1549528064 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026792377131394185, + "loss": 2.6877, + "theoretical_loss": 3.5060426237579803, + "tokens_seen": 1549593600 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026791374122367103, + "loss": 2.6185, + "theoretical_loss": 3.5060296318747666, + "tokens_seen": 1549659136 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002679037111334002, + "loss": 2.6832, + "theoretical_loss": 3.5060166406948094, + "tokens_seen": 1549724672 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3667531, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8835830688476562, + "objective/train/theoretical_loss": 3.506003650218042, + "objective/train/tokens_used": 1570250208, + "theoretical_loss": 3.506003650218042, + "tokens_seen": 1549790208 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002678936810431294, + "loss": 2.7949, + "theoretical_loss": 3.506003650218042, + "tokens_seen": 1549790208 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026788365095285863, + "loss": 2.683, + "theoretical_loss": 3.505990660444395, + "tokens_seen": 1549855744 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026787362086258776, + "loss": 2.7251, + "theoretical_loss": 3.5059776713738025, + "tokens_seen": 1549921280 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267863590772317, + "loss": 2.5788, + "theoretical_loss": 3.5059646830061952, + "tokens_seen": 1549986816 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002678535606820461, + "loss": 2.7176, + "theoretical_loss": 3.505951695341506, + "tokens_seen": 1550052352 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026784353059177535, + "loss": 2.6987, + "theoretical_loss": 3.505938708379668, + "tokens_seen": 1550117888 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026783350050150453, + "loss": 2.6612, + "theoretical_loss": 3.5059257221206117, + "tokens_seen": 1550183424 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002678234704112337, + "loss": 2.6771, + "theoretical_loss": 3.5059127365642704, + "tokens_seen": 1550248960 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002678134403209629, + "loss": 2.7612, + "theoretical_loss": 3.5058997517105768, + "tokens_seen": 1550314496 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002678034102306921, + "loss": 2.7475, + "theoretical_loss": 3.5058867675594616, + "tokens_seen": 1550380032 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026779338014042126, + "loss": 2.7241, + "theoretical_loss": 3.5058737841108587, + "tokens_seen": 1550445568 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002677833500501505, + "loss": 2.6829, + "theoretical_loss": 3.5058608013646997, + "tokens_seen": 1550511104 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002677733199598796, + "loss": 2.6978, + "theoretical_loss": 3.505847819320917, + "tokens_seen": 1550576640 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026776328986960886, + "loss": 2.7547, + "theoretical_loss": 3.505834837979443, + "tokens_seen": 1550642176 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267753259779338, + "loss": 2.7441, + "theoretical_loss": 3.5058218573402096, + "tokens_seen": 1550707712 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002677432296890672, + "loss": 2.5971, + "theoretical_loss": 3.5058088774031493, + "tokens_seen": 1550773248 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002677331995987964, + "loss": 2.7254, + "theoretical_loss": 3.505795898168195, + "tokens_seen": 1550838784 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002677231695085256, + "loss": 2.679, + "theoretical_loss": 3.505782919635279, + "tokens_seen": 1550904320 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026771313941825476, + "loss": 2.6393, + "theoretical_loss": 3.505769941804333, + "tokens_seen": 1550969856 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267703109327984, + "loss": 2.6877, + "theoretical_loss": 3.5057569646752893, + "tokens_seen": 1551035392 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002676930792377131, + "loss": 2.6408, + "theoretical_loss": 3.5057439882480814, + "tokens_seen": 1551100928 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026768304914744236, + "loss": 2.692, + "theoretical_loss": 3.50573101252264, + "tokens_seen": 1551166464 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002676730190571715, + "loss": 2.6776, + "theoretical_loss": 3.505718037498899, + "tokens_seen": 1551232000 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002676629889669007, + "loss": 2.7541, + "theoretical_loss": 3.5057050631767908, + "tokens_seen": 1551297536 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002676529588766299, + "loss": 2.6504, + "theoretical_loss": 3.5056920895562467, + "tokens_seen": 1551363072 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3670487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5215158462524414, + "objective/train/theoretical_loss": 3.5056791166372, + "objective/train/tokens_used": 1571888608, + "theoretical_loss": 3.5056791166372, + "tokens_seen": 1551428608 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002676429287863591, + "loss": 2.7554, + "theoretical_loss": 3.5056791166372, + "tokens_seen": 1551428608 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026763289869608827, + "loss": 2.6554, + "theoretical_loss": 3.5056661444195827, + "tokens_seen": 1551494144 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026762286860581745, + "loss": 2.641, + "theoretical_loss": 3.5056531729033273, + "tokens_seen": 1551559680 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026761283851554663, + "loss": 2.6511, + "theoretical_loss": 3.5056402020883666, + "tokens_seen": 1551625216 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026760280842527586, + "loss": 2.6852, + "theoretical_loss": 3.505627231974633, + "tokens_seen": 1551690752 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267592778335005, + "loss": 2.7611, + "theoretical_loss": 3.5056142625620583, + "tokens_seen": 1551756288 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002675827482447342, + "loss": 2.5901, + "theoretical_loss": 3.505601293850576, + "tokens_seen": 1551821824 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026757271815446335, + "loss": 2.6612, + "theoretical_loss": 3.505588325840118, + "tokens_seen": 1551887360 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002675626880641926, + "loss": 2.7124, + "theoretical_loss": 3.5055753585306166, + "tokens_seen": 1551952896 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026755265797392177, + "loss": 2.6629, + "theoretical_loss": 3.5055623919220045, + "tokens_seen": 1552018432 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026754262788365095, + "loss": 2.6943, + "theoretical_loss": 3.5055494260142153, + "tokens_seen": 1552083968 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026753259779338013, + "loss": 2.71, + "theoretical_loss": 3.5055364608071797, + "tokens_seen": 1552149504 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026752256770310937, + "loss": 2.7359, + "theoretical_loss": 3.5055234963008313, + "tokens_seen": 1552215040 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002675125376128385, + "loss": 2.7644, + "theoretical_loss": 3.5055105324951024, + "tokens_seen": 1552280576 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026750250752256773, + "loss": 2.6309, + "theoretical_loss": 3.505497569389926, + "tokens_seen": 1552346112 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026749247743229686, + "loss": 2.6788, + "theoretical_loss": 3.505484606985234, + "tokens_seen": 1552411648 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002674824473420261, + "loss": 2.6886, + "theoretical_loss": 3.5054716452809593, + "tokens_seen": 1552477184 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026747241725175527, + "loss": 2.7247, + "theoretical_loss": 3.505458684277035, + "tokens_seen": 1552542720 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026746238716148445, + "loss": 2.7104, + "theoretical_loss": 3.5054457239733923, + "tokens_seen": 1552608256 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026745235707121363, + "loss": 2.7621, + "theoretical_loss": 3.505432764369965, + "tokens_seen": 1552673792 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002674423269809428, + "loss": 2.6869, + "theoretical_loss": 3.505419805466685, + "tokens_seen": 1552739328 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267432296890672, + "loss": 2.7225, + "theoretical_loss": 3.5054068472634863, + "tokens_seen": 1552804864 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026742226680040123, + "loss": 2.5925, + "theoretical_loss": 3.5053938897603, + "tokens_seen": 1552870400 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026741223671013036, + "loss": 2.6987, + "theoretical_loss": 3.505380932957059, + "tokens_seen": 1552935936 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002674022066198596, + "loss": 2.8172, + "theoretical_loss": 3.5053679768536967, + "tokens_seen": 1553001472 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3674190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8504257202148438, + "objective/train/theoretical_loss": 3.505355021450145, + "objective/train/tokens_used": 1573527008, + "theoretical_loss": 3.505355021450145, + "tokens_seen": 1553067008 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002673921765295888, + "loss": 2.6621, + "theoretical_loss": 3.505355021450145, + "tokens_seen": 1553067008 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026738214643931796, + "loss": 2.6445, + "theoretical_loss": 3.505342066746337, + "tokens_seen": 1553132544 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026737211634904714, + "loss": 2.6866, + "theoretical_loss": 3.5053291127422055, + "tokens_seen": 1553198080 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002673620862587763, + "loss": 2.6574, + "theoretical_loss": 3.5053161594376827, + "tokens_seen": 1553263616 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002673520561685055, + "loss": 2.6949, + "theoretical_loss": 3.5053032068327017, + "tokens_seen": 1553329152 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026734202607823474, + "loss": 2.6861, + "theoretical_loss": 3.5052902549271945, + "tokens_seen": 1553394688 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026733199598796386, + "loss": 2.7268, + "theoretical_loss": 3.5052773037210954, + "tokens_seen": 1553460224 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002673219658976931, + "loss": 2.7296, + "theoretical_loss": 3.5052643532143355, + "tokens_seen": 1553525760 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002673119358074222, + "loss": 2.5592, + "theoretical_loss": 3.505251403406848, + "tokens_seen": 1553591296 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026730190571715146, + "loss": 2.7158, + "theoretical_loss": 3.505238454298566, + "tokens_seen": 1553656832 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026729187562688064, + "loss": 2.7468, + "theoretical_loss": 3.505225505889422, + "tokens_seen": 1553722368 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672818455366098, + "loss": 2.7405, + "theoretical_loss": 3.505212558179349, + "tokens_seen": 1553787904 + }, + { + "epoch": 19.0, + "learning_rate": 0.000267271815446339, + "loss": 2.747, + "theoretical_loss": 3.5051996111682793, + "tokens_seen": 1553853440 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672617853560682, + "loss": 2.7346, + "theoretical_loss": 3.5051866648561463, + "tokens_seen": 1553918976 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672517552657974, + "loss": 2.8112, + "theoretical_loss": 3.5051737192428822, + "tokens_seen": 1553984512 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672417251755266, + "loss": 2.6874, + "theoretical_loss": 3.50516077432842, + "tokens_seen": 1554050048 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672316950852558, + "loss": 2.7375, + "theoretical_loss": 3.505147830112693, + "tokens_seen": 1554115584 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026722166499498496, + "loss": 2.6192, + "theoretical_loss": 3.5051348865956333, + "tokens_seen": 1554181120 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672116349047142, + "loss": 2.6608, + "theoretical_loss": 3.505121943777174, + "tokens_seen": 1554246656 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002672016048144433, + "loss": 2.7635, + "theoretical_loss": 3.505109001657248, + "tokens_seen": 1554312192 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026719157472417256, + "loss": 2.7445, + "theoretical_loss": 3.5050960602357883, + "tokens_seen": 1554377728 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002671815446339017, + "loss": 2.7755, + "theoretical_loss": 3.5050831195127277, + "tokens_seen": 1554443264 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002671715145436309, + "loss": 2.6691, + "theoretical_loss": 3.5050701794879986, + "tokens_seen": 1554508800 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002671614844533601, + "loss": 2.6388, + "theoretical_loss": 3.505057240161534, + "tokens_seen": 1554574336 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002671514543630893, + "loss": 2.7197, + "theoretical_loss": 3.5050443015332675, + "tokens_seen": 1554639872 + }, + { + "epoch": 19.0, + "objective/train/docs_used": 3679050, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8632619380950928, + "objective/train/theoretical_loss": 3.5050313636031314, + "objective/train/tokens_used": 1575165408, + "theoretical_loss": 3.5050313636031314, + "tokens_seen": 1554705408 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026714142427281847, + "loss": 2.7767, + "theoretical_loss": 3.5050313636031314, + "tokens_seen": 1554705408 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026713139418254765, + "loss": 2.6891, + "theoretical_loss": 3.5050184263710586, + "tokens_seen": 1554770944 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026712136409227683, + "loss": 2.6776, + "theoretical_loss": 3.5050054898369822, + "tokens_seen": 1554836480 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026711133400200606, + "loss": 2.7673, + "theoretical_loss": 3.504992554000835, + "tokens_seen": 1554902016 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002671013039117352, + "loss": 2.6594, + "theoretical_loss": 3.50497961886255, + "tokens_seen": 1554967552 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002670912738214644, + "loss": 2.6415, + "theoretical_loss": 3.5049666844220604, + "tokens_seen": 1555033088 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026708124373119355, + "loss": 2.621, + "theoretical_loss": 3.5049537506792987, + "tokens_seen": 1555098624 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002670712136409228, + "loss": 2.7053, + "theoretical_loss": 3.504940817634198, + "tokens_seen": 1555164160 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026706118355065197, + "loss": 2.7252, + "theoretical_loss": 3.5049278852866914, + "tokens_seen": 1555229696 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026705115346038115, + "loss": 2.6778, + "theoretical_loss": 3.5049149536367112, + "tokens_seen": 1555295232 + }, + { + "epoch": 19.0, + "learning_rate": 0.00026704112337011033, + "loss": 2.7287, + "theoretical_loss": 3.504902022684192, + "tokens_seen": 1555360768 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026703109327983957, + "loss": 2.6194, + "theoretical_loss": 3.5048890924290648, + "tokens_seen": 1555426304 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002670210631895687, + "loss": 2.7013, + "theoretical_loss": 3.5048761628712644, + "tokens_seen": 1555491840 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026701103309929793, + "loss": 2.6522, + "theoretical_loss": 3.504863234010722, + "tokens_seen": 1555557376 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026700100300902706, + "loss": 2.7126, + "theoretical_loss": 3.5048503058473726, + "tokens_seen": 1555622912 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002669909729187563, + "loss": 2.7478, + "theoretical_loss": 3.504837378381148, + "tokens_seen": 1555688448 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026698094282848547, + "loss": 2.6442, + "theoretical_loss": 3.504824451611981, + "tokens_seen": 1555753984 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026697091273821465, + "loss": 2.7096, + "theoretical_loss": 3.5048115255398056, + "tokens_seen": 1555819520 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026696088264794383, + "loss": 2.6812, + "theoretical_loss": 3.5047986001645546, + "tokens_seen": 1555885056 + }, + { + "epoch": 19.01, + "learning_rate": 0.000266950852557673, + "loss": 2.6811, + "theoretical_loss": 3.5047856754861604, + "tokens_seen": 1555950592 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002669408224674022, + "loss": 2.6889, + "theoretical_loss": 3.504772751504557, + "tokens_seen": 1556016128 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026693079237713143, + "loss": 2.773, + "theoretical_loss": 3.504759828219677, + "tokens_seen": 1556081664 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026692076228686056, + "loss": 2.6429, + "theoretical_loss": 3.504746905631453, + "tokens_seen": 1556147200 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002669107321965898, + "loss": 2.7668, + "theoretical_loss": 3.5047339837398193, + "tokens_seen": 1556212736 + }, + { + "epoch": 19.01, + "learning_rate": 0.000266900702106319, + "loss": 2.6394, + "theoretical_loss": 3.5047210625447076, + "tokens_seen": 1556278272 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3681969, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8331093788146973, + "objective/train/theoretical_loss": 3.504708142046052, + "objective/train/tokens_used": 1576803808, + "theoretical_loss": 3.504708142046052, + "tokens_seen": 1556343808 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026689067201604816, + "loss": 2.7138, + "theoretical_loss": 3.504708142046052, + "tokens_seen": 1556343808 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026688064192577734, + "loss": 2.7016, + "theoretical_loss": 3.504695222243786, + "tokens_seen": 1556409344 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002668706118355065, + "loss": 2.8004, + "theoretical_loss": 3.5046823031378422, + "tokens_seen": 1556474880 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002668605817452357, + "loss": 2.6729, + "theoretical_loss": 3.5046693847281536, + "tokens_seen": 1556540416 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026685055165496494, + "loss": 2.7798, + "theoretical_loss": 3.5046564670146534, + "tokens_seen": 1556605952 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026684052156469406, + "loss": 2.5845, + "theoretical_loss": 3.504643549997275, + "tokens_seen": 1556671488 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002668304914744233, + "loss": 2.7129, + "theoretical_loss": 3.5046306336759514, + "tokens_seen": 1556737024 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002668204613841524, + "loss": 2.6737, + "theoretical_loss": 3.504617718050616, + "tokens_seen": 1556802560 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026681043129388166, + "loss": 2.7197, + "theoretical_loss": 3.504604803121201, + "tokens_seen": 1556868096 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026680040120361084, + "loss": 2.6353, + "theoretical_loss": 3.5045918888876413, + "tokens_seen": 1556933632 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026679037111334, + "loss": 2.6477, + "theoretical_loss": 3.5045789753498693, + "tokens_seen": 1556999168 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002667803410230692, + "loss": 2.6252, + "theoretical_loss": 3.504566062507818, + "tokens_seen": 1557064704 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002667703109327984, + "loss": 2.561, + "theoretical_loss": 3.5045531503614207, + "tokens_seen": 1557130240 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026676028084252757, + "loss": 2.6104, + "theoretical_loss": 3.504540238910611, + "tokens_seen": 1557195776 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002667502507522568, + "loss": 2.7785, + "theoretical_loss": 3.504527328155322, + "tokens_seen": 1557261312 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026674022066198593, + "loss": 2.6926, + "theoretical_loss": 3.5045144180954866, + "tokens_seen": 1557326848 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026673019057171516, + "loss": 2.5599, + "theoretical_loss": 3.504501508731039, + "tokens_seen": 1557392384 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026672016048144434, + "loss": 2.6161, + "theoretical_loss": 3.5044886000619115, + "tokens_seen": 1557457920 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002667101303911735, + "loss": 2.6898, + "theoretical_loss": 3.5044756920880378, + "tokens_seen": 1557523456 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002667001003009027, + "loss": 2.7782, + "theoretical_loss": 3.504462784809351, + "tokens_seen": 1557588992 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002666900702106319, + "loss": 2.7008, + "theoretical_loss": 3.5044498782257847, + "tokens_seen": 1557654528 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026668004012036107, + "loss": 2.7238, + "theoretical_loss": 3.504436972337272, + "tokens_seen": 1557720064 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002666700100300903, + "loss": 2.771, + "theoretical_loss": 3.5044240671437468, + "tokens_seen": 1557785600 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026665997993981943, + "loss": 2.5676, + "theoretical_loss": 3.5044111626451415, + "tokens_seen": 1557851136 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026664994984954867, + "loss": 2.803, + "theoretical_loss": 3.5043982588413902, + "tokens_seen": 1557916672 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3686800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.799070358276367, + "objective/train/theoretical_loss": 3.504385355732426, + "objective/train/tokens_used": 1578442208, + "theoretical_loss": 3.504385355732426, + "tokens_seen": 1557982208 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002666399197592778, + "loss": 2.7097, + "theoretical_loss": 3.504385355732426, + "tokens_seen": 1557982208 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026662988966900703, + "loss": 2.7087, + "theoretical_loss": 3.504372453318182, + "tokens_seen": 1558047744 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002666198595787362, + "loss": 2.6917, + "theoretical_loss": 3.5043595515985917, + "tokens_seen": 1558113280 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002666098294884654, + "loss": 2.7397, + "theoretical_loss": 3.5043466505735887, + "tokens_seen": 1558178816 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026659979939819457, + "loss": 2.7221, + "theoretical_loss": 3.5043337502431067, + "tokens_seen": 1558244352 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026658976930792375, + "loss": 2.7251, + "theoretical_loss": 3.504320850607078, + "tokens_seen": 1558309888 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026657973921765293, + "loss": 2.7934, + "theoretical_loss": 3.5043079516654374, + "tokens_seen": 1558375424 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026656970912738217, + "loss": 2.6282, + "theoretical_loss": 3.5042950534181174, + "tokens_seen": 1558440960 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002665596790371113, + "loss": 2.7272, + "theoretical_loss": 3.5042821558650514, + "tokens_seen": 1558506496 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026654964894684053, + "loss": 2.7045, + "theoretical_loss": 3.5042692590061737, + "tokens_seen": 1558572032 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002665396188565697, + "loss": 2.7733, + "theoretical_loss": 3.5042563628414167, + "tokens_seen": 1558637568 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002665295887662989, + "loss": 2.6838, + "theoretical_loss": 3.5042434673707143, + "tokens_seen": 1558703104 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002665195586760281, + "loss": 2.6484, + "theoretical_loss": 3.5042305725940004, + "tokens_seen": 1558768640 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026650952858575726, + "loss": 2.6919, + "theoretical_loss": 3.504217678511208, + "tokens_seen": 1558834176 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002664994984954865, + "loss": 2.7086, + "theoretical_loss": 3.50420478512227, + "tokens_seen": 1558899712 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026648946840521567, + "loss": 2.6662, + "theoretical_loss": 3.504191892427121, + "tokens_seen": 1558965248 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026647943831494485, + "loss": 2.7345, + "theoretical_loss": 3.504179000425694, + "tokens_seen": 1559030784 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026646940822467403, + "loss": 2.705, + "theoretical_loss": 3.5041661091179224, + "tokens_seen": 1559096320 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002664593781344032, + "loss": 2.8051, + "theoretical_loss": 3.50415321850374, + "tokens_seen": 1559161856 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002664493480441324, + "loss": 2.6284, + "theoretical_loss": 3.5041403285830803, + "tokens_seen": 1559227392 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026643931795386163, + "loss": 2.7012, + "theoretical_loss": 3.5041274393558766, + "tokens_seen": 1559292928 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026642928786359076, + "loss": 2.8049, + "theoretical_loss": 3.504114550822063, + "tokens_seen": 1559358464 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026641925777332, + "loss": 2.6817, + "theoretical_loss": 3.504101662981572, + "tokens_seen": 1559424000 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002664092276830492, + "loss": 2.7279, + "theoretical_loss": 3.5040887758343384, + "tokens_seen": 1559489536 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026639919759277836, + "loss": 2.7301, + "theoretical_loss": 3.504075889380295, + "tokens_seen": 1559555072 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3689789, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7620508670806885, + "objective/train/theoretical_loss": 3.5040630036193754, + "objective/train/tokens_used": 1580080608, + "theoretical_loss": 3.5040630036193754, + "tokens_seen": 1559620608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026638916750250754, + "loss": 2.7545, + "theoretical_loss": 3.5040630036193754, + "tokens_seen": 1559620608 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002663791374122367, + "loss": 2.6638, + "theoretical_loss": 3.504050118551514, + "tokens_seen": 1559686144 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002663691073219659, + "loss": 2.6875, + "theoretical_loss": 3.5040372341766433, + "tokens_seen": 1559751680 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026635907723169514, + "loss": 2.5408, + "theoretical_loss": 3.5040243504946975, + "tokens_seen": 1559817216 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026634904714142426, + "loss": 2.7148, + "theoretical_loss": 3.50401146750561, + "tokens_seen": 1559882752 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002663390170511535, + "loss": 2.7213, + "theoretical_loss": 3.503998585209315, + "tokens_seen": 1559948288 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002663289869608826, + "loss": 2.6235, + "theoretical_loss": 3.5039857036057453, + "tokens_seen": 1560013824 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026631895687061186, + "loss": 2.7693, + "theoretical_loss": 3.503972822694835, + "tokens_seen": 1560079360 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026630892678034104, + "loss": 2.7006, + "theoretical_loss": 3.5039599424765178, + "tokens_seen": 1560144896 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002662988966900702, + "loss": 2.7679, + "theoretical_loss": 3.5039470629507274, + "tokens_seen": 1560210432 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002662888665997994, + "loss": 2.6325, + "theoretical_loss": 3.503934184117397, + "tokens_seen": 1560275968 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002662788365095286, + "loss": 2.7792, + "theoretical_loss": 3.503921305976461, + "tokens_seen": 1560341504 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026626880641925777, + "loss": 2.6695, + "theoretical_loss": 3.5039084285278523, + "tokens_seen": 1560407040 + }, + { + "epoch": 19.01, + "learning_rate": 0.000266258776328987, + "loss": 2.6615, + "theoretical_loss": 3.503895551771506, + "tokens_seen": 1560472576 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026624874623871613, + "loss": 2.5949, + "theoretical_loss": 3.503882675707354, + "tokens_seen": 1560538112 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026623871614844536, + "loss": 2.7195, + "theoretical_loss": 3.5038698003353312, + "tokens_seen": 1560603648 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026622868605817454, + "loss": 2.6135, + "theoretical_loss": 3.503856925655371, + "tokens_seen": 1560669184 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002662186559679037, + "loss": 2.6153, + "theoretical_loss": 3.503844051667407, + "tokens_seen": 1560734720 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002662086258776329, + "loss": 2.6274, + "theoretical_loss": 3.5038311783713736, + "tokens_seen": 1560800256 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002661985957873621, + "loss": 2.6796, + "theoretical_loss": 3.5038183057672034, + "tokens_seen": 1560865792 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026618856569709127, + "loss": 2.6314, + "theoretical_loss": 3.5038054338548315, + "tokens_seen": 1560931328 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002661785356068205, + "loss": 2.7093, + "theoretical_loss": 3.5037925626341906, + "tokens_seen": 1560996864 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026616850551654963, + "loss": 2.6624, + "theoretical_loss": 3.503779692105215, + "tokens_seen": 1561062400 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026615847542627887, + "loss": 2.7338, + "theoretical_loss": 3.5037668222678384, + "tokens_seen": 1561127936 + }, + { + "epoch": 19.01, + "learning_rate": 0.000266148445336008, + "loss": 2.7279, + "theoretical_loss": 3.5037539531219952, + "tokens_seen": 1561193472 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3693635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5874812602996826, + "objective/train/theoretical_loss": 3.503741084667618, + "objective/train/tokens_used": 1581719008, + "theoretical_loss": 3.503741084667618, + "tokens_seen": 1561259008 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026613841524573723, + "loss": 2.6675, + "theoretical_loss": 3.503741084667618, + "tokens_seen": 1561259008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002661283851554664, + "loss": 2.632, + "theoretical_loss": 3.5037282169046415, + "tokens_seen": 1561324544 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002661183550651956, + "loss": 2.7636, + "theoretical_loss": 3.5037153498329987, + "tokens_seen": 1561390080 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026610832497492477, + "loss": 2.7354, + "theoretical_loss": 3.5037024834526247, + "tokens_seen": 1561455616 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026609829488465395, + "loss": 2.7155, + "theoretical_loss": 3.5036896177634524, + "tokens_seen": 1561521152 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026608826479438313, + "loss": 2.6695, + "theoretical_loss": 3.5036767527654162, + "tokens_seen": 1561586688 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026607823470411237, + "loss": 2.729, + "theoretical_loss": 3.5036638884584494, + "tokens_seen": 1561652224 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002660682046138415, + "loss": 2.6957, + "theoretical_loss": 3.503651024842487, + "tokens_seen": 1561717760 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026605817452357073, + "loss": 2.6305, + "theoretical_loss": 3.5036381619174612, + "tokens_seen": 1561783296 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002660481444332999, + "loss": 2.6382, + "theoretical_loss": 3.503625299683307, + "tokens_seen": 1561848832 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002660381143430291, + "loss": 2.7298, + "theoretical_loss": 3.5036124381399576, + "tokens_seen": 1561914368 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002660280842527583, + "loss": 2.6691, + "theoretical_loss": 3.503599577287348, + "tokens_seen": 1561979904 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026601805416248746, + "loss": 2.6893, + "theoretical_loss": 3.503586717125412, + "tokens_seen": 1562045440 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026600802407221664, + "loss": 2.7326, + "theoretical_loss": 3.5035738576540822, + "tokens_seen": 1562110976 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026599799398194587, + "loss": 2.7861, + "theoretical_loss": 3.5035609988732936, + "tokens_seen": 1562176512 + }, + { + "epoch": 19.01, + "learning_rate": 0.000265987963891675, + "loss": 2.6133, + "theoretical_loss": 3.5035481407829803, + "tokens_seen": 1562242048 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026597793380140423, + "loss": 2.6723, + "theoretical_loss": 3.5035352833830755, + "tokens_seen": 1562307584 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026596790371113336, + "loss": 2.7495, + "theoretical_loss": 3.5035224266735137, + "tokens_seen": 1562373120 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002659578736208626, + "loss": 2.6418, + "theoretical_loss": 3.503509570654229, + "tokens_seen": 1562438656 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002659478435305918, + "loss": 2.6945, + "theoretical_loss": 3.503496715325155, + "tokens_seen": 1562504192 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026593781344032096, + "loss": 2.6995, + "theoretical_loss": 3.5034838606862255, + "tokens_seen": 1562569728 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026592778335005014, + "loss": 2.7175, + "theoretical_loss": 3.5034710067373753, + "tokens_seen": 1562635264 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002659177532597794, + "loss": 2.5903, + "theoretical_loss": 3.5034581534785376, + "tokens_seen": 1562700800 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002659077231695085, + "loss": 2.6821, + "theoretical_loss": 3.503445300909647, + "tokens_seen": 1562766336 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026589769307923774, + "loss": 2.6837, + "theoretical_loss": 3.5034324490306377, + "tokens_seen": 1562831872 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3698388, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7703516483306885, + "objective/train/theoretical_loss": 3.503419597841443, + "objective/train/tokens_used": 1583357408, + "theoretical_loss": 3.503419597841443, + "tokens_seen": 1562897408 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026588766298896686, + "loss": 2.6641, + "theoretical_loss": 3.503419597841443, + "tokens_seen": 1562897408 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002658776328986961, + "loss": 2.7998, + "theoretical_loss": 3.503406747341997, + "tokens_seen": 1562962944 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002658676028084253, + "loss": 2.683, + "theoretical_loss": 3.5033938975322343, + "tokens_seen": 1563028480 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026585757271815446, + "loss": 2.7444, + "theoretical_loss": 3.5033810484120895, + "tokens_seen": 1563094016 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026584754262788364, + "loss": 2.6852, + "theoretical_loss": 3.5033681999814945, + "tokens_seen": 1563159552 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002658375125376128, + "loss": 2.727, + "theoretical_loss": 3.503355352240386, + "tokens_seen": 1563225088 + }, + { + "epoch": 19.01, + "learning_rate": 0.000265827482447342, + "loss": 2.6914, + "theoretical_loss": 3.5033425051886966, + "tokens_seen": 1563290624 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026581745235707124, + "loss": 2.6751, + "theoretical_loss": 3.5033296588263605, + "tokens_seen": 1563356160 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026580742226680037, + "loss": 2.6689, + "theoretical_loss": 3.5033168131533126, + "tokens_seen": 1563421696 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002657973921765296, + "loss": 2.58, + "theoretical_loss": 3.5033039681694857, + "tokens_seen": 1563487232 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026578736208625873, + "loss": 2.7713, + "theoretical_loss": 3.5032911238748152, + "tokens_seen": 1563552768 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026577733199598797, + "loss": 2.5873, + "theoretical_loss": 3.5032782802692344, + "tokens_seen": 1563618304 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026576730190571715, + "loss": 2.6762, + "theoretical_loss": 3.503265437352678, + "tokens_seen": 1563683840 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026575727181544633, + "loss": 2.7665, + "theoretical_loss": 3.5032525951250797, + "tokens_seen": 1563749376 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026574724172517556, + "loss": 2.6793, + "theoretical_loss": 3.5032397535863744, + "tokens_seen": 1563814912 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026573721163490474, + "loss": 2.6603, + "theoretical_loss": 3.503226912736495, + "tokens_seen": 1563880448 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002657271815446339, + "loss": 2.691, + "theoretical_loss": 3.503214072575377, + "tokens_seen": 1563945984 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002657171514543631, + "loss": 2.7475, + "theoretical_loss": 3.5032012331029545, + "tokens_seen": 1564011520 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002657071213640923, + "loss": 2.6541, + "theoretical_loss": 3.5031883943191606, + "tokens_seen": 1564077056 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026569709127382147, + "loss": 2.6108, + "theoretical_loss": 3.5031755562239306, + "tokens_seen": 1564142592 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002656870611835507, + "loss": 2.6587, + "theoretical_loss": 3.503162718817198, + "tokens_seen": 1564208128 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026567703109327983, + "loss": 2.7106, + "theoretical_loss": 3.5031498820988975, + "tokens_seen": 1564273664 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026566700100300907, + "loss": 2.6839, + "theoretical_loss": 3.503137046068963, + "tokens_seen": 1564339200 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002656569709127382, + "loss": 2.7371, + "theoretical_loss": 3.5031242107273295, + "tokens_seen": 1564404736 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026564694082246743, + "loss": 2.7752, + "theoretical_loss": 3.50311137607393, + "tokens_seen": 1564470272 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3701468, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8056583404541016, + "objective/train/theoretical_loss": 3.5030985421086998, + "objective/train/tokens_used": 1584995808, + "theoretical_loss": 3.5030985421086998, + "tokens_seen": 1564535808 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002656369107321966, + "loss": 2.8823, + "theoretical_loss": 3.5030985421086998, + "tokens_seen": 1564535808 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002656268806419258, + "loss": 2.8062, + "theoretical_loss": 3.503085708831573, + "tokens_seen": 1564601344 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026561685055165497, + "loss": 2.694, + "theoretical_loss": 3.5030728762424834, + "tokens_seen": 1564666880 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026560682046138415, + "loss": 2.6305, + "theoretical_loss": 3.5030600443413658, + "tokens_seen": 1564732416 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026559679037111333, + "loss": 2.669, + "theoretical_loss": 3.5030472131281547, + "tokens_seen": 1564797952 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026558676028084257, + "loss": 2.8282, + "theoretical_loss": 3.5030343826027837, + "tokens_seen": 1564863488 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002655767301905717, + "loss": 2.6448, + "theoretical_loss": 3.503021552765187, + "tokens_seen": 1564929024 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026556670010030093, + "loss": 2.61, + "theoretical_loss": 3.5030087236153, + "tokens_seen": 1564994560 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002655566700100301, + "loss": 2.6901, + "theoretical_loss": 3.5029958951530564, + "tokens_seen": 1565060096 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002655466399197593, + "loss": 2.7059, + "theoretical_loss": 3.5029830673783904, + "tokens_seen": 1565125632 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002655366098294885, + "loss": 2.7087, + "theoretical_loss": 3.502970240291237, + "tokens_seen": 1565191168 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026552657973921766, + "loss": 2.6817, + "theoretical_loss": 3.5029574138915294, + "tokens_seen": 1565256704 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026551654964894684, + "loss": 2.6753, + "theoretical_loss": 3.5029445881792034, + "tokens_seen": 1565322240 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026550651955867607, + "loss": 2.7096, + "theoretical_loss": 3.5029317631541925, + "tokens_seen": 1565387776 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002654964894684052, + "loss": 2.7475, + "theoretical_loss": 3.502918938816431, + "tokens_seen": 1565453312 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026548645937813443, + "loss": 2.7832, + "theoretical_loss": 3.5029061151658536, + "tokens_seen": 1565518848 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026547642928786356, + "loss": 2.7111, + "theoretical_loss": 3.502893292202395, + "tokens_seen": 1565584384 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002654663991975928, + "loss": 2.7063, + "theoretical_loss": 3.502880469925989, + "tokens_seen": 1565649920 + }, + { + "epoch": 19.01, + "learning_rate": 0.000265456369107322, + "loss": 2.6315, + "theoretical_loss": 3.5028676483365704, + "tokens_seen": 1565715456 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026544633901705116, + "loss": 2.6487, + "theoretical_loss": 3.502854827434074, + "tokens_seen": 1565780992 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026543630892678034, + "loss": 2.6474, + "theoretical_loss": 3.502842007218433, + "tokens_seen": 1565846528 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002654262788365096, + "loss": 2.6536, + "theoretical_loss": 3.5028291876895836, + "tokens_seen": 1565912064 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002654162487462387, + "loss": 2.7394, + "theoretical_loss": 3.502816368847459, + "tokens_seen": 1565977600 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026540621865596794, + "loss": 2.6426, + "theoretical_loss": 3.5028035506919943, + "tokens_seen": 1566043136 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026539618856569707, + "loss": 2.6624, + "theoretical_loss": 3.502790733223123, + "tokens_seen": 1566108672 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3706270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8114006519317627, + "objective/train/theoretical_loss": 3.502777916440781, + "objective/train/tokens_used": 1586634208, + "theoretical_loss": 3.502777916440781, + "tokens_seen": 1566174208 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002653861584754263, + "loss": 2.739, + "theoretical_loss": 3.502777916440781, + "tokens_seen": 1566174208 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002653761283851555, + "loss": 2.7344, + "theoretical_loss": 3.5027651003449023, + "tokens_seen": 1566239744 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026536609829488466, + "loss": 2.6637, + "theoretical_loss": 3.5027522849354202, + "tokens_seen": 1566305280 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026535606820461384, + "loss": 2.6133, + "theoretical_loss": 3.5027394702122714, + "tokens_seen": 1566370816 + }, + { + "epoch": 19.01, + "learning_rate": 0.000265346038114343, + "loss": 2.703, + "theoretical_loss": 3.5027266561753887, + "tokens_seen": 1566436352 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002653360080240722, + "loss": 2.6232, + "theoretical_loss": 3.502713842824707, + "tokens_seen": 1566501888 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026532597793380144, + "loss": 2.7597, + "theoretical_loss": 3.5027010301601615, + "tokens_seen": 1566567424 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026531594784353057, + "loss": 2.7527, + "theoretical_loss": 3.502688218181686, + "tokens_seen": 1566632960 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002653059177532598, + "loss": 2.6865, + "theoretical_loss": 3.502675406889216, + "tokens_seen": 1566698496 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026529588766298893, + "loss": 2.7046, + "theoretical_loss": 3.502662596282685, + "tokens_seen": 1566764032 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026528585757271817, + "loss": 2.696, + "theoretical_loss": 3.5026497863620283, + "tokens_seen": 1566829568 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026527582748244735, + "loss": 2.7545, + "theoretical_loss": 3.50263697712718, + "tokens_seen": 1566895104 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026526579739217653, + "loss": 2.781, + "theoretical_loss": 3.502624168578075, + "tokens_seen": 1566960640 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002652557673019057, + "loss": 2.7375, + "theoretical_loss": 3.502611360714648, + "tokens_seen": 1567026176 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026524573721163494, + "loss": 2.8334, + "theoretical_loss": 3.502598553536833, + "tokens_seen": 1567091712 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026523570712136407, + "loss": 2.7097, + "theoretical_loss": 3.5025857470445656, + "tokens_seen": 1567157248 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002652256770310933, + "loss": 2.7819, + "theoretical_loss": 3.50257294123778, + "tokens_seen": 1567222784 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026521564694082243, + "loss": 2.7147, + "theoretical_loss": 3.5025601361164105, + "tokens_seen": 1567288320 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026520561685055167, + "loss": 2.7278, + "theoretical_loss": 3.5025473316803923, + "tokens_seen": 1567353856 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026519558676028085, + "loss": 2.624, + "theoretical_loss": 3.50253452792966, + "tokens_seen": 1567419392 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026518555667001003, + "loss": 2.7813, + "theoretical_loss": 3.502521724864147, + "tokens_seen": 1567484928 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002651755265797392, + "loss": 2.7115, + "theoretical_loss": 3.5025089224837904, + "tokens_seen": 1567550464 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002651654964894684, + "loss": 2.715, + "theoretical_loss": 3.5024961207885226, + "tokens_seen": 1567616000 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002651554663991976, + "loss": 2.7375, + "theoretical_loss": 3.5024833197782796, + "tokens_seen": 1567681536 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002651454363089268, + "loss": 2.7559, + "theoretical_loss": 3.502470519452996, + "tokens_seen": 1567747072 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3709243, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8501839637756348, + "objective/train/theoretical_loss": 3.5024577198126057, + "objective/train/tokens_used": 1588272608, + "theoretical_loss": 3.5024577198126057, + "tokens_seen": 1567812608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026513540621865594, + "loss": 2.7497, + "theoretical_loss": 3.5024577198126057, + "tokens_seen": 1567812608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026512537612838517, + "loss": 2.6973, + "theoretical_loss": 3.5024449208570445, + "tokens_seen": 1567878144 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002651153460381143, + "loss": 2.7936, + "theoretical_loss": 3.5024321225862467, + "tokens_seen": 1567943680 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026510531594784353, + "loss": 2.7665, + "theoretical_loss": 3.5024193250001465, + "tokens_seen": 1568009216 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650952858575727, + "loss": 2.7394, + "theoretical_loss": 3.50240652809868, + "tokens_seen": 1568074752 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650852557673019, + "loss": 2.6836, + "theoretical_loss": 3.5023937318817806, + "tokens_seen": 1568140288 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650752256770311, + "loss": 2.5335, + "theoretical_loss": 3.5023809363493834, + "tokens_seen": 1568205824 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650651955867603, + "loss": 2.6906, + "theoretical_loss": 3.5023681415014236, + "tokens_seen": 1568271360 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026505516549648944, + "loss": 2.7532, + "theoretical_loss": 3.5023553473378355, + "tokens_seen": 1568336896 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650451354062187, + "loss": 2.7201, + "theoretical_loss": 3.502342553858554, + "tokens_seen": 1568402432 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650351053159478, + "loss": 2.6981, + "theoretical_loss": 3.502329761063515, + "tokens_seen": 1568467968 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026502507522567704, + "loss": 2.6889, + "theoretical_loss": 3.502316968952652, + "tokens_seen": 1568533504 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650150451354062, + "loss": 2.7035, + "theoretical_loss": 3.5023041775258994, + "tokens_seen": 1568599040 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002650050150451354, + "loss": 2.7538, + "theoretical_loss": 3.5022913867831935, + "tokens_seen": 1568664576 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026499498495486464, + "loss": 2.6566, + "theoretical_loss": 3.5022785967244685, + "tokens_seen": 1568730112 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026498495486459376, + "loss": 2.6715, + "theoretical_loss": 3.502265807349659, + "tokens_seen": 1568795648 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264974924774323, + "loss": 2.6829, + "theoretical_loss": 3.5022530186587004, + "tokens_seen": 1568861184 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002649648946840522, + "loss": 2.7274, + "theoretical_loss": 3.5022402306515272, + "tokens_seen": 1568926720 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026495486459378136, + "loss": 2.6434, + "theoretical_loss": 3.5022274433280742, + "tokens_seen": 1568992256 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026494483450351054, + "loss": 2.7427, + "theoretical_loss": 3.5022146566882766, + "tokens_seen": 1569057792 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002649348044132398, + "loss": 2.6474, + "theoretical_loss": 3.502201870732069, + "tokens_seen": 1569123328 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002649247743229689, + "loss": 2.5385, + "theoretical_loss": 3.5021890854593867, + "tokens_seen": 1569188864 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026491474423269814, + "loss": 2.6688, + "theoretical_loss": 3.502176300870164, + "tokens_seen": 1569254400 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026490471414242727, + "loss": 2.6077, + "theoretical_loss": 3.5021635169643357, + "tokens_seen": 1569319936 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002648946840521565, + "loss": 2.7306, + "theoretical_loss": 3.5021507337418383, + "tokens_seen": 1569385472 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3712912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.74835467338562, + "objective/train/theoretical_loss": 3.502137951202605, + "objective/train/tokens_used": 1589911008, + "theoretical_loss": 3.502137951202605, + "tokens_seen": 1569451008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002648846539618857, + "loss": 2.725, + "theoretical_loss": 3.502137951202605, + "tokens_seen": 1569451008 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026487462387161486, + "loss": 2.7453, + "theoretical_loss": 3.5021251693465714, + "tokens_seen": 1569516544 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026486459378134404, + "loss": 2.7547, + "theoretical_loss": 3.5021123881736727, + "tokens_seen": 1569582080 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002648545636910732, + "loss": 2.7233, + "theoretical_loss": 3.502099607683843, + "tokens_seen": 1569647616 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002648445336008024, + "loss": 2.7554, + "theoretical_loss": 3.5020868278770187, + "tokens_seen": 1569713152 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026483450351053164, + "loss": 2.7012, + "theoretical_loss": 3.5020740487531334, + "tokens_seen": 1569778688 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026482447342026077, + "loss": 2.6386, + "theoretical_loss": 3.502061270312123, + "tokens_seen": 1569844224 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026481444332999, + "loss": 2.801, + "theoretical_loss": 3.5020484925539224, + "tokens_seen": 1569909760 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026480441323971913, + "loss": 2.752, + "theoretical_loss": 3.502035715478466, + "tokens_seen": 1569975296 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026479438314944837, + "loss": 2.7115, + "theoretical_loss": 3.5020229390856894, + "tokens_seen": 1570040832 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026478435305917755, + "loss": 2.7469, + "theoretical_loss": 3.5020101633755276, + "tokens_seen": 1570106368 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026477432296890673, + "loss": 2.7683, + "theoretical_loss": 3.501997388347915, + "tokens_seen": 1570171904 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002647642928786359, + "loss": 2.668, + "theoretical_loss": 3.5019846140027875, + "tokens_seen": 1570237440 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026475426278836514, + "loss": 2.7551, + "theoretical_loss": 3.5019718403400804, + "tokens_seen": 1570302976 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026474423269809427, + "loss": 2.6449, + "theoretical_loss": 3.501959067359727, + "tokens_seen": 1570368512 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002647342026078235, + "loss": 2.7037, + "theoretical_loss": 3.501946295061664, + "tokens_seen": 1570434048 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026472417251755263, + "loss": 2.8074, + "theoretical_loss": 3.5019335234458264, + "tokens_seen": 1570499584 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026471414242728187, + "loss": 2.5998, + "theoretical_loss": 3.5019207525121487, + "tokens_seen": 1570565120 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026470411233701105, + "loss": 2.6889, + "theoretical_loss": 3.501907982260566, + "tokens_seen": 1570630656 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026469408224674023, + "loss": 2.7828, + "theoretical_loss": 3.5018952126910134, + "tokens_seen": 1570696192 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002646840521564694, + "loss": 2.7888, + "theoretical_loss": 3.5018824438034266, + "tokens_seen": 1570761728 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002646740220661986, + "loss": 2.6679, + "theoretical_loss": 3.5018696755977405, + "tokens_seen": 1570827264 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002646639919759278, + "loss": 2.8001, + "theoretical_loss": 3.50185690807389, + "tokens_seen": 1570892800 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264653961885657, + "loss": 2.704, + "theoretical_loss": 3.5018441412318104, + "tokens_seen": 1570958336 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026464393179538614, + "loss": 2.6871, + "theoretical_loss": 3.5018313750714363, + "tokens_seen": 1571023872 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3716116, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8979320526123047, + "objective/train/theoretical_loss": 3.501818609592704, + "objective/train/tokens_used": 1591549408, + "theoretical_loss": 3.501818609592704, + "tokens_seen": 1571089408 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026463390170511537, + "loss": 2.7808, + "theoretical_loss": 3.501818609592704, + "tokens_seen": 1571089408 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002646238716148445, + "loss": 2.7068, + "theoretical_loss": 3.5018058447955482, + "tokens_seen": 1571154944 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026461384152457373, + "loss": 2.7162, + "theoretical_loss": 3.5017930806799034, + "tokens_seen": 1571220480 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002646038114343029, + "loss": 2.7293, + "theoretical_loss": 3.5017803172457054, + "tokens_seen": 1571286016 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645937813440321, + "loss": 2.6932, + "theoretical_loss": 3.5017675544928895, + "tokens_seen": 1571351552 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645837512537613, + "loss": 2.7045, + "theoretical_loss": 3.5017547924213908, + "tokens_seen": 1571417088 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645737211634905, + "loss": 2.7048, + "theoretical_loss": 3.501742031031144, + "tokens_seen": 1571482624 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026456369107321964, + "loss": 2.7281, + "theoretical_loss": 3.5017292703220853, + "tokens_seen": 1571548160 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645536609829489, + "loss": 2.7469, + "theoretical_loss": 3.501716510294149, + "tokens_seen": 1571613696 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264543630892678, + "loss": 2.6557, + "theoretical_loss": 3.501703750947271, + "tokens_seen": 1571679232 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026453360080240724, + "loss": 2.6419, + "theoretical_loss": 3.501690992281386, + "tokens_seen": 1571744768 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645235707121364, + "loss": 2.7612, + "theoretical_loss": 3.5016782342964303, + "tokens_seen": 1571810304 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645135406218656, + "loss": 2.7776, + "theoretical_loss": 3.5016654769923377, + "tokens_seen": 1571875840 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002645035105315948, + "loss": 2.6672, + "theoretical_loss": 3.5016527203690444, + "tokens_seen": 1571941376 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026449348044132396, + "loss": 2.7318, + "theoretical_loss": 3.5016399644264857, + "tokens_seen": 1572006912 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026448345035105314, + "loss": 2.6297, + "theoretical_loss": 3.5016272091645964, + "tokens_seen": 1572072448 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002644734202607824, + "loss": 2.7538, + "theoretical_loss": 3.5016144545833123, + "tokens_seen": 1572137984 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002644633901705115, + "loss": 2.8517, + "theoretical_loss": 3.5016017006825684, + "tokens_seen": 1572203520 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026445336008024074, + "loss": 2.7318, + "theoretical_loss": 3.5015889474623005, + "tokens_seen": 1572269056 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026444332998996987, + "loss": 2.6823, + "theoretical_loss": 3.501576194922443, + "tokens_seen": 1572334592 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002644332998996991, + "loss": 2.6901, + "theoretical_loss": 3.501563443062932, + "tokens_seen": 1572400128 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002644232698094283, + "loss": 2.6585, + "theoretical_loss": 3.501550691883703, + "tokens_seen": 1572465664 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026441323971915747, + "loss": 2.7589, + "theoretical_loss": 3.501537941384691, + "tokens_seen": 1572531200 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026440320962888665, + "loss": 2.7, + "theoretical_loss": 3.5015251915658308, + "tokens_seen": 1572596736 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002643931795386159, + "loss": 2.714, + "theoretical_loss": 3.5015124424270585, + "tokens_seen": 1572662272 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3720745, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7419846057891846, + "objective/train/theoretical_loss": 3.50149969396831, + "objective/train/tokens_used": 1593187808, + "theoretical_loss": 3.50149969396831, + "tokens_seen": 1572727808 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264383149448345, + "loss": 2.7402, + "theoretical_loss": 3.50149969396831, + "tokens_seen": 1572727808 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026437311935807424, + "loss": 2.7307, + "theoretical_loss": 3.5014869461895195, + "tokens_seen": 1572793344 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026436308926780337, + "loss": 2.7753, + "theoretical_loss": 3.501474199090623, + "tokens_seen": 1572858880 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002643530591775326, + "loss": 2.6741, + "theoretical_loss": 3.501461452671556, + "tokens_seen": 1572924416 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002643430290872618, + "loss": 2.7481, + "theoretical_loss": 3.501448706932254, + "tokens_seen": 1572989952 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026433299899699097, + "loss": 2.7885, + "theoretical_loss": 3.5014359618726516, + "tokens_seen": 1573055488 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026432296890672015, + "loss": 2.8397, + "theoretical_loss": 3.501423217492685, + "tokens_seen": 1573121024 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026431293881644933, + "loss": 2.6851, + "theoretical_loss": 3.5014104737922898, + "tokens_seen": 1573186560 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002643029087261785, + "loss": 2.769, + "theoretical_loss": 3.501397730771401, + "tokens_seen": 1573252096 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026429287863590775, + "loss": 2.7551, + "theoretical_loss": 3.5013849884299546, + "tokens_seen": 1573317632 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002642828485456369, + "loss": 2.5876, + "theoretical_loss": 3.501372246767885, + "tokens_seen": 1573383168 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002642728184553661, + "loss": 2.7298, + "theoretical_loss": 3.5013595057851283, + "tokens_seen": 1573448704 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026426278836509534, + "loss": 2.7108, + "theoretical_loss": 3.501346765481621, + "tokens_seen": 1573514240 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026425275827482447, + "loss": 2.7285, + "theoretical_loss": 3.5013340258572967, + "tokens_seen": 1573579776 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002642427281845537, + "loss": 2.7395, + "theoretical_loss": 3.501321286912092, + "tokens_seen": 1573645312 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026423269809428283, + "loss": 2.7191, + "theoretical_loss": 3.5013085486459428, + "tokens_seen": 1573710848 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026422266800401207, + "loss": 2.7574, + "theoretical_loss": 3.501295811058784, + "tokens_seen": 1573776384 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026421263791374125, + "loss": 2.7011, + "theoretical_loss": 3.5012830741505514, + "tokens_seen": 1573841920 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026420260782347043, + "loss": 2.6, + "theoretical_loss": 3.50127033792118, + "tokens_seen": 1573907456 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641925777331996, + "loss": 2.6807, + "theoretical_loss": 3.5012576023706057, + "tokens_seen": 1573972992 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641825476429288, + "loss": 2.7591, + "theoretical_loss": 3.501244867498764, + "tokens_seen": 1574038528 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264172517552658, + "loss": 2.6499, + "theoretical_loss": 3.5012321333055914, + "tokens_seen": 1574104064 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641624874623872, + "loss": 2.6634, + "theoretical_loss": 3.501219399791022, + "tokens_seen": 1574169600 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026415245737211634, + "loss": 2.6946, + "theoretical_loss": 3.501206666954992, + "tokens_seen": 1574235136 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026414242728184557, + "loss": 2.6526, + "theoretical_loss": 3.5011939347974375, + "tokens_seen": 1574300672 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3724011, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750674247741699, + "objective/train/theoretical_loss": 3.5011812033182927, + "objective/train/tokens_used": 1594826208, + "theoretical_loss": 3.5011812033182927, + "tokens_seen": 1574366208 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641323971915747, + "loss": 2.7416, + "theoretical_loss": 3.5011812033182927, + "tokens_seen": 1574366208 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026412236710130393, + "loss": 2.6551, + "theoretical_loss": 3.501168472517495, + "tokens_seen": 1574431744 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641123370110331, + "loss": 2.715, + "theoretical_loss": 3.501155742394979, + "tokens_seen": 1574497280 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002641023069207623, + "loss": 2.731, + "theoretical_loss": 3.5011430129506804, + "tokens_seen": 1574562816 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640922768304915, + "loss": 2.6568, + "theoretical_loss": 3.5011302841845353, + "tokens_seen": 1574628352 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640822467402207, + "loss": 2.751, + "theoretical_loss": 3.5011175560964785, + "tokens_seen": 1574693888 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026407221664994984, + "loss": 2.7948, + "theoretical_loss": 3.5011048286864463, + "tokens_seen": 1574759424 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640621865596791, + "loss": 2.821, + "theoretical_loss": 3.5010921019543746, + "tokens_seen": 1574824960 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640521564694082, + "loss": 2.7748, + "theoretical_loss": 3.501079375900198, + "tokens_seen": 1574890496 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026404212637913744, + "loss": 2.7916, + "theoretical_loss": 3.5010666505238532, + "tokens_seen": 1574956032 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640320962888666, + "loss": 2.7243, + "theoretical_loss": 3.5010539258252757, + "tokens_seen": 1575021568 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002640220661985958, + "loss": 2.7204, + "theoretical_loss": 3.5010412018044015, + "tokens_seen": 1575087104 + }, + { + "epoch": 19.01, + "learning_rate": 0.000264012036108325, + "loss": 2.7587, + "theoretical_loss": 3.501028478461165, + "tokens_seen": 1575152640 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026400200601805416, + "loss": 2.7123, + "theoretical_loss": 3.5010157557955033, + "tokens_seen": 1575218176 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026399197592778334, + "loss": 2.6659, + "theoretical_loss": 3.5010030338073514, + "tokens_seen": 1575283712 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002639819458375126, + "loss": 2.6984, + "theoretical_loss": 3.500990312496646, + "tokens_seen": 1575349248 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002639719157472417, + "loss": 2.7255, + "theoretical_loss": 3.5009775918633217, + "tokens_seen": 1575414784 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026396188565697094, + "loss": 2.7859, + "theoretical_loss": 3.500964871907314, + "tokens_seen": 1575480320 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026395185556670007, + "loss": 2.7211, + "theoretical_loss": 3.5009521526285603, + "tokens_seen": 1575545856 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002639418254764293, + "loss": 2.7101, + "theoretical_loss": 3.500939434026995, + "tokens_seen": 1575611392 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002639317953861585, + "loss": 2.7125, + "theoretical_loss": 3.500926716102555, + "tokens_seen": 1575676928 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026392176529588767, + "loss": 2.7455, + "theoretical_loss": 3.5009139988551743, + "tokens_seen": 1575742464 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026391173520561685, + "loss": 2.6742, + "theoretical_loss": 3.50090128228479, + "tokens_seen": 1575808000 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002639017051153461, + "loss": 2.8105, + "theoretical_loss": 3.500888566391338, + "tokens_seen": 1575873536 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002638916750250752, + "loss": 2.774, + "theoretical_loss": 3.5008758511747535, + "tokens_seen": 1575939072 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3727798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.478684186935425, + "objective/train/theoretical_loss": 3.500863136634973, + "objective/train/tokens_used": 1596464608, + "theoretical_loss": 3.500863136634973, + "tokens_seen": 1576004608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026388164493480444, + "loss": 2.6545, + "theoretical_loss": 3.500863136634973, + "tokens_seen": 1576004608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026387161484453357, + "loss": 2.8102, + "theoretical_loss": 3.500850422771932, + "tokens_seen": 1576070144 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002638615847542628, + "loss": 2.724, + "theoretical_loss": 3.500837709585566, + "tokens_seen": 1576135680 + }, + { + "epoch": 19.01, + "learning_rate": 0.000263851554663992, + "loss": 2.7589, + "theoretical_loss": 3.500824997075811, + "tokens_seen": 1576201216 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026384152457372117, + "loss": 2.7656, + "theoretical_loss": 3.5008122852426036, + "tokens_seen": 1576266752 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026383149448345035, + "loss": 2.7444, + "theoretical_loss": 3.500799574085879, + "tokens_seen": 1576332288 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026382146439317953, + "loss": 2.721, + "theoretical_loss": 3.5007868636055726, + "tokens_seen": 1576397824 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002638114343029087, + "loss": 2.675, + "theoretical_loss": 3.5007741538016215, + "tokens_seen": 1576463360 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026380140421263795, + "loss": 2.664, + "theoretical_loss": 3.5007614446739606, + "tokens_seen": 1576528896 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637913741223671, + "loss": 2.6358, + "theoretical_loss": 3.5007487362225262, + "tokens_seen": 1576594432 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637813440320963, + "loss": 2.7469, + "theoretical_loss": 3.500736028447254, + "tokens_seen": 1576659968 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637713139418255, + "loss": 2.7383, + "theoretical_loss": 3.5007233213480804, + "tokens_seen": 1576725504 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026376128385155467, + "loss": 2.7613, + "theoretical_loss": 3.500710614924941, + "tokens_seen": 1576791040 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026375125376128385, + "loss": 2.6998, + "theoretical_loss": 3.5006979091777715, + "tokens_seen": 1576856576 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026374122367101303, + "loss": 2.7645, + "theoretical_loss": 3.5006852041065084, + "tokens_seen": 1576922112 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637311935807422, + "loss": 2.596, + "theoretical_loss": 3.500672499711087, + "tokens_seen": 1576987648 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026372116349047145, + "loss": 2.7322, + "theoretical_loss": 3.5006597959914436, + "tokens_seen": 1577053184 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637111334002006, + "loss": 2.6712, + "theoretical_loss": 3.5006470929475144, + "tokens_seen": 1577118720 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002637011033099298, + "loss": 2.6936, + "theoretical_loss": 3.5006343905792354, + "tokens_seen": 1577184256 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026369107321965894, + "loss": 2.6536, + "theoretical_loss": 3.5006216888865422, + "tokens_seen": 1577249792 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636810431293882, + "loss": 2.6904, + "theoretical_loss": 3.5006089878693705, + "tokens_seen": 1577315328 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026367101303911736, + "loss": 2.7344, + "theoretical_loss": 3.5005962875276575, + "tokens_seen": 1577380864 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026366098294884654, + "loss": 2.7628, + "theoretical_loss": 3.500583587861338, + "tokens_seen": 1577446400 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636509528585757, + "loss": 2.7456, + "theoretical_loss": 3.5005708888703486, + "tokens_seen": 1577511936 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636409227683049, + "loss": 2.7294, + "theoretical_loss": 3.5005581905546252, + "tokens_seen": 1577577472 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3732301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6585962772369385, + "objective/train/theoretical_loss": 3.500545492914104, + "objective/train/tokens_used": 1598103008, + "theoretical_loss": 3.500545492914104, + "tokens_seen": 1577643008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636308926780341, + "loss": 2.6906, + "theoretical_loss": 3.500545492914104, + "tokens_seen": 1577643008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636208625877633, + "loss": 2.7256, + "theoretical_loss": 3.500532795948721, + "tokens_seen": 1577708544 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026361083249749244, + "loss": 2.6938, + "theoretical_loss": 3.500520099658412, + "tokens_seen": 1577774080 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002636008024072217, + "loss": 2.6877, + "theoretical_loss": 3.5005074040431134, + "tokens_seen": 1577839616 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026359077231695086, + "loss": 2.7669, + "theoretical_loss": 3.500494709102761, + "tokens_seen": 1577905152 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026358074222668004, + "loss": 2.731, + "theoretical_loss": 3.5004820148372913, + "tokens_seen": 1577970688 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002635707121364092, + "loss": 2.6536, + "theoretical_loss": 3.5004693212466402, + "tokens_seen": 1578036224 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002635606820461384, + "loss": 2.7723, + "theoretical_loss": 3.5004566283307437, + "tokens_seen": 1578101760 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002635506519558676, + "loss": 2.6399, + "theoretical_loss": 3.500443936089537, + "tokens_seen": 1578167296 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002635406218655968, + "loss": 2.6956, + "theoretical_loss": 3.5004312445229586, + "tokens_seen": 1578232832 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026353059177532595, + "loss": 2.7142, + "theoretical_loss": 3.5004185536309427, + "tokens_seen": 1578298368 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002635205616850552, + "loss": 2.7484, + "theoretical_loss": 3.500405863413426, + "tokens_seen": 1578363904 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026351053159478436, + "loss": 2.705, + "theoretical_loss": 3.500393173870344, + "tokens_seen": 1578429440 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026350050150451354, + "loss": 2.7968, + "theoretical_loss": 3.5003804850016342, + "tokens_seen": 1578494976 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634904714142428, + "loss": 2.7206, + "theoretical_loss": 3.5003677968072315, + "tokens_seen": 1578560512 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634804413239719, + "loss": 2.7189, + "theoretical_loss": 3.5003551092870726, + "tokens_seen": 1578626048 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026347041123370114, + "loss": 2.7615, + "theoretical_loss": 3.500342422441094, + "tokens_seen": 1578691584 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026346038114343027, + "loss": 2.7439, + "theoretical_loss": 3.500329736269231, + "tokens_seen": 1578757120 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634503510531595, + "loss": 2.813, + "theoretical_loss": 3.500317050771421, + "tokens_seen": 1578822656 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634403209628887, + "loss": 2.6625, + "theoretical_loss": 3.500304365947599, + "tokens_seen": 1578888192 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026343029087261787, + "loss": 2.6712, + "theoretical_loss": 3.5002916817977026, + "tokens_seen": 1578953728 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026342026078234705, + "loss": 2.7816, + "theoretical_loss": 3.5002789983216664, + "tokens_seen": 1579019264 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634102306920763, + "loss": 2.7115, + "theoretical_loss": 3.500266315519428, + "tokens_seen": 1579084800 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002634002006018054, + "loss": 2.6712, + "theoretical_loss": 3.5002536333909227, + "tokens_seen": 1579150336 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026339017051153464, + "loss": 2.7058, + "theoretical_loss": 3.500240951936087, + "tokens_seen": 1579215872 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3735436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9497697353363037, + "objective/train/theoretical_loss": 3.5002282711548576, + "objective/train/tokens_used": 1599741408, + "theoretical_loss": 3.5002282711548576, + "tokens_seen": 1579281408 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026338014042126377, + "loss": 2.7753, + "theoretical_loss": 3.5002282711548576, + "tokens_seen": 1579281408 + }, + { + "epoch": 19.01, + "learning_rate": 0.000263370110330993, + "loss": 2.7954, + "theoretical_loss": 3.5002155910471706, + "tokens_seen": 1579346944 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002633600802407222, + "loss": 2.7807, + "theoretical_loss": 3.500202911612962, + "tokens_seen": 1579412480 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026335005015045137, + "loss": 2.6728, + "theoretical_loss": 3.5001902328521677, + "tokens_seen": 1579478016 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026334002006018055, + "loss": 2.7422, + "theoretical_loss": 3.5001775547647256, + "tokens_seen": 1579543552 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026332998996990973, + "loss": 2.6301, + "theoretical_loss": 3.50016487735057, + "tokens_seen": 1579609088 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002633199598796389, + "loss": 2.6927, + "theoretical_loss": 3.5001522006096386, + "tokens_seen": 1579674624 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026330992978936815, + "loss": 2.719, + "theoretical_loss": 3.500139524541867, + "tokens_seen": 1579740160 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002632998996990973, + "loss": 2.8082, + "theoretical_loss": 3.500126849147192, + "tokens_seen": 1579805696 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002632898696088265, + "loss": 2.682, + "theoretical_loss": 3.5001141744255495, + "tokens_seen": 1579871232 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002632798395185557, + "loss": 2.6452, + "theoretical_loss": 3.5001015003768767, + "tokens_seen": 1579936768 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026326980942828487, + "loss": 2.7634, + "theoretical_loss": 3.5000888270011083, + "tokens_seen": 1580002304 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026325977933801405, + "loss": 2.6734, + "theoretical_loss": 3.5000761542981826, + "tokens_seen": 1580067840 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026324974924774323, + "loss": 2.6651, + "theoretical_loss": 3.5000634822680348, + "tokens_seen": 1580133376 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002632397191574724, + "loss": 2.6634, + "theoretical_loss": 3.5000508109106017, + "tokens_seen": 1580198912 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026322968906720165, + "loss": 2.7251, + "theoretical_loss": 3.5000381402258194, + "tokens_seen": 1580264448 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002632196589769308, + "loss": 2.6854, + "theoretical_loss": 3.500025470213624, + "tokens_seen": 1580329984 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026320962888666, + "loss": 2.7394, + "theoretical_loss": 3.500012800873953, + "tokens_seen": 1580395520 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026319959879638914, + "loss": 2.7732, + "theoretical_loss": 3.5000001322067416, + "tokens_seen": 1580461056 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631895687061184, + "loss": 2.715, + "theoretical_loss": 3.4999874642119275, + "tokens_seen": 1580526592 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026317953861584756, + "loss": 2.6045, + "theoretical_loss": 3.499974796889446, + "tokens_seen": 1580592128 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026316950852557674, + "loss": 2.7566, + "theoretical_loss": 3.4999621302392336, + "tokens_seen": 1580657664 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631594784353059, + "loss": 2.7192, + "theoretical_loss": 3.4999494642612277, + "tokens_seen": 1580723200 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631494483450351, + "loss": 2.6976, + "theoretical_loss": 3.499936798955364, + "tokens_seen": 1580788736 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631394182547643, + "loss": 2.714, + "theoretical_loss": 3.4999241343215792, + "tokens_seen": 1580854272 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3740277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8479647636413574, + "objective/train/theoretical_loss": 3.4999114703598098, + "objective/train/tokens_used": 1601379808, + "theoretical_loss": 3.4999114703598098, + "tokens_seen": 1580919808 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631293881644935, + "loss": 2.7023, + "theoretical_loss": 3.4999114703598098, + "tokens_seen": 1580919808 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026311935807422264, + "loss": 2.7731, + "theoretical_loss": 3.499898807069992, + "tokens_seen": 1580985344 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002631093279839519, + "loss": 2.7567, + "theoretical_loss": 3.499886144452063, + "tokens_seen": 1581050880 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026309929789368106, + "loss": 2.6541, + "theoretical_loss": 3.499873482505958, + "tokens_seen": 1581116416 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026308926780341024, + "loss": 2.6993, + "theoretical_loss": 3.4998608212316147, + "tokens_seen": 1581181952 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002630792377131394, + "loss": 2.7339, + "theoretical_loss": 3.4998481606289693, + "tokens_seen": 1581247488 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002630692076228686, + "loss": 2.7185, + "theoretical_loss": 3.499835500697958, + "tokens_seen": 1581313024 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002630591775325978, + "loss": 2.7298, + "theoretical_loss": 3.499822841438518, + "tokens_seen": 1581378560 + }, + { + "epoch": 19.01, + "learning_rate": 0.000263049147442327, + "loss": 2.7596, + "theoretical_loss": 3.4998101828505854, + "tokens_seen": 1581444096 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026303911735205615, + "loss": 2.7746, + "theoretical_loss": 3.4997975249340962, + "tokens_seen": 1581509632 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002630290872617854, + "loss": 2.8132, + "theoretical_loss": 3.499784867688988, + "tokens_seen": 1581575168 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002630190571715145, + "loss": 2.8132, + "theoretical_loss": 3.4997722111151974, + "tokens_seen": 1581640704 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026300902708124374, + "loss": 2.7483, + "theoretical_loss": 3.49975955521266, + "tokens_seen": 1581706240 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002629989969909729, + "loss": 2.7709, + "theoretical_loss": 3.499746899981313, + "tokens_seen": 1581771776 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002629889669007021, + "loss": 2.6774, + "theoretical_loss": 3.4997342454210933, + "tokens_seen": 1581837312 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002629789368104313, + "loss": 2.852, + "theoretical_loss": 3.4997215915319364, + "tokens_seen": 1581902848 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026296890672016047, + "loss": 2.7941, + "theoretical_loss": 3.49970893831378, + "tokens_seen": 1581968384 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026295887662988965, + "loss": 2.7051, + "theoretical_loss": 3.4996962857665603, + "tokens_seen": 1582033920 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002629488465396189, + "loss": 2.7581, + "theoretical_loss": 3.499683633890214, + "tokens_seen": 1582099456 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262938816449348, + "loss": 2.744, + "theoretical_loss": 3.4996709826846777, + "tokens_seen": 1582164992 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026292878635907725, + "loss": 2.7909, + "theoretical_loss": 3.499658332149888, + "tokens_seen": 1582230528 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026291875626880643, + "loss": 2.7194, + "theoretical_loss": 3.4996456822857818, + "tokens_seen": 1582296064 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002629087261785356, + "loss": 2.6883, + "theoretical_loss": 3.499633033092296, + "tokens_seen": 1582361600 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628986960882648, + "loss": 2.6355, + "theoretical_loss": 3.499620384569366, + "tokens_seen": 1582427136 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026288866599799397, + "loss": 2.6493, + "theoretical_loss": 3.49960773671693, + "tokens_seen": 1582492672 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3743280, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713395118713379, + "objective/train/theoretical_loss": 3.4995950895349237, + "objective/train/tokens_used": 1603018208, + "theoretical_loss": 3.4995950895349237, + "tokens_seen": 1582558208 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026287863590772315, + "loss": 2.6981, + "theoretical_loss": 3.4995950895349237, + "tokens_seen": 1582558208 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628686058174524, + "loss": 2.7975, + "theoretical_loss": 3.499582443023284, + "tokens_seen": 1582623744 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628585757271815, + "loss": 2.735, + "theoretical_loss": 3.499569797181948, + "tokens_seen": 1582689280 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026284854563691075, + "loss": 2.7056, + "theoretical_loss": 3.499557152010852, + "tokens_seen": 1582754816 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628385155466399, + "loss": 2.6848, + "theoretical_loss": 3.499544507509934, + "tokens_seen": 1582820352 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628284854563691, + "loss": 2.75, + "theoretical_loss": 3.4995318636791284, + "tokens_seen": 1582885888 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628184553660983, + "loss": 2.813, + "theoretical_loss": 3.4995192205183736, + "tokens_seen": 1582951424 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002628084252758275, + "loss": 2.7613, + "theoretical_loss": 3.4995065780276056, + "tokens_seen": 1583016960 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026279839518555666, + "loss": 2.749, + "theoretical_loss": 3.499493936206762, + "tokens_seen": 1583082496 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002627883650952859, + "loss": 2.5745, + "theoretical_loss": 3.4994812950557788, + "tokens_seen": 1583148032 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262778335005015, + "loss": 2.7177, + "theoretical_loss": 3.499468654574593, + "tokens_seen": 1583213568 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026276830491474425, + "loss": 2.7473, + "theoretical_loss": 3.4994560147631413, + "tokens_seen": 1583279104 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026275827482447343, + "loss": 2.8097, + "theoretical_loss": 3.499443375621361, + "tokens_seen": 1583344640 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002627482447342026, + "loss": 2.8288, + "theoretical_loss": 3.4994307371491886, + "tokens_seen": 1583410176 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026273821464393185, + "loss": 2.689, + "theoretical_loss": 3.4994180993465607, + "tokens_seen": 1583475712 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262728184553661, + "loss": 2.8665, + "theoretical_loss": 3.499405462213414, + "tokens_seen": 1583541248 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002627181544633902, + "loss": 2.7555, + "theoretical_loss": 3.499392825749686, + "tokens_seen": 1583606784 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026270812437311934, + "loss": 2.7146, + "theoretical_loss": 3.499380189955313, + "tokens_seen": 1583672320 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626980942828486, + "loss": 2.7689, + "theoretical_loss": 3.499367554830232, + "tokens_seen": 1583737856 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026268806419257776, + "loss": 2.6901, + "theoretical_loss": 3.4993549203743797, + "tokens_seen": 1583803392 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026267803410230694, + "loss": 2.6842, + "theoretical_loss": 3.4993422865876926, + "tokens_seen": 1583868928 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626680040120361, + "loss": 2.7168, + "theoretical_loss": 3.499329653470109, + "tokens_seen": 1583934464 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626579739217653, + "loss": 2.6234, + "theoretical_loss": 3.4993170210215645, + "tokens_seen": 1584000000 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626479438314945, + "loss": 2.7314, + "theoretical_loss": 3.4993043892419964, + "tokens_seen": 1584065536 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626379137412237, + "loss": 2.6861, + "theoretical_loss": 3.499291758131341, + "tokens_seen": 1584131072 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3747172, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6886274814605713, + "objective/train/theoretical_loss": 3.4992791276895368, + "objective/train/tokens_used": 1604656608, + "theoretical_loss": 3.4992791276895368, + "tokens_seen": 1584196608 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026262788365095284, + "loss": 2.7215, + "theoretical_loss": 3.4992791276895368, + "tokens_seen": 1584196608 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002626178535606821, + "loss": 2.7026, + "theoretical_loss": 3.499266497916519, + "tokens_seen": 1584262144 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026260782347041126, + "loss": 2.7064, + "theoretical_loss": 3.499253868812225, + "tokens_seen": 1584327680 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026259779338014044, + "loss": 2.6868, + "theoretical_loss": 3.4992412403765925, + "tokens_seen": 1584393216 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625877632898696, + "loss": 2.6199, + "theoretical_loss": 3.4992286126095573, + "tokens_seen": 1584458752 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625777331995988, + "loss": 2.6653, + "theoretical_loss": 3.499215985511057, + "tokens_seen": 1584524288 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262567703109328, + "loss": 2.7599, + "theoretical_loss": 3.4992033590810285, + "tokens_seen": 1584589824 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625576730190572, + "loss": 2.7706, + "theoretical_loss": 3.499190733319409, + "tokens_seen": 1584655360 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026254764292878635, + "loss": 2.7011, + "theoretical_loss": 3.4991781082261353, + "tokens_seen": 1584720896 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625376128385156, + "loss": 2.774, + "theoretical_loss": 3.499165483801144, + "tokens_seen": 1584786432 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625275827482447, + "loss": 2.7233, + "theoretical_loss": 3.4991528600443726, + "tokens_seen": 1584851968 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026251755265797394, + "loss": 2.8382, + "theoretical_loss": 3.499140236955758, + "tokens_seen": 1584917504 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002625075225677031, + "loss": 2.6822, + "theoretical_loss": 3.4991276145352366, + "tokens_seen": 1584983040 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002624974924774323, + "loss": 2.6424, + "theoretical_loss": 3.4991149927827463, + "tokens_seen": 1585048576 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002624874623871615, + "loss": 2.7527, + "theoretical_loss": 3.499102371698224, + "tokens_seen": 1585114112 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026247743229689067, + "loss": 2.7401, + "theoretical_loss": 3.4990897512816055, + "tokens_seen": 1585179648 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026246740220661985, + "loss": 2.6434, + "theoretical_loss": 3.4990771315328297, + "tokens_seen": 1585245184 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002624573721163491, + "loss": 2.7359, + "theoretical_loss": 3.4990645124518327, + "tokens_seen": 1585310720 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002624473420260782, + "loss": 2.7696, + "theoretical_loss": 3.4990518940385513, + "tokens_seen": 1585376256 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026243731193580745, + "loss": 2.7931, + "theoretical_loss": 3.499039276292923, + "tokens_seen": 1585441792 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026242728184553663, + "loss": 2.6978, + "theoretical_loss": 3.499026659214885, + "tokens_seen": 1585507328 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002624172517552658, + "loss": 2.7498, + "theoretical_loss": 3.499014042804374, + "tokens_seen": 1585572864 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262407221664995, + "loss": 2.7209, + "theoretical_loss": 3.4990014270613274, + "tokens_seen": 1585638400 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026239719157472417, + "loss": 2.7204, + "theoretical_loss": 3.498988811985682, + "tokens_seen": 1585703936 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026238716148445335, + "loss": 2.8106, + "theoretical_loss": 3.4989761975773748, + "tokens_seen": 1585769472 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3749996, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.527653932571411, + "objective/train/theoretical_loss": 3.4989635838363435, + "objective/train/tokens_used": 1606295008, + "theoretical_loss": 3.4989635838363435, + "tokens_seen": 1585835008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623771313941826, + "loss": 2.6801, + "theoretical_loss": 3.4989635838363435, + "tokens_seen": 1585835008 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623671013039117, + "loss": 2.8172, + "theoretical_loss": 3.4989509707625253, + "tokens_seen": 1585900544 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026235707121364095, + "loss": 2.6515, + "theoretical_loss": 3.4989383583558564, + "tokens_seen": 1585966080 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623470411233701, + "loss": 2.7441, + "theoretical_loss": 3.498925746616275, + "tokens_seen": 1586031616 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623370110330993, + "loss": 2.6044, + "theoretical_loss": 3.4989131355437175, + "tokens_seen": 1586097152 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623269809428285, + "loss": 2.7671, + "theoretical_loss": 3.4989005251381213, + "tokens_seen": 1586162688 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002623169508525577, + "loss": 2.7464, + "theoretical_loss": 3.4988879153994237, + "tokens_seen": 1586228224 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026230692076228686, + "loss": 2.6841, + "theoretical_loss": 3.4988753063275615, + "tokens_seen": 1586293760 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622968906720161, + "loss": 2.7105, + "theoretical_loss": 3.4988626979224726, + "tokens_seen": 1586359296 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622868605817452, + "loss": 2.729, + "theoretical_loss": 3.498850090184093, + "tokens_seen": 1586424832 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026227683049147445, + "loss": 2.7461, + "theoretical_loss": 3.4988374831123616, + "tokens_seen": 1586490368 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622668004012036, + "loss": 2.6101, + "theoretical_loss": 3.4988248767072143, + "tokens_seen": 1586555904 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622567703109328, + "loss": 2.7085, + "theoretical_loss": 3.498812270968589, + "tokens_seen": 1586621440 + }, + { + "epoch": 19.01, + "learning_rate": 0.000262246740220662, + "loss": 2.6764, + "theoretical_loss": 3.4987996658964224, + "tokens_seen": 1586686976 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622367101303912, + "loss": 2.685, + "theoretical_loss": 3.4987870614906518, + "tokens_seen": 1586752512 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026222668004012036, + "loss": 2.7063, + "theoretical_loss": 3.4987744577512148, + "tokens_seen": 1586818048 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026221664994984954, + "loss": 2.6898, + "theoretical_loss": 3.4987618546780483, + "tokens_seen": 1586883584 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002622066198595787, + "loss": 2.7655, + "theoretical_loss": 3.4987492522710903, + "tokens_seen": 1586949120 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026219658976930796, + "loss": 2.7149, + "theoretical_loss": 3.498736650530277, + "tokens_seen": 1587014656 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002621865596790371, + "loss": 2.6337, + "theoretical_loss": 3.4987240494555465, + "tokens_seen": 1587080192 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002621765295887663, + "loss": 2.7293, + "theoretical_loss": 3.4987114490468354, + "tokens_seen": 1587145728 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026216649949849545, + "loss": 2.8285, + "theoretical_loss": 3.4986988493040814, + "tokens_seen": 1587211264 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002621564694082247, + "loss": 2.8429, + "theoretical_loss": 3.4986862502272222, + "tokens_seen": 1587276800 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026214643931795386, + "loss": 2.6652, + "theoretical_loss": 3.498673651816195, + "tokens_seen": 1587342336 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026213640922768304, + "loss": 2.7432, + "theoretical_loss": 3.4986610540709364, + "tokens_seen": 1587407872 + }, + { + "epoch": 19.01, + "objective/train/docs_used": 3754902, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8833093643188477, + "objective/train/theoretical_loss": 3.4986484569913836, + "objective/train/tokens_used": 1607933408, + "theoretical_loss": 3.4986484569913836, + "tokens_seen": 1587473408 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002621263791374122, + "loss": 2.8193, + "theoretical_loss": 3.4986484569913836, + "tokens_seen": 1587473408 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026211634904714146, + "loss": 2.7509, + "theoretical_loss": 3.498635860577475, + "tokens_seen": 1587538944 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002621063189568706, + "loss": 2.5571, + "theoretical_loss": 3.498623264829148, + "tokens_seen": 1587604480 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620962888665998, + "loss": 2.7453, + "theoretical_loss": 3.498610669746339, + "tokens_seen": 1587670016 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026208625877632895, + "loss": 2.7048, + "theoretical_loss": 3.498598075328986, + "tokens_seen": 1587735552 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620762286860582, + "loss": 2.7427, + "theoretical_loss": 3.4985854815770256, + "tokens_seen": 1587801088 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026206619859578737, + "loss": 2.7177, + "theoretical_loss": 3.498572888490396, + "tokens_seen": 1587866624 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026205616850551655, + "loss": 2.7676, + "theoretical_loss": 3.498560296069035, + "tokens_seen": 1587932160 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026204613841524573, + "loss": 2.6979, + "theoretical_loss": 3.4985477043128785, + "tokens_seen": 1587997696 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620361083249749, + "loss": 2.6365, + "theoretical_loss": 3.498535113221865, + "tokens_seen": 1588063232 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620260782347041, + "loss": 2.6797, + "theoretical_loss": 3.4985225227959322, + "tokens_seen": 1588128768 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620160481444333, + "loss": 2.735, + "theoretical_loss": 3.4985099330350162, + "tokens_seen": 1588194304 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002620060180541625, + "loss": 2.6488, + "theoretical_loss": 3.498497343939056, + "tokens_seen": 1588259840 + }, + { + "epoch": 19.01, + "learning_rate": 0.0002619959879638917, + "loss": 2.7199, + "theoretical_loss": 3.498484755507988, + "tokens_seen": 1588325376 + }, + { + "epoch": 19.01, + "learning_rate": 0.00026198595787362087, + "loss": 2.6534, + "theoretical_loss": 3.4984721677417494, + "tokens_seen": 1588390912 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026197592778335005, + "loss": 2.7739, + "theoretical_loss": 3.4984595806402785, + "tokens_seen": 1588456448 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002619658976930793, + "loss": 2.6089, + "theoretical_loss": 3.498446994203513, + "tokens_seen": 1588521984 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002619558676028084, + "loss": 2.6581, + "theoretical_loss": 3.498434408431389, + "tokens_seen": 1588587520 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026194583751253765, + "loss": 2.774, + "theoretical_loss": 3.498421823323845, + "tokens_seen": 1588653056 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026193580742226683, + "loss": 2.8332, + "theoretical_loss": 3.498409238880819, + "tokens_seen": 1588718592 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261925777331996, + "loss": 2.7351, + "theoretical_loss": 3.4983966551022476, + "tokens_seen": 1588784128 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002619157472417252, + "loss": 2.7461, + "theoretical_loss": 3.498384071988068, + "tokens_seen": 1588849664 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026190571715145437, + "loss": 2.676, + "theoretical_loss": 3.4983714895382185, + "tokens_seen": 1588915200 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026189568706118355, + "loss": 2.7785, + "theoretical_loss": 3.4983589077526362, + "tokens_seen": 1588980736 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618856569709128, + "loss": 2.7516, + "theoretical_loss": 3.4983463266312596, + "tokens_seen": 1589046272 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3757728, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8163645267486572, + "objective/train/theoretical_loss": 3.4983337461740245, + "objective/train/tokens_used": 1609571808, + "theoretical_loss": 3.4983337461740245, + "tokens_seen": 1589111808 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618756268806419, + "loss": 2.8023, + "theoretical_loss": 3.4983337461740245, + "tokens_seen": 1589111808 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026186559679037115, + "loss": 2.7363, + "theoretical_loss": 3.4983211663808698, + "tokens_seen": 1589177344 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618555667001003, + "loss": 2.7495, + "theoretical_loss": 3.4983085872517328, + "tokens_seen": 1589242880 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618455366098295, + "loss": 2.731, + "theoretical_loss": 3.498296008786551, + "tokens_seen": 1589308416 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618355065195587, + "loss": 2.73, + "theoretical_loss": 3.4982834309852615, + "tokens_seen": 1589373952 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618254764292879, + "loss": 2.691, + "theoretical_loss": 3.498270853847803, + "tokens_seen": 1589439488 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026181544633901706, + "loss": 2.7962, + "theoretical_loss": 3.4982582773741115, + "tokens_seen": 1589505024 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002618054162487463, + "loss": 2.6786, + "theoretical_loss": 3.498245701564126, + "tokens_seen": 1589570560 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002617953861584754, + "loss": 2.7431, + "theoretical_loss": 3.4982331264177837, + "tokens_seen": 1589636096 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026178535606820465, + "loss": 2.7473, + "theoretical_loss": 3.498220551935022, + "tokens_seen": 1589701632 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002617753259779338, + "loss": 2.8133, + "theoretical_loss": 3.4982079781157784, + "tokens_seen": 1589767168 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261765295887663, + "loss": 2.7603, + "theoretical_loss": 3.4981954049599917, + "tokens_seen": 1589832704 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002617552657973922, + "loss": 2.689, + "theoretical_loss": 3.498182832467598, + "tokens_seen": 1589898240 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002617452357071214, + "loss": 2.7212, + "theoretical_loss": 3.4981702606385356, + "tokens_seen": 1589963776 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026173520561685056, + "loss": 2.7517, + "theoretical_loss": 3.498157689472742, + "tokens_seen": 1590029312 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026172517552657974, + "loss": 2.7013, + "theoretical_loss": 3.4981451189701556, + "tokens_seen": 1590094848 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002617151454363089, + "loss": 2.7765, + "theoretical_loss": 3.4981325491307134, + "tokens_seen": 1590160384 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026170511534603816, + "loss": 2.7722, + "theoretical_loss": 3.4981199799543528, + "tokens_seen": 1590225920 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002616950852557673, + "loss": 2.6079, + "theoretical_loss": 3.498107411441012, + "tokens_seen": 1590291456 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002616850551654965, + "loss": 2.7593, + "theoretical_loss": 3.4980948435906285, + "tokens_seen": 1590356992 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026167502507522565, + "loss": 2.7465, + "theoretical_loss": 3.4980822764031405, + "tokens_seen": 1590422528 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002616649949849549, + "loss": 2.6814, + "theoretical_loss": 3.4980697098784854, + "tokens_seen": 1590488064 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026165496489468406, + "loss": 2.8159, + "theoretical_loss": 3.4980571440166006, + "tokens_seen": 1590553600 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026164493480441324, + "loss": 2.7064, + "theoretical_loss": 3.4980445788174244, + "tokens_seen": 1590619136 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002616349047141424, + "loss": 2.7482, + "theoretical_loss": 3.498032014280894, + "tokens_seen": 1590684672 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3761363, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8401925563812256, + "objective/train/theoretical_loss": 3.4980194504069475, + "objective/train/tokens_used": 1611210208, + "theoretical_loss": 3.4980194504069475, + "tokens_seen": 1590750208 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026162487462387166, + "loss": 2.7501, + "theoretical_loss": 3.4980194504069475, + "tokens_seen": 1590750208 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002616148445336008, + "loss": 2.7107, + "theoretical_loss": 3.4980068871955226, + "tokens_seen": 1590815744 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026160481444333, + "loss": 2.6017, + "theoretical_loss": 3.497994324646557, + "tokens_seen": 1590881280 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026159478435305915, + "loss": 2.7797, + "theoretical_loss": 3.4979817627599887, + "tokens_seen": 1590946816 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002615847542627884, + "loss": 2.5661, + "theoretical_loss": 3.497969201535755, + "tokens_seen": 1591012352 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026157472417251757, + "loss": 2.7336, + "theoretical_loss": 3.4979566409737943, + "tokens_seen": 1591077888 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026156469408224675, + "loss": 2.8078, + "theoretical_loss": 3.497944081074044, + "tokens_seen": 1591143424 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026155466399197593, + "loss": 2.8355, + "theoretical_loss": 3.497931521836442, + "tokens_seen": 1591208960 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002615446339017051, + "loss": 2.7469, + "theoretical_loss": 3.4979189632609264, + "tokens_seen": 1591274496 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002615346038114343, + "loss": 2.6923, + "theoretical_loss": 3.4979064053474347, + "tokens_seen": 1591340032 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002615245737211635, + "loss": 2.654, + "theoretical_loss": 3.497893848095905, + "tokens_seen": 1591405568 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026151454363089265, + "loss": 2.7563, + "theoretical_loss": 3.4978812915062747, + "tokens_seen": 1591471104 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002615045135406219, + "loss": 2.7919, + "theoretical_loss": 3.497868735578482, + "tokens_seen": 1591536640 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261494483450351, + "loss": 2.7301, + "theoretical_loss": 3.4978561803124646, + "tokens_seen": 1591602176 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026148445336008025, + "loss": 2.6944, + "theoretical_loss": 3.4978436257081604, + "tokens_seen": 1591667712 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026147442326980943, + "loss": 2.6633, + "theoretical_loss": 3.497831071765508, + "tokens_seen": 1591733248 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002614643931795386, + "loss": 2.6766, + "theoretical_loss": 3.4978185184844444, + "tokens_seen": 1591798784 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002614543630892678, + "loss": 2.7233, + "theoretical_loss": 3.497805965864907, + "tokens_seen": 1591864320 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026144433299899703, + "loss": 2.7983, + "theoretical_loss": 3.4977934139068356, + "tokens_seen": 1591929856 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026143430290872616, + "loss": 2.768, + "theoretical_loss": 3.497780862610166, + "tokens_seen": 1591995392 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002614242728184554, + "loss": 2.7114, + "theoretical_loss": 3.4977683119748377, + "tokens_seen": 1592060928 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002614142427281845, + "loss": 2.7205, + "theoretical_loss": 3.4977557620007875, + "tokens_seen": 1592126464 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026140421263791375, + "loss": 2.6877, + "theoretical_loss": 3.497743212687954, + "tokens_seen": 1592192000 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026139418254764293, + "loss": 2.7742, + "theoretical_loss": 3.4977306640362755, + "tokens_seen": 1592257536 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613841524573721, + "loss": 2.6817, + "theoretical_loss": 3.4977181160456894, + "tokens_seen": 1592323072 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3766619, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.509535312652588, + "objective/train/theoretical_loss": 3.4977055687161336, + "objective/train/tokens_used": 1612848608, + "theoretical_loss": 3.4977055687161336, + "tokens_seen": 1592388608 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613741223671013, + "loss": 2.6897, + "theoretical_loss": 3.4977055687161336, + "tokens_seen": 1592388608 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613640922768305, + "loss": 2.698, + "theoretical_loss": 3.497693022047546, + "tokens_seen": 1592454144 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026135406218655966, + "loss": 2.7395, + "theoretical_loss": 3.4976804760398648, + "tokens_seen": 1592519680 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613440320962889, + "loss": 2.7449, + "theoretical_loss": 3.4976679306930283, + "tokens_seen": 1592585216 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261334002006018, + "loss": 2.7052, + "theoretical_loss": 3.497655386006974, + "tokens_seen": 1592650752 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026132397191574726, + "loss": 2.6624, + "theoretical_loss": 3.49764284198164, + "tokens_seen": 1592716288 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613139418254764, + "loss": 2.7965, + "theoretical_loss": 3.497630298616964, + "tokens_seen": 1592781824 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002613039117352056, + "loss": 2.7286, + "theoretical_loss": 3.497617755912885, + "tokens_seen": 1592847360 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002612938816449348, + "loss": 2.714, + "theoretical_loss": 3.497605213869341, + "tokens_seen": 1592912896 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261283851554664, + "loss": 2.6833, + "theoretical_loss": 3.4975926724862685, + "tokens_seen": 1592978432 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026127382146439316, + "loss": 2.7179, + "theoretical_loss": 3.497580131763607, + "tokens_seen": 1593043968 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002612637913741224, + "loss": 2.6774, + "theoretical_loss": 3.497567591701294, + "tokens_seen": 1593109504 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002612537612838516, + "loss": 2.7297, + "theoretical_loss": 3.4975550522992678, + "tokens_seen": 1593175040 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026124373119358076, + "loss": 2.7564, + "theoretical_loss": 3.497542513557466, + "tokens_seen": 1593240576 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026123370110330994, + "loss": 2.6559, + "theoretical_loss": 3.4975299754758273, + "tokens_seen": 1593306112 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002612236710130391, + "loss": 2.7664, + "theoretical_loss": 3.4975174380542895, + "tokens_seen": 1593371648 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026121364092276836, + "loss": 2.7208, + "theoretical_loss": 3.497504901292791, + "tokens_seen": 1593437184 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002612036108324975, + "loss": 2.7852, + "theoretical_loss": 3.4974923651912695, + "tokens_seen": 1593502720 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002611935807422267, + "loss": 2.7822, + "theoretical_loss": 3.497479829749663, + "tokens_seen": 1593568256 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026118355065195585, + "loss": 2.7456, + "theoretical_loss": 3.49746729496791, + "tokens_seen": 1593633792 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002611735205616851, + "loss": 2.7967, + "theoretical_loss": 3.4974547608459483, + "tokens_seen": 1593699328 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026116349047141426, + "loss": 2.7592, + "theoretical_loss": 3.4974422273837162, + "tokens_seen": 1593764864 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026115346038114344, + "loss": 2.72, + "theoretical_loss": 3.497429694581152, + "tokens_seen": 1593830400 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002611434302908726, + "loss": 2.7721, + "theoretical_loss": 3.497417162438194, + "tokens_seen": 1593895936 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026113340020060186, + "loss": 2.791, + "theoretical_loss": 3.4974046309547804, + "tokens_seen": 1593961472 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3769417, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.919240713119507, + "objective/train/theoretical_loss": 3.497392100130848, + "objective/train/tokens_used": 1614487008, + "theoretical_loss": 3.497392100130848, + "tokens_seen": 1594027008 + }, + { + "epoch": 19.02, + "learning_rate": 0.000261123370110331, + "loss": 2.8658, + "theoretical_loss": 3.497392100130848, + "tokens_seen": 1594027008 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002611133400200602, + "loss": 2.7905, + "theoretical_loss": 3.4973795699663373, + "tokens_seen": 1594092544 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026110330992978935, + "loss": 2.7535, + "theoretical_loss": 3.4973670404611843, + "tokens_seen": 1594158080 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610932798395186, + "loss": 2.7351, + "theoretical_loss": 3.497354511615329, + "tokens_seen": 1594223616 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026108324974924777, + "loss": 2.7966, + "theoretical_loss": 3.497341983428708, + "tokens_seen": 1594289152 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026107321965897695, + "loss": 2.7682, + "theoretical_loss": 3.4973294559012604, + "tokens_seen": 1594354688 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026106318956870613, + "loss": 2.8539, + "theoretical_loss": 3.497316929032925, + "tokens_seen": 1594420224 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610531594784353, + "loss": 2.785, + "theoretical_loss": 3.4973044028236386, + "tokens_seen": 1594485760 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610431293881645, + "loss": 2.7362, + "theoretical_loss": 3.497291877273341, + "tokens_seen": 1594551296 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610330992978937, + "loss": 2.7594, + "theoretical_loss": 3.4972793523819687, + "tokens_seen": 1594616832 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026102306920762285, + "loss": 2.5307, + "theoretical_loss": 3.4972668281494608, + "tokens_seen": 1594682368 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610130391173521, + "loss": 2.7933, + "theoretical_loss": 3.4972543045757565, + "tokens_seen": 1594747904 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002610030090270812, + "loss": 2.7415, + "theoretical_loss": 3.497241781660793, + "tokens_seen": 1594813440 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026099297893681045, + "loss": 2.7134, + "theoretical_loss": 3.4972292594045085, + "tokens_seen": 1594878976 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026098294884653963, + "loss": 2.6554, + "theoretical_loss": 3.4972167378068417, + "tokens_seen": 1594944512 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002609729187562688, + "loss": 2.6884, + "theoretical_loss": 3.497204216867731, + "tokens_seen": 1595010048 + }, + { + "epoch": 19.02, + "learning_rate": 0.000260962888665998, + "loss": 2.6602, + "theoretical_loss": 3.497191696587114, + "tokens_seen": 1595075584 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026095285857572723, + "loss": 2.7409, + "theoretical_loss": 3.49717917696493, + "tokens_seen": 1595141120 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026094282848545636, + "loss": 2.7363, + "theoretical_loss": 3.4971666580011167, + "tokens_seen": 1595206656 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002609327983951856, + "loss": 2.8252, + "theoretical_loss": 3.4971541396956125, + "tokens_seen": 1595272192 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002609227683049147, + "loss": 2.6876, + "theoretical_loss": 3.497141622048356, + "tokens_seen": 1595337728 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026091273821464395, + "loss": 2.7559, + "theoretical_loss": 3.497129105059286, + "tokens_seen": 1595403264 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026090270812437313, + "loss": 2.6822, + "theoretical_loss": 3.497116588728339, + "tokens_seen": 1595468800 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608926780341023, + "loss": 2.6983, + "theoretical_loss": 3.4971040730554552, + "tokens_seen": 1595534336 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608826479438315, + "loss": 2.7784, + "theoretical_loss": 3.497091558040572, + "tokens_seen": 1595599872 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3774213, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7892284393310547, + "objective/train/theoretical_loss": 3.4970790436836285, + "objective/train/tokens_used": 1616125408, + "theoretical_loss": 3.4970790436836285, + "tokens_seen": 1595665408 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608726178535607, + "loss": 2.7623, + "theoretical_loss": 3.4970790436836285, + "tokens_seen": 1595665408 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026086258776328986, + "loss": 2.747, + "theoretical_loss": 3.497066529984563, + "tokens_seen": 1595730944 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608525576730191, + "loss": 2.7158, + "theoretical_loss": 3.4970540169433133, + "tokens_seen": 1595796480 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608425275827482, + "loss": 2.7493, + "theoretical_loss": 3.497041504559818, + "tokens_seen": 1595862016 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026083249749247746, + "loss": 2.7418, + "theoretical_loss": 3.497028992834016, + "tokens_seen": 1595927552 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608224674022066, + "loss": 2.7588, + "theoretical_loss": 3.4970164817658453, + "tokens_seen": 1595993088 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002608124373119358, + "loss": 2.7809, + "theoretical_loss": 3.4970039713552445, + "tokens_seen": 1596058624 + }, + { + "epoch": 19.02, + "learning_rate": 0.000260802407221665, + "loss": 2.7615, + "theoretical_loss": 3.4969914616021516, + "tokens_seen": 1596124160 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607923771313942, + "loss": 2.6479, + "theoretical_loss": 3.496978952506506, + "tokens_seen": 1596189696 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026078234704112336, + "loss": 2.7543, + "theoretical_loss": 3.496966444068245, + "tokens_seen": 1596255232 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607723169508526, + "loss": 2.7345, + "theoretical_loss": 3.4969539362873077, + "tokens_seen": 1596320768 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607622868605817, + "loss": 2.7985, + "theoretical_loss": 3.4969414291636327, + "tokens_seen": 1596386304 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026075225677031096, + "loss": 2.621, + "theoretical_loss": 3.4969289226971583, + "tokens_seen": 1596451840 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607422266800401, + "loss": 2.7816, + "theoretical_loss": 3.4969164168878226, + "tokens_seen": 1596517376 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607321965897693, + "loss": 2.6776, + "theoretical_loss": 3.496903911735565, + "tokens_seen": 1596582912 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607221664994985, + "loss": 2.6848, + "theoretical_loss": 3.496891407240323, + "tokens_seen": 1596648448 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002607121364092277, + "loss": 2.7653, + "theoretical_loss": 3.4968789034020364, + "tokens_seen": 1596713984 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026070210631895687, + "loss": 2.7834, + "theoretical_loss": 3.496866400220642, + "tokens_seen": 1596779520 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026069207622868605, + "loss": 2.813, + "theoretical_loss": 3.49685389769608, + "tokens_seen": 1596845056 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026068204613841523, + "loss": 2.8023, + "theoretical_loss": 3.4968413958282873, + "tokens_seen": 1596910592 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026067201604814446, + "loss": 2.7988, + "theoretical_loss": 3.496828894617204, + "tokens_seen": 1596976128 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002606619859578736, + "loss": 2.6683, + "theoretical_loss": 3.4968163940627677, + "tokens_seen": 1597041664 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002606519558676028, + "loss": 2.5864, + "theoretical_loss": 3.496803894164917, + "tokens_seen": 1597107200 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026064192577733195, + "loss": 2.8189, + "theoretical_loss": 3.4967913949235916, + "tokens_seen": 1597172736 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002606318956870612, + "loss": 2.7917, + "theoretical_loss": 3.4967788963387285, + "tokens_seen": 1597238272 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3777029, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7662336826324463, + "objective/train/theoretical_loss": 3.4967663984102675, + "objective/train/tokens_used": 1617763808, + "theoretical_loss": 3.4967663984102675, + "tokens_seen": 1597303808 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026062186559679037, + "loss": 2.786, + "theoretical_loss": 3.4967663984102675, + "tokens_seen": 1597303808 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026061183550651955, + "loss": 2.7057, + "theoretical_loss": 3.4967539011381463, + "tokens_seen": 1597369344 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026060180541624873, + "loss": 2.7973, + "theoretical_loss": 3.4967414045223038, + "tokens_seen": 1597434880 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026059177532597797, + "loss": 2.6943, + "theoretical_loss": 3.4967289085626794, + "tokens_seen": 1597500416 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002605817452357071, + "loss": 2.7342, + "theoretical_loss": 3.49671641325921, + "tokens_seen": 1597565952 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026057171514543633, + "loss": 2.6489, + "theoretical_loss": 3.496703918611836, + "tokens_seen": 1597631488 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026056168505516546, + "loss": 2.7855, + "theoretical_loss": 3.4966914246204954, + "tokens_seen": 1597697024 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002605516549648947, + "loss": 2.7265, + "theoretical_loss": 3.4966789312851265, + "tokens_seen": 1597762560 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026054162487462387, + "loss": 2.7136, + "theoretical_loss": 3.4966664386056685, + "tokens_seen": 1597828096 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026053159478435305, + "loss": 2.679, + "theoretical_loss": 3.496653946582059, + "tokens_seen": 1597893632 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026052156469408223, + "loss": 2.7015, + "theoretical_loss": 3.496641455214238, + "tokens_seen": 1597959168 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002605115346038114, + "loss": 2.6851, + "theoretical_loss": 3.4966289645021438, + "tokens_seen": 1598024704 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026050150451354065, + "loss": 2.7172, + "theoretical_loss": 3.4966164744457147, + "tokens_seen": 1598090240 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026049147442326983, + "loss": 2.8136, + "theoretical_loss": 3.4966039850448896, + "tokens_seen": 1598155776 + }, + { + "epoch": 19.02, + "learning_rate": 0.000260481444332999, + "loss": 2.7076, + "theoretical_loss": 3.496591496299607, + "tokens_seen": 1598221312 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002604714142427282, + "loss": 2.7598, + "theoretical_loss": 3.4965790082098063, + "tokens_seen": 1598286848 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026046138415245743, + "loss": 2.6125, + "theoretical_loss": 3.4965665207754255, + "tokens_seen": 1598352384 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026045135406218656, + "loss": 2.8139, + "theoretical_loss": 3.4965540339964036, + "tokens_seen": 1598417920 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002604413239719158, + "loss": 2.7189, + "theoretical_loss": 3.4965415478726793, + "tokens_seen": 1598483456 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002604312938816449, + "loss": 2.6774, + "theoretical_loss": 3.4965290624041914, + "tokens_seen": 1598548992 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026042126379137415, + "loss": 2.7949, + "theoretical_loss": 3.496516577590879, + "tokens_seen": 1598614528 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026041123370110333, + "loss": 2.826, + "theoretical_loss": 3.49650409343268, + "tokens_seen": 1598680064 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002604012036108325, + "loss": 2.7567, + "theoretical_loss": 3.4964916099295333, + "tokens_seen": 1598745600 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603911735205617, + "loss": 2.6247, + "theoretical_loss": 3.4964791270813786, + "tokens_seen": 1598811136 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603811434302909, + "loss": 2.6678, + "theoretical_loss": 3.496466644888154, + "tokens_seen": 1598876672 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3780805, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.791797161102295, + "objective/train/theoretical_loss": 3.4964541633497985, + "objective/train/tokens_used": 1619402208, + "theoretical_loss": 3.4964541633497985, + "tokens_seen": 1598942208 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026037111334002006, + "loss": 2.7948, + "theoretical_loss": 3.4964541633497985, + "tokens_seen": 1598942208 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603610832497493, + "loss": 2.6496, + "theoretical_loss": 3.4964416824662505, + "tokens_seen": 1599007744 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603510531594784, + "loss": 2.6962, + "theoretical_loss": 3.4964292022374495, + "tokens_seen": 1599073280 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026034102306920766, + "loss": 2.7887, + "theoretical_loss": 3.4964167226633336, + "tokens_seen": 1599138816 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603309929789368, + "loss": 2.6349, + "theoretical_loss": 3.496404243743842, + "tokens_seen": 1599204352 + }, + { + "epoch": 19.02, + "learning_rate": 0.000260320962888666, + "loss": 2.7116, + "theoretical_loss": 3.496391765478914, + "tokens_seen": 1599269888 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603109327983952, + "loss": 2.6937, + "theoretical_loss": 3.496379287868488, + "tokens_seen": 1599335424 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002603009027081244, + "loss": 2.6936, + "theoretical_loss": 3.4963668109125026, + "tokens_seen": 1599400960 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026029087261785356, + "loss": 2.7333, + "theoretical_loss": 3.4963543346108965, + "tokens_seen": 1599466496 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602808425275828, + "loss": 2.7633, + "theoretical_loss": 3.496341858963609, + "tokens_seen": 1599532032 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602708124373119, + "loss": 2.7774, + "theoretical_loss": 3.496329383970579, + "tokens_seen": 1599597568 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026026078234704116, + "loss": 2.7511, + "theoretical_loss": 3.4963169096317457, + "tokens_seen": 1599663104 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602507522567703, + "loss": 2.7427, + "theoretical_loss": 3.4963044359470477, + "tokens_seen": 1599728640 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602407221664995, + "loss": 2.667, + "theoretical_loss": 3.4962919629164233, + "tokens_seen": 1599794176 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602306920762287, + "loss": 2.6454, + "theoretical_loss": 3.496279490539812, + "tokens_seen": 1599859712 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002602206619859579, + "loss": 2.6757, + "theoretical_loss": 3.4962670188171527, + "tokens_seen": 1599925248 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026021063189568707, + "loss": 2.7256, + "theoretical_loss": 3.496254547748384, + "tokens_seen": 1599990784 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026020060180541625, + "loss": 2.8173, + "theoretical_loss": 3.4962420773334455, + "tokens_seen": 1600056320 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026019057171514543, + "loss": 2.7479, + "theoretical_loss": 3.4962296075722756, + "tokens_seen": 1600121856 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026018054162487466, + "loss": 2.692, + "theoretical_loss": 3.4962171384648135, + "tokens_seen": 1600187392 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002601705115346038, + "loss": 2.6463, + "theoretical_loss": 3.496204670010998, + "tokens_seen": 1600252928 + }, + { + "epoch": 19.02, + "learning_rate": 0.000260160481444333, + "loss": 2.7308, + "theoretical_loss": 3.496192202210768, + "tokens_seen": 1600318464 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026015045135406215, + "loss": 2.7496, + "theoretical_loss": 3.4961797350640627, + "tokens_seen": 1600384000 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002601404212637914, + "loss": 2.6512, + "theoretical_loss": 3.4961672685708205, + "tokens_seen": 1600449536 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026013039117352057, + "loss": 2.7108, + "theoretical_loss": 3.496154802730982, + "tokens_seen": 1600515072 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.63750958442688, + "objective/train/theoretical_loss": 3.496142337544484, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.496142337544484, + "tokens_seen": 1600580608 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026012036108324975, + "loss": 2.6824, + "theoretical_loss": 3.496142337544484, + "tokens_seen": 1600580608 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026011033099297893, + "loss": 2.6943, + "theoretical_loss": 3.4961298730112667, + "tokens_seen": 1600646144 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026010030090270817, + "loss": 2.728, + "theoretical_loss": 3.4961174091312692, + "tokens_seen": 1600711680 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002600902708124373, + "loss": 2.775, + "theoretical_loss": 3.4961049459044307, + "tokens_seen": 1600777216 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026008024072216653, + "loss": 2.6933, + "theoretical_loss": 3.4960924833306892, + "tokens_seen": 1600842752 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026007021063189566, + "loss": 2.7793, + "theoretical_loss": 3.496080021409985, + "tokens_seen": 1600908288 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002600601805416249, + "loss": 2.7823, + "theoretical_loss": 3.496067560142256, + "tokens_seen": 1600973824 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026005015045135407, + "loss": 2.6766, + "theoretical_loss": 3.4960550995274424, + "tokens_seen": 1601039360 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026004012036108325, + "loss": 2.7374, + "theoretical_loss": 3.4960426395654824, + "tokens_seen": 1601104896 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026003009027081243, + "loss": 2.6137, + "theoretical_loss": 3.4960301802563154, + "tokens_seen": 1601170432 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002600200601805416, + "loss": 2.7237, + "theoretical_loss": 3.4960177215998804, + "tokens_seen": 1601235968 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002600100300902708, + "loss": 2.6932, + "theoretical_loss": 3.4960052635961163, + "tokens_seen": 1601301504 + }, + { + "epoch": 19.02, + "learning_rate": 0.00026000000000000003, + "loss": 2.7107, + "theoretical_loss": 3.4959928062449626, + "tokens_seen": 1601367040 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025998996990972916, + "loss": 2.7005, + "theoretical_loss": 3.4959803495463584, + "tokens_seen": 1601432576 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002599799398194584, + "loss": 2.7862, + "theoretical_loss": 3.495967893500242, + "tokens_seen": 1601498112 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002599699097291876, + "loss": 2.7396, + "theoretical_loss": 3.495955438106554, + "tokens_seen": 1601563648 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025995987963891676, + "loss": 2.6456, + "theoretical_loss": 3.4959429833652322, + "tokens_seen": 1601629184 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025994984954864594, + "loss": 2.7878, + "theoretical_loss": 3.495930529276216, + "tokens_seen": 1601694720 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002599398194583751, + "loss": 2.7003, + "theoretical_loss": 3.4959180758394455, + "tokens_seen": 1601760256 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002599297893681043, + "loss": 2.7436, + "theoretical_loss": 3.495905623054859, + "tokens_seen": 1601825792 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025991975927783353, + "loss": 2.7369, + "theoretical_loss": 3.4958931709223955, + "tokens_seen": 1601891328 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025990972918756266, + "loss": 2.7736, + "theoretical_loss": 3.4958807194419945, + "tokens_seen": 1601956864 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002598996990972919, + "loss": 2.715, + "theoretical_loss": 3.4958682686135956, + "tokens_seen": 1602022400 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259889669007021, + "loss": 2.7922, + "theoretical_loss": 3.495855818437137, + "tokens_seen": 1602087936 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025987963891675026, + "loss": 2.7307, + "theoretical_loss": 3.495843368912559, + "tokens_seen": 1602153472 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6949098110198975, + "objective/train/theoretical_loss": 3.4958309200397997, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.4958309200397997, + "tokens_seen": 1602219008 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025986960882647944, + "loss": 2.7156, + "theoretical_loss": 3.4958309200397997, + "tokens_seen": 1602219008 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002598595787362086, + "loss": 2.7683, + "theoretical_loss": 3.495818471818799, + "tokens_seen": 1602284544 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002598495486459378, + "loss": 2.7721, + "theoretical_loss": 3.4958060242494957, + "tokens_seen": 1602350080 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259839518555667, + "loss": 2.6094, + "theoretical_loss": 3.49579357733183, + "tokens_seen": 1602415616 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025982948846539616, + "loss": 2.7839, + "theoretical_loss": 3.49578113106574, + "tokens_seen": 1602481152 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002598194583751254, + "loss": 2.7462, + "theoretical_loss": 3.495768685451165, + "tokens_seen": 1602546688 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002598094282848545, + "loss": 2.8142, + "theoretical_loss": 3.4957562404880456, + "tokens_seen": 1602612224 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025979939819458376, + "loss": 2.7279, + "theoretical_loss": 3.4957437961763196, + "tokens_seen": 1602677760 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025978936810431294, + "loss": 2.7283, + "theoretical_loss": 3.495731352515927, + "tokens_seen": 1602743296 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597793380140421, + "loss": 2.7866, + "theoretical_loss": 3.495718909506807, + "tokens_seen": 1602808832 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597693079237713, + "loss": 2.6461, + "theoretical_loss": 3.4957064671488984, + "tokens_seen": 1602874368 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597592778335005, + "loss": 2.7732, + "theoretical_loss": 3.4956940254421407, + "tokens_seen": 1602939904 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597492477432297, + "loss": 2.8069, + "theoretical_loss": 3.4956815843864737, + "tokens_seen": 1603005440 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597392176529589, + "loss": 2.8023, + "theoretical_loss": 3.495669143981836, + "tokens_seen": 1603070976 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002597291875626881, + "loss": 2.6416, + "theoretical_loss": 3.4956567042281677, + "tokens_seen": 1603136512 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025971915747241727, + "loss": 2.7018, + "theoretical_loss": 3.4956442651254074, + "tokens_seen": 1603202048 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025970912738214645, + "loss": 2.7654, + "theoretical_loss": 3.495631826673495, + "tokens_seen": 1603267584 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025969909729187563, + "loss": 2.6813, + "theoretical_loss": 3.4956193888723694, + "tokens_seen": 1603333120 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025968906720160486, + "loss": 2.6894, + "theoretical_loss": 3.49560695172197, + "tokens_seen": 1603398656 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259679037111334, + "loss": 2.7036, + "theoretical_loss": 3.4955945152222365, + "tokens_seen": 1603464192 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002596690070210632, + "loss": 2.6539, + "theoretical_loss": 3.495582079373108, + "tokens_seen": 1603529728 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025965897693079235, + "loss": 2.7462, + "theoretical_loss": 3.495569644174524, + "tokens_seen": 1603595264 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002596489468405216, + "loss": 2.7088, + "theoretical_loss": 3.4955572096264236, + "tokens_seen": 1603660800 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025963891675025077, + "loss": 2.733, + "theoretical_loss": 3.495544775728747, + "tokens_seen": 1603726336 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025962888665997995, + "loss": 2.6651, + "theoretical_loss": 3.4955323424814324, + "tokens_seen": 1603791872 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.750046730041504, + "objective/train/theoretical_loss": 3.49551990988442, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.49551990988442, + "tokens_seen": 1603857408 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025961885656970913, + "loss": 2.7183, + "theoretical_loss": 3.49551990988442, + "tokens_seen": 1603857408 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025960882647943837, + "loss": 2.658, + "theoretical_loss": 3.4955074779376494, + "tokens_seen": 1603922944 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002595987963891675, + "loss": 2.7651, + "theoretical_loss": 3.495495046641059, + "tokens_seen": 1603988480 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025958876629889673, + "loss": 2.6791, + "theoretical_loss": 3.4954826159945895, + "tokens_seen": 1604054016 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025957873620862586, + "loss": 2.7494, + "theoretical_loss": 3.4954701859981796, + "tokens_seen": 1604119552 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002595687061183551, + "loss": 2.7144, + "theoretical_loss": 3.495457756651769, + "tokens_seen": 1604185088 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025955867602808427, + "loss": 2.7614, + "theoretical_loss": 3.495445327955297, + "tokens_seen": 1604250624 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025954864593781345, + "loss": 2.7812, + "theoretical_loss": 3.4954328999087023, + "tokens_seen": 1604316160 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025953861584754263, + "loss": 2.7768, + "theoretical_loss": 3.4954204725119262, + "tokens_seen": 1604381696 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002595285857572718, + "loss": 2.6633, + "theoretical_loss": 3.495408045764907, + "tokens_seen": 1604447232 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259518555667001, + "loss": 2.7046, + "theoretical_loss": 3.495395619667584, + "tokens_seen": 1604512768 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025950852557673023, + "loss": 2.6912, + "theoretical_loss": 3.4953831942198974, + "tokens_seen": 1604578304 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025949849548645936, + "loss": 2.69, + "theoretical_loss": 3.495370769421786, + "tokens_seen": 1604643840 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002594884653961886, + "loss": 2.7588, + "theoretical_loss": 3.49535834527319, + "tokens_seen": 1604709376 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002594784353059178, + "loss": 2.7545, + "theoretical_loss": 3.4953459217740486, + "tokens_seen": 1604774912 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025946840521564696, + "loss": 2.7072, + "theoretical_loss": 3.4953334989243006, + "tokens_seen": 1604840448 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025945837512537614, + "loss": 2.7092, + "theoretical_loss": 3.4953210767238874, + "tokens_seen": 1604905984 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002594483450351053, + "loss": 2.7213, + "theoretical_loss": 3.495308655172747, + "tokens_seen": 1604971520 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002594383149448345, + "loss": 2.7216, + "theoretical_loss": 3.4952962342708185, + "tokens_seen": 1605037056 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025942828485456373, + "loss": 2.7774, + "theoretical_loss": 3.4952838140180433, + "tokens_seen": 1605102592 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025941825476429286, + "loss": 2.7121, + "theoretical_loss": 3.4952713944143596, + "tokens_seen": 1605168128 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002594082246740221, + "loss": 2.7373, + "theoretical_loss": 3.4952589754597074, + "tokens_seen": 1605233664 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002593981945837512, + "loss": 2.6626, + "theoretical_loss": 3.4952465571540268, + "tokens_seen": 1605299200 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025938816449348046, + "loss": 2.8092, + "theoretical_loss": 3.4952341394972564, + "tokens_seen": 1605364736 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025937813440320964, + "loss": 2.766, + "theoretical_loss": 3.4952217224893367, + "tokens_seen": 1605430272 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.721799373626709, + "objective/train/theoretical_loss": 3.4952093061302065, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.4952093061302065, + "tokens_seen": 1605495808 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002593681043129388, + "loss": 2.8196, + "theoretical_loss": 3.4952093061302065, + "tokens_seen": 1605495808 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259358074222668, + "loss": 2.5789, + "theoretical_loss": 3.4951968904198054, + "tokens_seen": 1605561344 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002593480441323972, + "loss": 2.8027, + "theoretical_loss": 3.495184475358074, + "tokens_seen": 1605626880 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025933801404212636, + "loss": 2.7005, + "theoretical_loss": 3.4951720609449515, + "tokens_seen": 1605692416 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002593279839518556, + "loss": 2.6743, + "theoretical_loss": 3.495159647180377, + "tokens_seen": 1605757952 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002593179538615847, + "loss": 2.622, + "theoretical_loss": 3.495147234064291, + "tokens_seen": 1605823488 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025930792377131396, + "loss": 2.7717, + "theoretical_loss": 3.495134821596632, + "tokens_seen": 1605889024 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025929789368104314, + "loss": 2.7252, + "theoretical_loss": 3.495122409777341, + "tokens_seen": 1605954560 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002592878635907723, + "loss": 2.7819, + "theoretical_loss": 3.4951099986063565, + "tokens_seen": 1606020096 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002592778335005015, + "loss": 2.63, + "theoretical_loss": 3.495097588083619, + "tokens_seen": 1606085632 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002592678034102307, + "loss": 2.6617, + "theoretical_loss": 3.4950851782090684, + "tokens_seen": 1606151168 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025925777331995987, + "loss": 2.7242, + "theoretical_loss": 3.4950727689826433, + "tokens_seen": 1606216704 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002592477432296891, + "loss": 2.6512, + "theoretical_loss": 3.4950603604042842, + "tokens_seen": 1606282240 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025923771313941823, + "loss": 2.7367, + "theoretical_loss": 3.4950479524739304, + "tokens_seen": 1606347776 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025922768304914747, + "loss": 2.7523, + "theoretical_loss": 3.4950355451915227, + "tokens_seen": 1606413312 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002592176529588766, + "loss": 2.7964, + "theoretical_loss": 3.4950231385569994, + "tokens_seen": 1606478848 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025920762286860583, + "loss": 2.7681, + "theoretical_loss": 3.495010732570301, + "tokens_seen": 1606544384 + }, + { + "epoch": 19.02, + "learning_rate": 0.000259197592778335, + "loss": 2.7852, + "theoretical_loss": 3.494998327231367, + "tokens_seen": 1606609920 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002591875626880642, + "loss": 2.6776, + "theoretical_loss": 3.4949859225401374, + "tokens_seen": 1606675456 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025917753259779337, + "loss": 2.6747, + "theoretical_loss": 3.4949735184965514, + "tokens_seen": 1606740992 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025916750250752255, + "loss": 2.732, + "theoretical_loss": 3.4949611151005495, + "tokens_seen": 1606806528 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025915747241725173, + "loss": 2.7304, + "theoretical_loss": 3.4949487123520715, + "tokens_seen": 1606872064 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025914744232698097, + "loss": 2.707, + "theoretical_loss": 3.4949363102510564, + "tokens_seen": 1606937600 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002591374122367101, + "loss": 2.6469, + "theoretical_loss": 3.4949239087974444, + "tokens_seen": 1607003136 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025912738214643933, + "loss": 2.7273, + "theoretical_loss": 3.494911507991176, + "tokens_seen": 1607068672 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.807612180709839, + "objective/train/theoretical_loss": 3.49489910783219, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.49489910783219, + "tokens_seen": 1607134208 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002591173520561685, + "loss": 2.7274, + "theoretical_loss": 3.49489910783219, + "tokens_seen": 1607134208 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002591073219658977, + "loss": 2.6443, + "theoretical_loss": 3.4948867083204265, + "tokens_seen": 1607199744 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002590972918756269, + "loss": 2.755, + "theoretical_loss": 3.4948743094558257, + "tokens_seen": 1607265280 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025908726178535606, + "loss": 2.6515, + "theoretical_loss": 3.494861911238327, + "tokens_seen": 1607330816 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025907723169508524, + "loss": 2.7268, + "theoretical_loss": 3.4948495136678703, + "tokens_seen": 1607396352 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025906720160481447, + "loss": 2.7467, + "theoretical_loss": 3.494837116744396, + "tokens_seen": 1607461888 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002590571715145436, + "loss": 2.7855, + "theoretical_loss": 3.494824720467843, + "tokens_seen": 1607527424 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025904714142427283, + "loss": 2.6943, + "theoretical_loss": 3.4948123248381515, + "tokens_seen": 1607592960 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025903711133400196, + "loss": 2.6894, + "theoretical_loss": 3.4947999298552626, + "tokens_seen": 1607658496 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002590270812437312, + "loss": 2.8028, + "theoretical_loss": 3.494787535519114, + "tokens_seen": 1607724032 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002590170511534604, + "loss": 2.7013, + "theoretical_loss": 3.4947751418296473, + "tokens_seen": 1607789568 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025900702106318956, + "loss": 2.7064, + "theoretical_loss": 3.494762748786802, + "tokens_seen": 1607855104 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002589969909729188, + "loss": 2.795, + "theoretical_loss": 3.494750356390518, + "tokens_seen": 1607920640 + }, + { + "epoch": 19.02, + "learning_rate": 0.000258986960882648, + "loss": 2.8226, + "theoretical_loss": 3.4947379646407346, + "tokens_seen": 1607986176 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025897693079237716, + "loss": 2.6537, + "theoretical_loss": 3.4947255735373925, + "tokens_seen": 1608051712 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025896690070210634, + "loss": 2.7901, + "theoretical_loss": 3.4947131830804308, + "tokens_seen": 1608117248 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002589568706118355, + "loss": 2.7303, + "theoretical_loss": 3.49470079326979, + "tokens_seen": 1608182784 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002589468405215647, + "loss": 2.6974, + "theoretical_loss": 3.4946884041054114, + "tokens_seen": 1608248320 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025893681043129393, + "loss": 2.6794, + "theoretical_loss": 3.494676015587232, + "tokens_seen": 1608313856 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025892678034102306, + "loss": 2.7646, + "theoretical_loss": 3.494663627715194, + "tokens_seen": 1608379392 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002589167502507523, + "loss": 2.699, + "theoretical_loss": 3.4946512404892367, + "tokens_seen": 1608444928 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002589067201604814, + "loss": 2.7416, + "theoretical_loss": 3.4946388539093003, + "tokens_seen": 1608510464 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025889669007021066, + "loss": 2.8436, + "theoretical_loss": 3.4946264679753245, + "tokens_seen": 1608576000 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025888665997993984, + "loss": 2.629, + "theoretical_loss": 3.4946140826872494, + "tokens_seen": 1608641536 + }, + { + "epoch": 19.02, + "learning_rate": 0.000258876629889669, + "loss": 2.6505, + "theoretical_loss": 3.494601698045015, + "tokens_seen": 1608707072 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.701335906982422, + "objective/train/theoretical_loss": 3.494589314048561, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.494589314048561, + "tokens_seen": 1608772608 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002588665997993982, + "loss": 2.7458, + "theoretical_loss": 3.494589314048561, + "tokens_seen": 1608772608 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002588565697091274, + "loss": 2.8194, + "theoretical_loss": 3.4945769306978276, + "tokens_seen": 1608838144 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025884653961885657, + "loss": 2.6561, + "theoretical_loss": 3.4945645479927556, + "tokens_seen": 1608903680 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002588365095285858, + "loss": 2.7796, + "theoretical_loss": 3.494552165933284, + "tokens_seen": 1608969216 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025882647943831493, + "loss": 2.8088, + "theoretical_loss": 3.494539784519353, + "tokens_seen": 1609034752 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025881644934804416, + "loss": 2.7411, + "theoretical_loss": 3.4945274037509035, + "tokens_seen": 1609100288 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025880641925777334, + "loss": 2.7962, + "theoretical_loss": 3.494515023627875, + "tokens_seen": 1609165824 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587963891675025, + "loss": 2.6079, + "theoretical_loss": 3.4945026441502076, + "tokens_seen": 1609231360 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587863590772317, + "loss": 2.7401, + "theoretical_loss": 3.4944902653178405, + "tokens_seen": 1609296896 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587763289869609, + "loss": 2.777, + "theoretical_loss": 3.4944778871307154, + "tokens_seen": 1609362432 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025876629889669007, + "loss": 2.7744, + "theoretical_loss": 3.494465509588771, + "tokens_seen": 1609427968 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587562688064193, + "loss": 2.6651, + "theoretical_loss": 3.4944531326919486, + "tokens_seen": 1609493504 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025874623871614843, + "loss": 2.7521, + "theoretical_loss": 3.494440756440187, + "tokens_seen": 1609559040 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025873620862587767, + "loss": 2.7265, + "theoretical_loss": 3.4944283808334275, + "tokens_seen": 1609624576 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587261785356068, + "loss": 2.8175, + "theoretical_loss": 3.49441600587161, + "tokens_seen": 1609690112 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025871614844533603, + "loss": 2.7766, + "theoretical_loss": 3.4944036315546736, + "tokens_seen": 1609755648 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002587061183550652, + "loss": 2.7927, + "theoretical_loss": 3.49439125788256, + "tokens_seen": 1609821184 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002586960882647944, + "loss": 2.7096, + "theoretical_loss": 3.494378884855208, + "tokens_seen": 1609886720 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025868605817452357, + "loss": 2.8057, + "theoretical_loss": 3.494366512472559, + "tokens_seen": 1609952256 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025867602808425275, + "loss": 2.783, + "theoretical_loss": 3.4943541407345515, + "tokens_seen": 1610017792 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025866599799398193, + "loss": 2.7143, + "theoretical_loss": 3.4943417696411276, + "tokens_seen": 1610083328 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025865596790371117, + "loss": 2.7754, + "theoretical_loss": 3.494329399192226, + "tokens_seen": 1610148864 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002586459378134403, + "loss": 2.7303, + "theoretical_loss": 3.494317029387788, + "tokens_seen": 1610214400 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025863590772316953, + "loss": 2.8341, + "theoretical_loss": 3.4943046602277525, + "tokens_seen": 1610279936 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002586258776328987, + "loss": 2.6821, + "theoretical_loss": 3.494292291712061, + "tokens_seen": 1610345472 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784374237060547, + "objective/train/theoretical_loss": 3.4942799238406526, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.4942799238406526, + "tokens_seen": 1610411008 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002586158475426279, + "loss": 2.7502, + "theoretical_loss": 3.4942799238406526, + "tokens_seen": 1610411008 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002586058174523571, + "loss": 2.6671, + "theoretical_loss": 3.494267556613469, + "tokens_seen": 1610476544 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025859578736208626, + "loss": 2.8524, + "theoretical_loss": 3.494255190030449, + "tokens_seen": 1610542080 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025858575727181544, + "loss": 2.7272, + "theoretical_loss": 3.494242824091533, + "tokens_seen": 1610607616 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025857572718154467, + "loss": 2.8735, + "theoretical_loss": 3.494230458796662, + "tokens_seen": 1610673152 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002585656970912738, + "loss": 2.719, + "theoretical_loss": 3.4942180941457757, + "tokens_seen": 1610738688 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025855566700100303, + "loss": 2.6888, + "theoretical_loss": 3.4942057301388143, + "tokens_seen": 1610804224 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025854563691073216, + "loss": 2.8069, + "theoretical_loss": 3.494193366775719, + "tokens_seen": 1610869760 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002585356068204614, + "loss": 2.7474, + "theoretical_loss": 3.4941810040564287, + "tokens_seen": 1610935296 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002585255767301906, + "loss": 2.6375, + "theoretical_loss": 3.4941686419808846, + "tokens_seen": 1611000832 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025851554663991976, + "loss": 2.8029, + "theoretical_loss": 3.4941562805490265, + "tokens_seen": 1611066368 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025850551654964894, + "loss": 2.6849, + "theoretical_loss": 3.4941439197607953, + "tokens_seen": 1611131904 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584954864593782, + "loss": 2.83, + "theoretical_loss": 3.4941315596161306, + "tokens_seen": 1611197440 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584854563691073, + "loss": 2.6644, + "theoretical_loss": 3.494119200114973, + "tokens_seen": 1611262976 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025847542627883654, + "loss": 2.6721, + "theoretical_loss": 3.4941068412572633, + "tokens_seen": 1611328512 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025846539618856566, + "loss": 2.6574, + "theoretical_loss": 3.4940944830429412, + "tokens_seen": 1611394048 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584553660982949, + "loss": 2.7516, + "theoretical_loss": 3.494082125471947, + "tokens_seen": 1611459584 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584453360080241, + "loss": 2.8225, + "theoretical_loss": 3.494069768544222, + "tokens_seen": 1611525120 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025843530591775326, + "loss": 2.5918, + "theoretical_loss": 3.4940574122597057, + "tokens_seen": 1611590656 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025842527582748244, + "loss": 2.8298, + "theoretical_loss": 3.4940450566183383, + "tokens_seen": 1611656192 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584152457372116, + "loss": 2.7153, + "theoretical_loss": 3.4940327016200605, + "tokens_seen": 1611721728 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002584052156469408, + "loss": 2.7463, + "theoretical_loss": 3.494020347264813, + "tokens_seen": 1611787264 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025839518555667004, + "loss": 2.705, + "theoretical_loss": 3.4940079935525357, + "tokens_seen": 1611852800 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025838515546639917, + "loss": 2.6965, + "theoretical_loss": 3.493995640483169, + "tokens_seen": 1611918336 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002583751253761284, + "loss": 2.6641, + "theoretical_loss": 3.4939832880566537, + "tokens_seen": 1611983872 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.867887258529663, + "objective/train/theoretical_loss": 3.49397093627293, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.49397093627293, + "tokens_seen": 1612049408 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025836509528585753, + "loss": 2.786, + "theoretical_loss": 3.49397093627293, + "tokens_seen": 1612049408 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025835506519558677, + "loss": 2.6613, + "theoretical_loss": 3.493958585131938, + "tokens_seen": 1612114944 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025834503510531595, + "loss": 2.7526, + "theoretical_loss": 3.4939462346336185, + "tokens_seen": 1612180480 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025833500501504513, + "loss": 2.7909, + "theoretical_loss": 3.4939338847779124, + "tokens_seen": 1612246016 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002583249749247743, + "loss": 2.6699, + "theoretical_loss": 3.493921535564759, + "tokens_seen": 1612311552 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025831494483450354, + "loss": 2.7043, + "theoretical_loss": 3.4939091869940997, + "tokens_seen": 1612377088 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025830491474423267, + "loss": 2.6841, + "theoretical_loss": 3.493896839065875, + "tokens_seen": 1612442624 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002582948846539619, + "loss": 2.7892, + "theoretical_loss": 3.4938844917800242, + "tokens_seen": 1612508160 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025828485456369103, + "loss": 2.7802, + "theoretical_loss": 3.493872145136489, + "tokens_seen": 1612573696 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025827482447342027, + "loss": 2.7946, + "theoretical_loss": 3.4938597991352096, + "tokens_seen": 1612639232 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025826479438314945, + "loss": 2.8381, + "theoretical_loss": 3.493847453776126, + "tokens_seen": 1612704768 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025825476429287863, + "loss": 2.6827, + "theoretical_loss": 3.4938351090591793, + "tokens_seen": 1612770304 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025824473420260787, + "loss": 2.7132, + "theoretical_loss": 3.49382276498431, + "tokens_seen": 1612835840 + }, + { + "epoch": 19.02, + "learning_rate": 0.000258234704112337, + "loss": 2.7112, + "theoretical_loss": 3.493810421551458, + "tokens_seen": 1612901376 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025822467402206623, + "loss": 2.6648, + "theoretical_loss": 3.4937980787605643, + "tokens_seen": 1612966912 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002582146439317954, + "loss": 2.6625, + "theoretical_loss": 3.493785736611569, + "tokens_seen": 1613032448 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002582046138415246, + "loss": 2.7869, + "theoretical_loss": 3.4937733951044136, + "tokens_seen": 1613097984 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025819458375125377, + "loss": 2.8154, + "theoretical_loss": 3.493761054239038, + "tokens_seen": 1613163520 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025818455366098295, + "loss": 2.7543, + "theoretical_loss": 3.493748714015382, + "tokens_seen": 1613229056 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025817452357071213, + "loss": 2.7336, + "theoretical_loss": 3.4937363744333876, + "tokens_seen": 1613294592 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025816449348044137, + "loss": 2.8041, + "theoretical_loss": 3.4937240354929946, + "tokens_seen": 1613360128 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002581544633901705, + "loss": 2.6376, + "theoretical_loss": 3.4937116971941435, + "tokens_seen": 1613425664 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025814443329989973, + "loss": 2.8036, + "theoretical_loss": 3.493699359536776, + "tokens_seen": 1613491200 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002581344032096289, + "loss": 2.6925, + "theoretical_loss": 3.4936870225208305, + "tokens_seen": 1613556736 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002581243731193581, + "loss": 2.7855, + "theoretical_loss": 3.4936746861462495, + "tokens_seen": 1613622272 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5522429943084717, + "objective/train/theoretical_loss": 3.493662350412973, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.493662350412973, + "tokens_seen": 1613687808 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002581143430290873, + "loss": 2.6685, + "theoretical_loss": 3.493662350412973, + "tokens_seen": 1613687808 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025810431293881646, + "loss": 2.8627, + "theoretical_loss": 3.493650015320941, + "tokens_seen": 1613753344 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025809428284854564, + "loss": 2.8228, + "theoretical_loss": 3.493637680870095, + "tokens_seen": 1613818880 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025808425275827487, + "loss": 2.6777, + "theoretical_loss": 3.4936253470603758, + "tokens_seen": 1613884416 + }, + { + "epoch": 19.02, + "learning_rate": 0.000258074222668004, + "loss": 2.6066, + "theoretical_loss": 3.4936130138917236, + "tokens_seen": 1613949952 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025806419257773323, + "loss": 2.7631, + "theoretical_loss": 3.4936006813640788, + "tokens_seen": 1614015488 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025805416248746236, + "loss": 2.7643, + "theoretical_loss": 3.4935883494773825, + "tokens_seen": 1614081024 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002580441323971916, + "loss": 2.6948, + "theoretical_loss": 3.4935760182315754, + "tokens_seen": 1614146560 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002580341023069208, + "loss": 2.6589, + "theoretical_loss": 3.4935636876265974, + "tokens_seen": 1614212096 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025802407221664996, + "loss": 2.6792, + "theoretical_loss": 3.4935513576623896, + "tokens_seen": 1614277632 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025801404212637914, + "loss": 2.7835, + "theoretical_loss": 3.4935390283388936, + "tokens_seen": 1614343168 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002580040120361084, + "loss": 2.7291, + "theoretical_loss": 3.493526699656049, + "tokens_seen": 1614408704 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002579939819458375, + "loss": 2.7434, + "theoretical_loss": 3.493514371613797, + "tokens_seen": 1614474240 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025798395185556674, + "loss": 2.7516, + "theoretical_loss": 3.4935020442120774, + "tokens_seen": 1614539776 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025797392176529586, + "loss": 2.7394, + "theoretical_loss": 3.4934897174508324, + "tokens_seen": 1614605312 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002579638916750251, + "loss": 2.7354, + "theoretical_loss": 3.4934773913300017, + "tokens_seen": 1614670848 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002579538615847543, + "loss": 2.677, + "theoretical_loss": 3.4934650658495263, + "tokens_seen": 1614736384 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025794383149448346, + "loss": 2.7169, + "theoretical_loss": 3.493452741009347, + "tokens_seen": 1614801920 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025793380140421264, + "loss": 2.8368, + "theoretical_loss": 3.4934404168094053, + "tokens_seen": 1614867456 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002579237713139418, + "loss": 2.7859, + "theoretical_loss": 3.49342809324964, + "tokens_seen": 1614932992 + }, + { + "epoch": 19.02, + "learning_rate": 0.000257913741223671, + "loss": 2.7217, + "theoretical_loss": 3.493415770329994, + "tokens_seen": 1614998528 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025790371113340024, + "loss": 2.7294, + "theoretical_loss": 3.493403448050407, + "tokens_seen": 1615064064 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025789368104312937, + "loss": 2.7582, + "theoretical_loss": 3.4933911264108195, + "tokens_seen": 1615129600 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002578836509528586, + "loss": 2.7482, + "theoretical_loss": 3.493378805411173, + "tokens_seen": 1615195136 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025787362086258773, + "loss": 2.7423, + "theoretical_loss": 3.493366485051408, + "tokens_seen": 1615260672 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.683225393295288, + "objective/train/theoretical_loss": 3.493354165331465, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.493354165331465, + "tokens_seen": 1615326208 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025786359077231697, + "loss": 2.6752, + "theoretical_loss": 3.493354165331465, + "tokens_seen": 1615326208 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025785356068204615, + "loss": 2.8309, + "theoretical_loss": 3.493341846251286, + "tokens_seen": 1615391744 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025784353059177533, + "loss": 2.7876, + "theoretical_loss": 3.4933295278108103, + "tokens_seen": 1615457280 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002578335005015045, + "loss": 2.7332, + "theoretical_loss": 3.4933172100099794, + "tokens_seen": 1615522816 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025782347041123374, + "loss": 2.7267, + "theoretical_loss": 3.493304892848734, + "tokens_seen": 1615588352 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025781344032096287, + "loss": 2.709, + "theoretical_loss": 3.4932925763270157, + "tokens_seen": 1615653888 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002578034102306921, + "loss": 2.6684, + "theoretical_loss": 3.493280260444764, + "tokens_seen": 1615719424 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025779338014042123, + "loss": 2.7344, + "theoretical_loss": 3.493267945201921, + "tokens_seen": 1615784960 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025778335005015047, + "loss": 2.762, + "theoretical_loss": 3.4932556305984273, + "tokens_seen": 1615850496 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025777331995987965, + "loss": 2.7599, + "theoretical_loss": 3.493243316634223, + "tokens_seen": 1615916032 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025776328986960883, + "loss": 2.758, + "theoretical_loss": 3.49323100330925, + "tokens_seen": 1615981568 + }, + { + "epoch": 19.02, + "learning_rate": 0.000257753259779338, + "loss": 2.7789, + "theoretical_loss": 3.4932186906234484, + "tokens_seen": 1616047104 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002577432296890672, + "loss": 2.8376, + "theoretical_loss": 3.4932063785767595, + "tokens_seen": 1616112640 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002577331995987964, + "loss": 2.7062, + "theoretical_loss": 3.4931940671691244, + "tokens_seen": 1616178176 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002577231695085256, + "loss": 2.7271, + "theoretical_loss": 3.4931817564004835, + "tokens_seen": 1616243712 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025771313941825474, + "loss": 2.708, + "theoretical_loss": 3.493169446270778, + "tokens_seen": 1616309248 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025770310932798397, + "loss": 2.7513, + "theoretical_loss": 3.493157136779949, + "tokens_seen": 1616374784 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576930792377131, + "loss": 2.7233, + "theoretical_loss": 3.4931448279279373, + "tokens_seen": 1616440320 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025768304914744233, + "loss": 2.7016, + "theoretical_loss": 3.493132519714684, + "tokens_seen": 1616505856 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576730190571715, + "loss": 2.6745, + "theoretical_loss": 3.4931202121401297, + "tokens_seen": 1616571392 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576629889669007, + "loss": 2.7423, + "theoretical_loss": 3.493107905204215, + "tokens_seen": 1616636928 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576529588766299, + "loss": 2.722, + "theoretical_loss": 3.493095598906882, + "tokens_seen": 1616702464 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576429287863591, + "loss": 2.6967, + "theoretical_loss": 3.4930832932480715, + "tokens_seen": 1616768000 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025763289869608824, + "loss": 2.8059, + "theoretical_loss": 3.493070988227723, + "tokens_seen": 1616833536 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576228686058175, + "loss": 2.7758, + "theoretical_loss": 3.4930586838457796, + "tokens_seen": 1616899072 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7942371368408203, + "objective/train/theoretical_loss": 3.493046380102181, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.493046380102181, + "tokens_seen": 1616964608 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002576128385155466, + "loss": 2.7367, + "theoretical_loss": 3.493046380102181, + "tokens_seen": 1616964608 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025760280842527584, + "loss": 2.7181, + "theoretical_loss": 3.493034076996868, + "tokens_seen": 1617030144 + }, + { + "epoch": 19.02, + "learning_rate": 0.000257592778335005, + "loss": 2.7396, + "theoretical_loss": 3.493021774529783, + "tokens_seen": 1617095680 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002575827482447342, + "loss": 2.7942, + "theoretical_loss": 3.493009472700866, + "tokens_seen": 1617161216 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002575727181544634, + "loss": 2.6534, + "theoretical_loss": 3.4929971715100576, + "tokens_seen": 1617226752 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025756268806419256, + "loss": 2.7878, + "theoretical_loss": 3.4929848709572995, + "tokens_seen": 1617292288 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025755265797392174, + "loss": 2.6158, + "theoretical_loss": 3.4929725710425332, + "tokens_seen": 1617357824 + }, + { + "epoch": 19.02, + "learning_rate": 0.000257542627883651, + "loss": 2.7499, + "theoretical_loss": 3.492960271765699, + "tokens_seen": 1617423360 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002575325977933801, + "loss": 2.701, + "theoretical_loss": 3.4929479731267383, + "tokens_seen": 1617488896 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025752256770310934, + "loss": 2.7909, + "theoretical_loss": 3.492935675125592, + "tokens_seen": 1617554432 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025751253761283847, + "loss": 2.8069, + "theoretical_loss": 3.4929233777622013, + "tokens_seen": 1617619968 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002575025075225677, + "loss": 2.7639, + "theoretical_loss": 3.4929110810365067, + "tokens_seen": 1617685504 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025749247743229694, + "loss": 2.6798, + "theoretical_loss": 3.4928987849484505, + "tokens_seen": 1617751040 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025748244734202606, + "loss": 2.7341, + "theoretical_loss": 3.492886489497973, + "tokens_seen": 1617816576 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002574724172517553, + "loss": 2.7202, + "theoretical_loss": 3.4928741946850157, + "tokens_seen": 1617882112 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002574623871614845, + "loss": 2.7046, + "theoretical_loss": 3.4928619005095194, + "tokens_seen": 1617947648 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025745235707121366, + "loss": 2.7339, + "theoretical_loss": 3.492849606971425, + "tokens_seen": 1618013184 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025744232698094284, + "loss": 2.7369, + "theoretical_loss": 3.4928373140706745, + "tokens_seen": 1618078720 + }, + { + "epoch": 19.02, + "learning_rate": 0.000257432296890672, + "loss": 2.7469, + "theoretical_loss": 3.4928250218072083, + "tokens_seen": 1618144256 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002574222668004012, + "loss": 2.7465, + "theoretical_loss": 3.492812730180968, + "tokens_seen": 1618209792 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025741223671013044, + "loss": 2.7779, + "theoretical_loss": 3.4928004391918943, + "tokens_seen": 1618275328 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025740220661985957, + "loss": 2.7824, + "theoretical_loss": 3.492788148839929, + "tokens_seen": 1618340864 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002573921765295888, + "loss": 2.7467, + "theoretical_loss": 3.492775859125012, + "tokens_seen": 1618406400 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025738214643931793, + "loss": 2.8589, + "theoretical_loss": 3.492763570047086, + "tokens_seen": 1618471936 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025737211634904717, + "loss": 2.7743, + "theoretical_loss": 3.4927512816060915, + "tokens_seen": 1618537472 + }, + { + "epoch": 19.02, + "objective/train/docs_used": 3781760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.835365056991577, + "objective/train/theoretical_loss": 3.4927389938019697, + "objective/train/tokens_used": 1619886560, + "theoretical_loss": 3.4927389938019697, + "tokens_seen": 1618603008 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025736208625877635, + "loss": 2.7928, + "theoretical_loss": 3.4927389938019697, + "tokens_seen": 1618603008 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025735205616850553, + "loss": 2.8325, + "theoretical_loss": 3.492726706634662, + "tokens_seen": 1618668544 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002573420260782347, + "loss": 2.7514, + "theoretical_loss": 3.492714420104109, + "tokens_seen": 1618734080 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025733199598796394, + "loss": 2.8225, + "theoretical_loss": 3.492702134210253, + "tokens_seen": 1618799616 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025732196589769307, + "loss": 2.7574, + "theoretical_loss": 3.4926898489530345, + "tokens_seen": 1618865152 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002573119358074223, + "loss": 2.7593, + "theoretical_loss": 3.4926775643323946, + "tokens_seen": 1618930688 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025730190571715143, + "loss": 2.7965, + "theoretical_loss": 3.492665280348275, + "tokens_seen": 1618996224 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025729187562688067, + "loss": 2.6543, + "theoretical_loss": 3.492652997000617, + "tokens_seen": 1619061760 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025728184553660985, + "loss": 2.5218, + "theoretical_loss": 3.4926407142893616, + "tokens_seen": 1619127296 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025727181544633903, + "loss": 2.7031, + "theoretical_loss": 3.49262843221445, + "tokens_seen": 1619192832 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002572617853560682, + "loss": 2.7116, + "theoretical_loss": 3.4926161507758238, + "tokens_seen": 1619258368 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002572517552657974, + "loss": 2.7959, + "theoretical_loss": 3.4926038699734234, + "tokens_seen": 1619323904 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002572417251755266, + "loss": 2.6842, + "theoretical_loss": 3.4925915898071915, + "tokens_seen": 1619389440 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002572316950852558, + "loss": 2.7301, + "theoretical_loss": 3.492579310277068, + "tokens_seen": 1619454976 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025722166499498494, + "loss": 2.7038, + "theoretical_loss": 3.492567031382996, + "tokens_seen": 1619520512 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025721163490471417, + "loss": 2.6389, + "theoretical_loss": 3.4925547531249146, + "tokens_seen": 1619586048 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002572016048144433, + "loss": 2.7702, + "theoretical_loss": 3.4925424755027668, + "tokens_seen": 1619651584 + }, + { + "epoch": 19.02, + "learning_rate": 0.00025719157472417253, + "loss": 2.7811, + "theoretical_loss": 3.4925301985164934, + "tokens_seen": 1619717120 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002571815446339017, + "loss": 2.6279, + "theoretical_loss": 3.4925179221660354, + "tokens_seen": 1619782656 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002571715145436309, + "loss": 2.7342, + "theoretical_loss": 3.4925056464513347, + "tokens_seen": 1619848192 + }, + { + "epoch": 19.02, + "learning_rate": 0.0002571614844533601, + "loss": 2.7934, + "theoretical_loss": 3.4924947139281377, + "tokens_seen": 1619906560 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002571514543630893, + "loss": 2.6586, + "theoretical_loss": 3.4924824394152543, + "tokens_seen": 1619972096 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025714142427281844, + "loss": 2.6679, + "theoretical_loss": 3.4924701655379593, + "tokens_seen": 1620037632 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002571313941825477, + "loss": 2.6352, + "theoretical_loss": 3.4924578922961933, + "tokens_seen": 1620103168 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002571213640922768, + "loss": 2.6937, + "theoretical_loss": 3.4924456196898968, + "tokens_seen": 1620168704 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3830435, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8196909427642822, + "objective/train/theoretical_loss": 3.492433347719013, + "objective/train/tokens_used": 1640694240, + "theoretical_loss": 3.492433347719013, + "tokens_seen": 1620234240 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025711133400200604, + "loss": 2.6403, + "theoretical_loss": 3.492433347719013, + "tokens_seen": 1620234240 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002571013039117352, + "loss": 2.6368, + "theoretical_loss": 3.492421076383481, + "tokens_seen": 1620299776 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570912738214644, + "loss": 2.5574, + "theoretical_loss": 3.4924088056832447, + "tokens_seen": 1620365312 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570812437311936, + "loss": 2.5569, + "theoretical_loss": 3.4923965356182443, + "tokens_seen": 1620430848 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025707121364092276, + "loss": 2.5515, + "theoretical_loss": 3.4923842661884206, + "tokens_seen": 1620496384 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025706118355065194, + "loss": 2.6938, + "theoretical_loss": 3.4923719973937164, + "tokens_seen": 1620561920 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570511534603812, + "loss": 2.6371, + "theoretical_loss": 3.492359729234072, + "tokens_seen": 1620627456 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570411233701103, + "loss": 2.7446, + "theoretical_loss": 3.4923474617094294, + "tokens_seen": 1620692992 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025703109327983954, + "loss": 2.6382, + "theoretical_loss": 3.49233519481973, + "tokens_seen": 1620758528 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025702106318956867, + "loss": 2.6609, + "theoretical_loss": 3.4923229285649153, + "tokens_seen": 1620824064 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570110330992979, + "loss": 2.649, + "theoretical_loss": 3.4923106629449263, + "tokens_seen": 1620889600 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002570010030090271, + "loss": 2.6803, + "theoretical_loss": 3.492298397959705, + "tokens_seen": 1620955136 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025699097291875626, + "loss": 2.6185, + "theoretical_loss": 3.492286133609193, + "tokens_seen": 1621020672 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025698094282848545, + "loss": 2.6182, + "theoretical_loss": 3.4922738698933316, + "tokens_seen": 1621086208 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002569709127382147, + "loss": 2.5694, + "theoretical_loss": 3.492261606812062, + "tokens_seen": 1621151744 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002569608826479438, + "loss": 2.6123, + "theoretical_loss": 3.492249344365326, + "tokens_seen": 1621217280 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025695085255767304, + "loss": 2.6134, + "theoretical_loss": 3.4922370825530655, + "tokens_seen": 1621282816 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025694082246740217, + "loss": 2.5985, + "theoretical_loss": 3.4922248213752214, + "tokens_seen": 1621348352 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002569307923771314, + "loss": 2.6291, + "theoretical_loss": 3.492212560831735, + "tokens_seen": 1621413888 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002569207622868606, + "loss": 2.5998, + "theoretical_loss": 3.4922003009225486, + "tokens_seen": 1621479424 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025691073219658977, + "loss": 2.652, + "theoretical_loss": 3.4921880416476037, + "tokens_seen": 1621544960 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025690070210631895, + "loss": 2.6917, + "theoretical_loss": 3.492175783006841, + "tokens_seen": 1621610496 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025689067201604813, + "loss": 2.4731, + "theoretical_loss": 3.4921635250002034, + "tokens_seen": 1621676032 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002568806419257773, + "loss": 2.6443, + "theoretical_loss": 3.4921512676276314, + "tokens_seen": 1621741568 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025687061183550655, + "loss": 2.6913, + "theoretical_loss": 3.492139010889067, + "tokens_seen": 1621807104 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3834090, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.545135259628296, + "objective/train/theoretical_loss": 3.4921267547844517, + "objective/train/tokens_used": 1642332640, + "theoretical_loss": 3.4921267547844517, + "tokens_seen": 1621872640 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002568605817452357, + "loss": 2.595, + "theoretical_loss": 3.4921267547844517, + "tokens_seen": 1621872640 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002568505516549649, + "loss": 2.7226, + "theoretical_loss": 3.4921144993137267, + "tokens_seen": 1621938176 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025684052156469404, + "loss": 2.6055, + "theoretical_loss": 3.4921022444768344, + "tokens_seen": 1622003712 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025683049147442327, + "loss": 2.7444, + "theoretical_loss": 3.492089990273716, + "tokens_seen": 1622069248 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025682046138415245, + "loss": 2.6726, + "theoretical_loss": 3.4920777367043128, + "tokens_seen": 1622134784 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025681043129388163, + "loss": 2.6808, + "theoretical_loss": 3.492065483768567, + "tokens_seen": 1622200320 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002568004012036108, + "loss": 2.7273, + "theoretical_loss": 3.49205323146642, + "tokens_seen": 1622265856 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025679037111334005, + "loss": 2.6918, + "theoretical_loss": 3.492040979797814, + "tokens_seen": 1622331392 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002567803410230692, + "loss": 2.7316, + "theoretical_loss": 3.492028728762689, + "tokens_seen": 1622396928 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002567703109327984, + "loss": 2.5378, + "theoretical_loss": 3.4920164783609886, + "tokens_seen": 1622462464 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025676028084252754, + "loss": 2.6643, + "theoretical_loss": 3.492004228592653, + "tokens_seen": 1622528000 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002567502507522568, + "loss": 2.5386, + "theoretical_loss": 3.491991979457625, + "tokens_seen": 1622593536 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256740220661986, + "loss": 2.6585, + "theoretical_loss": 3.491979730955846, + "tokens_seen": 1622659072 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025673019057171514, + "loss": 2.6193, + "theoretical_loss": 3.4919674830872567, + "tokens_seen": 1622724608 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025672016048144437, + "loss": 2.6227, + "theoretical_loss": 3.4919552358518, + "tokens_seen": 1622790144 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002567101303911735, + "loss": 2.5432, + "theoretical_loss": 3.491942989249417, + "tokens_seen": 1622855680 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025670010030090273, + "loss": 2.7186, + "theoretical_loss": 3.4919307432800495, + "tokens_seen": 1622921216 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566900702106319, + "loss": 2.5713, + "theoretical_loss": 3.4919184979436393, + "tokens_seen": 1622986752 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566800401203611, + "loss": 2.7025, + "theoretical_loss": 3.4919062532401286, + "tokens_seen": 1623052288 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566700100300903, + "loss": 2.5741, + "theoretical_loss": 3.4918940091694584, + "tokens_seen": 1623117824 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566599799398195, + "loss": 2.5996, + "theoretical_loss": 3.49188176573157, + "tokens_seen": 1623183360 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025664994984954864, + "loss": 2.6587, + "theoretical_loss": 3.4918695229264065, + "tokens_seen": 1623248896 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566399197592779, + "loss": 2.7204, + "theoretical_loss": 3.4918572807539094, + "tokens_seen": 1623314432 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256629889669007, + "loss": 2.6427, + "theoretical_loss": 3.4918450392140192, + "tokens_seen": 1623379968 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025661985957873624, + "loss": 2.6194, + "theoretical_loss": 3.491832798306679, + "tokens_seen": 1623445504 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3839023, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8694212436676025, + "objective/train/theoretical_loss": 3.4918205580318302, + "objective/train/tokens_used": 1643971040, + "theoretical_loss": 3.4918205580318302, + "tokens_seen": 1623511040 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002566098294884654, + "loss": 2.6154, + "theoretical_loss": 3.4918205580318302, + "tokens_seen": 1623511040 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565997993981946, + "loss": 2.5991, + "theoretical_loss": 3.491808318389414, + "tokens_seen": 1623576576 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565897693079238, + "loss": 2.5699, + "theoretical_loss": 3.4917960793793736, + "tokens_seen": 1623642112 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025657973921765296, + "loss": 2.778, + "theoretical_loss": 3.4917838410016495, + "tokens_seen": 1623707648 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025656970912738214, + "loss": 2.6847, + "theoretical_loss": 3.4917716032561836, + "tokens_seen": 1623773184 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565596790371114, + "loss": 2.6053, + "theoretical_loss": 3.4917593661429187, + "tokens_seen": 1623838720 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565496489468405, + "loss": 2.7833, + "theoretical_loss": 3.4917471296617952, + "tokens_seen": 1623904256 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025653961885656974, + "loss": 2.6255, + "theoretical_loss": 3.4917348938127564, + "tokens_seen": 1623969792 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025652958876629887, + "loss": 2.5941, + "theoretical_loss": 3.491722658595743, + "tokens_seen": 1624035328 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565195586760281, + "loss": 2.6386, + "theoretical_loss": 3.4917104240106975, + "tokens_seen": 1624100864 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002565095285857573, + "loss": 2.6947, + "theoretical_loss": 3.4916981900575617, + "tokens_seen": 1624166400 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025649949849548647, + "loss": 2.6313, + "theoretical_loss": 3.4916859567362772, + "tokens_seen": 1624231936 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025648946840521565, + "loss": 2.5825, + "theoretical_loss": 3.491673724046786, + "tokens_seen": 1624297472 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002564794383149449, + "loss": 2.7353, + "theoretical_loss": 3.49166149198903, + "tokens_seen": 1624363008 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256469408224674, + "loss": 2.6567, + "theoretical_loss": 3.491649260562951, + "tokens_seen": 1624428544 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025645937813440324, + "loss": 2.6912, + "theoretical_loss": 3.491637029768491, + "tokens_seen": 1624494080 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025644934804413237, + "loss": 2.6546, + "theoretical_loss": 3.4916247996055922, + "tokens_seen": 1624559616 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002564393179538616, + "loss": 2.6852, + "theoretical_loss": 3.491612570074196, + "tokens_seen": 1624625152 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002564292878635908, + "loss": 2.5626, + "theoretical_loss": 3.4916003411742444, + "tokens_seen": 1624690688 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025641925777331997, + "loss": 2.6103, + "theoretical_loss": 3.491588112905679, + "tokens_seen": 1624756224 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025640922768304915, + "loss": 2.6963, + "theoretical_loss": 3.4915758852684426, + "tokens_seen": 1624821760 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025639919759277833, + "loss": 2.6362, + "theoretical_loss": 3.491563658262477, + "tokens_seen": 1624887296 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002563891675025075, + "loss": 2.6133, + "theoretical_loss": 3.491551431887724, + "tokens_seen": 1624952832 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025637913741223675, + "loss": 2.7214, + "theoretical_loss": 3.491539206144125, + "tokens_seen": 1625018368 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002563691073219659, + "loss": 2.8254, + "theoretical_loss": 3.4915269810316225, + "tokens_seen": 1625083904 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3842207, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6156601905822754, + "objective/train/theoretical_loss": 3.491514756550158, + "objective/train/tokens_used": 1645609440, + "theoretical_loss": 3.491514756550158, + "tokens_seen": 1625149440 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002563590772316951, + "loss": 2.6285, + "theoretical_loss": 3.491514756550158, + "tokens_seen": 1625149440 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025634904714142424, + "loss": 2.7104, + "theoretical_loss": 3.4915025326996743, + "tokens_seen": 1625214976 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025633901705115347, + "loss": 2.6279, + "theoretical_loss": 3.4914903094801133, + "tokens_seen": 1625280512 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025632898696088265, + "loss": 2.6702, + "theoretical_loss": 3.4914780868914157, + "tokens_seen": 1625346048 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025631895687061183, + "loss": 2.7015, + "theoretical_loss": 3.491465864933525, + "tokens_seen": 1625411584 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256308926780341, + "loss": 2.7107, + "theoretical_loss": 3.4914536436063823, + "tokens_seen": 1625477120 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025629889669007025, + "loss": 2.6341, + "theoretical_loss": 3.4914414229099306, + "tokens_seen": 1625542656 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002562888665997994, + "loss": 2.7405, + "theoretical_loss": 3.491429202844111, + "tokens_seen": 1625608192 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002562788365095286, + "loss": 2.7326, + "theoretical_loss": 3.4914169834088655, + "tokens_seen": 1625673728 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025626880641925774, + "loss": 2.6196, + "theoretical_loss": 3.491404764604137, + "tokens_seen": 1625739264 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256258776328987, + "loss": 2.5415, + "theoretical_loss": 3.491392546429867, + "tokens_seen": 1625804800 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025624874623871616, + "loss": 2.5559, + "theoretical_loss": 3.491380328885997, + "tokens_seen": 1625870336 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025623871614844534, + "loss": 2.6995, + "theoretical_loss": 3.49136811197247, + "tokens_seen": 1625935872 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002562286860581745, + "loss": 2.5528, + "theoretical_loss": 3.491355895689228, + "tokens_seen": 1626001408 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002562186559679037, + "loss": 2.5703, + "theoretical_loss": 3.4913436800362128, + "tokens_seen": 1626066944 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002562086258776329, + "loss": 2.6414, + "theoretical_loss": 3.491331465013366, + "tokens_seen": 1626132480 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002561985957873621, + "loss": 2.5845, + "theoretical_loss": 3.4913192506206308, + "tokens_seen": 1626198016 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025618856569709124, + "loss": 2.6481, + "theoretical_loss": 3.491307036857948, + "tokens_seen": 1626263552 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002561785356068205, + "loss": 2.6026, + "theoretical_loss": 3.4912948237252612, + "tokens_seen": 1626329088 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025616850551654966, + "loss": 2.683, + "theoretical_loss": 3.491282611222511, + "tokens_seen": 1626394624 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025615847542627884, + "loss": 2.6805, + "theoretical_loss": 3.4912703993496406, + "tokens_seen": 1626460160 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256148445336008, + "loss": 2.6553, + "theoretical_loss": 3.491258188106592, + "tokens_seen": 1626525696 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002561384152457372, + "loss": 2.6569, + "theoretical_loss": 3.491245977493307, + "tokens_seen": 1626591232 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002561283851554664, + "loss": 2.6188, + "theoretical_loss": 3.4912337675097276, + "tokens_seen": 1626656768 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002561183550651956, + "loss": 2.6983, + "theoretical_loss": 3.4912215581557966, + "tokens_seen": 1626722304 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3846867, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6463375091552734, + "objective/train/theoretical_loss": 3.4912093494314553, + "objective/train/tokens_used": 1647247840, + "theoretical_loss": 3.4912093494314553, + "tokens_seen": 1626787840 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025610832497492475, + "loss": 2.6462, + "theoretical_loss": 3.4912093494314553, + "tokens_seen": 1626787840 + }, + { + "epoch": 20.0, + "learning_rate": 0.000256098294884654, + "loss": 2.5992, + "theoretical_loss": 3.491197141336647, + "tokens_seen": 1626853376 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560882647943831, + "loss": 2.6414, + "theoretical_loss": 3.491184933871313, + "tokens_seen": 1626918912 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025607823470411234, + "loss": 2.6074, + "theoretical_loss": 3.4911727270353956, + "tokens_seen": 1626984448 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560682046138415, + "loss": 2.648, + "theoretical_loss": 3.491160520828837, + "tokens_seen": 1627049984 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560581745235707, + "loss": 2.6486, + "theoretical_loss": 3.4911483152515803, + "tokens_seen": 1627115520 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560481444332999, + "loss": 2.6977, + "theoretical_loss": 3.4911361103035663, + "tokens_seen": 1627181056 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025603811434302907, + "loss": 2.7203, + "theoretical_loss": 3.491123905984738, + "tokens_seen": 1627246592 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025602808425275825, + "loss": 2.6364, + "theoretical_loss": 3.4911117022950373, + "tokens_seen": 1627312128 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560180541624875, + "loss": 2.7519, + "theoretical_loss": 3.4910994992344073, + "tokens_seen": 1627377664 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002560080240722166, + "loss": 2.6616, + "theoretical_loss": 3.491087296802789, + "tokens_seen": 1627443200 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025599799398194585, + "loss": 2.6068, + "theoretical_loss": 3.4910750950001255, + "tokens_seen": 1627508736 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002559879638916751, + "loss": 2.6952, + "theoretical_loss": 3.4910628938263586, + "tokens_seen": 1627574272 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002559779338014042, + "loss": 2.7289, + "theoretical_loss": 3.491050693281431, + "tokens_seen": 1627639808 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025596790371113344, + "loss": 2.6158, + "theoretical_loss": 3.4910384933652843, + "tokens_seen": 1627705344 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025595787362086257, + "loss": 2.642, + "theoretical_loss": 3.491026294077862, + "tokens_seen": 1627770880 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002559478435305918, + "loss": 2.7469, + "theoretical_loss": 3.491014095419105, + "tokens_seen": 1627836416 + }, + { + "epoch": 20.0, + "learning_rate": 0.000255937813440321, + "loss": 2.625, + "theoretical_loss": 3.491001897388956, + "tokens_seen": 1627901952 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025592778335005017, + "loss": 2.5796, + "theoretical_loss": 3.4909896999873578, + "tokens_seen": 1627967488 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025591775325977935, + "loss": 2.6787, + "theoretical_loss": 3.4909775032142525, + "tokens_seen": 1628033024 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025590772316950853, + "loss": 2.7234, + "theoretical_loss": 3.4909653070695823, + "tokens_seen": 1628098560 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002558976930792377, + "loss": 2.5939, + "theoretical_loss": 3.4909531115532895, + "tokens_seen": 1628164096 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025588766298896695, + "loss": 2.6637, + "theoretical_loss": 3.4909409166653163, + "tokens_seen": 1628229632 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002558776328986961, + "loss": 2.6307, + "theoretical_loss": 3.4909287224056054, + "tokens_seen": 1628295168 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002558676028084253, + "loss": 2.6814, + "theoretical_loss": 3.4909165287740986, + "tokens_seen": 1628360704 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3849968, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.72721266746521, + "objective/train/theoretical_loss": 3.4909043357707388, + "objective/train/tokens_used": 1648886240, + "theoretical_loss": 3.4909043357707388, + "tokens_seen": 1628426240 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025585757271815444, + "loss": 2.6474, + "theoretical_loss": 3.4909043357707388, + "tokens_seen": 1628426240 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025584754262788367, + "loss": 2.7004, + "theoretical_loss": 3.4908921433954685, + "tokens_seen": 1628491776 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025583751253761285, + "loss": 2.6876, + "theoretical_loss": 3.4908799516482296, + "tokens_seen": 1628557312 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025582748244734203, + "loss": 2.5499, + "theoretical_loss": 3.490867760528965, + "tokens_seen": 1628622848 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002558174523570712, + "loss": 2.6269, + "theoretical_loss": 3.490855570037616, + "tokens_seen": 1628688384 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002558174523570712, + "loss": 2.6689, + "theoretical_loss": 3.490843380174126, + "tokens_seen": 1628753920 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025580742226680045, + "loss": 2.8001, + "theoretical_loss": 3.490831190938437, + "tokens_seen": 1628819456 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557973921765296, + "loss": 2.6863, + "theoretical_loss": 3.4908190023304915, + "tokens_seen": 1628884992 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557873620862588, + "loss": 2.6908, + "theoretical_loss": 3.4908068143502327, + "tokens_seen": 1628950528 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025577733199598794, + "loss": 2.5808, + "theoretical_loss": 3.4907946269976016, + "tokens_seen": 1629016064 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557673019057172, + "loss": 2.6793, + "theoretical_loss": 3.4907824402725414, + "tokens_seen": 1629081600 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025575727181544636, + "loss": 2.7538, + "theoretical_loss": 3.4907702541749943, + "tokens_seen": 1629147136 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025574724172517554, + "loss": 2.6581, + "theoretical_loss": 3.4907580687049027, + "tokens_seen": 1629212672 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557372116349047, + "loss": 2.687, + "theoretical_loss": 3.4907458838622096, + "tokens_seen": 1629278208 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557271815446339, + "loss": 2.6556, + "theoretical_loss": 3.4907336996468574, + "tokens_seen": 1629343744 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557171514543631, + "loss": 2.6078, + "theoretical_loss": 3.4907215160587874, + "tokens_seen": 1629409280 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002557071213640923, + "loss": 2.7008, + "theoretical_loss": 3.4907093330979437, + "tokens_seen": 1629474816 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025569709127382144, + "loss": 2.6904, + "theoretical_loss": 3.4906971507642677, + "tokens_seen": 1629540352 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556870611835507, + "loss": 2.6921, + "theoretical_loss": 3.4906849690577024, + "tokens_seen": 1629605888 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025567703109327986, + "loss": 2.6534, + "theoretical_loss": 3.4906727879781903, + "tokens_seen": 1629671424 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025566700100300904, + "loss": 2.6396, + "theoretical_loss": 3.4906606075256734, + "tokens_seen": 1629736960 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556569709127382, + "loss": 2.6013, + "theoretical_loss": 3.490648427700095, + "tokens_seen": 1629802496 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556469408224674, + "loss": 2.6768, + "theoretical_loss": 3.4906362485013966, + "tokens_seen": 1629868032 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556369107321966, + "loss": 2.6586, + "theoretical_loss": 3.4906240699295217, + "tokens_seen": 1629933568 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556268806419258, + "loss": 2.6722, + "theoretical_loss": 3.4906118919844125, + "tokens_seen": 1629999104 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3853819, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6481683254241943, + "objective/train/theoretical_loss": 3.4905997146660113, + "objective/train/tokens_used": 1650524640, + "theoretical_loss": 3.4905997146660113, + "tokens_seen": 1630064640 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025561685055165495, + "loss": 2.7613, + "theoretical_loss": 3.4905997146660113, + "tokens_seen": 1630064640 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002556068204613842, + "loss": 2.6201, + "theoretical_loss": 3.490587537974261, + "tokens_seen": 1630130176 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555967903711133, + "loss": 2.6524, + "theoretical_loss": 3.4905753619091033, + "tokens_seen": 1630195712 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025558676028084254, + "loss": 2.6797, + "theoretical_loss": 3.4905631864704825, + "tokens_seen": 1630261248 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555767301905717, + "loss": 2.6303, + "theoretical_loss": 3.4905510116583396, + "tokens_seen": 1630326784 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555667001003009, + "loss": 2.6623, + "theoretical_loss": 3.490538837472618, + "tokens_seen": 1630392320 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555566700100301, + "loss": 2.6239, + "theoretical_loss": 3.4905266639132595, + "tokens_seen": 1630457856 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025554663991975927, + "loss": 2.8109, + "theoretical_loss": 3.4905144909802077, + "tokens_seen": 1630523392 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025553660982948845, + "loss": 2.6757, + "theoretical_loss": 3.4905023186734043, + "tokens_seen": 1630588928 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555265797392177, + "loss": 2.7031, + "theoretical_loss": 3.490490146992793, + "tokens_seen": 1630654464 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002555165496489468, + "loss": 2.6607, + "theoretical_loss": 3.4904779759383153, + "tokens_seen": 1630720000 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025550651955867605, + "loss": 2.7296, + "theoretical_loss": 3.4904658055099143, + "tokens_seen": 1630785536 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025549648946840523, + "loss": 2.6384, + "theoretical_loss": 3.490453635707533, + "tokens_seen": 1630851072 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554864593781344, + "loss": 2.707, + "theoretical_loss": 3.4904414665311134, + "tokens_seen": 1630916608 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554764292878636, + "loss": 2.6636, + "theoretical_loss": 3.490429297980598, + "tokens_seen": 1630982144 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025546639919759277, + "loss": 2.7053, + "theoretical_loss": 3.490417130055931, + "tokens_seen": 1631047680 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025545636910732195, + "loss": 2.7527, + "theoretical_loss": 3.490404962757053, + "tokens_seen": 1631113216 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554463390170512, + "loss": 2.7084, + "theoretical_loss": 3.490392796083908, + "tokens_seen": 1631178752 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554363089267803, + "loss": 2.742, + "theoretical_loss": 3.490380630036438, + "tokens_seen": 1631244288 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025542627883650955, + "loss": 2.7751, + "theoretical_loss": 3.4903684646145865, + "tokens_seen": 1631309824 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554162487462387, + "loss": 2.6126, + "theoretical_loss": 3.4903562998182958, + "tokens_seen": 1631375360 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002554062186559679, + "loss": 2.7221, + "theoretical_loss": 3.490344135647508, + "tokens_seen": 1631440896 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553961885656971, + "loss": 2.623, + "theoretical_loss": 3.490331972102166, + "tokens_seen": 1631506432 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553861584754263, + "loss": 2.6513, + "theoretical_loss": 3.4903198091822136, + "tokens_seen": 1631571968 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025537612838515546, + "loss": 2.6702, + "theoretical_loss": 3.4903076468875924, + "tokens_seen": 1631637504 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3858419, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.734097480773926, + "objective/train/theoretical_loss": 3.4902954852182457, + "objective/train/tokens_used": 1652163040, + "theoretical_loss": 3.4902954852182457, + "tokens_seen": 1631703040 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025536609829488464, + "loss": 2.6323, + "theoretical_loss": 3.4902954852182457, + "tokens_seen": 1631703040 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553560682046138, + "loss": 2.6399, + "theoretical_loss": 3.490283324174116, + "tokens_seen": 1631768576 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025534603811434305, + "loss": 2.6757, + "theoretical_loss": 3.490271163755146, + "tokens_seen": 1631834112 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553360080240722, + "loss": 2.6654, + "theoretical_loss": 3.4902590039612784, + "tokens_seen": 1631899648 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553259779338014, + "loss": 2.6934, + "theoretical_loss": 3.4902468447924564, + "tokens_seen": 1631965184 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553159478435306, + "loss": 2.7357, + "theoretical_loss": 3.4902346862486224, + "tokens_seen": 1632030720 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002553059177532598, + "loss": 2.7684, + "theoretical_loss": 3.490222528329719, + "tokens_seen": 1632096256 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025529588766298896, + "loss": 2.5823, + "theoretical_loss": 3.4902103710356895, + "tokens_seen": 1632161792 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025528585757271814, + "loss": 2.6982, + "theoretical_loss": 3.490198214366476, + "tokens_seen": 1632227328 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552758274824473, + "loss": 2.7899, + "theoretical_loss": 3.4901860583220223, + "tokens_seen": 1632292864 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025526579739217656, + "loss": 2.6191, + "theoretical_loss": 3.4901739029022707, + "tokens_seen": 1632358400 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552557673019057, + "loss": 2.7585, + "theoretical_loss": 3.4901617481071634, + "tokens_seen": 1632423936 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552457372116349, + "loss": 2.6629, + "theoretical_loss": 3.4901495939366445, + "tokens_seen": 1632489472 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552357071213641, + "loss": 2.5314, + "theoretical_loss": 3.4901374403906558, + "tokens_seen": 1632555008 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552256770310933, + "loss": 2.6752, + "theoretical_loss": 3.4901252874691404, + "tokens_seen": 1632620544 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002552156469408225, + "loss": 2.5995, + "theoretical_loss": 3.4901131351720416, + "tokens_seen": 1632686080 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025520561685055164, + "loss": 2.6948, + "theoretical_loss": 3.4901009834993015, + "tokens_seen": 1632751616 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551955867602809, + "loss": 2.642, + "theoretical_loss": 3.4900888324508634, + "tokens_seen": 1632817152 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025518555667001006, + "loss": 2.6444, + "theoretical_loss": 3.49007668202667, + "tokens_seen": 1632882688 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025517552657973924, + "loss": 2.4748, + "theoretical_loss": 3.490064532226665, + "tokens_seen": 1632948224 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551654964894684, + "loss": 2.7537, + "theoretical_loss": 3.4900523830507897, + "tokens_seen": 1633013760 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551554663991976, + "loss": 2.6934, + "theoretical_loss": 3.4900402344989883, + "tokens_seen": 1633079296 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551454363089268, + "loss": 2.585, + "theoretical_loss": 3.4900280865712032, + "tokens_seen": 1633144832 + }, + { + "epoch": 20.0, + "learning_rate": 0.000255135406218656, + "loss": 2.7177, + "theoretical_loss": 3.4900159392673773, + "tokens_seen": 1633210368 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025512537612838515, + "loss": 2.7188, + "theoretical_loss": 3.4900037925874536, + "tokens_seen": 1633275904 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3861541, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8097035884857178, + "objective/train/theoretical_loss": 3.489991646531375, + "objective/train/tokens_used": 1653801440, + "theoretical_loss": 3.489991646531375, + "tokens_seen": 1633341440 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551153460381144, + "loss": 2.6435, + "theoretical_loss": 3.489991646531375, + "tokens_seen": 1633341440 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002551053159478435, + "loss": 2.6367, + "theoretical_loss": 3.4899795010990844, + "tokens_seen": 1633406976 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025509528585757274, + "loss": 2.6682, + "theoretical_loss": 3.489967356290525, + "tokens_seen": 1633472512 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002550852557673019, + "loss": 2.6436, + "theoretical_loss": 3.4899552121056394, + "tokens_seen": 1633538048 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002550752256770311, + "loss": 2.6894, + "theoretical_loss": 3.489943068544371, + "tokens_seen": 1633603584 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002550651955867603, + "loss": 2.588, + "theoretical_loss": 3.4899309256066617, + "tokens_seen": 1633669120 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025505516549648947, + "loss": 2.6196, + "theoretical_loss": 3.489918783292456, + "tokens_seen": 1633734656 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025504513540621865, + "loss": 2.7231, + "theoretical_loss": 3.4899066416016957, + "tokens_seen": 1633800192 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002550351053159479, + "loss": 2.8223, + "theoretical_loss": 3.4898945005343243, + "tokens_seen": 1633865728 + }, + { + "epoch": 20.0, + "learning_rate": 0.000255025075225677, + "loss": 2.6943, + "theoretical_loss": 3.4898823600902844, + "tokens_seen": 1633931264 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025501504513540625, + "loss": 2.5854, + "theoretical_loss": 3.4898702202695198, + "tokens_seen": 1633996800 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025500501504513543, + "loss": 2.6782, + "theoretical_loss": 3.489858081071972, + "tokens_seen": 1634062336 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549949849548646, + "loss": 2.7943, + "theoretical_loss": 3.489845942497586, + "tokens_seen": 1634127872 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549849548645938, + "loss": 2.6951, + "theoretical_loss": 3.4898338045463033, + "tokens_seen": 1634193408 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025497492477432297, + "loss": 2.6157, + "theoretical_loss": 3.4898216672180675, + "tokens_seen": 1634258944 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025496489468405215, + "loss": 2.5468, + "theoretical_loss": 3.489809530512822, + "tokens_seen": 1634324480 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549548645937814, + "loss": 2.6786, + "theoretical_loss": 3.489797394430509, + "tokens_seen": 1634390016 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549448345035105, + "loss": 2.7047, + "theoretical_loss": 3.4897852589710716, + "tokens_seen": 1634455552 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025493480441323975, + "loss": 2.7036, + "theoretical_loss": 3.4897731241344534, + "tokens_seen": 1634521088 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549247743229689, + "loss": 2.7569, + "theoretical_loss": 3.489760989920598, + "tokens_seen": 1634586624 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549147442326981, + "loss": 2.6094, + "theoretical_loss": 3.489748856329447, + "tokens_seen": 1634652160 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002549047141424273, + "loss": 2.7255, + "theoretical_loss": 3.489736723360944, + "tokens_seen": 1634717696 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002548946840521565, + "loss": 2.559, + "theoretical_loss": 3.489724591015033, + "tokens_seen": 1634783232 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025488465396188566, + "loss": 2.6496, + "theoretical_loss": 3.489712459291656, + "tokens_seen": 1634848768 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025487462387161484, + "loss": 2.6393, + "theoretical_loss": 3.4897003281907573, + "tokens_seen": 1634914304 + }, + { + "epoch": 20.0, + "objective/train/docs_used": 3866182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.590050220489502, + "objective/train/theoretical_loss": 3.4896881977122787, + "objective/train/tokens_used": 1655439840, + "theoretical_loss": 3.4896881977122787, + "tokens_seen": 1634979840 + }, + { + "epoch": 20.0, + "learning_rate": 0.000254864593781344, + "loss": 2.6736, + "theoretical_loss": 3.4896881977122787, + "tokens_seen": 1634979840 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025485456369107325, + "loss": 2.6839, + "theoretical_loss": 3.4896760678561636, + "tokens_seen": 1635045376 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002548445336008024, + "loss": 2.7133, + "theoretical_loss": 3.4896639386223556, + "tokens_seen": 1635110912 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002548345035105316, + "loss": 2.7456, + "theoretical_loss": 3.4896518100107974, + "tokens_seen": 1635176448 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002548244734202608, + "loss": 2.6072, + "theoretical_loss": 3.4896396820214326, + "tokens_seen": 1635241984 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025481444332999, + "loss": 2.6902, + "theoretical_loss": 3.489627554654204, + "tokens_seen": 1635307520 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025480441323971916, + "loss": 2.625, + "theoretical_loss": 3.489615427909055, + "tokens_seen": 1635373056 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025479438314944834, + "loss": 2.6367, + "theoretical_loss": 3.4896033017859285, + "tokens_seen": 1635438592 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002547843530591775, + "loss": 2.7052, + "theoretical_loss": 3.489591176284768, + "tokens_seen": 1635504128 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025477432296890676, + "loss": 2.6772, + "theoretical_loss": 3.4895790514055163, + "tokens_seen": 1635569664 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002547642928786359, + "loss": 2.7389, + "theoretical_loss": 3.4895669271481164, + "tokens_seen": 1635635200 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002547542627883651, + "loss": 2.6492, + "theoretical_loss": 3.489554803512512, + "tokens_seen": 1635700736 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025474423269809425, + "loss": 2.6173, + "theoretical_loss": 3.489542680498646, + "tokens_seen": 1635766272 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002547342026078235, + "loss": 2.6608, + "theoretical_loss": 3.4895305581064626, + "tokens_seen": 1635831808 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025472417251755266, + "loss": 2.6857, + "theoretical_loss": 3.4895184363359033, + "tokens_seen": 1635897344 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025471414242728184, + "loss": 2.678, + "theoretical_loss": 3.4895063151869126, + "tokens_seen": 1635962880 + }, + { + "epoch": 20.0, + "learning_rate": 0.000254704112337011, + "loss": 2.5892, + "theoretical_loss": 3.489494194659433, + "tokens_seen": 1636028416 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025469408224674026, + "loss": 2.778, + "theoretical_loss": 3.4894820747534085, + "tokens_seen": 1636093952 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002546840521564694, + "loss": 2.6818, + "theoretical_loss": 3.489469955468781, + "tokens_seen": 1636159488 + }, + { + "epoch": 20.0, + "learning_rate": 0.0002546740220661986, + "loss": 2.7741, + "theoretical_loss": 3.489457836805496, + "tokens_seen": 1636225024 + }, + { + "epoch": 20.0, + "learning_rate": 0.00025466399197592775, + "loss": 2.6478, + "theoretical_loss": 3.4894457187634944, + "tokens_seen": 1636290560 + }, + { + "epoch": 20.0, + "learning_rate": 0.000254653961885657, + "loss": 2.7302, + "theoretical_loss": 3.4894336013427205, + "tokens_seen": 1636356096 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025464393179538616, + "loss": 2.6781, + "theoretical_loss": 3.489421484543118, + "tokens_seen": 1636421632 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025463390170511535, + "loss": 2.6329, + "theoretical_loss": 3.4894093683646297, + "tokens_seen": 1636487168 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025462387161484453, + "loss": 2.7068, + "theoretical_loss": 3.4893972528071986, + "tokens_seen": 1636552704 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3869168, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6550180912017822, + "objective/train/theoretical_loss": 3.4893851378707685, + "objective/train/tokens_used": 1657078240, + "theoretical_loss": 3.4893851378707685, + "tokens_seen": 1636618240 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002546138415245737, + "loss": 2.7401, + "theoretical_loss": 3.4893851378707685, + "tokens_seen": 1636618240 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002546038114343029, + "loss": 2.5935, + "theoretical_loss": 3.4893730235552827, + "tokens_seen": 1636683776 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545937813440321, + "loss": 2.6666, + "theoretical_loss": 3.489360909860684, + "tokens_seen": 1636749312 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025458375125376125, + "loss": 2.7755, + "theoretical_loss": 3.4893487967869166, + "tokens_seen": 1636814848 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545737211634905, + "loss": 2.7274, + "theoretical_loss": 3.4893366843339226, + "tokens_seen": 1636880384 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545636910732196, + "loss": 2.6385, + "theoretical_loss": 3.489324572501647, + "tokens_seen": 1636945920 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025455366098294885, + "loss": 2.7057, + "theoretical_loss": 3.489312461290031, + "tokens_seen": 1637011456 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025454363089267803, + "loss": 2.6391, + "theoretical_loss": 3.4893003506990197, + "tokens_seen": 1637076992 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545336008024072, + "loss": 2.6137, + "theoretical_loss": 3.4892882407285555, + "tokens_seen": 1637142528 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545235707121364, + "loss": 2.6253, + "theoretical_loss": 3.4892761313785825, + "tokens_seen": 1637208064 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025451354062186563, + "loss": 2.6611, + "theoretical_loss": 3.489264022649044, + "tokens_seen": 1637273600 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002545035105315948, + "loss": 2.6797, + "theoretical_loss": 3.4892519145398824, + "tokens_seen": 1637339136 + }, + { + "epoch": 20.01, + "learning_rate": 0.000254493480441324, + "loss": 2.6217, + "theoretical_loss": 3.489239807051042, + "tokens_seen": 1637404672 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025448345035105317, + "loss": 2.6536, + "theoretical_loss": 3.4892277001824663, + "tokens_seen": 1637470208 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025447342026078235, + "loss": 2.7324, + "theoretical_loss": 3.489215593934098, + "tokens_seen": 1637535744 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544633901705116, + "loss": 2.7124, + "theoretical_loss": 3.489203488305881, + "tokens_seen": 1637601280 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544533600802407, + "loss": 2.6835, + "theoretical_loss": 3.4891913832977584, + "tokens_seen": 1637666816 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025444332998996995, + "loss": 2.6425, + "theoretical_loss": 3.489179278909674, + "tokens_seen": 1637732352 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544332998996991, + "loss": 2.7688, + "theoretical_loss": 3.4891671751415707, + "tokens_seen": 1637797888 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544232698094283, + "loss": 2.6329, + "theoretical_loss": 3.489155071993393, + "tokens_seen": 1637863424 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544132397191575, + "loss": 2.6333, + "theoretical_loss": 3.4891429694650826, + "tokens_seen": 1637928960 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002544032096288867, + "loss": 2.6868, + "theoretical_loss": 3.489130867556585, + "tokens_seen": 1637994496 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025439317953861586, + "loss": 2.6557, + "theoretical_loss": 3.489118766267842, + "tokens_seen": 1638060032 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025438314944834504, + "loss": 2.7132, + "theoretical_loss": 3.489106665598798, + "tokens_seen": 1638125568 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002543731193580742, + "loss": 2.6362, + "theoretical_loss": 3.4890945655493963, + "tokens_seen": 1638191104 + }, + { + "debugging/Self-BLEU-5": 0.6837916758787553, + "debugging/distinct-1-grams": 0.7188505772981949, + "debugging/distinct-2-grams": 0.9410880568834508, + "debugging/entropy-1-grams": 6.355549341143374, + "debugging/entropy-2-grams": 7.691561370662277, + "debugging/length": 609.0689655172414, + "debugging/num_segments": 29, + "epoch": 20.01, + "objective/train/docs_used": 3872898, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9250667095184326, + "objective/train/theoretical_loss": 3.4890824661195796, + "objective/train/tokens_used": 1658716640, + "theoretical_loss": 3.4890824661195796, + "tokens_seen": 1638256640 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025436308926780345, + "loss": 2.8043, + "theoretical_loss": 3.4890824661195796, + "tokens_seen": 1638256640 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002543530591775326, + "loss": 2.7674, + "theoretical_loss": 3.4890703673092927, + "tokens_seen": 1638322176 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002543430290872618, + "loss": 2.6555, + "theoretical_loss": 3.489058269118478, + "tokens_seen": 1638387712 + }, + { + "epoch": 20.01, + "learning_rate": 0.000254332998996991, + "loss": 2.7158, + "theoretical_loss": 3.4890461715470797, + "tokens_seen": 1638453248 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002543229689067202, + "loss": 2.7332, + "theoretical_loss": 3.4890340745950414, + "tokens_seen": 1638518784 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025431293881644936, + "loss": 2.6562, + "theoretical_loss": 3.489021978262306, + "tokens_seen": 1638584320 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025430290872617854, + "loss": 2.6433, + "theoretical_loss": 3.4890098825488174, + "tokens_seen": 1638649856 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002542928786359077, + "loss": 2.731, + "theoretical_loss": 3.488997787454519, + "tokens_seen": 1638715392 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025428284854563696, + "loss": 2.5947, + "theoretical_loss": 3.4889856929793543, + "tokens_seen": 1638780928 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002542728184553661, + "loss": 2.7253, + "theoretical_loss": 3.488973599123267, + "tokens_seen": 1638846464 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002542627883650953, + "loss": 2.7229, + "theoretical_loss": 3.4889615058862002, + "tokens_seen": 1638912000 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025425275827482445, + "loss": 2.6594, + "theoretical_loss": 3.4889494132680987, + "tokens_seen": 1638977536 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002542427281845537, + "loss": 2.6168, + "theoretical_loss": 3.4889373212689048, + "tokens_seen": 1639043072 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025423269809428286, + "loss": 2.6612, + "theoretical_loss": 3.4889252298885625, + "tokens_seen": 1639108608 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025422266800401204, + "loss": 2.746, + "theoretical_loss": 3.488913139127015, + "tokens_seen": 1639174144 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002542126379137412, + "loss": 2.7377, + "theoretical_loss": 3.488901048984207, + "tokens_seen": 1639239680 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025420260782347046, + "loss": 2.7152, + "theoretical_loss": 3.488888959460081, + "tokens_seen": 1639305216 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541925777331996, + "loss": 2.7283, + "theoretical_loss": 3.488876870554581, + "tokens_seen": 1639370752 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541825476429288, + "loss": 2.7268, + "theoretical_loss": 3.488864782267651, + "tokens_seen": 1639436288 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025417251755265795, + "loss": 2.6332, + "theoretical_loss": 3.4888526945992338, + "tokens_seen": 1639501824 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541624874623872, + "loss": 2.6874, + "theoretical_loss": 3.4888406075492737, + "tokens_seen": 1639567360 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025415245737211637, + "loss": 2.6742, + "theoretical_loss": 3.488828521117714, + "tokens_seen": 1639632896 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025414242728184555, + "loss": 2.7259, + "theoretical_loss": 3.488816435304499, + "tokens_seen": 1639698432 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025413239719157473, + "loss": 2.7407, + "theoretical_loss": 3.488804350109571, + "tokens_seen": 1639763968 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541223671013039, + "loss": 2.6909, + "theoretical_loss": 3.488792265532875, + "tokens_seen": 1639829504 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3877808, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8019001483917236, + "objective/train/theoretical_loss": 3.488780181574354, + "objective/train/tokens_used": 1660355040, + "theoretical_loss": 3.488780181574354, + "tokens_seen": 1639895040 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541123370110331, + "loss": 2.6629, + "theoretical_loss": 3.488780181574354, + "tokens_seen": 1639895040 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002541023069207623, + "loss": 2.6392, + "theoretical_loss": 3.488768098233952, + "tokens_seen": 1639960576 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025409227683049145, + "loss": 2.6795, + "theoretical_loss": 3.488756015511612, + "tokens_seen": 1640026112 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002540822467402207, + "loss": 2.6315, + "theoretical_loss": 3.4887439334072785, + "tokens_seen": 1640091648 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002540722166499498, + "loss": 2.6214, + "theoretical_loss": 3.488731851920895, + "tokens_seen": 1640157184 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025406218655967905, + "loss": 2.8353, + "theoretical_loss": 3.4887197710524047, + "tokens_seen": 1640222720 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025405215646940823, + "loss": 2.657, + "theoretical_loss": 3.4887076908017525, + "tokens_seen": 1640288256 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002540421263791374, + "loss": 2.7086, + "theoretical_loss": 3.4886956111688807, + "tokens_seen": 1640353792 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002540320962888666, + "loss": 2.5204, + "theoretical_loss": 3.4886835321537335, + "tokens_seen": 1640419328 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025402206619859583, + "loss": 2.7451, + "theoretical_loss": 3.4886714537562553, + "tokens_seen": 1640484864 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025401203610832495, + "loss": 2.63, + "theoretical_loss": 3.488659375976389, + "tokens_seen": 1640550400 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002540020060180542, + "loss": 2.7276, + "theoretical_loss": 3.4886472988140786, + "tokens_seen": 1640615936 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539919759277833, + "loss": 2.7556, + "theoretical_loss": 3.4886352222692683, + "tokens_seen": 1640681472 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025398194583751255, + "loss": 2.6688, + "theoretical_loss": 3.488623146341901, + "tokens_seen": 1640747008 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025397191574724173, + "loss": 2.6606, + "theoretical_loss": 3.4886110710319214, + "tokens_seen": 1640812544 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539618856569709, + "loss": 2.7854, + "theoretical_loss": 3.488598996339273, + "tokens_seen": 1640878080 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539518555667001, + "loss": 2.6703, + "theoretical_loss": 3.4885869222638988, + "tokens_seen": 1640943616 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539418254764293, + "loss": 2.6282, + "theoretical_loss": 3.4885748488057438, + "tokens_seen": 1641009152 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025393179538615846, + "loss": 2.7124, + "theoretical_loss": 3.488562775964751, + "tokens_seen": 1641074688 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539217652958877, + "loss": 2.7621, + "theoretical_loss": 3.4885507037408643, + "tokens_seen": 1641140224 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002539117352056168, + "loss": 2.6037, + "theoretical_loss": 3.488538632134028, + "tokens_seen": 1641205760 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025390170511534606, + "loss": 2.6905, + "theoretical_loss": 3.488526561144185, + "tokens_seen": 1641271296 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538916750250752, + "loss": 2.6065, + "theoretical_loss": 3.48851449077128, + "tokens_seen": 1641336832 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538816449348044, + "loss": 2.6272, + "theoretical_loss": 3.488502421015257, + "tokens_seen": 1641402368 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538716148445336, + "loss": 2.7152, + "theoretical_loss": 3.4884903518760586, + "tokens_seen": 1641467904 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3880790, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.576399326324463, + "objective/train/theoretical_loss": 3.48847828335363, + "objective/train/tokens_used": 1661993440, + "theoretical_loss": 3.48847828335363, + "tokens_seen": 1641533440 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538615847542628, + "loss": 2.614, + "theoretical_loss": 3.48847828335363, + "tokens_seen": 1641533440 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025385155466399196, + "loss": 2.6505, + "theoretical_loss": 3.4884662154479145, + "tokens_seen": 1641598976 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538415245737212, + "loss": 2.7928, + "theoretical_loss": 3.488454148158856, + "tokens_seen": 1641664512 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538314944834503, + "loss": 2.7232, + "theoretical_loss": 3.4884420814863977, + "tokens_seen": 1641730048 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025382146439317956, + "loss": 2.6602, + "theoretical_loss": 3.4884300154304846, + "tokens_seen": 1641795584 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538114343029087, + "loss": 2.6134, + "theoretical_loss": 3.48841794999106, + "tokens_seen": 1641861120 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002538014042126379, + "loss": 2.7718, + "theoretical_loss": 3.488405885168068, + "tokens_seen": 1641926656 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002537913741223671, + "loss": 2.6726, + "theoretical_loss": 3.4883938209614525, + "tokens_seen": 1641992192 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002537813440320963, + "loss": 2.6622, + "theoretical_loss": 3.4883817573711573, + "tokens_seen": 1642057728 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025377131394182546, + "loss": 2.7098, + "theoretical_loss": 3.488369694397126, + "tokens_seen": 1642123264 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025376128385155465, + "loss": 2.574, + "theoretical_loss": 3.4883576320393033, + "tokens_seen": 1642188800 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002537512537612839, + "loss": 2.5489, + "theoretical_loss": 3.488345570297632, + "tokens_seen": 1642254336 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025374122367101306, + "loss": 2.7655, + "theoretical_loss": 3.4883335091720573, + "tokens_seen": 1642319872 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025373119358074224, + "loss": 2.6477, + "theoretical_loss": 3.4883214486625227, + "tokens_seen": 1642385408 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002537211634904714, + "loss": 2.8151, + "theoretical_loss": 3.488309388768972, + "tokens_seen": 1642450944 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025371113340020066, + "loss": 2.6277, + "theoretical_loss": 3.488297329491349, + "tokens_seen": 1642516480 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002537011033099298, + "loss": 2.6091, + "theoretical_loss": 3.4882852708295977, + "tokens_seen": 1642582016 + }, + { + "epoch": 20.01, + "learning_rate": 0.000253691073219659, + "loss": 2.736, + "theoretical_loss": 3.488273212783662, + "tokens_seen": 1642647552 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025368104312938815, + "loss": 2.6946, + "theoretical_loss": 3.488261155353487, + "tokens_seen": 1642713088 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002536710130391174, + "loss": 2.6939, + "theoretical_loss": 3.4882490985390153, + "tokens_seen": 1642778624 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025366098294884657, + "loss": 2.6725, + "theoretical_loss": 3.4882370423401916, + "tokens_seen": 1642844160 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025365095285857575, + "loss": 2.723, + "theoretical_loss": 3.4882249867569595, + "tokens_seen": 1642909696 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025364092276830493, + "loss": 2.6257, + "theoretical_loss": 3.4882129317892634, + "tokens_seen": 1642975232 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002536308926780341, + "loss": 2.6587, + "theoretical_loss": 3.4882008774370474, + "tokens_seen": 1643040768 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002536208625877633, + "loss": 2.7098, + "theoretical_loss": 3.4881888237002547, + "tokens_seen": 1643106304 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3885700, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.638127088546753, + "objective/train/theoretical_loss": 3.4881767705788307, + "objective/train/tokens_used": 1663631840, + "theoretical_loss": 3.4881767705788307, + "tokens_seen": 1643171840 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002536108324974925, + "loss": 2.6999, + "theoretical_loss": 3.4881767705788307, + "tokens_seen": 1643171840 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025360080240722165, + "loss": 2.672, + "theoretical_loss": 3.488164718072718, + "tokens_seen": 1643237376 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002535907723169509, + "loss": 2.6703, + "theoretical_loss": 3.488152666181862, + "tokens_seen": 1643302912 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025358074222668, + "loss": 2.6936, + "theoretical_loss": 3.4881406149062055, + "tokens_seen": 1643368448 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025357071213640925, + "loss": 2.6448, + "theoretical_loss": 3.4881285642456934, + "tokens_seen": 1643433984 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025356068204613843, + "loss": 2.6826, + "theoretical_loss": 3.4881165142002692, + "tokens_seen": 1643499520 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002535506519558676, + "loss": 2.6911, + "theoretical_loss": 3.4881044647698776, + "tokens_seen": 1643565056 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002535406218655968, + "loss": 2.6975, + "theoretical_loss": 3.488092415954462, + "tokens_seen": 1643630592 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025353059177532603, + "loss": 2.7031, + "theoretical_loss": 3.488080367753967, + "tokens_seen": 1643696128 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025352056168505516, + "loss": 2.6675, + "theoretical_loss": 3.488068320168337, + "tokens_seen": 1643761664 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002535105315947844, + "loss": 2.6599, + "theoretical_loss": 3.4880562731975155, + "tokens_seen": 1643827200 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002535005015045135, + "loss": 2.7501, + "theoretical_loss": 3.4880442268414464, + "tokens_seen": 1643892736 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025349047141424275, + "loss": 2.7032, + "theoretical_loss": 3.488032181100075, + "tokens_seen": 1643958272 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025348044132397193, + "loss": 2.6893, + "theoretical_loss": 3.488020135973344, + "tokens_seen": 1644023808 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002534704112337011, + "loss": 2.6141, + "theoretical_loss": 3.4880080914611984, + "tokens_seen": 1644089344 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002534603811434303, + "loss": 2.788, + "theoretical_loss": 3.487996047563582, + "tokens_seen": 1644154880 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002534503510531595, + "loss": 2.7406, + "theoretical_loss": 3.487984004280439, + "tokens_seen": 1644220416 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025344032096288866, + "loss": 2.6484, + "theoretical_loss": 3.4879719616117137, + "tokens_seen": 1644285952 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002534302908726179, + "loss": 2.7075, + "theoretical_loss": 3.4879599195573507, + "tokens_seen": 1644351488 + }, + { + "epoch": 20.01, + "learning_rate": 0.000253420260782347, + "loss": 2.7088, + "theoretical_loss": 3.4879478781172932, + "tokens_seen": 1644417024 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025341023069207626, + "loss": 2.6046, + "theoretical_loss": 3.487935837291486, + "tokens_seen": 1644482560 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002534002006018054, + "loss": 2.75, + "theoretical_loss": 3.4879237970798735, + "tokens_seen": 1644548096 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533901705115346, + "loss": 2.6507, + "theoretical_loss": 3.487911757482399, + "tokens_seen": 1644613632 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533801404212638, + "loss": 2.6711, + "theoretical_loss": 3.4878997184990075, + "tokens_seen": 1644679168 + }, + { + "epoch": 20.01, + "learning_rate": 0.000253370110330993, + "loss": 2.6657, + "theoretical_loss": 3.487887680129643, + "tokens_seen": 1644744704 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3888598, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6939826011657715, + "objective/train/theoretical_loss": 3.4878756423742496, + "objective/train/tokens_used": 1665270240, + "theoretical_loss": 3.4878756423742496, + "tokens_seen": 1644810240 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025336008024072216, + "loss": 2.6241, + "theoretical_loss": 3.4878756423742496, + "tokens_seen": 1644810240 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533500501504514, + "loss": 2.5842, + "theoretical_loss": 3.4878636052327714, + "tokens_seen": 1644875776 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533400200601805, + "loss": 2.7336, + "theoretical_loss": 3.4878515687051532, + "tokens_seen": 1644941312 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025332998996990976, + "loss": 2.7521, + "theoretical_loss": 3.487839532791339, + "tokens_seen": 1645006848 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533199598796389, + "loss": 2.73, + "theoretical_loss": 3.487827497491273, + "tokens_seen": 1645072384 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002533099297893681, + "loss": 2.7372, + "theoretical_loss": 3.4878154628048987, + "tokens_seen": 1645137920 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002532998996990973, + "loss": 2.6662, + "theoretical_loss": 3.4878034287321618, + "tokens_seen": 1645203456 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002532898696088265, + "loss": 2.6899, + "theoretical_loss": 3.4877913952730055, + "tokens_seen": 1645268992 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025327983951855566, + "loss": 2.6915, + "theoretical_loss": 3.487779362427374, + "tokens_seen": 1645334528 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025326980942828485, + "loss": 2.6514, + "theoretical_loss": 3.487767330195213, + "tokens_seen": 1645400064 + }, + { + "epoch": 20.01, + "learning_rate": 0.000253259779338014, + "loss": 2.7187, + "theoretical_loss": 3.487755298576465, + "tokens_seen": 1645465600 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025324974924774326, + "loss": 2.6171, + "theoretical_loss": 3.4877432675710756, + "tokens_seen": 1645531136 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002532397191574724, + "loss": 2.5578, + "theoretical_loss": 3.4877312371789877, + "tokens_seen": 1645596672 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002532296890672016, + "loss": 2.7463, + "theoretical_loss": 3.487719207400147, + "tokens_seen": 1645662208 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025321965897693075, + "loss": 2.6902, + "theoretical_loss": 3.4877071782344977, + "tokens_seen": 1645727744 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025320962888666, + "loss": 2.6162, + "theoretical_loss": 3.4876951496819832, + "tokens_seen": 1645793280 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025319959879638917, + "loss": 2.6935, + "theoretical_loss": 3.4876831217425486, + "tokens_seen": 1645858816 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025318956870611835, + "loss": 2.7403, + "theoretical_loss": 3.4876710944161378, + "tokens_seen": 1645924352 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025317953861584753, + "loss": 2.724, + "theoretical_loss": 3.4876590677026957, + "tokens_seen": 1645989888 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025316950852557677, + "loss": 2.6883, + "theoretical_loss": 3.487647041602166, + "tokens_seen": 1646055424 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002531594784353059, + "loss": 2.6699, + "theoretical_loss": 3.487635016114494, + "tokens_seen": 1646120960 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025314944834503513, + "loss": 2.7841, + "theoretical_loss": 3.4876229912396224, + "tokens_seen": 1646186496 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025313941825476425, + "loss": 2.7669, + "theoretical_loss": 3.4876109669774973, + "tokens_seen": 1646252032 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002531293881644935, + "loss": 2.6374, + "theoretical_loss": 3.487598943328062, + "tokens_seen": 1646317568 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025311935807422267, + "loss": 2.6974, + "theoretical_loss": 3.487586920291262, + "tokens_seen": 1646383104 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3892395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.607754945755005, + "objective/train/theoretical_loss": 3.4875748978670407, + "objective/train/tokens_used": 1666908640, + "theoretical_loss": 3.4875748978670407, + "tokens_seen": 1646448640 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025310932798395185, + "loss": 2.7447, + "theoretical_loss": 3.4875748978670407, + "tokens_seen": 1646448640 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025309929789368103, + "loss": 2.6951, + "theoretical_loss": 3.4875628760553425, + "tokens_seen": 1646514176 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002530892678034102, + "loss": 2.6321, + "theoretical_loss": 3.4875508548561127, + "tokens_seen": 1646579712 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002530792377131394, + "loss": 2.6837, + "theoretical_loss": 3.4875388342692943, + "tokens_seen": 1646645248 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025306920762286863, + "loss": 2.7159, + "theoretical_loss": 3.4875268142948332, + "tokens_seen": 1646710784 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025305917753259776, + "loss": 2.7215, + "theoretical_loss": 3.4875147949326735, + "tokens_seen": 1646776320 + }, + { + "epoch": 20.01, + "learning_rate": 0.000253049147442327, + "loss": 2.6546, + "theoretical_loss": 3.487502776182759, + "tokens_seen": 1646841856 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002530391173520562, + "loss": 2.6787, + "theoretical_loss": 3.487490758045034, + "tokens_seen": 1646907392 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025302908726178536, + "loss": 2.6847, + "theoretical_loss": 3.4874787405194443, + "tokens_seen": 1646972928 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025301905717151454, + "loss": 2.6884, + "theoretical_loss": 3.4874667236059333, + "tokens_seen": 1647038464 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002530090270812437, + "loss": 2.6542, + "theoretical_loss": 3.4874547073044457, + "tokens_seen": 1647104000 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025299899699097295, + "loss": 2.7582, + "theoretical_loss": 3.487442691614926, + "tokens_seen": 1647169536 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025298896690070213, + "loss": 2.6667, + "theoretical_loss": 3.4874306765373184, + "tokens_seen": 1647235072 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529789368104313, + "loss": 2.6776, + "theoretical_loss": 3.487418662071568, + "tokens_seen": 1647300608 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529689067201605, + "loss": 2.6992, + "theoretical_loss": 3.487406648217619, + "tokens_seen": 1647366144 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529588766298897, + "loss": 2.6987, + "theoretical_loss": 3.4873946349754155, + "tokens_seen": 1647431680 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025294884653961886, + "loss": 2.7018, + "theoretical_loss": 3.487382622344903, + "tokens_seen": 1647497216 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529388164493481, + "loss": 2.576, + "theoretical_loss": 3.4873706103260247, + "tokens_seen": 1647562752 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529287863590772, + "loss": 2.8247, + "theoretical_loss": 3.4873585989187266, + "tokens_seen": 1647628288 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025291875626880646, + "loss": 2.6643, + "theoretical_loss": 3.4873465881229517, + "tokens_seen": 1647693824 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002529087261785356, + "loss": 2.6591, + "theoretical_loss": 3.487334577938646, + "tokens_seen": 1647759360 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528986960882648, + "loss": 2.6628, + "theoretical_loss": 3.4873225683657534, + "tokens_seen": 1647824896 + }, + { + "epoch": 20.01, + "learning_rate": 0.000252888665997994, + "loss": 2.5815, + "theoretical_loss": 3.487310559404218, + "tokens_seen": 1647890432 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528786359077232, + "loss": 2.7145, + "theoretical_loss": 3.4872985510539847, + "tokens_seen": 1647955968 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025286860581745236, + "loss": 2.784, + "theoretical_loss": 3.4872865433149984, + "tokens_seen": 1648021504 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3897190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.703402280807495, + "objective/train/theoretical_loss": 3.4872745361872037, + "objective/train/tokens_used": 1668547040, + "theoretical_loss": 3.4872745361872037, + "tokens_seen": 1648087040 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528585757271816, + "loss": 2.6549, + "theoretical_loss": 3.4872745361872037, + "tokens_seen": 1648087040 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528485456369107, + "loss": 2.5656, + "theoretical_loss": 3.487262529670545, + "tokens_seen": 1648152576 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025283851554663996, + "loss": 2.6845, + "theoretical_loss": 3.4872505237649665, + "tokens_seen": 1648218112 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528284854563691, + "loss": 2.7409, + "theoretical_loss": 3.4872385184704133, + "tokens_seen": 1648283648 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528184553660983, + "loss": 2.6171, + "theoretical_loss": 3.4872265137868297, + "tokens_seen": 1648349184 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002528084252758275, + "loss": 2.6789, + "theoretical_loss": 3.487214509714161, + "tokens_seen": 1648414720 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002527983951855567, + "loss": 2.7332, + "theoretical_loss": 3.487202506252351, + "tokens_seen": 1648480256 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025278836509528586, + "loss": 2.6048, + "theoretical_loss": 3.4871905034013446, + "tokens_seen": 1648545792 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025277833500501505, + "loss": 2.6867, + "theoretical_loss": 3.487178501161086, + "tokens_seen": 1648611328 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002527683049147442, + "loss": 2.794, + "theoretical_loss": 3.4871664995315212, + "tokens_seen": 1648676864 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025275827482447346, + "loss": 2.7259, + "theoretical_loss": 3.4871544985125933, + "tokens_seen": 1648742400 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002527482447342026, + "loss": 2.6687, + "theoretical_loss": 3.487142498104248, + "tokens_seen": 1648807936 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002527382146439318, + "loss": 2.6453, + "theoretical_loss": 3.4871304983064295, + "tokens_seen": 1648873472 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025272818455366095, + "loss": 2.6754, + "theoretical_loss": 3.4871184991190827, + "tokens_seen": 1648939008 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002527181544633902, + "loss": 2.7416, + "theoretical_loss": 3.4871065005421524, + "tokens_seen": 1649004544 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025270812437311937, + "loss": 2.6597, + "theoretical_loss": 3.4870945025755824, + "tokens_seen": 1649070080 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025269809428284855, + "loss": 2.7491, + "theoretical_loss": 3.4870825052193184, + "tokens_seen": 1649135616 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025268806419257773, + "loss": 2.6072, + "theoretical_loss": 3.4870705084733045, + "tokens_seen": 1649201152 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025267803410230697, + "loss": 2.79, + "theoretical_loss": 3.487058512337486, + "tokens_seen": 1649266688 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002526680040120361, + "loss": 2.6779, + "theoretical_loss": 3.487046516811807, + "tokens_seen": 1649332224 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025265797392176533, + "loss": 2.8122, + "theoretical_loss": 3.487034521896213, + "tokens_seen": 1649397760 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025264794383149445, + "loss": 2.7494, + "theoretical_loss": 3.487022527590648, + "tokens_seen": 1649463296 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002526379137412237, + "loss": 2.6056, + "theoretical_loss": 3.4870105338950568, + "tokens_seen": 1649528832 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025262788365095287, + "loss": 2.6719, + "theoretical_loss": 3.4869985408093847, + "tokens_seen": 1649594368 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025261785356068205, + "loss": 2.705, + "theoretical_loss": 3.486986548333576, + "tokens_seen": 1649659904 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3900158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7404942512512207, + "objective/train/theoretical_loss": 3.486974556467575, + "objective/train/tokens_used": 1670185440, + "theoretical_loss": 3.486974556467575, + "tokens_seen": 1649725440 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025260782347041123, + "loss": 2.6853, + "theoretical_loss": 3.486974556467575, + "tokens_seen": 1649725440 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525977933801404, + "loss": 2.6995, + "theoretical_loss": 3.486962565211327, + "tokens_seen": 1649790976 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525877632898696, + "loss": 2.6796, + "theoretical_loss": 3.4869505745647773, + "tokens_seen": 1649856512 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025257773319959883, + "loss": 2.6654, + "theoretical_loss": 3.48693858452787, + "tokens_seen": 1649922048 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025256770310932796, + "loss": 2.6752, + "theoretical_loss": 3.48692659510055, + "tokens_seen": 1649987584 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525576730190572, + "loss": 2.6784, + "theoretical_loss": 3.4869146062827623, + "tokens_seen": 1650053120 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525476429287864, + "loss": 2.7623, + "theoretical_loss": 3.486902618074452, + "tokens_seen": 1650118656 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025253761283851556, + "loss": 2.7334, + "theoretical_loss": 3.4868906304755627, + "tokens_seen": 1650184192 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025252758274824474, + "loss": 2.6841, + "theoretical_loss": 3.4868786434860404, + "tokens_seen": 1650249728 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525175526579739, + "loss": 2.7119, + "theoretical_loss": 3.4868666571058293, + "tokens_seen": 1650315264 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002525075225677031, + "loss": 2.6786, + "theoretical_loss": 3.4868546713348745, + "tokens_seen": 1650380800 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025249749247743233, + "loss": 2.6686, + "theoretical_loss": 3.4868426861731208, + "tokens_seen": 1650446336 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025248746238716146, + "loss": 2.6723, + "theoretical_loss": 3.486830701620513, + "tokens_seen": 1650511872 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002524774322968907, + "loss": 2.6314, + "theoretical_loss": 3.4868187176769965, + "tokens_seen": 1650577408 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002524674022066198, + "loss": 2.7134, + "theoretical_loss": 3.486806734342515, + "tokens_seen": 1650642944 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025245737211634906, + "loss": 2.629, + "theoretical_loss": 3.486794751617014, + "tokens_seen": 1650708480 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025244734202607824, + "loss": 2.6629, + "theoretical_loss": 3.4867827695004388, + "tokens_seen": 1650774016 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002524373119358074, + "loss": 2.7252, + "theoretical_loss": 3.486770787992734, + "tokens_seen": 1650839552 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002524272818455366, + "loss": 2.6635, + "theoretical_loss": 3.4867588070938442, + "tokens_seen": 1650905088 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002524172517552658, + "loss": 2.6823, + "theoretical_loss": 3.4867468268037145, + "tokens_seen": 1650970624 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025240722166499496, + "loss": 2.6961, + "theoretical_loss": 3.4867348471222894, + "tokens_seen": 1651036160 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523971915747242, + "loss": 2.6612, + "theoretical_loss": 3.4867228680495144, + "tokens_seen": 1651101696 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523871614844533, + "loss": 2.7068, + "theoretical_loss": 3.4867108895853347, + "tokens_seen": 1651167232 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025237713139418256, + "loss": 2.6293, + "theoretical_loss": 3.486698911729694, + "tokens_seen": 1651232768 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025236710130391174, + "loss": 2.6731, + "theoretical_loss": 3.4866869344825386, + "tokens_seen": 1651298304 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3905204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.768002986907959, + "objective/train/theoretical_loss": 3.486674957843812, + "objective/train/tokens_used": 1671823840, + "theoretical_loss": 3.486674957843812, + "tokens_seen": 1651363840 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523570712136409, + "loss": 2.8038, + "theoretical_loss": 3.486674957843812, + "tokens_seen": 1651363840 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523470411233701, + "loss": 2.6663, + "theoretical_loss": 3.4866629818134607, + "tokens_seen": 1651429376 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523370110330993, + "loss": 2.6826, + "theoretical_loss": 3.486651006391429, + "tokens_seen": 1651494912 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025232698094282847, + "loss": 2.8023, + "theoretical_loss": 3.4866390315776616, + "tokens_seen": 1651560448 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002523169508525577, + "loss": 2.7326, + "theoretical_loss": 3.4866270573721034, + "tokens_seen": 1651625984 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025230692076228683, + "loss": 2.6138, + "theoretical_loss": 3.4866150837746996, + "tokens_seen": 1651691520 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025229689067201606, + "loss": 2.7638, + "theoretical_loss": 3.4866031107853956, + "tokens_seen": 1651757056 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002522868605817452, + "loss": 2.6875, + "theoretical_loss": 3.486591138404136, + "tokens_seen": 1651822592 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025227683049147443, + "loss": 2.693, + "theoretical_loss": 3.4865791666308654, + "tokens_seen": 1651888128 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002522668004012036, + "loss": 2.7425, + "theoretical_loss": 3.4865671954655295, + "tokens_seen": 1651953664 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002522567703109328, + "loss": 2.7061, + "theoretical_loss": 3.4865552249080727, + "tokens_seen": 1652019200 + }, + { + "epoch": 20.01, + "learning_rate": 0.000252246740220662, + "loss": 2.716, + "theoretical_loss": 3.486543254958441, + "tokens_seen": 1652084736 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025223671013039115, + "loss": 2.7297, + "theoretical_loss": 3.486531285616578, + "tokens_seen": 1652150272 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002522266800401204, + "loss": 2.6199, + "theoretical_loss": 3.48651931688243, + "tokens_seen": 1652215808 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025221664994984957, + "loss": 2.7293, + "theoretical_loss": 3.4865073487559415, + "tokens_seen": 1652281344 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025220661985957875, + "loss": 2.6346, + "theoretical_loss": 3.4864953812370576, + "tokens_seen": 1652346880 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025219658976930793, + "loss": 2.6935, + "theoretical_loss": 3.486483414325723, + "tokens_seen": 1652412416 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025218655967903717, + "loss": 2.6397, + "theoretical_loss": 3.4864714480218835, + "tokens_seen": 1652477952 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002521765295887663, + "loss": 2.7267, + "theoretical_loss": 3.4864594823254835, + "tokens_seen": 1652543488 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025216649949849553, + "loss": 2.7817, + "theoretical_loss": 3.486447517236469, + "tokens_seen": 1652609024 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025215646940822465, + "loss": 2.7323, + "theoretical_loss": 3.4864355527547835, + "tokens_seen": 1652674560 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002521464393179539, + "loss": 2.656, + "theoretical_loss": 3.4864235888803736, + "tokens_seen": 1652740096 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025213640922768307, + "loss": 2.7185, + "theoretical_loss": 3.486411625613184, + "tokens_seen": 1652805632 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025212637913741225, + "loss": 2.7468, + "theoretical_loss": 3.486399662953159, + "tokens_seen": 1652871168 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025211634904714143, + "loss": 2.6507, + "theoretical_loss": 3.4863877009002446, + "tokens_seen": 1652936704 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3908134, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7059168815612793, + "objective/train/theoretical_loss": 3.4863757394543855, + "objective/train/tokens_used": 1673462240, + "theoretical_loss": 3.4863757394543855, + "tokens_seen": 1653002240 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002521063189568706, + "loss": 2.7205, + "theoretical_loss": 3.4863757394543855, + "tokens_seen": 1653002240 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002520962888665998, + "loss": 2.7032, + "theoretical_loss": 3.4863637786155275, + "tokens_seen": 1653067776 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025208625877632903, + "loss": 2.6914, + "theoretical_loss": 3.486351818383615, + "tokens_seen": 1653133312 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025207622868605816, + "loss": 2.725, + "theoretical_loss": 3.486339858758593, + "tokens_seen": 1653198848 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002520661985957874, + "loss": 2.7, + "theoretical_loss": 3.4863278997404077, + "tokens_seen": 1653264384 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002520561685055166, + "loss": 2.6571, + "theoretical_loss": 3.4863159413290035, + "tokens_seen": 1653329920 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025204613841524576, + "loss": 2.5838, + "theoretical_loss": 3.486303983524325, + "tokens_seen": 1653395456 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025203610832497494, + "loss": 2.6361, + "theoretical_loss": 3.4862920263263186, + "tokens_seen": 1653460992 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002520260782347041, + "loss": 2.6652, + "theoretical_loss": 3.4862800697349288, + "tokens_seen": 1653526528 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002520160481444333, + "loss": 2.5903, + "theoretical_loss": 3.486268113750101, + "tokens_seen": 1653592064 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025200601805416253, + "loss": 2.6983, + "theoretical_loss": 3.4862561583717797, + "tokens_seen": 1653657600 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025199598796389166, + "loss": 2.642, + "theoretical_loss": 3.486244203599911, + "tokens_seen": 1653723136 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002519859578736209, + "loss": 2.5985, + "theoretical_loss": 3.4862322494344395, + "tokens_seen": 1653788672 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025197592778335, + "loss": 2.724, + "theoretical_loss": 3.4862202958753112, + "tokens_seen": 1653854208 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025196589769307926, + "loss": 2.6761, + "theoretical_loss": 3.4862083429224704, + "tokens_seen": 1653919744 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025195586760280844, + "loss": 2.7944, + "theoretical_loss": 3.486196390575863, + "tokens_seen": 1653985280 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002519458375125376, + "loss": 2.7272, + "theoretical_loss": 3.4861844388354335, + "tokens_seen": 1654050816 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002519358074222668, + "loss": 2.6169, + "theoretical_loss": 3.4861724877011278, + "tokens_seen": 1654116352 + }, + { + "epoch": 20.01, + "learning_rate": 0.000251925777331996, + "loss": 2.8008, + "theoretical_loss": 3.486160537172891, + "tokens_seen": 1654181888 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025191574724172516, + "loss": 2.7443, + "theoretical_loss": 3.4861485872506686, + "tokens_seen": 1654247424 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002519057171514544, + "loss": 2.6367, + "theoretical_loss": 3.486136637934405, + "tokens_seen": 1654312960 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002518956870611835, + "loss": 2.7823, + "theoretical_loss": 3.486124689224046, + "tokens_seen": 1654378496 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025188565697091276, + "loss": 2.7051, + "theoretical_loss": 3.4861127411195376, + "tokens_seen": 1654444032 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025187562688064194, + "loss": 2.6137, + "theoretical_loss": 3.4861007936208237, + "tokens_seen": 1654509568 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002518655967903711, + "loss": 2.6769, + "theoretical_loss": 3.486088846727851, + "tokens_seen": 1654575104 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3911631, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8841426372528076, + "objective/train/theoretical_loss": 3.4860769004405636, + "objective/train/tokens_used": 1675100640, + "theoretical_loss": 3.4860769004405636, + "tokens_seen": 1654640640 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002518555667001003, + "loss": 2.7656, + "theoretical_loss": 3.4860769004405636, + "tokens_seen": 1654640640 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002518455366098295, + "loss": 2.7941, + "theoretical_loss": 3.4860649547589073, + "tokens_seen": 1654706176 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025183550651955867, + "loss": 2.6505, + "theoretical_loss": 3.4860530096828275, + "tokens_seen": 1654771712 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002518254764292879, + "loss": 2.6892, + "theoretical_loss": 3.4860410652122695, + "tokens_seen": 1654837248 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025181544633901703, + "loss": 2.6387, + "theoretical_loss": 3.4860291213471784, + "tokens_seen": 1654902784 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025180541624874627, + "loss": 2.6956, + "theoretical_loss": 3.4860171780875, + "tokens_seen": 1654968320 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002517953861584754, + "loss": 2.859, + "theoretical_loss": 3.4860052354331788, + "tokens_seen": 1655033856 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025178535606820463, + "loss": 2.7089, + "theoretical_loss": 3.485993293384161, + "tokens_seen": 1655099392 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002517753259779338, + "loss": 2.6991, + "theoretical_loss": 3.4859813519403913, + "tokens_seen": 1655164928 + }, + { + "epoch": 20.01, + "learning_rate": 0.000251765295887663, + "loss": 2.6866, + "theoretical_loss": 3.4859694111018156, + "tokens_seen": 1655230464 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025175526579739217, + "loss": 2.6987, + "theoretical_loss": 3.485957470868379, + "tokens_seen": 1655296000 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025174523570712135, + "loss": 2.6118, + "theoretical_loss": 3.485945531240027, + "tokens_seen": 1655361536 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025173520561685053, + "loss": 2.6779, + "theoretical_loss": 3.485933592216705, + "tokens_seen": 1655427072 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025172517552657977, + "loss": 2.7114, + "theoretical_loss": 3.485921653798358, + "tokens_seen": 1655492608 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002517151454363089, + "loss": 2.674, + "theoretical_loss": 3.485909715984932, + "tokens_seen": 1655558144 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025170511534603813, + "loss": 2.6448, + "theoretical_loss": 3.485897778776372, + "tokens_seen": 1655623680 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002516950852557673, + "loss": 2.7846, + "theoretical_loss": 3.4858858421726238, + "tokens_seen": 1655689216 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002516850551654965, + "loss": 2.6222, + "theoretical_loss": 3.485873906173632, + "tokens_seen": 1655754752 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002516750250752257, + "loss": 2.7018, + "theoretical_loss": 3.485861970779343, + "tokens_seen": 1655820288 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025166499498495486, + "loss": 2.7259, + "theoretical_loss": 3.485850035989701, + "tokens_seen": 1655885824 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025165496489468404, + "loss": 2.6881, + "theoretical_loss": 3.485838101804653, + "tokens_seen": 1655951360 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025164493480441327, + "loss": 2.7366, + "theoretical_loss": 3.485826168224144, + "tokens_seen": 1656016896 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002516349047141424, + "loss": 2.574, + "theoretical_loss": 3.4858142352481183, + "tokens_seen": 1656082432 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025162487462387163, + "loss": 2.6005, + "theoretical_loss": 3.4858023028765226, + "tokens_seen": 1656147968 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025161484453360076, + "loss": 2.7102, + "theoretical_loss": 3.4857903711093017, + "tokens_seen": 1656213504 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3915039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.601741313934326, + "objective/train/theoretical_loss": 3.485778439946402, + "objective/train/tokens_used": 1676739040, + "theoretical_loss": 3.485778439946402, + "tokens_seen": 1656279040 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025160481444333, + "loss": 2.7285, + "theoretical_loss": 3.485778439946402, + "tokens_seen": 1656279040 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002515947843530592, + "loss": 2.6926, + "theoretical_loss": 3.4857665093877674, + "tokens_seen": 1656344576 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025158475426278836, + "loss": 2.7037, + "theoretical_loss": 3.485754579433345, + "tokens_seen": 1656410112 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025157472417251754, + "loss": 2.6289, + "theoretical_loss": 3.485742650083079, + "tokens_seen": 1656475648 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002515646940822468, + "loss": 2.6691, + "theoretical_loss": 3.4857307213369157, + "tokens_seen": 1656541184 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002515546639919759, + "loss": 2.7499, + "theoretical_loss": 3.485718793194801, + "tokens_seen": 1656606720 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025154463390170514, + "loss": 2.6771, + "theoretical_loss": 3.485706865656679, + "tokens_seen": 1656672256 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025153460381143426, + "loss": 2.7796, + "theoretical_loss": 3.4856949387224967, + "tokens_seen": 1656737792 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002515245737211635, + "loss": 2.7589, + "theoretical_loss": 3.485683012392199, + "tokens_seen": 1656803328 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002515145436308927, + "loss": 2.72, + "theoretical_loss": 3.485671086665731, + "tokens_seen": 1656868864 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025150451354062186, + "loss": 2.6571, + "theoretical_loss": 3.4856591615430386, + "tokens_seen": 1656934400 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514944834503511, + "loss": 2.6481, + "theoretical_loss": 3.485647237024068, + "tokens_seen": 1656999936 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514844533600802, + "loss": 2.769, + "theoretical_loss": 3.485635313108764, + "tokens_seen": 1657065472 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025147442326980946, + "loss": 2.6233, + "theoretical_loss": 3.4856233897970723, + "tokens_seen": 1657131008 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025146439317953864, + "loss": 2.6452, + "theoretical_loss": 3.4856114670889387, + "tokens_seen": 1657196544 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514543630892678, + "loss": 2.7286, + "theoretical_loss": 3.485599544984308, + "tokens_seen": 1657262080 + }, + { + "epoch": 20.01, + "learning_rate": 0.000251444332998997, + "loss": 2.7331, + "theoretical_loss": 3.485587623483127, + "tokens_seen": 1657327616 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514343029087262, + "loss": 2.764, + "theoretical_loss": 3.4855757025853404, + "tokens_seen": 1657393152 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025142427281845536, + "loss": 2.666, + "theoretical_loss": 3.4855637822908943, + "tokens_seen": 1657458688 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514142427281846, + "loss": 2.6335, + "theoretical_loss": 3.485551862599734, + "tokens_seen": 1657524224 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002514042126379137, + "loss": 2.7473, + "theoretical_loss": 3.485539943511805, + "tokens_seen": 1657589760 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025139418254764296, + "loss": 2.7478, + "theoretical_loss": 3.4855280250270537, + "tokens_seen": 1657655296 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025138415245737214, + "loss": 2.7177, + "theoretical_loss": 3.4855161071454246, + "tokens_seen": 1657720832 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002513741223671013, + "loss": 2.7077, + "theoretical_loss": 3.485504189866864, + "tokens_seen": 1657786368 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002513640922768305, + "loss": 2.7194, + "theoretical_loss": 3.485492273191318, + "tokens_seen": 1657851904 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3919749, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7703750133514404, + "objective/train/theoretical_loss": 3.485480357118731, + "objective/train/tokens_used": 1678377440, + "theoretical_loss": 3.485480357118731, + "tokens_seen": 1657917440 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002513540621865597, + "loss": 2.7183, + "theoretical_loss": 3.485480357118731, + "tokens_seen": 1657917440 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025134403209628887, + "loss": 2.6445, + "theoretical_loss": 3.48546844164905, + "tokens_seen": 1657982976 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002513340020060181, + "loss": 2.609, + "theoretical_loss": 3.4854565267822197, + "tokens_seen": 1658048512 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025132397191574723, + "loss": 2.7682, + "theoretical_loss": 3.485444612518186, + "tokens_seen": 1658114048 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025131394182547647, + "loss": 2.7157, + "theoretical_loss": 3.4854326988568953, + "tokens_seen": 1658179584 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002513039117352056, + "loss": 2.6145, + "theoretical_loss": 3.4854207857982917, + "tokens_seen": 1658245120 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025129388164493483, + "loss": 2.7206, + "theoretical_loss": 3.4854088733423225, + "tokens_seen": 1658310656 + }, + { + "epoch": 20.01, + "learning_rate": 0.000251283851554664, + "loss": 2.7612, + "theoretical_loss": 3.4853969614889326, + "tokens_seen": 1658376192 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002512738214643932, + "loss": 2.7387, + "theoretical_loss": 3.485385050238068, + "tokens_seen": 1658441728 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025126379137412237, + "loss": 2.6726, + "theoretical_loss": 3.485373139589674, + "tokens_seen": 1658507264 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025125376128385155, + "loss": 2.7359, + "theoretical_loss": 3.485361229543697, + "tokens_seen": 1658572800 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025124373119358073, + "loss": 2.7606, + "theoretical_loss": 3.4853493201000827, + "tokens_seen": 1658638336 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025123370110330997, + "loss": 2.6804, + "theoretical_loss": 3.485337411258776, + "tokens_seen": 1658703872 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002512236710130391, + "loss": 2.7849, + "theoretical_loss": 3.485325503019723, + "tokens_seen": 1658769408 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025121364092276833, + "loss": 2.7462, + "theoretical_loss": 3.4853135953828698, + "tokens_seen": 1658834944 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002512036108324975, + "loss": 2.6607, + "theoretical_loss": 3.485301688348162, + "tokens_seen": 1658900480 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002511935807422267, + "loss": 2.657, + "theoretical_loss": 3.4852897819155446, + "tokens_seen": 1658966016 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002511835506519559, + "loss": 2.6697, + "theoretical_loss": 3.4852778760849645, + "tokens_seen": 1659031552 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025117352056168506, + "loss": 2.709, + "theoretical_loss": 3.4852659708563674, + "tokens_seen": 1659097088 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025116349047141424, + "loss": 2.659, + "theoretical_loss": 3.4852540662296985, + "tokens_seen": 1659162624 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025115346038114347, + "loss": 2.6553, + "theoretical_loss": 3.485242162204904, + "tokens_seen": 1659228160 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002511434302908726, + "loss": 2.731, + "theoretical_loss": 3.4852302587819293, + "tokens_seen": 1659293696 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025113340020060183, + "loss": 2.7456, + "theoretical_loss": 3.4852183559607206, + "tokens_seen": 1659359232 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025112337011033096, + "loss": 2.7425, + "theoretical_loss": 3.485206453741223, + "tokens_seen": 1659424768 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002511133400200602, + "loss": 2.654, + "theoretical_loss": 3.4851945521233834, + "tokens_seen": 1659490304 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3922689, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5741422176361084, + "objective/train/theoretical_loss": 3.485182651107147, + "objective/train/tokens_used": 1680015840, + "theoretical_loss": 3.485182651107147, + "tokens_seen": 1659555840 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002511033099297894, + "loss": 2.5996, + "theoretical_loss": 3.485182651107147, + "tokens_seen": 1659555840 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025109327983951856, + "loss": 2.7608, + "theoretical_loss": 3.48517075069246, + "tokens_seen": 1659621376 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025108324974924774, + "loss": 2.6422, + "theoretical_loss": 3.4851588508792677, + "tokens_seen": 1659686912 + }, + { + "epoch": 20.01, + "learning_rate": 0.000251073219658977, + "loss": 2.6728, + "theoretical_loss": 3.4851469516675158, + "tokens_seen": 1659752448 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002510631895687061, + "loss": 2.6903, + "theoretical_loss": 3.485135053057151, + "tokens_seen": 1659817984 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025105315947843534, + "loss": 2.7164, + "theoretical_loss": 3.485123155048119, + "tokens_seen": 1659883520 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025104312938816446, + "loss": 2.7956, + "theoretical_loss": 3.485111257640365, + "tokens_seen": 1659949056 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002510330992978937, + "loss": 2.6862, + "theoretical_loss": 3.4850993608338356, + "tokens_seen": 1660014592 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002510230692076229, + "loss": 2.6656, + "theoretical_loss": 3.485087464628476, + "tokens_seen": 1660080128 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025101303911735206, + "loss": 2.7342, + "theoretical_loss": 3.4850755690242328, + "tokens_seen": 1660145664 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025100300902708124, + "loss": 2.7034, + "theoretical_loss": 3.4850636740210517, + "tokens_seen": 1660211200 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002509929789368104, + "loss": 2.7292, + "theoretical_loss": 3.485051779618878, + "tokens_seen": 1660276736 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002509829488465396, + "loss": 2.7737, + "theoretical_loss": 3.485039885817658, + "tokens_seen": 1660342272 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025097291875626884, + "loss": 2.6821, + "theoretical_loss": 3.4850279926173378, + "tokens_seen": 1660407808 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025096288866599797, + "loss": 2.6971, + "theoretical_loss": 3.485016100017863, + "tokens_seen": 1660473344 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002509528585757272, + "loss": 2.7352, + "theoretical_loss": 3.4850042080191805, + "tokens_seen": 1660538880 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025094282848545633, + "loss": 2.5662, + "theoretical_loss": 3.484992316621235, + "tokens_seen": 1660604416 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025093279839518556, + "loss": 2.704, + "theoretical_loss": 3.4849804258239727, + "tokens_seen": 1660669952 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025092276830491475, + "loss": 2.677, + "theoretical_loss": 3.4849685356273405, + "tokens_seen": 1660735488 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002509127382146439, + "loss": 2.6897, + "theoretical_loss": 3.484956646031283, + "tokens_seen": 1660801024 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002509027081243731, + "loss": 2.6895, + "theoretical_loss": 3.4849447570357475, + "tokens_seen": 1660866560 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025089267803410234, + "loss": 2.6163, + "theoretical_loss": 3.4849328686406786, + "tokens_seen": 1660932096 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025088264794383147, + "loss": 2.6702, + "theoretical_loss": 3.484920980846023, + "tokens_seen": 1660997632 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002508726178535607, + "loss": 2.7054, + "theoretical_loss": 3.4849090936517273, + "tokens_seen": 1661063168 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025086258776328983, + "loss": 2.7558, + "theoretical_loss": 3.4848972070577364, + "tokens_seen": 1661128704 + }, + { + "epoch": 20.01, + "objective/train/docs_used": 3926322, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7508740425109863, + "objective/train/theoretical_loss": 3.4848853210639965, + "objective/train/tokens_used": 1681654240, + "theoretical_loss": 3.4848853210639965, + "tokens_seen": 1661194240 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025085255767301907, + "loss": 2.7145, + "theoretical_loss": 3.4848853210639965, + "tokens_seen": 1661194240 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025084252758274825, + "loss": 2.7299, + "theoretical_loss": 3.4848734356704543, + "tokens_seen": 1661259776 + }, + { + "epoch": 20.01, + "learning_rate": 0.00025083249749247743, + "loss": 2.6873, + "theoretical_loss": 3.484861550877055, + "tokens_seen": 1661325312 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002508224674022066, + "loss": 2.6495, + "theoretical_loss": 3.484849666683745, + "tokens_seen": 1661390848 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002508124373119358, + "loss": 2.669, + "theoretical_loss": 3.4848377830904713, + "tokens_seen": 1661456384 + } + ], + "max_steps": 50354, + "num_train_epochs": 9223372036854775807, + "total_flos": 8.47901005185024e+17, + "trial_name": null, + "trial_params": null +}