{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0588137976673009, "eval_steps": 1600, "global_step": 16000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.6154260635375977, "learning_rate": 6.250000000000001e-08, "loss": 0.6581, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.434083104133606, "learning_rate": 1.2500000000000002e-07, "loss": 0.65, "step": 20 }, { "epoch": 0.0, "grad_norm": 1.770402193069458, "learning_rate": 1.875e-07, "loss": 0.6538, "step": 30 }, { "epoch": 0.0, "grad_norm": 1.5259037017822266, "learning_rate": 2.5000000000000004e-07, "loss": 0.6661, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.3413686752319336, "learning_rate": 3.125e-07, "loss": 0.6629, "step": 50 }, { "epoch": 0.0, "grad_norm": 1.8524203300476074, "learning_rate": 3.75e-07, "loss": 0.652, "step": 60 }, { "epoch": 0.0, "grad_norm": 1.6129018068313599, "learning_rate": 4.375e-07, "loss": 0.6503, "step": 70 }, { "epoch": 0.01, "grad_norm": 1.3771299123764038, "learning_rate": 5.000000000000001e-07, "loss": 0.6568, "step": 80 }, { "epoch": 0.01, "grad_norm": 1.4569002389907837, "learning_rate": 5.625e-07, "loss": 0.6592, "step": 90 }, { "epoch": 0.01, "grad_norm": 1.3211089372634888, "learning_rate": 6.25e-07, "loss": 0.6609, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.3682845830917358, "learning_rate": 6.875000000000001e-07, "loss": 0.6655, "step": 110 }, { "epoch": 0.01, "grad_norm": 2.0182406902313232, "learning_rate": 7.5e-07, "loss": 0.6581, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.5314549207687378, "learning_rate": 8.125000000000001e-07, "loss": 0.6554, "step": 130 }, { "epoch": 0.01, "grad_norm": 1.4846184253692627, "learning_rate": 8.75e-07, "loss": 0.6555, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.568630576133728, "learning_rate": 9.375000000000001e-07, "loss": 0.6567, "step": 150 }, { "epoch": 0.01, "grad_norm": 1.1758819818496704, "learning_rate": 1.0000000000000002e-06, "loss": 0.6448, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.6889770030975342, "learning_rate": 1.0625e-06, "loss": 0.6352, "step": 170 }, { "epoch": 0.01, "grad_norm": 1.4104373455047607, "learning_rate": 1.125e-06, "loss": 0.64, "step": 180 }, { "epoch": 0.01, "grad_norm": 1.5935137271881104, "learning_rate": 1.1875e-06, "loss": 0.6647, "step": 190 }, { "epoch": 0.01, "grad_norm": 1.5472041368484497, "learning_rate": 1.25e-06, "loss": 0.637, "step": 200 }, { "epoch": 0.01, "grad_norm": 1.2572060823440552, "learning_rate": 1.3125000000000001e-06, "loss": 0.6642, "step": 210 }, { "epoch": 0.01, "grad_norm": 1.3120354413986206, "learning_rate": 1.3750000000000002e-06, "loss": 0.6713, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.480773687362671, "learning_rate": 1.4375e-06, "loss": 0.6765, "step": 230 }, { "epoch": 0.02, "grad_norm": 1.6741840839385986, "learning_rate": 1.5e-06, "loss": 0.662, "step": 240 }, { "epoch": 0.02, "grad_norm": 1.2385046482086182, "learning_rate": 1.5625e-06, "loss": 0.6563, "step": 250 }, { "epoch": 0.02, "grad_norm": 1.4410344362258911, "learning_rate": 1.6250000000000001e-06, "loss": 0.6524, "step": 260 }, { "epoch": 0.02, "grad_norm": 1.2795886993408203, "learning_rate": 1.6875000000000001e-06, "loss": 0.655, "step": 270 }, { "epoch": 0.02, "grad_norm": 1.6680865287780762, "learning_rate": 1.75e-06, "loss": 0.6577, "step": 280 }, { "epoch": 0.02, "grad_norm": 1.456658959388733, "learning_rate": 1.8125e-06, "loss": 0.6687, "step": 290 }, { "epoch": 0.02, "grad_norm": 1.4947397708892822, "learning_rate": 1.8750000000000003e-06, "loss": 0.6685, "step": 300 }, { "epoch": 0.02, "grad_norm": 1.6426098346710205, "learning_rate": 1.9375e-06, "loss": 0.653, "step": 310 }, { "epoch": 0.02, "grad_norm": 1.3753637075424194, "learning_rate": 2.0000000000000003e-06, "loss": 0.6543, "step": 320 }, { "epoch": 0.02, "grad_norm": 1.4655407667160034, "learning_rate": 2.0625e-06, "loss": 0.654, "step": 330 }, { "epoch": 0.02, "grad_norm": 1.6397826671600342, "learning_rate": 2.125e-06, "loss": 0.6604, "step": 340 }, { "epoch": 0.02, "grad_norm": 1.4749575853347778, "learning_rate": 2.1875000000000002e-06, "loss": 0.6587, "step": 350 }, { "epoch": 0.02, "grad_norm": 1.650220274925232, "learning_rate": 2.25e-06, "loss": 0.6578, "step": 360 }, { "epoch": 0.02, "grad_norm": 1.290277123451233, "learning_rate": 2.3125000000000003e-06, "loss": 0.6607, "step": 370 }, { "epoch": 0.03, "grad_norm": 1.5285893678665161, "learning_rate": 2.375e-06, "loss": 0.6594, "step": 380 }, { "epoch": 0.03, "grad_norm": 1.8043378591537476, "learning_rate": 2.4375e-06, "loss": 0.6539, "step": 390 }, { "epoch": 0.03, "grad_norm": 1.5191413164138794, "learning_rate": 2.5e-06, "loss": 0.6716, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.7285609245300293, "learning_rate": 2.5625e-06, "loss": 0.6462, "step": 410 }, { "epoch": 0.03, "grad_norm": 1.2920969724655151, "learning_rate": 2.6250000000000003e-06, "loss": 0.6601, "step": 420 }, { "epoch": 0.03, "grad_norm": 1.4754414558410645, "learning_rate": 2.6875e-06, "loss": 0.6574, "step": 430 }, { "epoch": 0.03, "grad_norm": 1.6130346059799194, "learning_rate": 2.7500000000000004e-06, "loss": 0.6625, "step": 440 }, { "epoch": 0.03, "grad_norm": 1.4590799808502197, "learning_rate": 2.8125e-06, "loss": 0.6793, "step": 450 }, { "epoch": 0.03, "grad_norm": 1.7691385746002197, "learning_rate": 2.875e-06, "loss": 0.6616, "step": 460 }, { "epoch": 0.03, "grad_norm": 1.7910367250442505, "learning_rate": 2.9375000000000003e-06, "loss": 0.6534, "step": 470 }, { "epoch": 0.03, "grad_norm": 2.179307460784912, "learning_rate": 3e-06, "loss": 0.6698, "step": 480 }, { "epoch": 0.03, "grad_norm": 2.0902700424194336, "learning_rate": 3.0625000000000003e-06, "loss": 0.6707, "step": 490 }, { "epoch": 0.03, "grad_norm": 1.3620158433914185, "learning_rate": 3.125e-06, "loss": 0.6648, "step": 500 }, { "epoch": 0.03, "grad_norm": 1.5206565856933594, "learning_rate": 3.1875e-06, "loss": 0.6733, "step": 510 }, { "epoch": 0.03, "grad_norm": 1.7129814624786377, "learning_rate": 3.2500000000000002e-06, "loss": 0.6648, "step": 520 }, { "epoch": 0.04, "grad_norm": 1.7053027153015137, "learning_rate": 3.3125e-06, "loss": 0.6711, "step": 530 }, { "epoch": 0.04, "grad_norm": 1.8278532028198242, "learning_rate": 3.3750000000000003e-06, "loss": 0.6582, "step": 540 }, { "epoch": 0.04, "grad_norm": 1.8324699401855469, "learning_rate": 3.4375e-06, "loss": 0.6746, "step": 550 }, { "epoch": 0.04, "grad_norm": 1.597606897354126, "learning_rate": 3.5e-06, "loss": 0.6647, "step": 560 }, { "epoch": 0.04, "grad_norm": 1.7364826202392578, "learning_rate": 3.5625e-06, "loss": 0.6577, "step": 570 }, { "epoch": 0.04, "grad_norm": 2.0563762187957764, "learning_rate": 3.625e-06, "loss": 0.6711, "step": 580 }, { "epoch": 0.04, "grad_norm": 1.8743737936019897, "learning_rate": 3.6875000000000007e-06, "loss": 0.6623, "step": 590 }, { "epoch": 0.04, "grad_norm": 1.7393046617507935, "learning_rate": 3.7500000000000005e-06, "loss": 0.6453, "step": 600 }, { "epoch": 0.04, "grad_norm": 2.464277744293213, "learning_rate": 3.8125e-06, "loss": 0.6528, "step": 610 }, { "epoch": 0.04, "grad_norm": 1.5163260698318481, "learning_rate": 3.875e-06, "loss": 0.6682, "step": 620 }, { "epoch": 0.04, "grad_norm": 1.5761061906814575, "learning_rate": 3.9375e-06, "loss": 0.6647, "step": 630 }, { "epoch": 0.04, "grad_norm": 1.468314528465271, "learning_rate": 4.000000000000001e-06, "loss": 0.6572, "step": 640 }, { "epoch": 0.04, "grad_norm": 1.7023948431015015, "learning_rate": 4.0625000000000005e-06, "loss": 0.6527, "step": 650 }, { "epoch": 0.04, "grad_norm": 1.502794861793518, "learning_rate": 4.125e-06, "loss": 0.6444, "step": 660 }, { "epoch": 0.04, "grad_norm": 1.8254650831222534, "learning_rate": 4.1875e-06, "loss": 0.6524, "step": 670 }, { "epoch": 0.04, "grad_norm": 1.577621340751648, "learning_rate": 4.25e-06, "loss": 0.644, "step": 680 }, { "epoch": 0.05, "grad_norm": 2.2948737144470215, "learning_rate": 4.312500000000001e-06, "loss": 0.6617, "step": 690 }, { "epoch": 0.05, "grad_norm": 1.8099135160446167, "learning_rate": 4.3750000000000005e-06, "loss": 0.6569, "step": 700 }, { "epoch": 0.05, "grad_norm": 1.51678466796875, "learning_rate": 4.4375e-06, "loss": 0.6637, "step": 710 }, { "epoch": 0.05, "grad_norm": 1.8125680685043335, "learning_rate": 4.5e-06, "loss": 0.6525, "step": 720 }, { "epoch": 0.05, "grad_norm": 2.2971110343933105, "learning_rate": 4.5625e-06, "loss": 0.6658, "step": 730 }, { "epoch": 0.05, "grad_norm": 1.7269909381866455, "learning_rate": 4.625000000000001e-06, "loss": 0.6461, "step": 740 }, { "epoch": 0.05, "grad_norm": 1.6699669361114502, "learning_rate": 4.6875000000000004e-06, "loss": 0.6569, "step": 750 }, { "epoch": 0.05, "grad_norm": 1.5642304420471191, "learning_rate": 4.75e-06, "loss": 0.6394, "step": 760 }, { "epoch": 0.05, "grad_norm": 1.1117033958435059, "learning_rate": 4.8125e-06, "loss": 0.6547, "step": 770 }, { "epoch": 0.05, "grad_norm": 1.453066349029541, "learning_rate": 4.875e-06, "loss": 0.6565, "step": 780 }, { "epoch": 0.05, "grad_norm": 1.6391645669937134, "learning_rate": 4.937500000000001e-06, "loss": 0.6586, "step": 790 }, { "epoch": 0.05, "grad_norm": 1.4576597213745117, "learning_rate": 5e-06, "loss": 0.6569, "step": 800 }, { "epoch": 0.05, "grad_norm": 1.4210525751113892, "learning_rate": 5.0625e-06, "loss": 0.6439, "step": 810 }, { "epoch": 0.05, "grad_norm": 1.8609915971755981, "learning_rate": 5.125e-06, "loss": 0.6543, "step": 820 }, { "epoch": 0.05, "grad_norm": 1.8662998676300049, "learning_rate": 5.187500000000001e-06, "loss": 0.6596, "step": 830 }, { "epoch": 0.06, "grad_norm": 1.3369812965393066, "learning_rate": 5.2500000000000006e-06, "loss": 0.6538, "step": 840 }, { "epoch": 0.06, "grad_norm": 1.643251657485962, "learning_rate": 5.3125e-06, "loss": 0.6422, "step": 850 }, { "epoch": 0.06, "grad_norm": 1.5795202255249023, "learning_rate": 5.375e-06, "loss": 0.6567, "step": 860 }, { "epoch": 0.06, "grad_norm": 1.4530309438705444, "learning_rate": 5.4375e-06, "loss": 0.6613, "step": 870 }, { "epoch": 0.06, "grad_norm": 1.514719843864441, "learning_rate": 5.500000000000001e-06, "loss": 0.6644, "step": 880 }, { "epoch": 0.06, "grad_norm": 1.7065812349319458, "learning_rate": 5.5625000000000005e-06, "loss": 0.6512, "step": 890 }, { "epoch": 0.06, "grad_norm": 2.272714376449585, "learning_rate": 5.625e-06, "loss": 0.6693, "step": 900 }, { "epoch": 0.06, "grad_norm": 1.5543745756149292, "learning_rate": 5.6875e-06, "loss": 0.6469, "step": 910 }, { "epoch": 0.06, "grad_norm": 1.5970263481140137, "learning_rate": 5.75e-06, "loss": 0.6492, "step": 920 }, { "epoch": 0.06, "grad_norm": 1.426156759262085, "learning_rate": 5.812500000000001e-06, "loss": 0.6232, "step": 930 }, { "epoch": 0.06, "grad_norm": 1.5355912446975708, "learning_rate": 5.8750000000000005e-06, "loss": 0.6603, "step": 940 }, { "epoch": 0.06, "grad_norm": 1.58833909034729, "learning_rate": 5.9375e-06, "loss": 0.6448, "step": 950 }, { "epoch": 0.06, "grad_norm": 1.565136194229126, "learning_rate": 6e-06, "loss": 0.6434, "step": 960 }, { "epoch": 0.06, "grad_norm": 1.7123429775238037, "learning_rate": 6.0625e-06, "loss": 0.6724, "step": 970 }, { "epoch": 0.06, "grad_norm": 1.8091198205947876, "learning_rate": 6.125000000000001e-06, "loss": 0.659, "step": 980 }, { "epoch": 0.07, "grad_norm": 1.606202244758606, "learning_rate": 6.1875000000000005e-06, "loss": 0.6351, "step": 990 }, { "epoch": 0.07, "grad_norm": 1.8575665950775146, "learning_rate": 6.25e-06, "loss": 0.6766, "step": 1000 }, { "epoch": 0.07, "grad_norm": 1.4026097059249878, "learning_rate": 6.3125e-06, "loss": 0.6549, "step": 1010 }, { "epoch": 0.07, "grad_norm": 1.772989273071289, "learning_rate": 6.375e-06, "loss": 0.6579, "step": 1020 }, { "epoch": 0.07, "grad_norm": 1.7279027700424194, "learning_rate": 6.437500000000001e-06, "loss": 0.6604, "step": 1030 }, { "epoch": 0.07, "grad_norm": 2.0833723545074463, "learning_rate": 6.5000000000000004e-06, "loss": 0.6434, "step": 1040 }, { "epoch": 0.07, "grad_norm": 1.8391436338424683, "learning_rate": 6.5625e-06, "loss": 0.6495, "step": 1050 }, { "epoch": 0.07, "grad_norm": 2.104515790939331, "learning_rate": 6.625e-06, "loss": 0.6573, "step": 1060 }, { "epoch": 0.07, "grad_norm": 2.012073516845703, "learning_rate": 6.6875e-06, "loss": 0.6601, "step": 1070 }, { "epoch": 0.07, "grad_norm": 1.5166912078857422, "learning_rate": 6.750000000000001e-06, "loss": 0.687, "step": 1080 }, { "epoch": 0.07, "grad_norm": 2.0484936237335205, "learning_rate": 6.8125e-06, "loss": 0.6362, "step": 1090 }, { "epoch": 0.07, "grad_norm": 1.7620980739593506, "learning_rate": 6.875e-06, "loss": 0.6295, "step": 1100 }, { "epoch": 0.07, "grad_norm": 1.7010966539382935, "learning_rate": 6.9375e-06, "loss": 0.662, "step": 1110 }, { "epoch": 0.07, "grad_norm": 1.4270750284194946, "learning_rate": 7e-06, "loss": 0.6696, "step": 1120 }, { "epoch": 0.07, "grad_norm": 1.3873461484909058, "learning_rate": 7.062500000000001e-06, "loss": 0.6568, "step": 1130 }, { "epoch": 0.08, "grad_norm": 1.8650726079940796, "learning_rate": 7.125e-06, "loss": 0.6574, "step": 1140 }, { "epoch": 0.08, "grad_norm": 1.807304859161377, "learning_rate": 7.1875e-06, "loss": 0.6714, "step": 1150 }, { "epoch": 0.08, "grad_norm": 1.5310503244400024, "learning_rate": 7.25e-06, "loss": 0.6476, "step": 1160 }, { "epoch": 0.08, "grad_norm": 1.4306132793426514, "learning_rate": 7.3125e-06, "loss": 0.6508, "step": 1170 }, { "epoch": 0.08, "grad_norm": 1.687558650970459, "learning_rate": 7.375000000000001e-06, "loss": 0.6534, "step": 1180 }, { "epoch": 0.08, "grad_norm": 1.5304369926452637, "learning_rate": 7.437500000000001e-06, "loss": 0.6651, "step": 1190 }, { "epoch": 0.08, "grad_norm": 1.6289629936218262, "learning_rate": 7.500000000000001e-06, "loss": 0.6712, "step": 1200 }, { "epoch": 0.08, "grad_norm": 1.8669979572296143, "learning_rate": 7.5625e-06, "loss": 0.6637, "step": 1210 }, { "epoch": 0.08, "grad_norm": 1.877084732055664, "learning_rate": 7.625e-06, "loss": 0.6521, "step": 1220 }, { "epoch": 0.08, "grad_norm": 1.906940221786499, "learning_rate": 7.6875e-06, "loss": 0.6381, "step": 1230 }, { "epoch": 0.08, "grad_norm": 1.4054601192474365, "learning_rate": 7.75e-06, "loss": 0.6528, "step": 1240 }, { "epoch": 0.08, "grad_norm": 2.1831490993499756, "learning_rate": 7.8125e-06, "loss": 0.6424, "step": 1250 }, { "epoch": 0.08, "grad_norm": 1.7532954216003418, "learning_rate": 7.875e-06, "loss": 0.6591, "step": 1260 }, { "epoch": 0.08, "grad_norm": 1.4256421327590942, "learning_rate": 7.9375e-06, "loss": 0.644, "step": 1270 }, { "epoch": 0.08, "grad_norm": 1.645342230796814, "learning_rate": 8.000000000000001e-06, "loss": 0.6786, "step": 1280 }, { "epoch": 0.09, "grad_norm": 1.749700665473938, "learning_rate": 8.062500000000001e-06, "loss": 0.6583, "step": 1290 }, { "epoch": 0.09, "grad_norm": 1.5855311155319214, "learning_rate": 8.125000000000001e-06, "loss": 0.6388, "step": 1300 }, { "epoch": 0.09, "grad_norm": 1.7551072835922241, "learning_rate": 8.1875e-06, "loss": 0.6638, "step": 1310 }, { "epoch": 0.09, "grad_norm": 1.656398892402649, "learning_rate": 8.25e-06, "loss": 0.6486, "step": 1320 }, { "epoch": 0.09, "grad_norm": 1.3058027029037476, "learning_rate": 8.3125e-06, "loss": 0.6678, "step": 1330 }, { "epoch": 0.09, "grad_norm": 1.9038392305374146, "learning_rate": 8.375e-06, "loss": 0.6576, "step": 1340 }, { "epoch": 0.09, "grad_norm": 1.7255905866622925, "learning_rate": 8.4375e-06, "loss": 0.6432, "step": 1350 }, { "epoch": 0.09, "grad_norm": 2.0054244995117188, "learning_rate": 8.5e-06, "loss": 0.6642, "step": 1360 }, { "epoch": 0.09, "grad_norm": 1.624997615814209, "learning_rate": 8.5625e-06, "loss": 0.6659, "step": 1370 }, { "epoch": 0.09, "grad_norm": 1.5902334451675415, "learning_rate": 8.625000000000001e-06, "loss": 0.6707, "step": 1380 }, { "epoch": 0.09, "grad_norm": 2.2210817337036133, "learning_rate": 8.687500000000001e-06, "loss": 0.6713, "step": 1390 }, { "epoch": 0.09, "grad_norm": 1.6660022735595703, "learning_rate": 8.750000000000001e-06, "loss": 0.6637, "step": 1400 }, { "epoch": 0.09, "grad_norm": 1.5590497255325317, "learning_rate": 8.8125e-06, "loss": 0.6643, "step": 1410 }, { "epoch": 0.09, "grad_norm": 1.6343599557876587, "learning_rate": 8.875e-06, "loss": 0.6496, "step": 1420 }, { "epoch": 0.09, "grad_norm": 1.755144476890564, "learning_rate": 8.9375e-06, "loss": 0.654, "step": 1430 }, { "epoch": 0.1, "grad_norm": 2.1182198524475098, "learning_rate": 9e-06, "loss": 0.6567, "step": 1440 }, { "epoch": 0.1, "grad_norm": 1.6853889226913452, "learning_rate": 9.0625e-06, "loss": 0.6651, "step": 1450 }, { "epoch": 0.1, "grad_norm": 1.7039555311203003, "learning_rate": 9.125e-06, "loss": 0.6376, "step": 1460 }, { "epoch": 0.1, "grad_norm": 1.2666831016540527, "learning_rate": 9.1875e-06, "loss": 0.638, "step": 1470 }, { "epoch": 0.1, "grad_norm": 1.2994011640548706, "learning_rate": 9.250000000000001e-06, "loss": 0.665, "step": 1480 }, { "epoch": 0.1, "grad_norm": 1.4676392078399658, "learning_rate": 9.312500000000001e-06, "loss": 0.6592, "step": 1490 }, { "epoch": 0.1, "grad_norm": 2.501640558242798, "learning_rate": 9.375000000000001e-06, "loss": 0.6532, "step": 1500 }, { "epoch": 0.1, "grad_norm": 1.9528775215148926, "learning_rate": 9.4375e-06, "loss": 0.6697, "step": 1510 }, { "epoch": 0.1, "grad_norm": 2.0028085708618164, "learning_rate": 9.5e-06, "loss": 0.6705, "step": 1520 }, { "epoch": 0.1, "grad_norm": 1.638511061668396, "learning_rate": 9.562500000000002e-06, "loss": 0.6712, "step": 1530 }, { "epoch": 0.1, "grad_norm": 1.7564353942871094, "learning_rate": 9.625e-06, "loss": 0.6543, "step": 1540 }, { "epoch": 0.1, "grad_norm": 1.4182738065719604, "learning_rate": 9.6875e-06, "loss": 0.6614, "step": 1550 }, { "epoch": 0.1, "grad_norm": 1.9909614324569702, "learning_rate": 9.75e-06, "loss": 0.6676, "step": 1560 }, { "epoch": 0.1, "grad_norm": 1.5370928049087524, "learning_rate": 9.8125e-06, "loss": 0.6585, "step": 1570 }, { "epoch": 0.1, "grad_norm": 1.7557249069213867, "learning_rate": 9.875000000000001e-06, "loss": 0.6462, "step": 1580 }, { "epoch": 0.11, "grad_norm": 1.671527624130249, "learning_rate": 9.937500000000001e-06, "loss": 0.6862, "step": 1590 }, { "epoch": 0.11, "grad_norm": 1.6722038984298706, "learning_rate": 1e-05, "loss": 0.6518, "step": 1600 }, { "epoch": 0.11, "eval_loss": 0.7220360636711121, "eval_runtime": 134.694, "eval_samples_per_second": 81.667, "eval_steps_per_second": 10.208, "step": 1600 }, { "epoch": 0.11, "grad_norm": 1.6332907676696777, "learning_rate": 9.99998810088676e-06, "loss": 0.6458, "step": 1610 }, { "epoch": 0.11, "grad_norm": 1.5682039260864258, "learning_rate": 9.999952403603674e-06, "loss": 0.6709, "step": 1620 }, { "epoch": 0.11, "grad_norm": 1.3811382055282593, "learning_rate": 9.999892908320647e-06, "loss": 0.6468, "step": 1630 }, { "epoch": 0.11, "grad_norm": 1.5780935287475586, "learning_rate": 9.999809615320857e-06, "loss": 0.6345, "step": 1640 }, { "epoch": 0.11, "grad_norm": 1.7821502685546875, "learning_rate": 9.99970252500075e-06, "loss": 0.6425, "step": 1650 }, { "epoch": 0.11, "grad_norm": 1.6573759317398071, "learning_rate": 9.999571637870035e-06, "loss": 0.6448, "step": 1660 }, { "epoch": 0.11, "grad_norm": 1.8963578939437866, "learning_rate": 9.999416954551693e-06, "loss": 0.6699, "step": 1670 }, { "epoch": 0.11, "grad_norm": 2.0272181034088135, "learning_rate": 9.999238475781957e-06, "loss": 0.6593, "step": 1680 }, { "epoch": 0.11, "grad_norm": 1.6698834896087646, "learning_rate": 9.999036202410324e-06, "loss": 0.6589, "step": 1690 }, { "epoch": 0.11, "grad_norm": 1.7105271816253662, "learning_rate": 9.998810135399545e-06, "loss": 0.645, "step": 1700 }, { "epoch": 0.11, "grad_norm": 1.5127520561218262, "learning_rate": 9.99856027582562e-06, "loss": 0.6435, "step": 1710 }, { "epoch": 0.11, "grad_norm": 1.930246114730835, "learning_rate": 9.998286624877786e-06, "loss": 0.658, "step": 1720 }, { "epoch": 0.11, "grad_norm": 1.6141360998153687, "learning_rate": 9.997989183858531e-06, "loss": 0.6503, "step": 1730 }, { "epoch": 0.12, "grad_norm": 1.8381562232971191, "learning_rate": 9.997667954183566e-06, "loss": 0.6619, "step": 1740 }, { "epoch": 0.12, "grad_norm": 1.8966178894042969, "learning_rate": 9.997322937381829e-06, "loss": 0.644, "step": 1750 }, { "epoch": 0.12, "grad_norm": 1.9953837394714355, "learning_rate": 9.99695413509548e-06, "loss": 0.6716, "step": 1760 }, { "epoch": 0.12, "grad_norm": 1.9095193147659302, "learning_rate": 9.996561549079886e-06, "loss": 0.6741, "step": 1770 }, { "epoch": 0.12, "grad_norm": 1.4451298713684082, "learning_rate": 9.996145181203616e-06, "loss": 0.6781, "step": 1780 }, { "epoch": 0.12, "grad_norm": 1.8144304752349854, "learning_rate": 9.995705033448435e-06, "loss": 0.6751, "step": 1790 }, { "epoch": 0.12, "grad_norm": 1.5360352993011475, "learning_rate": 9.99524110790929e-06, "loss": 0.6658, "step": 1800 }, { "epoch": 0.12, "grad_norm": 1.858272910118103, "learning_rate": 9.994753406794303e-06, "loss": 0.6563, "step": 1810 }, { "epoch": 0.12, "grad_norm": 1.853493571281433, "learning_rate": 9.994241932424755e-06, "loss": 0.6695, "step": 1820 }, { "epoch": 0.12, "grad_norm": 1.5680876970291138, "learning_rate": 9.993706687235085e-06, "loss": 0.6632, "step": 1830 }, { "epoch": 0.12, "grad_norm": 1.5058163404464722, "learning_rate": 9.993147673772869e-06, "loss": 0.6767, "step": 1840 }, { "epoch": 0.12, "grad_norm": 1.5722607374191284, "learning_rate": 9.992564894698816e-06, "loss": 0.6661, "step": 1850 }, { "epoch": 0.12, "grad_norm": 1.6947115659713745, "learning_rate": 9.991958352786744e-06, "loss": 0.6316, "step": 1860 }, { "epoch": 0.12, "grad_norm": 1.8827723264694214, "learning_rate": 9.99132805092358e-06, "loss": 0.6517, "step": 1870 }, { "epoch": 0.12, "grad_norm": 1.6634174585342407, "learning_rate": 9.990673992109335e-06, "loss": 0.6425, "step": 1880 }, { "epoch": 0.13, "grad_norm": 2.319075107574463, "learning_rate": 9.9899961794571e-06, "loss": 0.6636, "step": 1890 }, { "epoch": 0.13, "grad_norm": 1.733626365661621, "learning_rate": 9.989294616193018e-06, "loss": 0.6459, "step": 1900 }, { "epoch": 0.13, "grad_norm": 1.7393757104873657, "learning_rate": 9.988569305656286e-06, "loss": 0.6623, "step": 1910 }, { "epoch": 0.13, "grad_norm": 1.7465229034423828, "learning_rate": 9.987820251299121e-06, "loss": 0.6554, "step": 1920 }, { "epoch": 0.13, "grad_norm": 1.940171718597412, "learning_rate": 9.98704745668676e-06, "loss": 0.641, "step": 1930 }, { "epoch": 0.13, "grad_norm": 2.001396656036377, "learning_rate": 9.986250925497429e-06, "loss": 0.6558, "step": 1940 }, { "epoch": 0.13, "grad_norm": 1.8247671127319336, "learning_rate": 9.985430661522333e-06, "loss": 0.6703, "step": 1950 }, { "epoch": 0.13, "grad_norm": 1.653157114982605, "learning_rate": 9.984586668665641e-06, "loss": 0.6426, "step": 1960 }, { "epoch": 0.13, "grad_norm": 1.9459385871887207, "learning_rate": 9.983718950944457e-06, "loss": 0.656, "step": 1970 }, { "epoch": 0.13, "grad_norm": 1.8265575170516968, "learning_rate": 9.982827512488809e-06, "loss": 0.6623, "step": 1980 }, { "epoch": 0.13, "grad_norm": 1.6862218379974365, "learning_rate": 9.981912357541628e-06, "loss": 0.6457, "step": 1990 }, { "epoch": 0.13, "grad_norm": 1.6471173763275146, "learning_rate": 9.980973490458728e-06, "loss": 0.6565, "step": 2000 }, { "epoch": 0.13, "grad_norm": 1.7987889051437378, "learning_rate": 9.980010915708785e-06, "loss": 0.6526, "step": 2010 }, { "epoch": 0.13, "grad_norm": 2.383647918701172, "learning_rate": 9.979024637873309e-06, "loss": 0.6532, "step": 2020 }, { "epoch": 0.13, "grad_norm": 2.297119379043579, "learning_rate": 9.978014661646637e-06, "loss": 0.6488, "step": 2030 }, { "epoch": 0.13, "grad_norm": 1.8215795755386353, "learning_rate": 9.976980991835896e-06, "loss": 0.6537, "step": 2040 }, { "epoch": 0.14, "grad_norm": 1.737383484840393, "learning_rate": 9.975923633360985e-06, "loss": 0.6375, "step": 2050 }, { "epoch": 0.14, "grad_norm": 1.5321128368377686, "learning_rate": 9.974842591254559e-06, "loss": 0.6346, "step": 2060 }, { "epoch": 0.14, "grad_norm": 1.6683794260025024, "learning_rate": 9.973737870661995e-06, "loss": 0.6484, "step": 2070 }, { "epoch": 0.14, "grad_norm": 1.3653274774551392, "learning_rate": 9.972609476841368e-06, "loss": 0.6352, "step": 2080 }, { "epoch": 0.14, "grad_norm": 1.6334515810012817, "learning_rate": 9.971457415163435e-06, "loss": 0.6514, "step": 2090 }, { "epoch": 0.14, "grad_norm": 1.4465519189834595, "learning_rate": 9.970281691111598e-06, "loss": 0.6532, "step": 2100 }, { "epoch": 0.14, "grad_norm": 1.7188860177993774, "learning_rate": 9.96908231028189e-06, "loss": 0.6427, "step": 2110 }, { "epoch": 0.14, "grad_norm": 2.0040030479431152, "learning_rate": 9.967859278382939e-06, "loss": 0.6457, "step": 2120 }, { "epoch": 0.14, "grad_norm": 2.0880534648895264, "learning_rate": 9.96661260123594e-06, "loss": 0.6407, "step": 2130 }, { "epoch": 0.14, "grad_norm": 1.6864984035491943, "learning_rate": 9.965342284774633e-06, "loss": 0.6569, "step": 2140 }, { "epoch": 0.14, "grad_norm": 1.4719489812850952, "learning_rate": 9.964048335045276e-06, "loss": 0.6439, "step": 2150 }, { "epoch": 0.14, "grad_norm": 1.5283831357955933, "learning_rate": 9.962730758206612e-06, "loss": 0.6606, "step": 2160 }, { "epoch": 0.14, "grad_norm": 1.6295084953308105, "learning_rate": 9.961389560529835e-06, "loss": 0.6726, "step": 2170 }, { "epoch": 0.14, "grad_norm": 1.9193845987319946, "learning_rate": 9.960024748398576e-06, "loss": 0.6464, "step": 2180 }, { "epoch": 0.14, "grad_norm": 2.2357025146484375, "learning_rate": 9.958636328308852e-06, "loss": 0.6371, "step": 2190 }, { "epoch": 0.15, "grad_norm": 1.66888427734375, "learning_rate": 9.957224306869053e-06, "loss": 0.6607, "step": 2200 }, { "epoch": 0.15, "grad_norm": 1.6502246856689453, "learning_rate": 9.9557886907999e-06, "loss": 0.6436, "step": 2210 }, { "epoch": 0.15, "grad_norm": 1.6644810438156128, "learning_rate": 9.954329486934411e-06, "loss": 0.6552, "step": 2220 }, { "epoch": 0.15, "grad_norm": 1.6509498357772827, "learning_rate": 9.952846702217886e-06, "loss": 0.6501, "step": 2230 }, { "epoch": 0.15, "grad_norm": 1.7857375144958496, "learning_rate": 9.951340343707852e-06, "loss": 0.6603, "step": 2240 }, { "epoch": 0.15, "grad_norm": 1.4443598985671997, "learning_rate": 9.94981041857404e-06, "loss": 0.6351, "step": 2250 }, { "epoch": 0.15, "grad_norm": 1.636020302772522, "learning_rate": 9.948256934098353e-06, "loss": 0.6548, "step": 2260 }, { "epoch": 0.15, "grad_norm": 1.6805977821350098, "learning_rate": 9.946679897674823e-06, "loss": 0.6643, "step": 2270 }, { "epoch": 0.15, "grad_norm": 1.5527812242507935, "learning_rate": 9.945079316809585e-06, "loss": 0.6417, "step": 2280 }, { "epoch": 0.15, "grad_norm": 1.5074429512023926, "learning_rate": 9.943455199120836e-06, "loss": 0.662, "step": 2290 }, { "epoch": 0.15, "grad_norm": 1.5941535234451294, "learning_rate": 9.941807552338805e-06, "loss": 0.6613, "step": 2300 }, { "epoch": 0.15, "grad_norm": 1.7628566026687622, "learning_rate": 9.940136384305699e-06, "loss": 0.6624, "step": 2310 }, { "epoch": 0.15, "grad_norm": 2.036191940307617, "learning_rate": 9.938441702975689e-06, "loss": 0.6615, "step": 2320 }, { "epoch": 0.15, "grad_norm": 1.7616183757781982, "learning_rate": 9.936723516414857e-06, "loss": 0.6636, "step": 2330 }, { "epoch": 0.15, "grad_norm": 1.732585072517395, "learning_rate": 9.934981832801161e-06, "loss": 0.6687, "step": 2340 }, { "epoch": 0.16, "grad_norm": 2.074768543243408, "learning_rate": 9.933216660424396e-06, "loss": 0.6649, "step": 2350 }, { "epoch": 0.16, "grad_norm": 1.9834880828857422, "learning_rate": 9.931428007686158e-06, "loss": 0.645, "step": 2360 }, { "epoch": 0.16, "grad_norm": 1.7717509269714355, "learning_rate": 9.9296158830998e-06, "loss": 0.6571, "step": 2370 }, { "epoch": 0.16, "grad_norm": 1.8364372253417969, "learning_rate": 9.92778029529039e-06, "loss": 0.6428, "step": 2380 }, { "epoch": 0.16, "grad_norm": 1.92979896068573, "learning_rate": 9.925921252994677e-06, "loss": 0.6629, "step": 2390 }, { "epoch": 0.16, "grad_norm": 2.358120918273926, "learning_rate": 9.924038765061042e-06, "loss": 0.6375, "step": 2400 }, { "epoch": 0.16, "grad_norm": 1.8555611371994019, "learning_rate": 9.922132840449459e-06, "loss": 0.6455, "step": 2410 }, { "epoch": 0.16, "grad_norm": 1.9320988655090332, "learning_rate": 9.920203488231455e-06, "loss": 0.6426, "step": 2420 }, { "epoch": 0.16, "grad_norm": 2.835221767425537, "learning_rate": 9.918250717590061e-06, "loss": 0.6584, "step": 2430 }, { "epoch": 0.16, "grad_norm": 1.8801295757293701, "learning_rate": 9.916274537819774e-06, "loss": 0.6462, "step": 2440 }, { "epoch": 0.16, "grad_norm": 1.548391580581665, "learning_rate": 9.914274958326507e-06, "loss": 0.6579, "step": 2450 }, { "epoch": 0.16, "grad_norm": 1.7065672874450684, "learning_rate": 9.91225198862755e-06, "loss": 0.6617, "step": 2460 }, { "epoch": 0.16, "grad_norm": 1.8889693021774292, "learning_rate": 9.91020563835152e-06, "loss": 0.6748, "step": 2470 }, { "epoch": 0.16, "grad_norm": 1.5888632535934448, "learning_rate": 9.908135917238321e-06, "loss": 0.6484, "step": 2480 }, { "epoch": 0.16, "grad_norm": 1.8980170488357544, "learning_rate": 9.90604283513909e-06, "loss": 0.6489, "step": 2490 }, { "epoch": 0.17, "grad_norm": 1.585455298423767, "learning_rate": 9.903926402016153e-06, "loss": 0.667, "step": 2500 }, { "epoch": 0.17, "grad_norm": 1.7540241479873657, "learning_rate": 9.901786627942984e-06, "loss": 0.6521, "step": 2510 }, { "epoch": 0.17, "grad_norm": 1.97389554977417, "learning_rate": 9.899623523104149e-06, "loss": 0.6614, "step": 2520 }, { "epoch": 0.17, "grad_norm": 1.4442778825759888, "learning_rate": 9.897437097795257e-06, "loss": 0.6573, "step": 2530 }, { "epoch": 0.17, "grad_norm": 1.446113109588623, "learning_rate": 9.89522736242292e-06, "loss": 0.6522, "step": 2540 }, { "epoch": 0.17, "grad_norm": 1.8758691549301147, "learning_rate": 9.892994327504693e-06, "loss": 0.6511, "step": 2550 }, { "epoch": 0.17, "grad_norm": 1.9266222715377808, "learning_rate": 9.890738003669029e-06, "loss": 0.646, "step": 2560 }, { "epoch": 0.17, "grad_norm": 2.077249050140381, "learning_rate": 9.888458401655231e-06, "loss": 0.6555, "step": 2570 }, { "epoch": 0.17, "grad_norm": 1.661763310432434, "learning_rate": 9.886155532313396e-06, "loss": 0.6391, "step": 2580 }, { "epoch": 0.17, "grad_norm": 1.98686683177948, "learning_rate": 9.883829406604363e-06, "loss": 0.6679, "step": 2590 }, { "epoch": 0.17, "grad_norm": 1.9729160070419312, "learning_rate": 9.881480035599667e-06, "loss": 0.6719, "step": 2600 }, { "epoch": 0.17, "grad_norm": 1.4754536151885986, "learning_rate": 9.879107430481482e-06, "loss": 0.6443, "step": 2610 }, { "epoch": 0.17, "grad_norm": 2.3644585609436035, "learning_rate": 9.876711602542564e-06, "loss": 0.655, "step": 2620 }, { "epoch": 0.17, "grad_norm": 1.5546903610229492, "learning_rate": 9.874292563186206e-06, "loss": 0.6488, "step": 2630 }, { "epoch": 0.17, "grad_norm": 1.5309034585952759, "learning_rate": 9.871850323926178e-06, "loss": 0.6597, "step": 2640 }, { "epoch": 0.18, "grad_norm": 1.701562762260437, "learning_rate": 9.869384896386669e-06, "loss": 0.6409, "step": 2650 }, { "epoch": 0.18, "grad_norm": 1.8511505126953125, "learning_rate": 9.866896292302243e-06, "loss": 0.6562, "step": 2660 }, { "epoch": 0.18, "grad_norm": 1.7161003351211548, "learning_rate": 9.86438452351777e-06, "loss": 0.6519, "step": 2670 }, { "epoch": 0.18, "grad_norm": 1.749366283416748, "learning_rate": 9.861849601988384e-06, "loss": 0.6697, "step": 2680 }, { "epoch": 0.18, "grad_norm": 1.707053780555725, "learning_rate": 9.859291539779407e-06, "loss": 0.664, "step": 2690 }, { "epoch": 0.18, "grad_norm": 1.8223627805709839, "learning_rate": 9.856710349066307e-06, "loss": 0.6626, "step": 2700 }, { "epoch": 0.18, "grad_norm": 1.9889775514602661, "learning_rate": 9.854106042134642e-06, "loss": 0.6317, "step": 2710 }, { "epoch": 0.18, "grad_norm": 1.9672682285308838, "learning_rate": 9.851478631379982e-06, "loss": 0.6414, "step": 2720 }, { "epoch": 0.18, "grad_norm": 1.818117618560791, "learning_rate": 9.848828129307876e-06, "loss": 0.6687, "step": 2730 }, { "epoch": 0.18, "grad_norm": 2.1144392490386963, "learning_rate": 9.846154548533773e-06, "loss": 0.6473, "step": 2740 }, { "epoch": 0.18, "grad_norm": 1.5458323955535889, "learning_rate": 9.843457901782967e-06, "loss": 0.6568, "step": 2750 }, { "epoch": 0.18, "grad_norm": 1.7970044612884521, "learning_rate": 9.84073820189054e-06, "loss": 0.6609, "step": 2760 }, { "epoch": 0.18, "grad_norm": 1.4563696384429932, "learning_rate": 9.8379954618013e-06, "loss": 0.6626, "step": 2770 }, { "epoch": 0.18, "grad_norm": 1.5996843576431274, "learning_rate": 9.835229694569717e-06, "loss": 0.6556, "step": 2780 }, { "epoch": 0.18, "grad_norm": 1.6190502643585205, "learning_rate": 9.83244091335986e-06, "loss": 0.6492, "step": 2790 }, { "epoch": 0.19, "grad_norm": 1.9234681129455566, "learning_rate": 9.829629131445342e-06, "loss": 0.6645, "step": 2800 }, { "epoch": 0.19, "grad_norm": 1.7610886096954346, "learning_rate": 9.826794362209246e-06, "loss": 0.6548, "step": 2810 }, { "epoch": 0.19, "grad_norm": 1.6596527099609375, "learning_rate": 9.823936619144065e-06, "loss": 0.6493, "step": 2820 }, { "epoch": 0.19, "grad_norm": 1.7359377145767212, "learning_rate": 9.821055915851647e-06, "loss": 0.658, "step": 2830 }, { "epoch": 0.19, "grad_norm": 1.7483757734298706, "learning_rate": 9.818152266043115e-06, "loss": 0.64, "step": 2840 }, { "epoch": 0.19, "grad_norm": 1.4559893608093262, "learning_rate": 9.815225683538814e-06, "loss": 0.6546, "step": 2850 }, { "epoch": 0.19, "grad_norm": 1.773544192314148, "learning_rate": 9.812276182268236e-06, "loss": 0.6565, "step": 2860 }, { "epoch": 0.19, "grad_norm": 1.798349142074585, "learning_rate": 9.809303776269964e-06, "loss": 0.6534, "step": 2870 }, { "epoch": 0.19, "grad_norm": 1.7769262790679932, "learning_rate": 9.806308479691595e-06, "loss": 0.6556, "step": 2880 }, { "epoch": 0.19, "grad_norm": 2.2573776245117188, "learning_rate": 9.803290306789676e-06, "loss": 0.6344, "step": 2890 }, { "epoch": 0.19, "grad_norm": 1.619582176208496, "learning_rate": 9.800249271929645e-06, "loss": 0.6554, "step": 2900 }, { "epoch": 0.19, "grad_norm": 1.765291690826416, "learning_rate": 9.797185389585742e-06, "loss": 0.6565, "step": 2910 }, { "epoch": 0.19, "grad_norm": 1.5608848333358765, "learning_rate": 9.794098674340966e-06, "loss": 0.6485, "step": 2920 }, { "epoch": 0.19, "grad_norm": 1.6000419855117798, "learning_rate": 9.790989140886983e-06, "loss": 0.6413, "step": 2930 }, { "epoch": 0.19, "grad_norm": 1.367274522781372, "learning_rate": 9.787856804024073e-06, "loss": 0.6414, "step": 2940 }, { "epoch": 0.2, "grad_norm": 1.7870447635650635, "learning_rate": 9.784701678661045e-06, "loss": 0.6904, "step": 2950 }, { "epoch": 0.2, "grad_norm": 2.1525564193725586, "learning_rate": 9.781523779815178e-06, "loss": 0.658, "step": 2960 }, { "epoch": 0.2, "grad_norm": 1.6607723236083984, "learning_rate": 9.778323122612143e-06, "loss": 0.6476, "step": 2970 }, { "epoch": 0.2, "grad_norm": 1.6392195224761963, "learning_rate": 9.775099722285934e-06, "loss": 0.6591, "step": 2980 }, { "epoch": 0.2, "grad_norm": 1.522789478302002, "learning_rate": 9.771853594178791e-06, "loss": 0.6598, "step": 2990 }, { "epoch": 0.2, "grad_norm": 1.791340708732605, "learning_rate": 9.768584753741134e-06, "loss": 0.6698, "step": 3000 }, { "epoch": 0.2, "grad_norm": 1.3436764478683472, "learning_rate": 9.765293216531486e-06, "loss": 0.6819, "step": 3010 }, { "epoch": 0.2, "grad_norm": 1.5194346904754639, "learning_rate": 9.761978998216392e-06, "loss": 0.671, "step": 3020 }, { "epoch": 0.2, "grad_norm": 1.87282133102417, "learning_rate": 9.758642114570359e-06, "loss": 0.6684, "step": 3030 }, { "epoch": 0.2, "grad_norm": 2.3428215980529785, "learning_rate": 9.755282581475769e-06, "loss": 0.6635, "step": 3040 }, { "epoch": 0.2, "grad_norm": 1.9552640914916992, "learning_rate": 9.751900414922807e-06, "loss": 0.6384, "step": 3050 }, { "epoch": 0.2, "grad_norm": 2.020320415496826, "learning_rate": 9.748495631009386e-06, "loss": 0.6606, "step": 3060 }, { "epoch": 0.2, "grad_norm": 1.3121072053909302, "learning_rate": 9.745068245941071e-06, "loss": 0.671, "step": 3070 }, { "epoch": 0.2, "grad_norm": 2.007598400115967, "learning_rate": 9.741618276030998e-06, "loss": 0.6521, "step": 3080 }, { "epoch": 0.2, "grad_norm": 1.9580893516540527, "learning_rate": 9.7381457376998e-06, "loss": 0.6292, "step": 3090 }, { "epoch": 0.21, "grad_norm": 1.6906355619430542, "learning_rate": 9.73465064747553e-06, "loss": 0.6485, "step": 3100 }, { "epoch": 0.21, "grad_norm": 1.9698902368545532, "learning_rate": 9.731133021993574e-06, "loss": 0.6404, "step": 3110 }, { "epoch": 0.21, "grad_norm": 1.9455699920654297, "learning_rate": 9.727592877996585e-06, "loss": 0.6355, "step": 3120 }, { "epoch": 0.21, "grad_norm": 1.5921604633331299, "learning_rate": 9.72403023233439e-06, "loss": 0.656, "step": 3130 }, { "epoch": 0.21, "grad_norm": 1.6062285900115967, "learning_rate": 9.720445101963923e-06, "loss": 0.6658, "step": 3140 }, { "epoch": 0.21, "grad_norm": 1.8297983407974243, "learning_rate": 9.716837503949128e-06, "loss": 0.6573, "step": 3150 }, { "epoch": 0.21, "grad_norm": 1.9983500242233276, "learning_rate": 9.713207455460893e-06, "loss": 0.6729, "step": 3160 }, { "epoch": 0.21, "grad_norm": 1.587420105934143, "learning_rate": 9.709554973776962e-06, "loss": 0.669, "step": 3170 }, { "epoch": 0.21, "grad_norm": 1.468428611755371, "learning_rate": 9.705880076281854e-06, "loss": 0.6473, "step": 3180 }, { "epoch": 0.21, "grad_norm": 1.710042119026184, "learning_rate": 9.702182780466775e-06, "loss": 0.684, "step": 3190 }, { "epoch": 0.21, "grad_norm": 1.8687783479690552, "learning_rate": 9.698463103929542e-06, "loss": 0.6583, "step": 3200 }, { "epoch": 0.21, "eval_loss": 0.7434597015380859, "eval_runtime": 134.6713, "eval_samples_per_second": 81.68, "eval_steps_per_second": 10.21, "step": 3200 }, { "epoch": 0.21, "grad_norm": 1.8008785247802734, "learning_rate": 9.694721064374497e-06, "loss": 0.6572, "step": 3210 }, { "epoch": 0.21, "grad_norm": 1.7104676961898804, "learning_rate": 9.690956679612422e-06, "loss": 0.6357, "step": 3220 }, { "epoch": 0.21, "grad_norm": 1.9379181861877441, "learning_rate": 9.68716996756045e-06, "loss": 0.6741, "step": 3230 }, { "epoch": 0.21, "grad_norm": 1.993609070777893, "learning_rate": 9.683360946241988e-06, "loss": 0.658, "step": 3240 }, { "epoch": 0.22, "grad_norm": 1.755682349205017, "learning_rate": 9.67952963378663e-06, "loss": 0.6803, "step": 3250 }, { "epoch": 0.22, "grad_norm": 1.6151280403137207, "learning_rate": 9.67567604843006e-06, "loss": 0.6693, "step": 3260 }, { "epoch": 0.22, "grad_norm": 1.4773739576339722, "learning_rate": 9.671800208513978e-06, "loss": 0.636, "step": 3270 }, { "epoch": 0.22, "grad_norm": 1.6857538223266602, "learning_rate": 9.667902132486009e-06, "loss": 0.6762, "step": 3280 }, { "epoch": 0.22, "grad_norm": 1.871492624282837, "learning_rate": 9.663981838899612e-06, "loss": 0.6595, "step": 3290 }, { "epoch": 0.22, "grad_norm": 1.8173590898513794, "learning_rate": 9.660039346413994e-06, "loss": 0.6605, "step": 3300 }, { "epoch": 0.22, "grad_norm": 2.4486677646636963, "learning_rate": 9.656074673794018e-06, "loss": 0.6604, "step": 3310 }, { "epoch": 0.22, "grad_norm": 1.5115715265274048, "learning_rate": 9.652087839910123e-06, "loss": 0.6732, "step": 3320 }, { "epoch": 0.22, "grad_norm": 1.5585874319076538, "learning_rate": 9.648078863738224e-06, "loss": 0.6651, "step": 3330 }, { "epoch": 0.22, "grad_norm": 2.0782039165496826, "learning_rate": 9.644047764359623e-06, "loss": 0.6467, "step": 3340 }, { "epoch": 0.22, "grad_norm": 1.5483053922653198, "learning_rate": 9.639994560960923e-06, "loss": 0.6625, "step": 3350 }, { "epoch": 0.22, "grad_norm": 1.6379517316818237, "learning_rate": 9.635919272833938e-06, "loss": 0.668, "step": 3360 }, { "epoch": 0.22, "grad_norm": 1.7890597581863403, "learning_rate": 9.63182191937559e-06, "loss": 0.6528, "step": 3370 }, { "epoch": 0.22, "grad_norm": 1.5837633609771729, "learning_rate": 9.627702520087833e-06, "loss": 0.656, "step": 3380 }, { "epoch": 0.22, "grad_norm": 1.770031213760376, "learning_rate": 9.623561094577541e-06, "loss": 0.6479, "step": 3390 }, { "epoch": 0.22, "grad_norm": 1.7771351337432861, "learning_rate": 9.619397662556434e-06, "loss": 0.6554, "step": 3400 }, { "epoch": 0.23, "grad_norm": 1.8997645378112793, "learning_rate": 9.615212243840972e-06, "loss": 0.6503, "step": 3410 }, { "epoch": 0.23, "grad_norm": 1.9709876775741577, "learning_rate": 9.61100485835226e-06, "loss": 0.6737, "step": 3420 }, { "epoch": 0.23, "grad_norm": 1.9905493259429932, "learning_rate": 9.606775526115963e-06, "loss": 0.6574, "step": 3430 }, { "epoch": 0.23, "grad_norm": 1.7140249013900757, "learning_rate": 9.602524267262202e-06, "loss": 0.6502, "step": 3440 }, { "epoch": 0.23, "grad_norm": 1.849632978439331, "learning_rate": 9.598251102025463e-06, "loss": 0.6641, "step": 3450 }, { "epoch": 0.23, "grad_norm": 1.5897934436798096, "learning_rate": 9.593956050744493e-06, "loss": 0.6373, "step": 3460 }, { "epoch": 0.23, "grad_norm": 1.7657532691955566, "learning_rate": 9.589639133862214e-06, "loss": 0.6568, "step": 3470 }, { "epoch": 0.23, "grad_norm": 1.7160342931747437, "learning_rate": 9.58530037192562e-06, "loss": 0.6671, "step": 3480 }, { "epoch": 0.23, "grad_norm": 1.909600019454956, "learning_rate": 9.58093978558568e-06, "loss": 0.6514, "step": 3490 }, { "epoch": 0.23, "grad_norm": 1.8497976064682007, "learning_rate": 9.576557395597237e-06, "loss": 0.6471, "step": 3500 }, { "epoch": 0.23, "grad_norm": 1.4226924180984497, "learning_rate": 9.572153222818911e-06, "loss": 0.6686, "step": 3510 }, { "epoch": 0.23, "grad_norm": 1.7128634452819824, "learning_rate": 9.567727288213005e-06, "loss": 0.6838, "step": 3520 }, { "epoch": 0.23, "grad_norm": 1.6018590927124023, "learning_rate": 9.563279612845398e-06, "loss": 0.6336, "step": 3530 }, { "epoch": 0.23, "grad_norm": 1.5354275703430176, "learning_rate": 9.558810217885444e-06, "loss": 0.6544, "step": 3540 }, { "epoch": 0.23, "grad_norm": 1.7476013898849487, "learning_rate": 9.55431912460588e-06, "loss": 0.6717, "step": 3550 }, { "epoch": 0.24, "grad_norm": 1.68599534034729, "learning_rate": 9.549806354382716e-06, "loss": 0.6623, "step": 3560 }, { "epoch": 0.24, "grad_norm": 1.7562646865844727, "learning_rate": 9.54527192869514e-06, "loss": 0.6655, "step": 3570 }, { "epoch": 0.24, "grad_norm": 2.3129031658172607, "learning_rate": 9.540715869125407e-06, "loss": 0.6798, "step": 3580 }, { "epoch": 0.24, "grad_norm": 1.9537733793258667, "learning_rate": 9.536138197358747e-06, "loss": 0.638, "step": 3590 }, { "epoch": 0.24, "grad_norm": 1.7580291032791138, "learning_rate": 9.531538935183252e-06, "loss": 0.6663, "step": 3600 }, { "epoch": 0.24, "grad_norm": 1.713660717010498, "learning_rate": 9.526918104489777e-06, "loss": 0.653, "step": 3610 }, { "epoch": 0.24, "grad_norm": 1.7625364065170288, "learning_rate": 9.522275727271842e-06, "loss": 0.6597, "step": 3620 }, { "epoch": 0.24, "grad_norm": 1.5197455883026123, "learning_rate": 9.51761182562551e-06, "loss": 0.6719, "step": 3630 }, { "epoch": 0.24, "grad_norm": 1.7578450441360474, "learning_rate": 9.512926421749305e-06, "loss": 0.649, "step": 3640 }, { "epoch": 0.24, "grad_norm": 1.4946767091751099, "learning_rate": 9.50821953794408e-06, "loss": 0.6493, "step": 3650 }, { "epoch": 0.24, "grad_norm": 2.112987518310547, "learning_rate": 9.503491196612939e-06, "loss": 0.6544, "step": 3660 }, { "epoch": 0.24, "grad_norm": 1.4539880752563477, "learning_rate": 9.498741420261109e-06, "loss": 0.6452, "step": 3670 }, { "epoch": 0.24, "grad_norm": 1.7577303647994995, "learning_rate": 9.493970231495836e-06, "loss": 0.6705, "step": 3680 }, { "epoch": 0.24, "grad_norm": 1.4653581380844116, "learning_rate": 9.48917765302629e-06, "loss": 0.6759, "step": 3690 }, { "epoch": 0.24, "grad_norm": 1.4176025390625, "learning_rate": 9.484363707663443e-06, "loss": 0.6525, "step": 3700 }, { "epoch": 0.25, "grad_norm": 1.6279628276824951, "learning_rate": 9.479528418319968e-06, "loss": 0.6493, "step": 3710 }, { "epoch": 0.25, "grad_norm": 1.9044269323349, "learning_rate": 9.474671808010126e-06, "loss": 0.6398, "step": 3720 }, { "epoch": 0.25, "grad_norm": 1.878974199295044, "learning_rate": 9.469793899849663e-06, "loss": 0.6533, "step": 3730 }, { "epoch": 0.25, "grad_norm": 1.6365597248077393, "learning_rate": 9.464894717055686e-06, "loss": 0.6601, "step": 3740 }, { "epoch": 0.25, "grad_norm": 1.9638291597366333, "learning_rate": 9.459974282946572e-06, "loss": 0.6496, "step": 3750 }, { "epoch": 0.25, "grad_norm": 1.6757596731185913, "learning_rate": 9.45503262094184e-06, "loss": 0.6566, "step": 3760 }, { "epoch": 0.25, "grad_norm": 2.156317949295044, "learning_rate": 9.45006975456205e-06, "loss": 0.6722, "step": 3770 }, { "epoch": 0.25, "grad_norm": 1.5976033210754395, "learning_rate": 9.445085707428683e-06, "loss": 0.6596, "step": 3780 }, { "epoch": 0.25, "grad_norm": 1.6568489074707031, "learning_rate": 9.440080503264038e-06, "loss": 0.6412, "step": 3790 }, { "epoch": 0.25, "grad_norm": 2.008394479751587, "learning_rate": 9.43505416589111e-06, "loss": 0.6643, "step": 3800 }, { "epoch": 0.25, "grad_norm": 1.5924795866012573, "learning_rate": 9.430006719233483e-06, "loss": 0.6374, "step": 3810 }, { "epoch": 0.25, "grad_norm": 1.8586095571517944, "learning_rate": 9.42493818731521e-06, "loss": 0.6545, "step": 3820 }, { "epoch": 0.25, "grad_norm": 1.717989444732666, "learning_rate": 9.419848594260708e-06, "loss": 0.6411, "step": 3830 }, { "epoch": 0.25, "grad_norm": 1.6271024942398071, "learning_rate": 9.414737964294636e-06, "loss": 0.6737, "step": 3840 }, { "epoch": 0.25, "grad_norm": 1.7664674520492554, "learning_rate": 9.409606321741776e-06, "loss": 0.6635, "step": 3850 }, { "epoch": 0.26, "grad_norm": 1.7622829675674438, "learning_rate": 9.40445369102693e-06, "loss": 0.673, "step": 3860 }, { "epoch": 0.26, "grad_norm": 1.5368982553482056, "learning_rate": 9.399280096674788e-06, "loss": 0.6383, "step": 3870 }, { "epoch": 0.26, "grad_norm": 1.9596257209777832, "learning_rate": 9.394085563309827e-06, "loss": 0.6818, "step": 3880 }, { "epoch": 0.26, "grad_norm": 1.8420953750610352, "learning_rate": 9.388870115656185e-06, "loss": 0.6641, "step": 3890 }, { "epoch": 0.26, "grad_norm": 1.407446026802063, "learning_rate": 9.38363377853754e-06, "loss": 0.6591, "step": 3900 }, { "epoch": 0.26, "grad_norm": 1.393359661102295, "learning_rate": 9.378376576876999e-06, "loss": 0.6496, "step": 3910 }, { "epoch": 0.26, "grad_norm": 1.487106442451477, "learning_rate": 9.37309853569698e-06, "loss": 0.6501, "step": 3920 }, { "epoch": 0.26, "grad_norm": 1.5764119625091553, "learning_rate": 9.367799680119085e-06, "loss": 0.6609, "step": 3930 }, { "epoch": 0.26, "grad_norm": 1.7985124588012695, "learning_rate": 9.362480035363987e-06, "loss": 0.6597, "step": 3940 }, { "epoch": 0.26, "grad_norm": 1.5843743085861206, "learning_rate": 9.357139626751308e-06, "loss": 0.662, "step": 3950 }, { "epoch": 0.26, "grad_norm": 2.2033445835113525, "learning_rate": 9.351778479699499e-06, "loss": 0.6505, "step": 3960 }, { "epoch": 0.26, "grad_norm": 2.089184284210205, "learning_rate": 9.34639661972572e-06, "loss": 0.6703, "step": 3970 }, { "epoch": 0.26, "grad_norm": 1.5206127166748047, "learning_rate": 9.340994072445713e-06, "loss": 0.6676, "step": 3980 }, { "epoch": 0.26, "grad_norm": 1.3185709714889526, "learning_rate": 9.335570863573687e-06, "loss": 0.6517, "step": 3990 }, { "epoch": 0.26, "grad_norm": 1.6669631004333496, "learning_rate": 9.330127018922195e-06, "loss": 0.6785, "step": 4000 }, { "epoch": 0.27, "grad_norm": 1.6120712757110596, "learning_rate": 9.324662564402004e-06, "loss": 0.667, "step": 4010 }, { "epoch": 0.27, "grad_norm": 2.04732608795166, "learning_rate": 9.31917752602198e-06, "loss": 0.6623, "step": 4020 }, { "epoch": 0.27, "grad_norm": 1.6198598146438599, "learning_rate": 9.31367192988896e-06, "loss": 0.6431, "step": 4030 }, { "epoch": 0.27, "grad_norm": 2.13211727142334, "learning_rate": 9.30814580220763e-06, "loss": 0.6589, "step": 4040 }, { "epoch": 0.27, "grad_norm": 1.837581753730774, "learning_rate": 9.302599169280395e-06, "loss": 0.6542, "step": 4050 }, { "epoch": 0.27, "grad_norm": 1.8382261991500854, "learning_rate": 9.297032057507264e-06, "loss": 0.6527, "step": 4060 }, { "epoch": 0.27, "grad_norm": 1.9884930849075317, "learning_rate": 9.291444493385712e-06, "loss": 0.6681, "step": 4070 }, { "epoch": 0.27, "grad_norm": 1.6666454076766968, "learning_rate": 9.285836503510562e-06, "loss": 0.6809, "step": 4080 }, { "epoch": 0.27, "grad_norm": 2.0016543865203857, "learning_rate": 9.280208114573859e-06, "loss": 0.6808, "step": 4090 }, { "epoch": 0.27, "grad_norm": 1.6018117666244507, "learning_rate": 9.274559353364734e-06, "loss": 0.6751, "step": 4100 }, { "epoch": 0.27, "grad_norm": 2.8057656288146973, "learning_rate": 9.268890246769288e-06, "loss": 0.6618, "step": 4110 }, { "epoch": 0.27, "grad_norm": 1.394476294517517, "learning_rate": 9.263200821770462e-06, "loss": 0.6596, "step": 4120 }, { "epoch": 0.27, "grad_norm": 1.8551517724990845, "learning_rate": 9.257491105447895e-06, "loss": 0.6557, "step": 4130 }, { "epoch": 0.27, "grad_norm": 1.7693849802017212, "learning_rate": 9.251761124977816e-06, "loss": 0.6389, "step": 4140 }, { "epoch": 0.27, "grad_norm": 2.032569169998169, "learning_rate": 9.246010907632894e-06, "loss": 0.653, "step": 4150 }, { "epoch": 0.28, "grad_norm": 1.4892268180847168, "learning_rate": 9.24024048078213e-06, "loss": 0.657, "step": 4160 }, { "epoch": 0.28, "grad_norm": 1.8956682682037354, "learning_rate": 9.234449871890708e-06, "loss": 0.6585, "step": 4170 }, { "epoch": 0.28, "grad_norm": 1.9287517070770264, "learning_rate": 9.228639108519867e-06, "loss": 0.6556, "step": 4180 }, { "epoch": 0.28, "grad_norm": 1.6487663984298706, "learning_rate": 9.222808218326784e-06, "loss": 0.6572, "step": 4190 }, { "epoch": 0.28, "grad_norm": 1.7571004629135132, "learning_rate": 9.21695722906443e-06, "loss": 0.6681, "step": 4200 }, { "epoch": 0.28, "grad_norm": 2.26200532913208, "learning_rate": 9.211086168581433e-06, "loss": 0.6369, "step": 4210 }, { "epoch": 0.28, "grad_norm": 2.210916042327881, "learning_rate": 9.205195064821964e-06, "loss": 0.6748, "step": 4220 }, { "epoch": 0.28, "grad_norm": 1.523188829421997, "learning_rate": 9.199283945825582e-06, "loss": 0.6595, "step": 4230 }, { "epoch": 0.28, "grad_norm": 2.346522092819214, "learning_rate": 9.193352839727122e-06, "loss": 0.6908, "step": 4240 }, { "epoch": 0.28, "grad_norm": 1.611079454421997, "learning_rate": 9.18740177475654e-06, "loss": 0.6476, "step": 4250 }, { "epoch": 0.28, "grad_norm": 1.8673412799835205, "learning_rate": 9.181430779238799e-06, "loss": 0.6544, "step": 4260 }, { "epoch": 0.28, "grad_norm": 1.6414918899536133, "learning_rate": 9.175439881593716e-06, "loss": 0.6579, "step": 4270 }, { "epoch": 0.28, "grad_norm": 2.150317668914795, "learning_rate": 9.169429110335842e-06, "loss": 0.6687, "step": 4280 }, { "epoch": 0.28, "grad_norm": 1.9575879573822021, "learning_rate": 9.163398494074314e-06, "loss": 0.6423, "step": 4290 }, { "epoch": 0.28, "grad_norm": 1.7684667110443115, "learning_rate": 9.157348061512728e-06, "loss": 0.6457, "step": 4300 }, { "epoch": 0.29, "grad_norm": 1.60320246219635, "learning_rate": 9.151277841448993e-06, "loss": 0.632, "step": 4310 }, { "epoch": 0.29, "grad_norm": 1.604935884475708, "learning_rate": 9.145187862775208e-06, "loss": 0.6633, "step": 4320 }, { "epoch": 0.29, "grad_norm": 1.5652973651885986, "learning_rate": 9.139078154477512e-06, "loss": 0.6496, "step": 4330 }, { "epoch": 0.29, "grad_norm": 1.8579405546188354, "learning_rate": 9.132948745635943e-06, "loss": 0.652, "step": 4340 }, { "epoch": 0.29, "grad_norm": 2.2190017700195312, "learning_rate": 9.126799665424319e-06, "loss": 0.6789, "step": 4350 }, { "epoch": 0.29, "grad_norm": 1.978559970855713, "learning_rate": 9.120630943110078e-06, "loss": 0.6849, "step": 4360 }, { "epoch": 0.29, "grad_norm": 1.7971248626708984, "learning_rate": 9.114442608054153e-06, "loss": 0.6637, "step": 4370 }, { "epoch": 0.29, "grad_norm": 2.0373167991638184, "learning_rate": 9.10823468971082e-06, "loss": 0.6718, "step": 4380 }, { "epoch": 0.29, "grad_norm": 1.748665690422058, "learning_rate": 9.102007217627568e-06, "loss": 0.6666, "step": 4390 }, { "epoch": 0.29, "grad_norm": 2.1095681190490723, "learning_rate": 9.09576022144496e-06, "loss": 0.6676, "step": 4400 }, { "epoch": 0.29, "grad_norm": 1.4815317392349243, "learning_rate": 9.089493730896478e-06, "loss": 0.6414, "step": 4410 }, { "epoch": 0.29, "grad_norm": 10.00068187713623, "learning_rate": 9.083207775808395e-06, "loss": 0.6533, "step": 4420 }, { "epoch": 0.29, "grad_norm": 1.62544584274292, "learning_rate": 9.076902386099628e-06, "loss": 0.6549, "step": 4430 }, { "epoch": 0.29, "grad_norm": 2.158773183822632, "learning_rate": 9.070577591781598e-06, "loss": 0.6627, "step": 4440 }, { "epoch": 0.29, "grad_norm": 2.012834072113037, "learning_rate": 9.064233422958078e-06, "loss": 0.6542, "step": 4450 }, { "epoch": 0.3, "grad_norm": 1.6546082496643066, "learning_rate": 9.057869909825062e-06, "loss": 0.6541, "step": 4460 }, { "epoch": 0.3, "grad_norm": 1.8351281881332397, "learning_rate": 9.051487082670618e-06, "loss": 0.6714, "step": 4470 }, { "epoch": 0.3, "grad_norm": 2.1694746017456055, "learning_rate": 9.045084971874738e-06, "loss": 0.6666, "step": 4480 }, { "epoch": 0.3, "grad_norm": 2.0474767684936523, "learning_rate": 9.038663607909198e-06, "loss": 0.6649, "step": 4490 }, { "epoch": 0.3, "grad_norm": 1.6256054639816284, "learning_rate": 9.032223021337415e-06, "loss": 0.6588, "step": 4500 }, { "epoch": 0.3, "grad_norm": 1.5998213291168213, "learning_rate": 9.025763242814291e-06, "loss": 0.6857, "step": 4510 }, { "epoch": 0.3, "grad_norm": 1.970272421836853, "learning_rate": 9.019284303086086e-06, "loss": 0.6657, "step": 4520 }, { "epoch": 0.3, "grad_norm": 1.555245041847229, "learning_rate": 9.012786232990256e-06, "loss": 0.651, "step": 4530 }, { "epoch": 0.3, "grad_norm": 2.0598926544189453, "learning_rate": 9.006269063455305e-06, "loss": 0.6689, "step": 4540 }, { "epoch": 0.3, "grad_norm": 1.8089969158172607, "learning_rate": 8.999732825500649e-06, "loss": 0.6528, "step": 4550 }, { "epoch": 0.3, "grad_norm": 2.0560412406921387, "learning_rate": 8.993177550236464e-06, "loss": 0.6599, "step": 4560 }, { "epoch": 0.3, "grad_norm": 1.9101848602294922, "learning_rate": 8.986603268863536e-06, "loss": 0.6429, "step": 4570 }, { "epoch": 0.3, "grad_norm": 1.8303155899047852, "learning_rate": 8.98001001267311e-06, "loss": 0.6315, "step": 4580 }, { "epoch": 0.3, "grad_norm": 1.8667141199111938, "learning_rate": 8.97339781304675e-06, "loss": 0.6405, "step": 4590 }, { "epoch": 0.3, "grad_norm": 1.8446400165557861, "learning_rate": 8.966766701456177e-06, "loss": 0.6378, "step": 4600 }, { "epoch": 0.31, "grad_norm": 2.6912033557891846, "learning_rate": 8.960116709463131e-06, "loss": 0.6727, "step": 4610 }, { "epoch": 0.31, "grad_norm": 1.9695560932159424, "learning_rate": 8.953447868719218e-06, "loss": 0.6554, "step": 4620 }, { "epoch": 0.31, "grad_norm": 1.7779700756072998, "learning_rate": 8.94676021096575e-06, "loss": 0.674, "step": 4630 }, { "epoch": 0.31, "grad_norm": 1.5401631593704224, "learning_rate": 8.94005376803361e-06, "loss": 0.6557, "step": 4640 }, { "epoch": 0.31, "grad_norm": 1.6267428398132324, "learning_rate": 8.933328571843086e-06, "loss": 0.65, "step": 4650 }, { "epoch": 0.31, "grad_norm": 2.1260504722595215, "learning_rate": 8.926584654403725e-06, "loss": 0.6648, "step": 4660 }, { "epoch": 0.31, "grad_norm": 1.7050879001617432, "learning_rate": 8.919822047814184e-06, "loss": 0.6471, "step": 4670 }, { "epoch": 0.31, "grad_norm": 1.5733059644699097, "learning_rate": 8.91304078426207e-06, "loss": 0.6494, "step": 4680 }, { "epoch": 0.31, "grad_norm": 1.788745403289795, "learning_rate": 8.906240896023794e-06, "loss": 0.6619, "step": 4690 }, { "epoch": 0.31, "grad_norm": 1.6663099527359009, "learning_rate": 8.899422415464409e-06, "loss": 0.656, "step": 4700 }, { "epoch": 0.31, "grad_norm": 1.934832215309143, "learning_rate": 8.892585375037469e-06, "loss": 0.6489, "step": 4710 }, { "epoch": 0.31, "grad_norm": 1.9886648654937744, "learning_rate": 8.885729807284855e-06, "loss": 0.6524, "step": 4720 }, { "epoch": 0.31, "grad_norm": 1.718221664428711, "learning_rate": 8.878855744836643e-06, "loss": 0.657, "step": 4730 }, { "epoch": 0.31, "grad_norm": 2.0010931491851807, "learning_rate": 8.871963220410929e-06, "loss": 0.6532, "step": 4740 }, { "epoch": 0.31, "grad_norm": 1.7940354347229004, "learning_rate": 8.865052266813686e-06, "loss": 0.6639, "step": 4750 }, { "epoch": 0.31, "grad_norm": 1.7344415187835693, "learning_rate": 8.858122916938601e-06, "loss": 0.6588, "step": 4760 }, { "epoch": 0.32, "grad_norm": 1.5673871040344238, "learning_rate": 8.851175203766922e-06, "loss": 0.6526, "step": 4770 }, { "epoch": 0.32, "grad_norm": 1.609823226928711, "learning_rate": 8.844209160367298e-06, "loss": 0.6836, "step": 4780 }, { "epoch": 0.32, "grad_norm": 1.8896527290344238, "learning_rate": 8.837224819895627e-06, "loss": 0.6739, "step": 4790 }, { "epoch": 0.32, "grad_norm": 2.3576793670654297, "learning_rate": 8.83022221559489e-06, "loss": 0.6755, "step": 4800 }, { "epoch": 0.32, "eval_loss": 0.7543163895606995, "eval_runtime": 134.2076, "eval_samples_per_second": 81.963, "eval_steps_per_second": 10.245, "step": 4800 }, { "epoch": 0.32, "grad_norm": 1.6949530839920044, "learning_rate": 8.823201380795003e-06, "loss": 0.6621, "step": 4810 }, { "epoch": 0.32, "grad_norm": 1.8176947832107544, "learning_rate": 8.816162348912644e-06, "loss": 0.6603, "step": 4820 }, { "epoch": 0.32, "grad_norm": 2.184191942214966, "learning_rate": 8.809105153451113e-06, "loss": 0.6721, "step": 4830 }, { "epoch": 0.32, "grad_norm": 1.877064824104309, "learning_rate": 8.802029828000157e-06, "loss": 0.6696, "step": 4840 }, { "epoch": 0.32, "grad_norm": 1.3812588453292847, "learning_rate": 8.79493640623581e-06, "loss": 0.6698, "step": 4850 }, { "epoch": 0.32, "grad_norm": 1.5951484441757202, "learning_rate": 8.78782492192025e-06, "loss": 0.6656, "step": 4860 }, { "epoch": 0.32, "grad_norm": 1.7040786743164062, "learning_rate": 8.780695408901613e-06, "loss": 0.6648, "step": 4870 }, { "epoch": 0.32, "grad_norm": 2.1103320121765137, "learning_rate": 8.773547901113862e-06, "loss": 0.6809, "step": 4880 }, { "epoch": 0.32, "grad_norm": 1.5602185726165771, "learning_rate": 8.766382432576589e-06, "loss": 0.6597, "step": 4890 }, { "epoch": 0.32, "grad_norm": 1.905117392539978, "learning_rate": 8.759199037394888e-06, "loss": 0.6619, "step": 4900 }, { "epoch": 0.32, "grad_norm": 1.5752969980239868, "learning_rate": 8.75199774975917e-06, "loss": 0.6549, "step": 4910 }, { "epoch": 0.33, "grad_norm": 1.9230555295944214, "learning_rate": 8.744778603945013e-06, "loss": 0.6364, "step": 4920 }, { "epoch": 0.33, "grad_norm": 1.3653078079223633, "learning_rate": 8.737541634312985e-06, "loss": 0.6222, "step": 4930 }, { "epoch": 0.33, "grad_norm": 1.6335278749465942, "learning_rate": 8.730286875308498e-06, "loss": 0.665, "step": 4940 }, { "epoch": 0.33, "grad_norm": 1.7848576307296753, "learning_rate": 8.723014361461633e-06, "loss": 0.6575, "step": 4950 }, { "epoch": 0.33, "grad_norm": 1.5796133279800415, "learning_rate": 8.715724127386971e-06, "loss": 0.6556, "step": 4960 }, { "epoch": 0.33, "grad_norm": 1.7556233406066895, "learning_rate": 8.708416207783447e-06, "loss": 0.6569, "step": 4970 }, { "epoch": 0.33, "grad_norm": 2.42543888092041, "learning_rate": 8.701090637434161e-06, "loss": 0.6654, "step": 4980 }, { "epoch": 0.33, "grad_norm": 1.9830849170684814, "learning_rate": 8.693747451206231e-06, "loss": 0.6418, "step": 4990 }, { "epoch": 0.33, "grad_norm": 1.6270414590835571, "learning_rate": 8.68638668405062e-06, "loss": 0.6666, "step": 5000 }, { "epoch": 0.33, "grad_norm": 1.8726800680160522, "learning_rate": 8.679008371001969e-06, "loss": 0.6598, "step": 5010 }, { "epoch": 0.33, "grad_norm": 1.2677507400512695, "learning_rate": 8.671612547178428e-06, "loss": 0.6625, "step": 5020 }, { "epoch": 0.33, "grad_norm": 1.7537394762039185, "learning_rate": 8.664199247781497e-06, "loss": 0.6797, "step": 5030 }, { "epoch": 0.33, "grad_norm": 1.82877779006958, "learning_rate": 8.656768508095853e-06, "loss": 0.6734, "step": 5040 }, { "epoch": 0.33, "grad_norm": 1.8775991201400757, "learning_rate": 8.649320363489178e-06, "loss": 0.6722, "step": 5050 }, { "epoch": 0.33, "grad_norm": 1.7584172487258911, "learning_rate": 8.641854849412002e-06, "loss": 0.6744, "step": 5060 }, { "epoch": 0.34, "grad_norm": 1.6870256662368774, "learning_rate": 8.634372001397521e-06, "loss": 0.6596, "step": 5070 }, { "epoch": 0.34, "grad_norm": 2.059516191482544, "learning_rate": 8.626871855061438e-06, "loss": 0.6584, "step": 5080 }, { "epoch": 0.34, "grad_norm": 1.5890916585922241, "learning_rate": 8.61935444610179e-06, "loss": 0.6461, "step": 5090 }, { "epoch": 0.34, "grad_norm": 1.5515724420547485, "learning_rate": 8.611819810298778e-06, "loss": 0.6651, "step": 5100 }, { "epoch": 0.34, "grad_norm": 1.4988518953323364, "learning_rate": 8.604267983514595e-06, "loss": 0.6616, "step": 5110 }, { "epoch": 0.34, "grad_norm": 1.6072263717651367, "learning_rate": 8.596699001693257e-06, "loss": 0.6694, "step": 5120 }, { "epoch": 0.34, "grad_norm": 2.2334463596343994, "learning_rate": 8.589112900860432e-06, "loss": 0.6647, "step": 5130 }, { "epoch": 0.34, "grad_norm": 1.476638674736023, "learning_rate": 8.581509717123272e-06, "loss": 0.6542, "step": 5140 }, { "epoch": 0.34, "grad_norm": 1.5825482606887817, "learning_rate": 8.573889486670233e-06, "loss": 0.6628, "step": 5150 }, { "epoch": 0.34, "grad_norm": 1.387061357498169, "learning_rate": 8.56625224577091e-06, "loss": 0.646, "step": 5160 }, { "epoch": 0.34, "grad_norm": 2.2527670860290527, "learning_rate": 8.558598030775857e-06, "loss": 0.6675, "step": 5170 }, { "epoch": 0.34, "grad_norm": 1.8437511920928955, "learning_rate": 8.550926878116428e-06, "loss": 0.646, "step": 5180 }, { "epoch": 0.34, "grad_norm": 1.7316181659698486, "learning_rate": 8.543238824304585e-06, "loss": 0.6828, "step": 5190 }, { "epoch": 0.34, "grad_norm": 1.5785106420516968, "learning_rate": 8.535533905932739e-06, "loss": 0.6675, "step": 5200 }, { "epoch": 0.34, "grad_norm": 2.5922186374664307, "learning_rate": 8.527812159673567e-06, "loss": 0.675, "step": 5210 }, { "epoch": 0.35, "grad_norm": 1.8569538593292236, "learning_rate": 8.520073622279844e-06, "loss": 0.6397, "step": 5220 }, { "epoch": 0.35, "grad_norm": 1.9984098672866821, "learning_rate": 8.51231833058426e-06, "loss": 0.6806, "step": 5230 }, { "epoch": 0.35, "grad_norm": 1.8538262844085693, "learning_rate": 8.504546321499255e-06, "loss": 0.654, "step": 5240 }, { "epoch": 0.35, "grad_norm": 1.672841191291809, "learning_rate": 8.496757632016836e-06, "loss": 0.6429, "step": 5250 }, { "epoch": 0.35, "grad_norm": 1.5475361347198486, "learning_rate": 8.488952299208402e-06, "loss": 0.6574, "step": 5260 }, { "epoch": 0.35, "grad_norm": 1.6212105751037598, "learning_rate": 8.481130360224567e-06, "loss": 0.6504, "step": 5270 }, { "epoch": 0.35, "grad_norm": 1.7608546018600464, "learning_rate": 8.473291852294986e-06, "loss": 0.6709, "step": 5280 }, { "epoch": 0.35, "grad_norm": 1.720782995223999, "learning_rate": 8.465436812728181e-06, "loss": 0.6558, "step": 5290 }, { "epoch": 0.35, "grad_norm": 1.6840670108795166, "learning_rate": 8.457565278911349e-06, "loss": 0.6542, "step": 5300 }, { "epoch": 0.35, "grad_norm": 1.7344157695770264, "learning_rate": 8.449677288310198e-06, "loss": 0.6694, "step": 5310 }, { "epoch": 0.35, "grad_norm": 1.6819674968719482, "learning_rate": 8.44177287846877e-06, "loss": 0.6453, "step": 5320 }, { "epoch": 0.35, "grad_norm": 1.9622447490692139, "learning_rate": 8.433852087009251e-06, "loss": 0.6861, "step": 5330 }, { "epoch": 0.35, "grad_norm": 2.322389602661133, "learning_rate": 8.425914951631796e-06, "loss": 0.6838, "step": 5340 }, { "epoch": 0.35, "grad_norm": 1.4469890594482422, "learning_rate": 8.417961510114357e-06, "loss": 0.6752, "step": 5350 }, { "epoch": 0.35, "grad_norm": 2.1829028129577637, "learning_rate": 8.409991800312493e-06, "loss": 0.6567, "step": 5360 }, { "epoch": 0.36, "grad_norm": 2.2404589653015137, "learning_rate": 8.402005860159197e-06, "loss": 0.6509, "step": 5370 }, { "epoch": 0.36, "grad_norm": 1.9056888818740845, "learning_rate": 8.39400372766471e-06, "loss": 0.6588, "step": 5380 }, { "epoch": 0.36, "grad_norm": 3.867516040802002, "learning_rate": 8.385985440916344e-06, "loss": 0.6476, "step": 5390 }, { "epoch": 0.36, "grad_norm": 1.9217302799224854, "learning_rate": 8.377951038078303e-06, "loss": 0.6568, "step": 5400 }, { "epoch": 0.36, "grad_norm": 1.806114912033081, "learning_rate": 8.36990055739149e-06, "loss": 0.6729, "step": 5410 }, { "epoch": 0.36, "grad_norm": 1.9048370122909546, "learning_rate": 8.36183403717334e-06, "loss": 0.6783, "step": 5420 }, { "epoch": 0.36, "grad_norm": 1.9079678058624268, "learning_rate": 8.353751515817629e-06, "loss": 0.6659, "step": 5430 }, { "epoch": 0.36, "grad_norm": 2.0406830310821533, "learning_rate": 8.345653031794292e-06, "loss": 0.6696, "step": 5440 }, { "epoch": 0.36, "grad_norm": 1.7874845266342163, "learning_rate": 8.337538623649237e-06, "loss": 0.6506, "step": 5450 }, { "epoch": 0.36, "grad_norm": 1.866437554359436, "learning_rate": 8.329408330004172e-06, "loss": 0.6631, "step": 5460 }, { "epoch": 0.36, "grad_norm": 2.0198395252227783, "learning_rate": 8.32126218955641e-06, "loss": 0.656, "step": 5470 }, { "epoch": 0.36, "grad_norm": 1.6846990585327148, "learning_rate": 8.313100241078689e-06, "loss": 0.6639, "step": 5480 }, { "epoch": 0.36, "grad_norm": 1.445802927017212, "learning_rate": 8.304922523418988e-06, "loss": 0.6772, "step": 5490 }, { "epoch": 0.36, "grad_norm": 2.063717842102051, "learning_rate": 8.296729075500345e-06, "loss": 0.6666, "step": 5500 }, { "epoch": 0.36, "grad_norm": 2.563687324523926, "learning_rate": 8.288519936320664e-06, "loss": 0.6528, "step": 5510 }, { "epoch": 0.37, "grad_norm": 1.9258893728256226, "learning_rate": 8.280295144952537e-06, "loss": 0.6656, "step": 5520 }, { "epoch": 0.37, "grad_norm": 2.109924077987671, "learning_rate": 8.272054740543053e-06, "loss": 0.6509, "step": 5530 }, { "epoch": 0.37, "grad_norm": 1.950067400932312, "learning_rate": 8.263798762313613e-06, "loss": 0.6524, "step": 5540 }, { "epoch": 0.37, "grad_norm": 2.1017370223999023, "learning_rate": 8.255527249559747e-06, "loss": 0.662, "step": 5550 }, { "epoch": 0.37, "grad_norm": 1.7278681993484497, "learning_rate": 8.247240241650918e-06, "loss": 0.6602, "step": 5560 }, { "epoch": 0.37, "grad_norm": 3.6589279174804688, "learning_rate": 8.23893777803035e-06, "loss": 0.6454, "step": 5570 }, { "epoch": 0.37, "grad_norm": 1.8462167978286743, "learning_rate": 8.23061989821482e-06, "loss": 0.6468, "step": 5580 }, { "epoch": 0.37, "grad_norm": 1.9224498271942139, "learning_rate": 8.222286641794488e-06, "loss": 0.6608, "step": 5590 }, { "epoch": 0.37, "grad_norm": 1.796586513519287, "learning_rate": 8.213938048432697e-06, "loss": 0.6582, "step": 5600 }, { "epoch": 0.37, "grad_norm": 1.8218111991882324, "learning_rate": 8.205574157865791e-06, "loss": 0.665, "step": 5610 }, { "epoch": 0.37, "grad_norm": 1.6219843626022339, "learning_rate": 8.197195009902924e-06, "loss": 0.643, "step": 5620 }, { "epoch": 0.37, "grad_norm": 2.4031009674072266, "learning_rate": 8.188800644425867e-06, "loss": 0.6543, "step": 5630 }, { "epoch": 0.37, "grad_norm": 1.7068158388137817, "learning_rate": 8.18039110138882e-06, "loss": 0.6671, "step": 5640 }, { "epoch": 0.37, "grad_norm": 1.9731248617172241, "learning_rate": 8.171966420818227e-06, "loss": 0.6717, "step": 5650 }, { "epoch": 0.37, "grad_norm": 1.647336721420288, "learning_rate": 8.163526642812582e-06, "loss": 0.6717, "step": 5660 }, { "epoch": 0.38, "grad_norm": 1.3182628154754639, "learning_rate": 8.15507180754223e-06, "loss": 0.6597, "step": 5670 }, { "epoch": 0.38, "grad_norm": 1.8658872842788696, "learning_rate": 8.146601955249187e-06, "loss": 0.6751, "step": 5680 }, { "epoch": 0.38, "grad_norm": 2.511075019836426, "learning_rate": 8.138117126246951e-06, "loss": 0.6608, "step": 5690 }, { "epoch": 0.38, "grad_norm": 1.5444680452346802, "learning_rate": 8.129617360920297e-06, "loss": 0.6621, "step": 5700 }, { "epoch": 0.38, "grad_norm": 1.8167874813079834, "learning_rate": 8.12110269972509e-06, "loss": 0.6596, "step": 5710 }, { "epoch": 0.38, "grad_norm": 2.235473155975342, "learning_rate": 8.112573183188099e-06, "loss": 0.6696, "step": 5720 }, { "epoch": 0.38, "grad_norm": 1.5982747077941895, "learning_rate": 8.104028851906797e-06, "loss": 0.6697, "step": 5730 }, { "epoch": 0.38, "grad_norm": 2.0763542652130127, "learning_rate": 8.095469746549172e-06, "loss": 0.6757, "step": 5740 }, { "epoch": 0.38, "grad_norm": 1.66940176486969, "learning_rate": 8.086895907853526e-06, "loss": 0.676, "step": 5750 }, { "epoch": 0.38, "grad_norm": 1.759155035018921, "learning_rate": 8.078307376628292e-06, "loss": 0.6539, "step": 5760 }, { "epoch": 0.38, "grad_norm": 2.2886273860931396, "learning_rate": 8.069704193751834e-06, "loss": 0.6575, "step": 5770 }, { "epoch": 0.38, "grad_norm": 1.7834458351135254, "learning_rate": 8.061086400172247e-06, "loss": 0.6827, "step": 5780 }, { "epoch": 0.38, "grad_norm": 1.9700424671173096, "learning_rate": 8.052454036907174e-06, "loss": 0.6629, "step": 5790 }, { "epoch": 0.38, "grad_norm": 1.9198191165924072, "learning_rate": 8.043807145043604e-06, "loss": 0.6695, "step": 5800 }, { "epoch": 0.38, "grad_norm": 1.5736877918243408, "learning_rate": 8.035145765737671e-06, "loss": 0.6676, "step": 5810 }, { "epoch": 0.39, "grad_norm": 1.728734016418457, "learning_rate": 8.026469940214471e-06, "loss": 0.6684, "step": 5820 }, { "epoch": 0.39, "grad_norm": 1.5845164060592651, "learning_rate": 8.017779709767857e-06, "loss": 0.6597, "step": 5830 }, { "epoch": 0.39, "grad_norm": 2.1340720653533936, "learning_rate": 8.009075115760243e-06, "loss": 0.6703, "step": 5840 }, { "epoch": 0.39, "grad_norm": 1.7079795598983765, "learning_rate": 8.000356199622406e-06, "loss": 0.6652, "step": 5850 }, { "epoch": 0.39, "grad_norm": 1.5601955652236938, "learning_rate": 7.991623002853296e-06, "loss": 0.6583, "step": 5860 }, { "epoch": 0.39, "grad_norm": 2.0047013759613037, "learning_rate": 7.982875567019833e-06, "loss": 0.6733, "step": 5870 }, { "epoch": 0.39, "grad_norm": 1.8969409465789795, "learning_rate": 7.974113933756708e-06, "loss": 0.678, "step": 5880 }, { "epoch": 0.39, "grad_norm": 1.5502653121948242, "learning_rate": 7.965338144766186e-06, "loss": 0.6474, "step": 5890 }, { "epoch": 0.39, "grad_norm": 1.8099123239517212, "learning_rate": 7.956548241817914e-06, "loss": 0.6564, "step": 5900 }, { "epoch": 0.39, "grad_norm": 1.6473009586334229, "learning_rate": 7.947744266748707e-06, "loss": 0.6785, "step": 5910 }, { "epoch": 0.39, "grad_norm": 1.9546464681625366, "learning_rate": 7.938926261462366e-06, "loss": 0.6852, "step": 5920 }, { "epoch": 0.39, "grad_norm": 2.04304838180542, "learning_rate": 7.93009426792947e-06, "loss": 0.6521, "step": 5930 }, { "epoch": 0.39, "grad_norm": 1.7775088548660278, "learning_rate": 7.921248328187174e-06, "loss": 0.6755, "step": 5940 }, { "epoch": 0.39, "grad_norm": 1.7072761058807373, "learning_rate": 7.912388484339012e-06, "loss": 0.6681, "step": 5950 }, { "epoch": 0.39, "grad_norm": 2.046266555786133, "learning_rate": 7.903514778554699e-06, "loss": 0.6622, "step": 5960 }, { "epoch": 0.4, "grad_norm": 1.9110488891601562, "learning_rate": 7.89462725306993e-06, "loss": 0.6676, "step": 5970 }, { "epoch": 0.4, "grad_norm": 1.9375412464141846, "learning_rate": 7.88572595018617e-06, "loss": 0.6635, "step": 5980 }, { "epoch": 0.4, "grad_norm": 2.0740785598754883, "learning_rate": 7.876810912270462e-06, "loss": 0.6592, "step": 5990 }, { "epoch": 0.4, "grad_norm": 1.6654475927352905, "learning_rate": 7.86788218175523e-06, "loss": 0.6598, "step": 6000 }, { "epoch": 0.4, "grad_norm": 1.9562811851501465, "learning_rate": 7.858939801138061e-06, "loss": 0.6831, "step": 6010 }, { "epoch": 0.4, "grad_norm": 1.9873039722442627, "learning_rate": 7.849983812981516e-06, "loss": 0.6548, "step": 6020 }, { "epoch": 0.4, "grad_norm": 1.7364007234573364, "learning_rate": 7.84101425991292e-06, "loss": 0.6452, "step": 6030 }, { "epoch": 0.4, "grad_norm": 2.040637731552124, "learning_rate": 7.832031184624165e-06, "loss": 0.6533, "step": 6040 }, { "epoch": 0.4, "grad_norm": 1.9943041801452637, "learning_rate": 7.823034629871503e-06, "loss": 0.6601, "step": 6050 }, { "epoch": 0.4, "grad_norm": 1.8325495719909668, "learning_rate": 7.814024638475344e-06, "loss": 0.6666, "step": 6060 }, { "epoch": 0.4, "grad_norm": 2.264528274536133, "learning_rate": 7.80500125332005e-06, "loss": 0.645, "step": 6070 }, { "epoch": 0.4, "grad_norm": 1.8350125551223755, "learning_rate": 7.795964517353734e-06, "loss": 0.681, "step": 6080 }, { "epoch": 0.4, "grad_norm": 5.209187030792236, "learning_rate": 7.786914473588057e-06, "loss": 0.6385, "step": 6090 }, { "epoch": 0.4, "grad_norm": 2.010577917098999, "learning_rate": 7.777851165098012e-06, "loss": 0.6375, "step": 6100 }, { "epoch": 0.4, "grad_norm": 2.004896402359009, "learning_rate": 7.768774635021737e-06, "loss": 0.6817, "step": 6110 }, { "epoch": 0.4, "grad_norm": 1.7268619537353516, "learning_rate": 7.759684926560292e-06, "loss": 0.665, "step": 6120 }, { "epoch": 0.41, "grad_norm": 2.00075101852417, "learning_rate": 7.750582082977468e-06, "loss": 0.6602, "step": 6130 }, { "epoch": 0.41, "grad_norm": 1.9209833145141602, "learning_rate": 7.74146614759957e-06, "loss": 0.6795, "step": 6140 }, { "epoch": 0.41, "grad_norm": 1.7252877950668335, "learning_rate": 7.732337163815218e-06, "loss": 0.6608, "step": 6150 }, { "epoch": 0.41, "grad_norm": 1.6505799293518066, "learning_rate": 7.723195175075136e-06, "loss": 0.6519, "step": 6160 }, { "epoch": 0.41, "grad_norm": 2.175870656967163, "learning_rate": 7.714040224891949e-06, "loss": 0.6877, "step": 6170 }, { "epoch": 0.41, "grad_norm": 1.762647271156311, "learning_rate": 7.704872356839971e-06, "loss": 0.6582, "step": 6180 }, { "epoch": 0.41, "grad_norm": 1.860929250717163, "learning_rate": 7.695691614555002e-06, "loss": 0.6719, "step": 6190 }, { "epoch": 0.41, "grad_norm": 6.883901596069336, "learning_rate": 7.686498041734121e-06, "loss": 0.6604, "step": 6200 }, { "epoch": 0.41, "grad_norm": 2.181346893310547, "learning_rate": 7.67729168213547e-06, "loss": 0.672, "step": 6210 }, { "epoch": 0.41, "grad_norm": 2.26829195022583, "learning_rate": 7.66807257957806e-06, "loss": 0.671, "step": 6220 }, { "epoch": 0.41, "grad_norm": 1.7630722522735596, "learning_rate": 7.658840777941543e-06, "loss": 0.6623, "step": 6230 }, { "epoch": 0.41, "grad_norm": 1.9417176246643066, "learning_rate": 7.649596321166024e-06, "loss": 0.6476, "step": 6240 }, { "epoch": 0.41, "grad_norm": 1.660746693611145, "learning_rate": 7.64033925325184e-06, "loss": 0.6778, "step": 6250 }, { "epoch": 0.41, "grad_norm": 1.5244559049606323, "learning_rate": 7.631069618259347e-06, "loss": 0.6471, "step": 6260 }, { "epoch": 0.41, "grad_norm": 1.8127204179763794, "learning_rate": 7.621787460308723e-06, "loss": 0.6633, "step": 6270 }, { "epoch": 0.42, "grad_norm": 1.7429029941558838, "learning_rate": 7.612492823579744e-06, "loss": 0.674, "step": 6280 }, { "epoch": 0.42, "grad_norm": 1.903221845626831, "learning_rate": 7.603185752311587e-06, "loss": 0.6593, "step": 6290 }, { "epoch": 0.42, "grad_norm": 1.6729519367218018, "learning_rate": 7.593866290802608e-06, "loss": 0.664, "step": 6300 }, { "epoch": 0.42, "grad_norm": 1.7710927724838257, "learning_rate": 7.584534483410137e-06, "loss": 0.6639, "step": 6310 }, { "epoch": 0.42, "grad_norm": 1.5772103071212769, "learning_rate": 7.575190374550272e-06, "loss": 0.6746, "step": 6320 }, { "epoch": 0.42, "grad_norm": 1.7170156240463257, "learning_rate": 7.565834008697652e-06, "loss": 0.6721, "step": 6330 }, { "epoch": 0.42, "grad_norm": 1.7289538383483887, "learning_rate": 7.55646543038526e-06, "loss": 0.6507, "step": 6340 }, { "epoch": 0.42, "grad_norm": 2.6091270446777344, "learning_rate": 7.54708468420421e-06, "loss": 0.6584, "step": 6350 }, { "epoch": 0.42, "grad_norm": 2.401001214981079, "learning_rate": 7.537691814803522e-06, "loss": 0.6503, "step": 6360 }, { "epoch": 0.42, "grad_norm": 1.2781227827072144, "learning_rate": 7.528286866889924e-06, "loss": 0.6545, "step": 6370 }, { "epoch": 0.42, "grad_norm": 2.448362112045288, "learning_rate": 7.518869885227632e-06, "loss": 0.6722, "step": 6380 }, { "epoch": 0.42, "grad_norm": 1.9541414976119995, "learning_rate": 7.50944091463814e-06, "loss": 0.6553, "step": 6390 }, { "epoch": 0.42, "grad_norm": 1.3395729064941406, "learning_rate": 7.500000000000001e-06, "loss": 0.6601, "step": 6400 }, { "epoch": 0.42, "eval_loss": 0.7648976445198059, "eval_runtime": 134.0186, "eval_samples_per_second": 82.078, "eval_steps_per_second": 10.26, "step": 6400 }, { "epoch": 0.42, "grad_norm": 1.6933283805847168, "learning_rate": 7.4905471862486215e-06, "loss": 0.6627, "step": 6410 }, { "epoch": 0.42, "grad_norm": 1.5258747339248657, "learning_rate": 7.4810825183760425e-06, "loss": 0.6685, "step": 6420 }, { "epoch": 0.43, "grad_norm": 2.085606336593628, "learning_rate": 7.471606041430724e-06, "loss": 0.6646, "step": 6430 }, { "epoch": 0.43, "grad_norm": 2.31076979637146, "learning_rate": 7.462117800517337e-06, "loss": 0.6467, "step": 6440 }, { "epoch": 0.43, "grad_norm": 1.5522764921188354, "learning_rate": 7.4526178407965396e-06, "loss": 0.6559, "step": 6450 }, { "epoch": 0.43, "grad_norm": 1.5559918880462646, "learning_rate": 7.443106207484776e-06, "loss": 0.6486, "step": 6460 }, { "epoch": 0.43, "grad_norm": 1.428256630897522, "learning_rate": 7.433582945854041e-06, "loss": 0.6696, "step": 6470 }, { "epoch": 0.43, "grad_norm": 1.6516914367675781, "learning_rate": 7.424048101231687e-06, "loss": 0.6571, "step": 6480 }, { "epoch": 0.43, "grad_norm": 1.6921088695526123, "learning_rate": 7.414501719000187e-06, "loss": 0.6595, "step": 6490 }, { "epoch": 0.43, "grad_norm": 1.556532621383667, "learning_rate": 7.404943844596939e-06, "loss": 0.657, "step": 6500 }, { "epoch": 0.43, "grad_norm": 2.2812130451202393, "learning_rate": 7.3953745235140325e-06, "loss": 0.677, "step": 6510 }, { "epoch": 0.43, "grad_norm": 1.9262441396713257, "learning_rate": 7.3857938012980425e-06, "loss": 0.6451, "step": 6520 }, { "epoch": 0.43, "grad_norm": 1.781899094581604, "learning_rate": 7.3762017235498084e-06, "loss": 0.6774, "step": 6530 }, { "epoch": 0.43, "grad_norm": 1.732818841934204, "learning_rate": 7.3665983359242175e-06, "loss": 0.6724, "step": 6540 }, { "epoch": 0.43, "grad_norm": 1.8356454372406006, "learning_rate": 7.3569836841299905e-06, "loss": 0.682, "step": 6550 }, { "epoch": 0.43, "grad_norm": 1.787606954574585, "learning_rate": 7.347357813929455e-06, "loss": 0.6719, "step": 6560 }, { "epoch": 0.43, "grad_norm": 1.9841190576553345, "learning_rate": 7.337720771138343e-06, "loss": 0.6547, "step": 6570 }, { "epoch": 0.44, "grad_norm": 1.8264681100845337, "learning_rate": 7.328072601625558e-06, "loss": 0.6545, "step": 6580 }, { "epoch": 0.44, "grad_norm": 1.4462283849716187, "learning_rate": 7.318413351312965e-06, "loss": 0.6714, "step": 6590 }, { "epoch": 0.44, "grad_norm": 1.7890430688858032, "learning_rate": 7.308743066175172e-06, "loss": 0.6563, "step": 6600 }, { "epoch": 0.44, "grad_norm": 2.616171360015869, "learning_rate": 7.2990617922393e-06, "loss": 0.657, "step": 6610 }, { "epoch": 0.44, "grad_norm": 1.8816206455230713, "learning_rate": 7.289369575584783e-06, "loss": 0.6662, "step": 6620 }, { "epoch": 0.44, "grad_norm": 1.7342751026153564, "learning_rate": 7.279666462343138e-06, "loss": 0.674, "step": 6630 }, { "epoch": 0.44, "grad_norm": 2.6490211486816406, "learning_rate": 7.269952498697734e-06, "loss": 0.6869, "step": 6640 }, { "epoch": 0.44, "grad_norm": 1.4362220764160156, "learning_rate": 7.2602277308836e-06, "loss": 0.6465, "step": 6650 }, { "epoch": 0.44, "grad_norm": 1.6366841793060303, "learning_rate": 7.250492205187176e-06, "loss": 0.6526, "step": 6660 }, { "epoch": 0.44, "grad_norm": 1.906667709350586, "learning_rate": 7.240745967946113e-06, "loss": 0.6552, "step": 6670 }, { "epoch": 0.44, "grad_norm": 1.9176589250564575, "learning_rate": 7.2309890655490446e-06, "loss": 0.6732, "step": 6680 }, { "epoch": 0.44, "grad_norm": 1.6638537645339966, "learning_rate": 7.221221544435364e-06, "loss": 0.6552, "step": 6690 }, { "epoch": 0.44, "grad_norm": 1.9146836996078491, "learning_rate": 7.211443451095007e-06, "loss": 0.6516, "step": 6700 }, { "epoch": 0.44, "grad_norm": 1.5971161127090454, "learning_rate": 7.20165483206823e-06, "loss": 0.6429, "step": 6710 }, { "epoch": 0.44, "grad_norm": 1.771937370300293, "learning_rate": 7.191855733945388e-06, "loss": 0.6407, "step": 6720 }, { "epoch": 0.45, "grad_norm": 1.5045241117477417, "learning_rate": 7.18204620336671e-06, "loss": 0.6663, "step": 6730 }, { "epoch": 0.45, "grad_norm": 1.8442574739456177, "learning_rate": 7.172226287022086e-06, "loss": 0.6637, "step": 6740 }, { "epoch": 0.45, "grad_norm": 1.5092518329620361, "learning_rate": 7.162396031650831e-06, "loss": 0.6665, "step": 6750 }, { "epoch": 0.45, "grad_norm": 1.6536009311676025, "learning_rate": 7.1525554840414765e-06, "loss": 0.6544, "step": 6760 }, { "epoch": 0.45, "grad_norm": 2.160461902618408, "learning_rate": 7.142704691031537e-06, "loss": 0.6886, "step": 6770 }, { "epoch": 0.45, "grad_norm": 1.5910495519638062, "learning_rate": 7.132843699507292e-06, "loss": 0.6634, "step": 6780 }, { "epoch": 0.45, "grad_norm": 1.7818145751953125, "learning_rate": 7.1229725564035665e-06, "loss": 0.6761, "step": 6790 }, { "epoch": 0.45, "grad_norm": 1.350142478942871, "learning_rate": 7.113091308703498e-06, "loss": 0.6484, "step": 6800 }, { "epoch": 0.45, "grad_norm": 1.6214317083358765, "learning_rate": 7.103200003438322e-06, "loss": 0.6699, "step": 6810 }, { "epoch": 0.45, "grad_norm": 1.9810750484466553, "learning_rate": 7.093298687687141e-06, "loss": 0.6579, "step": 6820 }, { "epoch": 0.45, "grad_norm": 1.8099583387374878, "learning_rate": 7.08338740857671e-06, "loss": 0.6747, "step": 6830 }, { "epoch": 0.45, "grad_norm": 1.903903841972351, "learning_rate": 7.073466213281196e-06, "loss": 0.6625, "step": 6840 }, { "epoch": 0.45, "grad_norm": 1.7253398895263672, "learning_rate": 7.063535149021974e-06, "loss": 0.6655, "step": 6850 }, { "epoch": 0.45, "grad_norm": 1.8018195629119873, "learning_rate": 7.053594263067387e-06, "loss": 0.6587, "step": 6860 }, { "epoch": 0.45, "grad_norm": 1.7250744104385376, "learning_rate": 7.043643602732525e-06, "loss": 0.6414, "step": 6870 }, { "epoch": 0.46, "grad_norm": 1.4878661632537842, "learning_rate": 7.033683215379002e-06, "loss": 0.6509, "step": 6880 }, { "epoch": 0.46, "grad_norm": 1.621942400932312, "learning_rate": 7.023713148414728e-06, "loss": 0.6672, "step": 6890 }, { "epoch": 0.46, "grad_norm": 1.8879963159561157, "learning_rate": 7.0137334492936875e-06, "loss": 0.6588, "step": 6900 }, { "epoch": 0.46, "grad_norm": 1.4919434785842896, "learning_rate": 7.0037441655157045e-06, "loss": 0.6529, "step": 6910 }, { "epoch": 0.46, "grad_norm": 1.991425633430481, "learning_rate": 6.993745344626232e-06, "loss": 0.6455, "step": 6920 }, { "epoch": 0.46, "grad_norm": 3.1762428283691406, "learning_rate": 6.983737034216106e-06, "loss": 0.6531, "step": 6930 }, { "epoch": 0.46, "grad_norm": 1.9187873601913452, "learning_rate": 6.973719281921336e-06, "loss": 0.6828, "step": 6940 }, { "epoch": 0.46, "grad_norm": 1.8147242069244385, "learning_rate": 6.963692135422872e-06, "loss": 0.6663, "step": 6950 }, { "epoch": 0.46, "grad_norm": 1.9920296669006348, "learning_rate": 6.953655642446368e-06, "loss": 0.6571, "step": 6960 }, { "epoch": 0.46, "grad_norm": 2.697572946548462, "learning_rate": 6.943609850761979e-06, "loss": 0.6564, "step": 6970 }, { "epoch": 0.46, "grad_norm": 2.1568384170532227, "learning_rate": 6.933554808184104e-06, "loss": 0.6625, "step": 6980 }, { "epoch": 0.46, "grad_norm": 1.5813344717025757, "learning_rate": 6.9234905625711816e-06, "loss": 0.6872, "step": 6990 }, { "epoch": 0.46, "grad_norm": 1.5985502004623413, "learning_rate": 6.913417161825449e-06, "loss": 0.6762, "step": 7000 }, { "epoch": 0.46, "grad_norm": 2.154099941253662, "learning_rate": 6.9033346538927235e-06, "loss": 0.6392, "step": 7010 }, { "epoch": 0.46, "grad_norm": 2.5765979290008545, "learning_rate": 6.8932430867621655e-06, "loss": 0.6707, "step": 7020 }, { "epoch": 0.47, "grad_norm": 1.7588094472885132, "learning_rate": 6.883142508466054e-06, "loss": 0.6718, "step": 7030 }, { "epoch": 0.47, "grad_norm": 1.5742931365966797, "learning_rate": 6.873032967079562e-06, "loss": 0.6586, "step": 7040 }, { "epoch": 0.47, "grad_norm": 1.7835898399353027, "learning_rate": 6.862914510720515e-06, "loss": 0.6703, "step": 7050 }, { "epoch": 0.47, "grad_norm": 1.8514904975891113, "learning_rate": 6.852787187549182e-06, "loss": 0.6803, "step": 7060 }, { "epoch": 0.47, "grad_norm": 1.6340848207473755, "learning_rate": 6.842651045768026e-06, "loss": 0.6881, "step": 7070 }, { "epoch": 0.47, "grad_norm": 1.552843451499939, "learning_rate": 6.832506133621487e-06, "loss": 0.6789, "step": 7080 }, { "epoch": 0.47, "grad_norm": 1.7644469738006592, "learning_rate": 6.822352499395751e-06, "loss": 0.6853, "step": 7090 }, { "epoch": 0.47, "grad_norm": 1.6095765829086304, "learning_rate": 6.812190191418508e-06, "loss": 0.6491, "step": 7100 }, { "epoch": 0.47, "grad_norm": 1.5338112115859985, "learning_rate": 6.80201925805875e-06, "loss": 0.6573, "step": 7110 }, { "epoch": 0.47, "grad_norm": 1.9101743698120117, "learning_rate": 6.7918397477265e-06, "loss": 0.6727, "step": 7120 }, { "epoch": 0.47, "grad_norm": 2.3680825233459473, "learning_rate": 6.781651708872629e-06, "loss": 0.6728, "step": 7130 }, { "epoch": 0.47, "grad_norm": 1.8265564441680908, "learning_rate": 6.771455189988579e-06, "loss": 0.6851, "step": 7140 }, { "epoch": 0.47, "grad_norm": 1.4049782752990723, "learning_rate": 6.7612502396061685e-06, "loss": 0.67, "step": 7150 }, { "epoch": 0.47, "grad_norm": 2.181297779083252, "learning_rate": 6.751036906297338e-06, "loss": 0.6373, "step": 7160 }, { "epoch": 0.47, "grad_norm": 2.1673145294189453, "learning_rate": 6.740815238673932e-06, "loss": 0.6639, "step": 7170 }, { "epoch": 0.48, "grad_norm": 1.8220609426498413, "learning_rate": 6.730585285387465e-06, "loss": 0.6712, "step": 7180 }, { "epoch": 0.48, "grad_norm": 1.3301570415496826, "learning_rate": 6.720347095128884e-06, "loss": 0.6792, "step": 7190 }, { "epoch": 0.48, "grad_norm": 2.4812264442443848, "learning_rate": 6.710100716628345e-06, "loss": 0.6863, "step": 7200 }, { "epoch": 0.48, "grad_norm": 1.278617024421692, "learning_rate": 6.6998461986549715e-06, "loss": 0.677, "step": 7210 }, { "epoch": 0.48, "grad_norm": 1.8752747774124146, "learning_rate": 6.689583590016636e-06, "loss": 0.666, "step": 7220 }, { "epoch": 0.48, "grad_norm": 1.41091787815094, "learning_rate": 6.679312939559712e-06, "loss": 0.6868, "step": 7230 }, { "epoch": 0.48, "grad_norm": 1.9605522155761719, "learning_rate": 6.669034296168855e-06, "loss": 0.6784, "step": 7240 }, { "epoch": 0.48, "grad_norm": 1.8834712505340576, "learning_rate": 6.6587477087667615e-06, "loss": 0.6703, "step": 7250 }, { "epoch": 0.48, "grad_norm": 1.7570191621780396, "learning_rate": 6.648453226313937e-06, "loss": 0.6847, "step": 7260 }, { "epoch": 0.48, "grad_norm": 2.3550283908843994, "learning_rate": 6.638150897808469e-06, "loss": 0.6582, "step": 7270 }, { "epoch": 0.48, "grad_norm": 1.2605594396591187, "learning_rate": 6.627840772285784e-06, "loss": 0.6684, "step": 7280 }, { "epoch": 0.48, "grad_norm": 1.6544970273971558, "learning_rate": 6.617522898818426e-06, "loss": 0.6698, "step": 7290 }, { "epoch": 0.48, "grad_norm": 1.7877496480941772, "learning_rate": 6.607197326515808e-06, "loss": 0.6608, "step": 7300 }, { "epoch": 0.48, "grad_norm": 1.8587825298309326, "learning_rate": 6.596864104523996e-06, "loss": 0.6637, "step": 7310 }, { "epoch": 0.48, "grad_norm": 1.8580244779586792, "learning_rate": 6.586523282025462e-06, "loss": 0.6681, "step": 7320 }, { "epoch": 0.49, "grad_norm": 1.5482275485992432, "learning_rate": 6.57617490823885e-06, "loss": 0.6583, "step": 7330 }, { "epoch": 0.49, "grad_norm": 1.6002042293548584, "learning_rate": 6.565819032418748e-06, "loss": 0.682, "step": 7340 }, { "epoch": 0.49, "grad_norm": 1.9164557456970215, "learning_rate": 6.555455703855454e-06, "loss": 0.667, "step": 7350 }, { "epoch": 0.49, "grad_norm": 1.4982038736343384, "learning_rate": 6.545084971874738e-06, "loss": 0.6675, "step": 7360 }, { "epoch": 0.49, "grad_norm": 1.3651514053344727, "learning_rate": 6.534706885837601e-06, "loss": 0.659, "step": 7370 }, { "epoch": 0.49, "grad_norm": 1.6981933116912842, "learning_rate": 6.5243214951400545e-06, "loss": 0.6691, "step": 7380 }, { "epoch": 0.49, "grad_norm": 1.5667850971221924, "learning_rate": 6.513928849212874e-06, "loss": 0.6888, "step": 7390 }, { "epoch": 0.49, "grad_norm": 2.9156270027160645, "learning_rate": 6.503528997521365e-06, "loss": 0.665, "step": 7400 }, { "epoch": 0.49, "grad_norm": 1.765031099319458, "learning_rate": 6.4931219895651384e-06, "loss": 0.6694, "step": 7410 }, { "epoch": 0.49, "grad_norm": 1.2956502437591553, "learning_rate": 6.482707874877855e-06, "loss": 0.6606, "step": 7420 }, { "epoch": 0.49, "grad_norm": 1.3494573831558228, "learning_rate": 6.472286703027011e-06, "loss": 0.6469, "step": 7430 }, { "epoch": 0.49, "grad_norm": 1.6514666080474854, "learning_rate": 6.461858523613684e-06, "loss": 0.6688, "step": 7440 }, { "epoch": 0.49, "grad_norm": 1.7511714696884155, "learning_rate": 6.451423386272312e-06, "loss": 0.655, "step": 7450 }, { "epoch": 0.49, "grad_norm": 1.893713355064392, "learning_rate": 6.440981340670447e-06, "loss": 0.6512, "step": 7460 }, { "epoch": 0.49, "grad_norm": 1.3743759393692017, "learning_rate": 6.430532436508522e-06, "loss": 0.6542, "step": 7470 }, { "epoch": 0.49, "grad_norm": 1.3243224620819092, "learning_rate": 6.420076723519615e-06, "loss": 0.6565, "step": 7480 }, { "epoch": 0.5, "grad_norm": 2.08793568611145, "learning_rate": 6.4096142514692085e-06, "loss": 0.6679, "step": 7490 }, { "epoch": 0.5, "grad_norm": 1.69426691532135, "learning_rate": 6.399145070154962e-06, "loss": 0.6747, "step": 7500 }, { "epoch": 0.5, "grad_norm": 1.9869012832641602, "learning_rate": 6.388669229406462e-06, "loss": 0.6755, "step": 7510 }, { "epoch": 0.5, "grad_norm": 1.9419549703598022, "learning_rate": 6.378186779084996e-06, "loss": 0.6928, "step": 7520 }, { "epoch": 0.5, "grad_norm": 1.703644871711731, "learning_rate": 6.367697769083312e-06, "loss": 0.6848, "step": 7530 }, { "epoch": 0.5, "grad_norm": 1.7564888000488281, "learning_rate": 6.3572022493253715e-06, "loss": 0.6626, "step": 7540 }, { "epoch": 0.5, "grad_norm": 1.867537260055542, "learning_rate": 6.346700269766132e-06, "loss": 0.6472, "step": 7550 }, { "epoch": 0.5, "grad_norm": 1.6203207969665527, "learning_rate": 6.336191880391285e-06, "loss": 0.6706, "step": 7560 }, { "epoch": 0.5, "grad_norm": 1.374058485031128, "learning_rate": 6.325677131217041e-06, "loss": 0.6603, "step": 7570 }, { "epoch": 0.5, "grad_norm": 1.7284061908721924, "learning_rate": 6.315156072289874e-06, "loss": 0.6659, "step": 7580 }, { "epoch": 0.5, "grad_norm": 1.3828376531600952, "learning_rate": 6.304628753686295e-06, "loss": 0.666, "step": 7590 }, { "epoch": 0.5, "grad_norm": 1.9280568361282349, "learning_rate": 6.294095225512604e-06, "loss": 0.6584, "step": 7600 }, { "epoch": 0.5, "grad_norm": 1.9138083457946777, "learning_rate": 6.283555537904659e-06, "loss": 0.6575, "step": 7610 }, { "epoch": 0.5, "grad_norm": 1.4988782405853271, "learning_rate": 6.273009741027638e-06, "loss": 0.676, "step": 7620 }, { "epoch": 0.5, "grad_norm": 1.5602288246154785, "learning_rate": 6.26245788507579e-06, "loss": 0.6582, "step": 7630 }, { "epoch": 0.51, "grad_norm": 1.3849523067474365, "learning_rate": 6.251900020272208e-06, "loss": 0.6835, "step": 7640 }, { "epoch": 0.51, "grad_norm": 1.7585147619247437, "learning_rate": 6.241336196868582e-06, "loss": 0.673, "step": 7650 }, { "epoch": 0.51, "grad_norm": 1.720861792564392, "learning_rate": 6.230766465144966e-06, "loss": 0.6857, "step": 7660 }, { "epoch": 0.51, "grad_norm": 1.6793100833892822, "learning_rate": 6.220190875409533e-06, "loss": 0.6667, "step": 7670 }, { "epoch": 0.51, "grad_norm": 2.7875709533691406, "learning_rate": 6.209609477998339e-06, "loss": 0.6624, "step": 7680 }, { "epoch": 0.51, "grad_norm": 1.640952706336975, "learning_rate": 6.199022323275083e-06, "loss": 0.6723, "step": 7690 }, { "epoch": 0.51, "grad_norm": 1.5932248830795288, "learning_rate": 6.188429461630866e-06, "loss": 0.6525, "step": 7700 }, { "epoch": 0.51, "grad_norm": 2.364243984222412, "learning_rate": 6.177830943483952e-06, "loss": 0.6771, "step": 7710 }, { "epoch": 0.51, "grad_norm": 1.9060790538787842, "learning_rate": 6.1672268192795285e-06, "loss": 0.6932, "step": 7720 }, { "epoch": 0.51, "grad_norm": 1.844760537147522, "learning_rate": 6.156617139489465e-06, "loss": 0.6684, "step": 7730 }, { "epoch": 0.51, "grad_norm": 1.5574620962142944, "learning_rate": 6.146001954612072e-06, "loss": 0.6814, "step": 7740 }, { "epoch": 0.51, "grad_norm": 1.8839787244796753, "learning_rate": 6.135381315171867e-06, "loss": 0.6783, "step": 7750 }, { "epoch": 0.51, "grad_norm": 1.6583342552185059, "learning_rate": 6.124755271719326e-06, "loss": 0.6655, "step": 7760 }, { "epoch": 0.51, "grad_norm": 1.7036808729171753, "learning_rate": 6.114123874830647e-06, "loss": 0.6814, "step": 7770 }, { "epoch": 0.51, "grad_norm": 1.3549695014953613, "learning_rate": 6.103487175107508e-06, "loss": 0.6738, "step": 7780 }, { "epoch": 0.52, "grad_norm": 1.3941162824630737, "learning_rate": 6.092845223176823e-06, "loss": 0.6722, "step": 7790 }, { "epoch": 0.52, "grad_norm": 1.9601216316223145, "learning_rate": 6.0821980696905145e-06, "loss": 0.6645, "step": 7800 }, { "epoch": 0.52, "grad_norm": 1.757865309715271, "learning_rate": 6.071545765325254e-06, "loss": 0.6782, "step": 7810 }, { "epoch": 0.52, "grad_norm": 1.4168590307235718, "learning_rate": 6.060888360782232e-06, "loss": 0.6648, "step": 7820 }, { "epoch": 0.52, "grad_norm": 1.7266621589660645, "learning_rate": 6.050225906786913e-06, "loss": 0.6729, "step": 7830 }, { "epoch": 0.52, "grad_norm": 1.7137846946716309, "learning_rate": 6.039558454088796e-06, "loss": 0.6863, "step": 7840 }, { "epoch": 0.52, "grad_norm": 1.9252254962921143, "learning_rate": 6.028886053461175e-06, "loss": 0.6348, "step": 7850 }, { "epoch": 0.52, "grad_norm": 1.7567081451416016, "learning_rate": 6.0182087557008875e-06, "loss": 0.6672, "step": 7860 }, { "epoch": 0.52, "grad_norm": 1.5034328699111938, "learning_rate": 6.0075266116280865e-06, "loss": 0.6714, "step": 7870 }, { "epoch": 0.52, "grad_norm": 1.4937865734100342, "learning_rate": 5.996839672085986e-06, "loss": 0.6555, "step": 7880 }, { "epoch": 0.52, "grad_norm": 1.4067682027816772, "learning_rate": 5.986147987940632e-06, "loss": 0.6473, "step": 7890 }, { "epoch": 0.52, "grad_norm": 1.5189076662063599, "learning_rate": 5.975451610080643e-06, "loss": 0.6583, "step": 7900 }, { "epoch": 0.52, "grad_norm": 1.8422696590423584, "learning_rate": 5.964750589416985e-06, "loss": 0.6659, "step": 7910 }, { "epoch": 0.52, "grad_norm": 1.4895750284194946, "learning_rate": 5.954044976882725e-06, "loss": 0.6653, "step": 7920 }, { "epoch": 0.52, "grad_norm": 1.71246337890625, "learning_rate": 5.943334823432777e-06, "loss": 0.6742, "step": 7930 }, { "epoch": 0.53, "grad_norm": 2.6499202251434326, "learning_rate": 5.932620180043674e-06, "loss": 0.6748, "step": 7940 }, { "epoch": 0.53, "grad_norm": 1.4720969200134277, "learning_rate": 5.921901097713317e-06, "loss": 0.6714, "step": 7950 }, { "epoch": 0.53, "grad_norm": 1.7344847917556763, "learning_rate": 5.911177627460739e-06, "loss": 0.6691, "step": 7960 }, { "epoch": 0.53, "grad_norm": 1.5558526515960693, "learning_rate": 5.9004498203258495e-06, "loss": 0.7011, "step": 7970 }, { "epoch": 0.53, "grad_norm": 1.4514857530593872, "learning_rate": 5.88971772736921e-06, "loss": 0.6828, "step": 7980 }, { "epoch": 0.53, "grad_norm": 1.6652244329452515, "learning_rate": 5.878981399671774e-06, "loss": 0.6727, "step": 7990 }, { "epoch": 0.53, "grad_norm": 1.811303734779358, "learning_rate": 5.8682408883346535e-06, "loss": 0.6786, "step": 8000 }, { "epoch": 0.53, "eval_loss": 0.7684772610664368, "eval_runtime": 134.4971, "eval_samples_per_second": 81.786, "eval_steps_per_second": 10.223, "step": 8000 }, { "epoch": 0.53, "grad_norm": 1.600232481956482, "learning_rate": 5.8574962444788715e-06, "loss": 0.6763, "step": 8010 }, { "epoch": 0.53, "grad_norm": 1.4658622741699219, "learning_rate": 5.846747519245123e-06, "loss": 0.6784, "step": 8020 }, { "epoch": 0.53, "grad_norm": 1.8189136981964111, "learning_rate": 5.835994763793529e-06, "loss": 0.6676, "step": 8030 }, { "epoch": 0.53, "grad_norm": 1.8251430988311768, "learning_rate": 5.825238029303388e-06, "loss": 0.6829, "step": 8040 }, { "epoch": 0.53, "grad_norm": 1.769020676612854, "learning_rate": 5.814477366972945e-06, "loss": 0.6711, "step": 8050 }, { "epoch": 0.53, "grad_norm": 1.7342435121536255, "learning_rate": 5.8037128280191315e-06, "loss": 0.6458, "step": 8060 }, { "epoch": 0.53, "grad_norm": 2.0114922523498535, "learning_rate": 5.792944463677336e-06, "loss": 0.6412, "step": 8070 }, { "epoch": 0.53, "grad_norm": 1.960257649421692, "learning_rate": 5.782172325201155e-06, "loss": 0.6765, "step": 8080 }, { "epoch": 0.54, "grad_norm": 1.7859301567077637, "learning_rate": 5.771396463862145e-06, "loss": 0.6751, "step": 8090 }, { "epoch": 0.54, "grad_norm": 1.7072672843933105, "learning_rate": 5.760616930949584e-06, "loss": 0.6676, "step": 8100 }, { "epoch": 0.54, "grad_norm": 1.50044584274292, "learning_rate": 5.749833777770225e-06, "loss": 0.6729, "step": 8110 }, { "epoch": 0.54, "grad_norm": 1.5489919185638428, "learning_rate": 5.7390470556480545e-06, "loss": 0.6773, "step": 8120 }, { "epoch": 0.54, "grad_norm": 1.5351699590682983, "learning_rate": 5.7282568159240405e-06, "loss": 0.6531, "step": 8130 }, { "epoch": 0.54, "grad_norm": 1.8078864812850952, "learning_rate": 5.717463109955896e-06, "loss": 0.6546, "step": 8140 }, { "epoch": 0.54, "grad_norm": 1.7729016542434692, "learning_rate": 5.7066659891178385e-06, "loss": 0.6725, "step": 8150 }, { "epoch": 0.54, "grad_norm": 1.5250924825668335, "learning_rate": 5.695865504800328e-06, "loss": 0.6748, "step": 8160 }, { "epoch": 0.54, "grad_norm": 1.7680206298828125, "learning_rate": 5.6850617084098416e-06, "loss": 0.6629, "step": 8170 }, { "epoch": 0.54, "grad_norm": 1.5990138053894043, "learning_rate": 5.674254651368616e-06, "loss": 0.6725, "step": 8180 }, { "epoch": 0.54, "grad_norm": 1.87677001953125, "learning_rate": 5.6634443851144115e-06, "loss": 0.6628, "step": 8190 }, { "epoch": 0.54, "grad_norm": 6.757842540740967, "learning_rate": 5.65263096110026e-06, "loss": 0.6609, "step": 8200 }, { "epoch": 0.54, "grad_norm": 1.3478131294250488, "learning_rate": 5.641814430794222e-06, "loss": 0.6476, "step": 8210 }, { "epoch": 0.54, "grad_norm": 1.5324090719223022, "learning_rate": 5.63099484567915e-06, "loss": 0.648, "step": 8220 }, { "epoch": 0.54, "grad_norm": 1.4732797145843506, "learning_rate": 5.620172257252427e-06, "loss": 0.6745, "step": 8230 }, { "epoch": 0.55, "grad_norm": 1.4768403768539429, "learning_rate": 5.609346717025738e-06, "loss": 0.6811, "step": 8240 }, { "epoch": 0.55, "grad_norm": 1.409543752670288, "learning_rate": 5.598518276524813e-06, "loss": 0.6795, "step": 8250 }, { "epoch": 0.55, "grad_norm": 1.5199010372161865, "learning_rate": 5.587686987289189e-06, "loss": 0.6879, "step": 8260 }, { "epoch": 0.55, "grad_norm": 1.5905330181121826, "learning_rate": 5.57685290087196e-06, "loss": 0.6585, "step": 8270 }, { "epoch": 0.55, "grad_norm": 1.3945244550704956, "learning_rate": 5.566016068839535e-06, "loss": 0.6677, "step": 8280 }, { "epoch": 0.55, "grad_norm": 1.805010437965393, "learning_rate": 5.555176542771389e-06, "loss": 0.682, "step": 8290 }, { "epoch": 0.55, "grad_norm": 1.46513032913208, "learning_rate": 5.544334374259823e-06, "loss": 0.654, "step": 8300 }, { "epoch": 0.55, "grad_norm": 1.642730474472046, "learning_rate": 5.533489614909714e-06, "loss": 0.6548, "step": 8310 }, { "epoch": 0.55, "grad_norm": 1.5549567937850952, "learning_rate": 5.522642316338268e-06, "loss": 0.66, "step": 8320 }, { "epoch": 0.55, "grad_norm": 1.639410376548767, "learning_rate": 5.51179253017478e-06, "loss": 0.6582, "step": 8330 }, { "epoch": 0.55, "grad_norm": 1.937220811843872, "learning_rate": 5.500940308060382e-06, "loss": 0.6769, "step": 8340 }, { "epoch": 0.55, "grad_norm": 1.5718787908554077, "learning_rate": 5.490085701647805e-06, "loss": 0.6638, "step": 8350 }, { "epoch": 0.55, "grad_norm": 1.5767078399658203, "learning_rate": 5.4792287626011206e-06, "loss": 0.6472, "step": 8360 }, { "epoch": 0.55, "grad_norm": 1.781279444694519, "learning_rate": 5.468369542595512e-06, "loss": 0.6778, "step": 8370 }, { "epoch": 0.55, "grad_norm": 1.7731683254241943, "learning_rate": 5.457508093317013e-06, "loss": 0.6707, "step": 8380 }, { "epoch": 0.56, "grad_norm": 1.4716190099716187, "learning_rate": 5.446644466462269e-06, "loss": 0.6691, "step": 8390 }, { "epoch": 0.56, "grad_norm": 1.4966431856155396, "learning_rate": 5.435778713738292e-06, "loss": 0.6587, "step": 8400 }, { "epoch": 0.56, "grad_norm": 2.055002212524414, "learning_rate": 5.4249108868622095e-06, "loss": 0.6513, "step": 8410 }, { "epoch": 0.56, "grad_norm": 1.5098994970321655, "learning_rate": 5.414041037561022e-06, "loss": 0.6579, "step": 8420 }, { "epoch": 0.56, "grad_norm": 1.787880301475525, "learning_rate": 5.403169217571359e-06, "loss": 0.6577, "step": 8430 }, { "epoch": 0.56, "grad_norm": 1.8359959125518799, "learning_rate": 5.392295478639226e-06, "loss": 0.6628, "step": 8440 }, { "epoch": 0.56, "grad_norm": 1.8961970806121826, "learning_rate": 5.381419872519763e-06, "loss": 0.6444, "step": 8450 }, { "epoch": 0.56, "grad_norm": 1.7812308073043823, "learning_rate": 5.3705424509769976e-06, "loss": 0.643, "step": 8460 }, { "epoch": 0.56, "grad_norm": 1.8749055862426758, "learning_rate": 5.3596632657835975e-06, "loss": 0.694, "step": 8470 }, { "epoch": 0.56, "grad_norm": 1.7846895456314087, "learning_rate": 5.348782368720627e-06, "loss": 0.6745, "step": 8480 }, { "epoch": 0.56, "grad_norm": 2.077484607696533, "learning_rate": 5.337899811577297e-06, "loss": 0.6886, "step": 8490 }, { "epoch": 0.56, "grad_norm": 1.7645025253295898, "learning_rate": 5.327015646150716e-06, "loss": 0.6339, "step": 8500 }, { "epoch": 0.56, "grad_norm": 1.7799251079559326, "learning_rate": 5.3161299242456535e-06, "loss": 0.6564, "step": 8510 }, { "epoch": 0.56, "grad_norm": 1.8336862325668335, "learning_rate": 5.3052426976742855e-06, "loss": 0.6761, "step": 8520 }, { "epoch": 0.56, "grad_norm": 1.5300918817520142, "learning_rate": 5.294354018255945e-06, "loss": 0.6634, "step": 8530 }, { "epoch": 0.57, "grad_norm": 1.747183084487915, "learning_rate": 5.283463937816888e-06, "loss": 0.6465, "step": 8540 }, { "epoch": 0.57, "grad_norm": 1.5539292097091675, "learning_rate": 5.272572508190033e-06, "loss": 0.6844, "step": 8550 }, { "epoch": 0.57, "grad_norm": 2.0540542602539062, "learning_rate": 5.2616797812147205e-06, "loss": 0.6938, "step": 8560 }, { "epoch": 0.57, "grad_norm": 1.6536123752593994, "learning_rate": 5.250785808736467e-06, "loss": 0.6707, "step": 8570 }, { "epoch": 0.57, "grad_norm": 1.6305638551712036, "learning_rate": 5.23989064260672e-06, "loss": 0.6685, "step": 8580 }, { "epoch": 0.57, "grad_norm": 1.6948388814926147, "learning_rate": 5.228994334682605e-06, "loss": 0.6685, "step": 8590 }, { "epoch": 0.57, "grad_norm": 1.391671895980835, "learning_rate": 5.218096936826681e-06, "loss": 0.6609, "step": 8600 }, { "epoch": 0.57, "grad_norm": 1.8620556592941284, "learning_rate": 5.207198500906698e-06, "loss": 0.6701, "step": 8610 }, { "epoch": 0.57, "grad_norm": 1.3646891117095947, "learning_rate": 5.1962990787953436e-06, "loss": 0.6421, "step": 8620 }, { "epoch": 0.57, "grad_norm": 2.0592386722564697, "learning_rate": 5.185398722370004e-06, "loss": 0.6551, "step": 8630 }, { "epoch": 0.57, "grad_norm": 1.6097017526626587, "learning_rate": 5.174497483512506e-06, "loss": 0.6714, "step": 8640 }, { "epoch": 0.57, "grad_norm": 1.8083919286727905, "learning_rate": 5.1635954141088815e-06, "loss": 0.6529, "step": 8650 }, { "epoch": 0.57, "grad_norm": 2.0661661624908447, "learning_rate": 5.1526925660491145e-06, "loss": 0.6454, "step": 8660 }, { "epoch": 0.57, "grad_norm": 1.97969388961792, "learning_rate": 5.141788991226892e-06, "loss": 0.6633, "step": 8670 }, { "epoch": 0.57, "grad_norm": 1.696077823638916, "learning_rate": 5.130884741539367e-06, "loss": 0.6604, "step": 8680 }, { "epoch": 0.58, "grad_norm": 1.7397289276123047, "learning_rate": 5.1199798688868955e-06, "loss": 0.6747, "step": 8690 }, { "epoch": 0.58, "grad_norm": 1.664778232574463, "learning_rate": 5.109074425172806e-06, "loss": 0.6572, "step": 8700 }, { "epoch": 0.58, "grad_norm": 1.5999622344970703, "learning_rate": 5.098168462303141e-06, "loss": 0.6849, "step": 8710 }, { "epoch": 0.58, "grad_norm": 1.667110562324524, "learning_rate": 5.087262032186418e-06, "loss": 0.6791, "step": 8720 }, { "epoch": 0.58, "grad_norm": 1.889437198638916, "learning_rate": 5.076355186733373e-06, "loss": 0.6677, "step": 8730 }, { "epoch": 0.58, "grad_norm": 1.882400631904602, "learning_rate": 5.065447977856723e-06, "loss": 0.638, "step": 8740 }, { "epoch": 0.58, "grad_norm": 1.7043653726577759, "learning_rate": 5.054540457470912e-06, "loss": 0.6703, "step": 8750 }, { "epoch": 0.58, "grad_norm": 1.6736963987350464, "learning_rate": 5.04363267749187e-06, "loss": 0.656, "step": 8760 }, { "epoch": 0.58, "grad_norm": 2.609456777572632, "learning_rate": 5.0327246898367595e-06, "loss": 0.6665, "step": 8770 }, { "epoch": 0.58, "grad_norm": 1.9416409730911255, "learning_rate": 5.021816546423734e-06, "loss": 0.6706, "step": 8780 }, { "epoch": 0.58, "grad_norm": 1.6183526515960693, "learning_rate": 5.010908299171685e-06, "loss": 0.6759, "step": 8790 }, { "epoch": 0.58, "grad_norm": 1.77249014377594, "learning_rate": 5e-06, "loss": 0.6524, "step": 8800 }, { "epoch": 0.58, "grad_norm": 1.4985134601593018, "learning_rate": 4.989091700828316e-06, "loss": 0.6703, "step": 8810 }, { "epoch": 0.58, "grad_norm": 2.138824462890625, "learning_rate": 4.978183453576268e-06, "loss": 0.6905, "step": 8820 }, { "epoch": 0.58, "grad_norm": 1.2800077199935913, "learning_rate": 4.967275310163241e-06, "loss": 0.6383, "step": 8830 }, { "epoch": 0.58, "grad_norm": 1.545478105545044, "learning_rate": 4.956367322508131e-06, "loss": 0.6501, "step": 8840 }, { "epoch": 0.59, "grad_norm": 1.9019412994384766, "learning_rate": 4.945459542529089e-06, "loss": 0.6876, "step": 8850 }, { "epoch": 0.59, "grad_norm": 1.6927233934402466, "learning_rate": 4.934552022143279e-06, "loss": 0.6647, "step": 8860 }, { "epoch": 0.59, "grad_norm": 1.4633018970489502, "learning_rate": 4.923644813266629e-06, "loss": 0.6555, "step": 8870 }, { "epoch": 0.59, "grad_norm": 1.59763765335083, "learning_rate": 4.9127379678135825e-06, "loss": 0.6411, "step": 8880 }, { "epoch": 0.59, "grad_norm": 1.7652654647827148, "learning_rate": 4.90183153769686e-06, "loss": 0.6775, "step": 8890 }, { "epoch": 0.59, "grad_norm": 3.181339740753174, "learning_rate": 4.890925574827195e-06, "loss": 0.659, "step": 8900 }, { "epoch": 0.59, "grad_norm": 1.6873351335525513, "learning_rate": 4.880020131113107e-06, "loss": 0.6394, "step": 8910 }, { "epoch": 0.59, "grad_norm": 1.513166904449463, "learning_rate": 4.869115258460636e-06, "loss": 0.6576, "step": 8920 }, { "epoch": 0.59, "grad_norm": 1.5530864000320435, "learning_rate": 4.8582110087731095e-06, "loss": 0.6396, "step": 8930 }, { "epoch": 0.59, "grad_norm": 2.308940887451172, "learning_rate": 4.847307433950888e-06, "loss": 0.6985, "step": 8940 }, { "epoch": 0.59, "grad_norm": 1.679738998413086, "learning_rate": 4.83640458589112e-06, "loss": 0.6722, "step": 8950 }, { "epoch": 0.59, "grad_norm": 1.9519011974334717, "learning_rate": 4.825502516487497e-06, "loss": 0.676, "step": 8960 }, { "epoch": 0.59, "grad_norm": 1.7706700563430786, "learning_rate": 4.8146012776299984e-06, "loss": 0.6791, "step": 8970 }, { "epoch": 0.59, "grad_norm": 1.718878984451294, "learning_rate": 4.803700921204659e-06, "loss": 0.66, "step": 8980 }, { "epoch": 0.59, "grad_norm": 1.953933835029602, "learning_rate": 4.792801499093305e-06, "loss": 0.6872, "step": 8990 }, { "epoch": 0.6, "grad_norm": 1.6954289674758911, "learning_rate": 4.781903063173321e-06, "loss": 0.6759, "step": 9000 }, { "epoch": 0.6, "grad_norm": 1.7253254652023315, "learning_rate": 4.771005665317398e-06, "loss": 0.6621, "step": 9010 }, { "epoch": 0.6, "grad_norm": 1.4226269721984863, "learning_rate": 4.760109357393282e-06, "loss": 0.643, "step": 9020 }, { "epoch": 0.6, "grad_norm": 1.6937118768692017, "learning_rate": 4.749214191263533e-06, "loss": 0.6726, "step": 9030 }, { "epoch": 0.6, "grad_norm": 1.4342418909072876, "learning_rate": 4.738320218785281e-06, "loss": 0.681, "step": 9040 }, { "epoch": 0.6, "grad_norm": 1.6379367113113403, "learning_rate": 4.727427491809968e-06, "loss": 0.6942, "step": 9050 }, { "epoch": 0.6, "grad_norm": 1.7488796710968018, "learning_rate": 4.716536062183112e-06, "loss": 0.6669, "step": 9060 }, { "epoch": 0.6, "grad_norm": 1.671850323677063, "learning_rate": 4.705645981744055e-06, "loss": 0.6713, "step": 9070 }, { "epoch": 0.6, "grad_norm": 1.6623774766921997, "learning_rate": 4.694757302325715e-06, "loss": 0.668, "step": 9080 }, { "epoch": 0.6, "grad_norm": 2.3808863162994385, "learning_rate": 4.683870075754347e-06, "loss": 0.688, "step": 9090 }, { "epoch": 0.6, "grad_norm": 1.9445726871490479, "learning_rate": 4.672984353849285e-06, "loss": 0.669, "step": 9100 }, { "epoch": 0.6, "grad_norm": 1.3454090356826782, "learning_rate": 4.662100188422705e-06, "loss": 0.6616, "step": 9110 }, { "epoch": 0.6, "grad_norm": 1.649337649345398, "learning_rate": 4.651217631279374e-06, "loss": 0.6416, "step": 9120 }, { "epoch": 0.6, "grad_norm": 1.760851502418518, "learning_rate": 4.640336734216403e-06, "loss": 0.678, "step": 9130 }, { "epoch": 0.6, "grad_norm": 1.77865469455719, "learning_rate": 4.629457549023004e-06, "loss": 0.6744, "step": 9140 }, { "epoch": 0.61, "grad_norm": 1.9781075716018677, "learning_rate": 4.618580127480239e-06, "loss": 0.6548, "step": 9150 }, { "epoch": 0.61, "grad_norm": 1.4706860780715942, "learning_rate": 4.6077045213607765e-06, "loss": 0.6828, "step": 9160 }, { "epoch": 0.61, "grad_norm": 1.6444385051727295, "learning_rate": 4.596830782428642e-06, "loss": 0.6629, "step": 9170 }, { "epoch": 0.61, "grad_norm": 1.650578498840332, "learning_rate": 4.58595896243898e-06, "loss": 0.6529, "step": 9180 }, { "epoch": 0.61, "grad_norm": 1.7814977169036865, "learning_rate": 4.575089113137792e-06, "loss": 0.6633, "step": 9190 }, { "epoch": 0.61, "grad_norm": 1.5504720211029053, "learning_rate": 4.564221286261709e-06, "loss": 0.6463, "step": 9200 }, { "epoch": 0.61, "grad_norm": 1.6057378053665161, "learning_rate": 4.553355533537732e-06, "loss": 0.669, "step": 9210 }, { "epoch": 0.61, "grad_norm": 1.5203750133514404, "learning_rate": 4.542491906682988e-06, "loss": 0.6646, "step": 9220 }, { "epoch": 0.61, "grad_norm": 2.8241419792175293, "learning_rate": 4.53163045740449e-06, "loss": 0.6652, "step": 9230 }, { "epoch": 0.61, "grad_norm": 1.680282711982727, "learning_rate": 4.52077123739888e-06, "loss": 0.6757, "step": 9240 }, { "epoch": 0.61, "grad_norm": 1.5383038520812988, "learning_rate": 4.509914298352197e-06, "loss": 0.6775, "step": 9250 }, { "epoch": 0.61, "grad_norm": 1.3444230556488037, "learning_rate": 4.49905969193962e-06, "loss": 0.6407, "step": 9260 }, { "epoch": 0.61, "grad_norm": 1.9666281938552856, "learning_rate": 4.488207469825221e-06, "loss": 0.6731, "step": 9270 }, { "epoch": 0.61, "grad_norm": 1.1215275526046753, "learning_rate": 4.477357683661734e-06, "loss": 0.6692, "step": 9280 }, { "epoch": 0.61, "grad_norm": 1.7040753364562988, "learning_rate": 4.466510385090287e-06, "loss": 0.6571, "step": 9290 }, { "epoch": 0.62, "grad_norm": 1.5992120504379272, "learning_rate": 4.4556656257401786e-06, "loss": 0.6658, "step": 9300 }, { "epoch": 0.62, "grad_norm": 1.7472400665283203, "learning_rate": 4.4448234572286126e-06, "loss": 0.6786, "step": 9310 }, { "epoch": 0.62, "grad_norm": 1.5891025066375732, "learning_rate": 4.4339839311604675e-06, "loss": 0.6595, "step": 9320 }, { "epoch": 0.62, "grad_norm": 1.8652875423431396, "learning_rate": 4.4231470991280425e-06, "loss": 0.6568, "step": 9330 }, { "epoch": 0.62, "grad_norm": 1.7360942363739014, "learning_rate": 4.4123130127108125e-06, "loss": 0.6626, "step": 9340 }, { "epoch": 0.62, "grad_norm": 1.7511422634124756, "learning_rate": 4.401481723475189e-06, "loss": 0.6569, "step": 9350 }, { "epoch": 0.62, "grad_norm": 1.7602146863937378, "learning_rate": 4.390653282974264e-06, "loss": 0.6431, "step": 9360 }, { "epoch": 0.62, "grad_norm": 2.336517095565796, "learning_rate": 4.379827742747575e-06, "loss": 0.6788, "step": 9370 }, { "epoch": 0.62, "grad_norm": 2.613835096359253, "learning_rate": 4.3690051543208526e-06, "loss": 0.6776, "step": 9380 }, { "epoch": 0.62, "grad_norm": 2.495633840560913, "learning_rate": 4.358185569205779e-06, "loss": 0.6867, "step": 9390 }, { "epoch": 0.62, "grad_norm": 1.8492158651351929, "learning_rate": 4.347369038899744e-06, "loss": 0.6553, "step": 9400 }, { "epoch": 0.62, "grad_norm": 1.860273838043213, "learning_rate": 4.336555614885591e-06, "loss": 0.6648, "step": 9410 }, { "epoch": 0.62, "grad_norm": 1.934490442276001, "learning_rate": 4.3257453486313865e-06, "loss": 0.641, "step": 9420 }, { "epoch": 0.62, "grad_norm": 2.0558342933654785, "learning_rate": 4.314938291590161e-06, "loss": 0.6659, "step": 9430 }, { "epoch": 0.62, "grad_norm": 1.751317024230957, "learning_rate": 4.304134495199675e-06, "loss": 0.6718, "step": 9440 }, { "epoch": 0.63, "grad_norm": 1.4357839822769165, "learning_rate": 4.293334010882164e-06, "loss": 0.674, "step": 9450 }, { "epoch": 0.63, "grad_norm": 1.4586100578308105, "learning_rate": 4.282536890044105e-06, "loss": 0.6694, "step": 9460 }, { "epoch": 0.63, "grad_norm": 1.403393030166626, "learning_rate": 4.271743184075963e-06, "loss": 0.6625, "step": 9470 }, { "epoch": 0.63, "grad_norm": 1.9740585088729858, "learning_rate": 4.260952944351947e-06, "loss": 0.65, "step": 9480 }, { "epoch": 0.63, "grad_norm": 1.788071632385254, "learning_rate": 4.250166222229775e-06, "loss": 0.661, "step": 9490 }, { "epoch": 0.63, "grad_norm": 1.5685087442398071, "learning_rate": 4.239383069050417e-06, "loss": 0.6735, "step": 9500 }, { "epoch": 0.63, "grad_norm": 1.4203977584838867, "learning_rate": 4.228603536137856e-06, "loss": 0.6416, "step": 9510 }, { "epoch": 0.63, "grad_norm": 1.93641197681427, "learning_rate": 4.217827674798845e-06, "loss": 0.6681, "step": 9520 }, { "epoch": 0.63, "grad_norm": 1.8250054121017456, "learning_rate": 4.207055536322665e-06, "loss": 0.6954, "step": 9530 }, { "epoch": 0.63, "grad_norm": 1.5296562910079956, "learning_rate": 4.196287171980869e-06, "loss": 0.6809, "step": 9540 }, { "epoch": 0.63, "grad_norm": 1.7069146633148193, "learning_rate": 4.185522633027057e-06, "loss": 0.648, "step": 9550 }, { "epoch": 0.63, "grad_norm": 2.05324125289917, "learning_rate": 4.174761970696612e-06, "loss": 0.6818, "step": 9560 }, { "epoch": 0.63, "grad_norm": 1.6765975952148438, "learning_rate": 4.164005236206471e-06, "loss": 0.674, "step": 9570 }, { "epoch": 0.63, "grad_norm": 2.0910682678222656, "learning_rate": 4.1532524807548776e-06, "loss": 0.6609, "step": 9580 }, { "epoch": 0.63, "grad_norm": 1.6869258880615234, "learning_rate": 4.142503755521129e-06, "loss": 0.6618, "step": 9590 }, { "epoch": 0.64, "grad_norm": 1.5661019086837769, "learning_rate": 4.131759111665349e-06, "loss": 0.6749, "step": 9600 }, { "epoch": 0.64, "eval_loss": 0.7704323530197144, "eval_runtime": 134.2109, "eval_samples_per_second": 81.961, "eval_steps_per_second": 10.245, "step": 9600 }, { "epoch": 0.64, "grad_norm": 1.7838884592056274, "learning_rate": 4.1210186003282275e-06, "loss": 0.6592, "step": 9610 }, { "epoch": 0.64, "grad_norm": 1.9045015573501587, "learning_rate": 4.1102822726307925e-06, "loss": 0.66, "step": 9620 }, { "epoch": 0.64, "grad_norm": 1.6812734603881836, "learning_rate": 4.099550179674151e-06, "loss": 0.6734, "step": 9630 }, { "epoch": 0.64, "grad_norm": 2.4321088790893555, "learning_rate": 4.088822372539263e-06, "loss": 0.6621, "step": 9640 }, { "epoch": 0.64, "grad_norm": 1.5010099411010742, "learning_rate": 4.078098902286684e-06, "loss": 0.674, "step": 9650 }, { "epoch": 0.64, "grad_norm": 1.6574801206588745, "learning_rate": 4.067379819956327e-06, "loss": 0.6388, "step": 9660 }, { "epoch": 0.64, "grad_norm": 1.5412184000015259, "learning_rate": 4.056665176567225e-06, "loss": 0.6437, "step": 9670 }, { "epoch": 0.64, "grad_norm": 1.5895485877990723, "learning_rate": 4.045955023117276e-06, "loss": 0.6687, "step": 9680 }, { "epoch": 0.64, "grad_norm": 1.414804458618164, "learning_rate": 4.0352494105830155e-06, "loss": 0.6747, "step": 9690 }, { "epoch": 0.64, "grad_norm": 1.5721184015274048, "learning_rate": 4.02454838991936e-06, "loss": 0.6679, "step": 9700 }, { "epoch": 0.64, "grad_norm": 1.5480763912200928, "learning_rate": 4.013852012059371e-06, "loss": 0.6684, "step": 9710 }, { "epoch": 0.64, "grad_norm": 2.122615337371826, "learning_rate": 4.003160327914015e-06, "loss": 0.6544, "step": 9720 }, { "epoch": 0.64, "grad_norm": 1.6763825416564941, "learning_rate": 3.992473388371914e-06, "loss": 0.6747, "step": 9730 }, { "epoch": 0.64, "grad_norm": 1.469360589981079, "learning_rate": 3.981791244299113e-06, "loss": 0.659, "step": 9740 }, { "epoch": 0.65, "grad_norm": 1.746013879776001, "learning_rate": 3.971113946538826e-06, "loss": 0.6904, "step": 9750 }, { "epoch": 0.65, "grad_norm": 1.5463660955429077, "learning_rate": 3.960441545911205e-06, "loss": 0.6837, "step": 9760 }, { "epoch": 0.65, "grad_norm": 2.29986310005188, "learning_rate": 3.949774093213089e-06, "loss": 0.6809, "step": 9770 }, { "epoch": 0.65, "grad_norm": 1.6813066005706787, "learning_rate": 3.939111639217769e-06, "loss": 0.6804, "step": 9780 }, { "epoch": 0.65, "grad_norm": 1.5592458248138428, "learning_rate": 3.928454234674748e-06, "loss": 0.6664, "step": 9790 }, { "epoch": 0.65, "grad_norm": 2.1687581539154053, "learning_rate": 3.917801930309486e-06, "loss": 0.6589, "step": 9800 }, { "epoch": 0.65, "grad_norm": 1.744254469871521, "learning_rate": 3.907154776823179e-06, "loss": 0.6624, "step": 9810 }, { "epoch": 0.65, "grad_norm": 2.446599245071411, "learning_rate": 3.8965128248924956e-06, "loss": 0.661, "step": 9820 }, { "epoch": 0.65, "grad_norm": 2.0234007835388184, "learning_rate": 3.885876125169356e-06, "loss": 0.6654, "step": 9830 }, { "epoch": 0.65, "grad_norm": 1.8161579370498657, "learning_rate": 3.875244728280676e-06, "loss": 0.6779, "step": 9840 }, { "epoch": 0.65, "grad_norm": 1.6402438879013062, "learning_rate": 3.864618684828135e-06, "loss": 0.6477, "step": 9850 }, { "epoch": 0.65, "grad_norm": 2.011474370956421, "learning_rate": 3.85399804538793e-06, "loss": 0.649, "step": 9860 }, { "epoch": 0.65, "grad_norm": 1.7880947589874268, "learning_rate": 3.8433828605105385e-06, "loss": 0.6751, "step": 9870 }, { "epoch": 0.65, "grad_norm": 2.0285022258758545, "learning_rate": 3.832773180720475e-06, "loss": 0.662, "step": 9880 }, { "epoch": 0.65, "grad_norm": 1.7321562767028809, "learning_rate": 3.822169056516051e-06, "loss": 0.6698, "step": 9890 }, { "epoch": 0.66, "grad_norm": 1.807761788368225, "learning_rate": 3.8115705383691354e-06, "loss": 0.658, "step": 9900 }, { "epoch": 0.66, "grad_norm": 1.7578504085540771, "learning_rate": 3.800977676724919e-06, "loss": 0.6343, "step": 9910 }, { "epoch": 0.66, "grad_norm": 2.058124542236328, "learning_rate": 3.790390522001662e-06, "loss": 0.6572, "step": 9920 }, { "epoch": 0.66, "grad_norm": 2.71362566947937, "learning_rate": 3.7798091245904674e-06, "loss": 0.6728, "step": 9930 }, { "epoch": 0.66, "grad_norm": 2.2041659355163574, "learning_rate": 3.769233534855035e-06, "loss": 0.6564, "step": 9940 }, { "epoch": 0.66, "grad_norm": 1.7433922290802002, "learning_rate": 3.7586638031314182e-06, "loss": 0.6592, "step": 9950 }, { "epoch": 0.66, "grad_norm": 1.8850117921829224, "learning_rate": 3.748099979727792e-06, "loss": 0.6677, "step": 9960 }, { "epoch": 0.66, "grad_norm": 1.5975645780563354, "learning_rate": 3.7375421149242102e-06, "loss": 0.6874, "step": 9970 }, { "epoch": 0.66, "grad_norm": 2.081080198287964, "learning_rate": 3.7269902589723617e-06, "loss": 0.6508, "step": 9980 }, { "epoch": 0.66, "grad_norm": 1.5484403371810913, "learning_rate": 3.7164444620953397e-06, "loss": 0.6681, "step": 9990 }, { "epoch": 0.66, "grad_norm": 1.7869611978530884, "learning_rate": 3.705904774487396e-06, "loss": 0.6762, "step": 10000 }, { "epoch": 0.66, "grad_norm": 1.8547977209091187, "learning_rate": 3.6953712463137057e-06, "loss": 0.6572, "step": 10010 }, { "epoch": 0.66, "grad_norm": 2.011695146560669, "learning_rate": 3.6848439277101262e-06, "loss": 0.6574, "step": 10020 }, { "epoch": 0.66, "grad_norm": 2.0033137798309326, "learning_rate": 3.6743228687829596e-06, "loss": 0.6723, "step": 10030 }, { "epoch": 0.66, "grad_norm": 2.6469576358795166, "learning_rate": 3.663808119608716e-06, "loss": 0.6693, "step": 10040 }, { "epoch": 0.67, "grad_norm": 1.7878594398498535, "learning_rate": 3.6532997302338704e-06, "loss": 0.6653, "step": 10050 }, { "epoch": 0.67, "grad_norm": 1.6353760957717896, "learning_rate": 3.6427977506746293e-06, "loss": 0.6429, "step": 10060 }, { "epoch": 0.67, "grad_norm": 2.0853710174560547, "learning_rate": 3.63230223091669e-06, "loss": 0.6467, "step": 10070 }, { "epoch": 0.67, "grad_norm": 1.7960524559020996, "learning_rate": 3.6218132209150047e-06, "loss": 0.6812, "step": 10080 }, { "epoch": 0.67, "grad_norm": 1.5491673946380615, "learning_rate": 3.6113307705935398e-06, "loss": 0.6718, "step": 10090 }, { "epoch": 0.67, "grad_norm": 1.6183891296386719, "learning_rate": 3.6008549298450403e-06, "loss": 0.6671, "step": 10100 }, { "epoch": 0.67, "grad_norm": 1.508072018623352, "learning_rate": 3.5903857485307936e-06, "loss": 0.6755, "step": 10110 }, { "epoch": 0.67, "grad_norm": 1.8748259544372559, "learning_rate": 3.579923276480387e-06, "loss": 0.6664, "step": 10120 }, { "epoch": 0.67, "grad_norm": 1.6908599138259888, "learning_rate": 3.56946756349148e-06, "loss": 0.6732, "step": 10130 }, { "epoch": 0.67, "grad_norm": 1.944483995437622, "learning_rate": 3.559018659329554e-06, "loss": 0.6482, "step": 10140 }, { "epoch": 0.67, "grad_norm": 2.147538900375366, "learning_rate": 3.5485766137276894e-06, "loss": 0.6454, "step": 10150 }, { "epoch": 0.67, "grad_norm": 1.6188044548034668, "learning_rate": 3.538141476386317e-06, "loss": 0.6458, "step": 10160 }, { "epoch": 0.67, "grad_norm": 1.9587546586990356, "learning_rate": 3.527713296972991e-06, "loss": 0.662, "step": 10170 }, { "epoch": 0.67, "grad_norm": 1.46161949634552, "learning_rate": 3.517292125122146e-06, "loss": 0.6857, "step": 10180 }, { "epoch": 0.67, "grad_norm": 1.7037651538848877, "learning_rate": 3.5068780104348632e-06, "loss": 0.6547, "step": 10190 }, { "epoch": 0.67, "grad_norm": 1.446110486984253, "learning_rate": 3.4964710024786354e-06, "loss": 0.6737, "step": 10200 }, { "epoch": 0.68, "grad_norm": 2.2284021377563477, "learning_rate": 3.486071150787128e-06, "loss": 0.6568, "step": 10210 }, { "epoch": 0.68, "grad_norm": 1.4104644060134888, "learning_rate": 3.4756785048599464e-06, "loss": 0.6572, "step": 10220 }, { "epoch": 0.68, "grad_norm": 2.4139363765716553, "learning_rate": 3.4652931141624002e-06, "loss": 0.6568, "step": 10230 }, { "epoch": 0.68, "grad_norm": 1.8519604206085205, "learning_rate": 3.4549150281252635e-06, "loss": 0.6626, "step": 10240 }, { "epoch": 0.68, "grad_norm": 1.8511260747909546, "learning_rate": 3.444544296144546e-06, "loss": 0.688, "step": 10250 }, { "epoch": 0.68, "grad_norm": 1.454230785369873, "learning_rate": 3.4341809675812532e-06, "loss": 0.6693, "step": 10260 }, { "epoch": 0.68, "grad_norm": 1.7652877569198608, "learning_rate": 3.4238250917611533e-06, "loss": 0.6523, "step": 10270 }, { "epoch": 0.68, "grad_norm": 1.9757181406021118, "learning_rate": 3.4134767179745404e-06, "loss": 0.6672, "step": 10280 }, { "epoch": 0.68, "grad_norm": 1.6871960163116455, "learning_rate": 3.403135895476004e-06, "loss": 0.6604, "step": 10290 }, { "epoch": 0.68, "grad_norm": 1.3390458822250366, "learning_rate": 3.3928026734841935e-06, "loss": 0.6668, "step": 10300 }, { "epoch": 0.68, "grad_norm": 1.4733284711837769, "learning_rate": 3.3824771011815772e-06, "loss": 0.6843, "step": 10310 }, { "epoch": 0.68, "grad_norm": 1.77756667137146, "learning_rate": 3.372159227714218e-06, "loss": 0.6809, "step": 10320 }, { "epoch": 0.68, "grad_norm": 1.5924615859985352, "learning_rate": 3.3618491021915334e-06, "loss": 0.6658, "step": 10330 }, { "epoch": 0.68, "grad_norm": 1.7609484195709229, "learning_rate": 3.351546773686065e-06, "loss": 0.6797, "step": 10340 }, { "epoch": 0.68, "grad_norm": 1.5677520036697388, "learning_rate": 3.341252291233241e-06, "loss": 0.668, "step": 10350 }, { "epoch": 0.69, "grad_norm": 1.59241783618927, "learning_rate": 3.330965703831146e-06, "loss": 0.6561, "step": 10360 }, { "epoch": 0.69, "grad_norm": 1.4724187850952148, "learning_rate": 3.3206870604402897e-06, "loss": 0.6326, "step": 10370 }, { "epoch": 0.69, "grad_norm": 1.5326093435287476, "learning_rate": 3.3104164099833652e-06, "loss": 0.6764, "step": 10380 }, { "epoch": 0.69, "grad_norm": 1.5835292339324951, "learning_rate": 3.3001538013450285e-06, "loss": 0.6678, "step": 10390 }, { "epoch": 0.69, "grad_norm": 1.7297430038452148, "learning_rate": 3.289899283371657e-06, "loss": 0.6459, "step": 10400 }, { "epoch": 0.69, "grad_norm": 2.1451315879821777, "learning_rate": 3.2796529048711158e-06, "loss": 0.65, "step": 10410 }, { "epoch": 0.69, "grad_norm": 1.720035433769226, "learning_rate": 3.269414714612534e-06, "loss": 0.687, "step": 10420 }, { "epoch": 0.69, "grad_norm": 1.6080423593521118, "learning_rate": 3.259184761326068e-06, "loss": 0.677, "step": 10430 }, { "epoch": 0.69, "grad_norm": 2.1213626861572266, "learning_rate": 3.248963093702663e-06, "loss": 0.6687, "step": 10440 }, { "epoch": 0.69, "grad_norm": 1.6701629161834717, "learning_rate": 3.2387497603938327e-06, "loss": 0.6816, "step": 10450 }, { "epoch": 0.69, "grad_norm": 1.812422513961792, "learning_rate": 3.2285448100114208e-06, "loss": 0.6695, "step": 10460 }, { "epoch": 0.69, "grad_norm": 1.7101892232894897, "learning_rate": 3.218348291127371e-06, "loss": 0.6676, "step": 10470 }, { "epoch": 0.69, "grad_norm": 2.012482166290283, "learning_rate": 3.2081602522734987e-06, "loss": 0.6616, "step": 10480 }, { "epoch": 0.69, "grad_norm": 2.179459571838379, "learning_rate": 3.1979807419412523e-06, "loss": 0.6681, "step": 10490 }, { "epoch": 0.69, "grad_norm": 1.7281560897827148, "learning_rate": 3.1878098085814926e-06, "loss": 0.6808, "step": 10500 }, { "epoch": 0.7, "grad_norm": 2.1194772720336914, "learning_rate": 3.177647500604252e-06, "loss": 0.6662, "step": 10510 }, { "epoch": 0.7, "grad_norm": 1.462747573852539, "learning_rate": 3.167493866378514e-06, "loss": 0.6606, "step": 10520 }, { "epoch": 0.7, "grad_norm": 1.694826602935791, "learning_rate": 3.1573489542319754e-06, "loss": 0.6513, "step": 10530 }, { "epoch": 0.7, "grad_norm": 1.4207332134246826, "learning_rate": 3.147212812450819e-06, "loss": 0.6592, "step": 10540 }, { "epoch": 0.7, "grad_norm": 1.5110808610916138, "learning_rate": 3.1370854892794855e-06, "loss": 0.6504, "step": 10550 }, { "epoch": 0.7, "grad_norm": 3.3117215633392334, "learning_rate": 3.12696703292044e-06, "loss": 0.6538, "step": 10560 }, { "epoch": 0.7, "grad_norm": 1.655723214149475, "learning_rate": 3.1168574915339465e-06, "loss": 0.6742, "step": 10570 }, { "epoch": 0.7, "grad_norm": 1.747663974761963, "learning_rate": 3.1067569132378358e-06, "loss": 0.6694, "step": 10580 }, { "epoch": 0.7, "grad_norm": 1.6091103553771973, "learning_rate": 3.0966653461072778e-06, "loss": 0.6666, "step": 10590 }, { "epoch": 0.7, "grad_norm": 1.784797191619873, "learning_rate": 3.0865828381745515e-06, "loss": 0.6683, "step": 10600 }, { "epoch": 0.7, "grad_norm": 2.061067819595337, "learning_rate": 3.0765094374288197e-06, "loss": 0.666, "step": 10610 }, { "epoch": 0.7, "grad_norm": 1.6201362609863281, "learning_rate": 3.066445191815898e-06, "loss": 0.6738, "step": 10620 }, { "epoch": 0.7, "grad_norm": 1.7196377515792847, "learning_rate": 3.056390149238022e-06, "loss": 0.6561, "step": 10630 }, { "epoch": 0.7, "grad_norm": 1.8749741315841675, "learning_rate": 3.0463443575536324e-06, "loss": 0.6817, "step": 10640 }, { "epoch": 0.7, "grad_norm": 1.473533034324646, "learning_rate": 3.0363078645771303e-06, "loss": 0.6449, "step": 10650 }, { "epoch": 0.71, "grad_norm": 1.9876571893692017, "learning_rate": 3.0262807180786647e-06, "loss": 0.6657, "step": 10660 }, { "epoch": 0.71, "grad_norm": 1.949334979057312, "learning_rate": 3.0162629657838947e-06, "loss": 0.6462, "step": 10670 }, { "epoch": 0.71, "grad_norm": 3.1843159198760986, "learning_rate": 3.0062546553737692e-06, "loss": 0.6448, "step": 10680 }, { "epoch": 0.71, "grad_norm": 1.9066513776779175, "learning_rate": 2.9962558344842963e-06, "loss": 0.6639, "step": 10690 }, { "epoch": 0.71, "grad_norm": 3.2927820682525635, "learning_rate": 2.986266550706315e-06, "loss": 0.6532, "step": 10700 }, { "epoch": 0.71, "grad_norm": 1.4722814559936523, "learning_rate": 2.976286851585274e-06, "loss": 0.6665, "step": 10710 }, { "epoch": 0.71, "grad_norm": 2.8656790256500244, "learning_rate": 2.966316784621e-06, "loss": 0.6719, "step": 10720 }, { "epoch": 0.71, "grad_norm": 2.7842459678649902, "learning_rate": 2.956356397267477e-06, "loss": 0.6726, "step": 10730 }, { "epoch": 0.71, "grad_norm": 2.024540662765503, "learning_rate": 2.946405736932615e-06, "loss": 0.6587, "step": 10740 }, { "epoch": 0.71, "grad_norm": 2.1006271839141846, "learning_rate": 2.936464850978027e-06, "loss": 0.6585, "step": 10750 }, { "epoch": 0.71, "grad_norm": 1.8146822452545166, "learning_rate": 2.926533786718806e-06, "loss": 0.6355, "step": 10760 }, { "epoch": 0.71, "grad_norm": 1.8565179109573364, "learning_rate": 2.9166125914232935e-06, "loss": 0.6917, "step": 10770 }, { "epoch": 0.71, "grad_norm": 2.3497860431671143, "learning_rate": 2.906701312312861e-06, "loss": 0.6522, "step": 10780 }, { "epoch": 0.71, "grad_norm": 1.8022676706314087, "learning_rate": 2.8967999965616815e-06, "loss": 0.6548, "step": 10790 }, { "epoch": 0.71, "grad_norm": 1.768591284751892, "learning_rate": 2.886908691296504e-06, "loss": 0.6607, "step": 10800 }, { "epoch": 0.72, "grad_norm": 1.492566466331482, "learning_rate": 2.8770274435964356e-06, "loss": 0.6817, "step": 10810 }, { "epoch": 0.72, "grad_norm": 2.3266677856445312, "learning_rate": 2.8671563004927107e-06, "loss": 0.6659, "step": 10820 }, { "epoch": 0.72, "grad_norm": 2.3747525215148926, "learning_rate": 2.8572953089684654e-06, "loss": 0.6794, "step": 10830 }, { "epoch": 0.72, "grad_norm": 2.2825326919555664, "learning_rate": 2.8474445159585235e-06, "loss": 0.6911, "step": 10840 }, { "epoch": 0.72, "grad_norm": 1.5767141580581665, "learning_rate": 2.8376039683491683e-06, "loss": 0.6322, "step": 10850 }, { "epoch": 0.72, "grad_norm": 2.076864242553711, "learning_rate": 2.827773712977915e-06, "loss": 0.6793, "step": 10860 }, { "epoch": 0.72, "grad_norm": 1.8241063356399536, "learning_rate": 2.817953796633289e-06, "loss": 0.6607, "step": 10870 }, { "epoch": 0.72, "grad_norm": 1.8080527782440186, "learning_rate": 2.8081442660546126e-06, "loss": 0.6799, "step": 10880 }, { "epoch": 0.72, "grad_norm": 1.869896650314331, "learning_rate": 2.798345167931771e-06, "loss": 0.6643, "step": 10890 }, { "epoch": 0.72, "grad_norm": 1.5855712890625, "learning_rate": 2.7885565489049948e-06, "loss": 0.6549, "step": 10900 }, { "epoch": 0.72, "grad_norm": 1.8657243251800537, "learning_rate": 2.7787784555646363e-06, "loss": 0.6615, "step": 10910 }, { "epoch": 0.72, "grad_norm": 2.6397323608398438, "learning_rate": 2.7690109344509563e-06, "loss": 0.671, "step": 10920 }, { "epoch": 0.72, "grad_norm": 1.839142084121704, "learning_rate": 2.759254032053888e-06, "loss": 0.6648, "step": 10930 }, { "epoch": 0.72, "grad_norm": 2.0795602798461914, "learning_rate": 2.7495077948128245e-06, "loss": 0.6503, "step": 10940 }, { "epoch": 0.72, "grad_norm": 1.419287919998169, "learning_rate": 2.739772269116402e-06, "loss": 0.6647, "step": 10950 }, { "epoch": 0.73, "grad_norm": 1.8919901847839355, "learning_rate": 2.7300475013022666e-06, "loss": 0.6565, "step": 10960 }, { "epoch": 0.73, "grad_norm": 1.7516885995864868, "learning_rate": 2.720333537656865e-06, "loss": 0.6556, "step": 10970 }, { "epoch": 0.73, "grad_norm": 1.7074434757232666, "learning_rate": 2.710630424415216e-06, "loss": 0.6839, "step": 10980 }, { "epoch": 0.73, "grad_norm": 1.6884958744049072, "learning_rate": 2.700938207760701e-06, "loss": 0.6645, "step": 10990 }, { "epoch": 0.73, "grad_norm": 1.643620252609253, "learning_rate": 2.6912569338248317e-06, "loss": 0.6514, "step": 11000 }, { "epoch": 0.73, "grad_norm": 2.157878875732422, "learning_rate": 2.681586648687035e-06, "loss": 0.6604, "step": 11010 }, { "epoch": 0.73, "grad_norm": 1.6927180290222168, "learning_rate": 2.671927398374443e-06, "loss": 0.6996, "step": 11020 }, { "epoch": 0.73, "grad_norm": 1.435312032699585, "learning_rate": 2.6622792288616595e-06, "loss": 0.6723, "step": 11030 }, { "epoch": 0.73, "grad_norm": 1.5182280540466309, "learning_rate": 2.6526421860705474e-06, "loss": 0.6723, "step": 11040 }, { "epoch": 0.73, "grad_norm": 2.380584478378296, "learning_rate": 2.6430163158700116e-06, "loss": 0.666, "step": 11050 }, { "epoch": 0.73, "grad_norm": 1.787930965423584, "learning_rate": 2.6334016640757838e-06, "loss": 0.6569, "step": 11060 }, { "epoch": 0.73, "grad_norm": 1.6916040182113647, "learning_rate": 2.6237982764501936e-06, "loss": 0.6599, "step": 11070 }, { "epoch": 0.73, "grad_norm": 1.7497210502624512, "learning_rate": 2.614206198701958e-06, "loss": 0.6667, "step": 11080 }, { "epoch": 0.73, "grad_norm": 1.8590047359466553, "learning_rate": 2.6046254764859687e-06, "loss": 0.6574, "step": 11090 }, { "epoch": 0.73, "grad_norm": 1.6491410732269287, "learning_rate": 2.595056155403063e-06, "loss": 0.685, "step": 11100 }, { "epoch": 0.74, "grad_norm": 1.8310904502868652, "learning_rate": 2.5854982809998154e-06, "loss": 0.6726, "step": 11110 }, { "epoch": 0.74, "grad_norm": 1.8105839490890503, "learning_rate": 2.5759518987683154e-06, "loss": 0.6754, "step": 11120 }, { "epoch": 0.74, "grad_norm": 1.6606043577194214, "learning_rate": 2.56641705414596e-06, "loss": 0.6668, "step": 11130 }, { "epoch": 0.74, "grad_norm": 1.6423386335372925, "learning_rate": 2.5568937925152272e-06, "loss": 0.6726, "step": 11140 }, { "epoch": 0.74, "grad_norm": 2.008108377456665, "learning_rate": 2.5473821592034604e-06, "loss": 0.6687, "step": 11150 }, { "epoch": 0.74, "grad_norm": 1.9500596523284912, "learning_rate": 2.5378821994826654e-06, "loss": 0.6748, "step": 11160 }, { "epoch": 0.74, "grad_norm": 1.762308120727539, "learning_rate": 2.5283939585692787e-06, "loss": 0.664, "step": 11170 }, { "epoch": 0.74, "grad_norm": 2.1707937717437744, "learning_rate": 2.518917481623961e-06, "loss": 0.6662, "step": 11180 }, { "epoch": 0.74, "grad_norm": 2.0620369911193848, "learning_rate": 2.5094528137513797e-06, "loss": 0.6487, "step": 11190 }, { "epoch": 0.74, "grad_norm": 2.011185884475708, "learning_rate": 2.5000000000000015e-06, "loss": 0.6789, "step": 11200 }, { "epoch": 0.74, "eval_loss": 0.7809577584266663, "eval_runtime": 134.3045, "eval_samples_per_second": 81.903, "eval_steps_per_second": 10.238, "step": 11200 }, { "epoch": 0.74, "grad_norm": 2.186656951904297, "learning_rate": 2.490559085361863e-06, "loss": 0.6671, "step": 11210 }, { "epoch": 0.74, "grad_norm": 1.421526312828064, "learning_rate": 2.481130114772369e-06, "loss": 0.6711, "step": 11220 }, { "epoch": 0.74, "grad_norm": 1.6306933164596558, "learning_rate": 2.471713133110078e-06, "loss": 0.6659, "step": 11230 }, { "epoch": 0.74, "grad_norm": 3.0042200088500977, "learning_rate": 2.462308185196481e-06, "loss": 0.6408, "step": 11240 }, { "epoch": 0.74, "grad_norm": 1.5799850225448608, "learning_rate": 2.4529153157957913e-06, "loss": 0.658, "step": 11250 }, { "epoch": 0.75, "grad_norm": 2.0872106552124023, "learning_rate": 2.4435345696147404e-06, "loss": 0.6609, "step": 11260 }, { "epoch": 0.75, "grad_norm": 2.4066195487976074, "learning_rate": 2.43416599130235e-06, "loss": 0.6471, "step": 11270 }, { "epoch": 0.75, "grad_norm": 1.88292396068573, "learning_rate": 2.424809625449729e-06, "loss": 0.6777, "step": 11280 }, { "epoch": 0.75, "grad_norm": 2.071256160736084, "learning_rate": 2.4154655165898626e-06, "loss": 0.6567, "step": 11290 }, { "epoch": 0.75, "grad_norm": 1.9656749963760376, "learning_rate": 2.406133709197392e-06, "loss": 0.6701, "step": 11300 }, { "epoch": 0.75, "grad_norm": 1.7675248384475708, "learning_rate": 2.396814247688413e-06, "loss": 0.6567, "step": 11310 }, { "epoch": 0.75, "grad_norm": 2.7106385231018066, "learning_rate": 2.387507176420256e-06, "loss": 0.6772, "step": 11320 }, { "epoch": 0.75, "grad_norm": 2.049154281616211, "learning_rate": 2.3782125396912765e-06, "loss": 0.6517, "step": 11330 }, { "epoch": 0.75, "grad_norm": 2.511610984802246, "learning_rate": 2.3689303817406523e-06, "loss": 0.6672, "step": 11340 }, { "epoch": 0.75, "grad_norm": 2.223442554473877, "learning_rate": 2.3596607467481602e-06, "loss": 0.6525, "step": 11350 }, { "epoch": 0.75, "grad_norm": 1.762532114982605, "learning_rate": 2.3504036788339763e-06, "loss": 0.6863, "step": 11360 }, { "epoch": 0.75, "grad_norm": 2.122451066970825, "learning_rate": 2.3411592220584574e-06, "loss": 0.6608, "step": 11370 }, { "epoch": 0.75, "grad_norm": 1.7999773025512695, "learning_rate": 2.3319274204219427e-06, "loss": 0.6354, "step": 11380 }, { "epoch": 0.75, "grad_norm": 1.5697572231292725, "learning_rate": 2.3227083178645316e-06, "loss": 0.6651, "step": 11390 }, { "epoch": 0.75, "grad_norm": 2.309673309326172, "learning_rate": 2.3135019582658803e-06, "loss": 0.6694, "step": 11400 }, { "epoch": 0.76, "grad_norm": 2.324636220932007, "learning_rate": 2.304308385444999e-06, "loss": 0.6593, "step": 11410 }, { "epoch": 0.76, "grad_norm": 1.683274745941162, "learning_rate": 2.295127643160031e-06, "loss": 0.6747, "step": 11420 }, { "epoch": 0.76, "grad_norm": 1.913460373878479, "learning_rate": 2.2859597751080536e-06, "loss": 0.6553, "step": 11430 }, { "epoch": 0.76, "grad_norm": 1.3622136116027832, "learning_rate": 2.2768048249248648e-06, "loss": 0.6639, "step": 11440 }, { "epoch": 0.76, "grad_norm": 2.03890323638916, "learning_rate": 2.2676628361847834e-06, "loss": 0.663, "step": 11450 }, { "epoch": 0.76, "grad_norm": 1.982285737991333, "learning_rate": 2.258533852400432e-06, "loss": 0.663, "step": 11460 }, { "epoch": 0.76, "grad_norm": 1.8850750923156738, "learning_rate": 2.2494179170225333e-06, "loss": 0.6667, "step": 11470 }, { "epoch": 0.76, "grad_norm": 1.936200737953186, "learning_rate": 2.2403150734397095e-06, "loss": 0.6626, "step": 11480 }, { "epoch": 0.76, "grad_norm": 1.9150683879852295, "learning_rate": 2.2312253649782655e-06, "loss": 0.663, "step": 11490 }, { "epoch": 0.76, "grad_norm": 1.6289910078048706, "learning_rate": 2.2221488349019903e-06, "loss": 0.6719, "step": 11500 }, { "epoch": 0.76, "grad_norm": 1.7412772178649902, "learning_rate": 2.213085526411945e-06, "loss": 0.6666, "step": 11510 }, { "epoch": 0.76, "grad_norm": 2.1215097904205322, "learning_rate": 2.204035482646267e-06, "loss": 0.6705, "step": 11520 }, { "epoch": 0.76, "grad_norm": 1.5671436786651611, "learning_rate": 2.1949987466799524e-06, "loss": 0.6633, "step": 11530 }, { "epoch": 0.76, "grad_norm": 1.7265719175338745, "learning_rate": 2.185975361524657e-06, "loss": 0.6555, "step": 11540 }, { "epoch": 0.76, "grad_norm": 1.697858214378357, "learning_rate": 2.1769653701284983e-06, "loss": 0.6563, "step": 11550 }, { "epoch": 0.76, "grad_norm": 1.9649640321731567, "learning_rate": 2.1679688153758373e-06, "loss": 0.6479, "step": 11560 }, { "epoch": 0.77, "grad_norm": 2.096909523010254, "learning_rate": 2.1589857400870804e-06, "loss": 0.642, "step": 11570 }, { "epoch": 0.77, "grad_norm": 1.91984224319458, "learning_rate": 2.150016187018485e-06, "loss": 0.656, "step": 11580 }, { "epoch": 0.77, "grad_norm": 1.696636438369751, "learning_rate": 2.1410601988619394e-06, "loss": 0.6838, "step": 11590 }, { "epoch": 0.77, "grad_norm": 1.7546021938323975, "learning_rate": 2.132117818244771e-06, "loss": 0.6651, "step": 11600 }, { "epoch": 0.77, "grad_norm": 2.2934517860412598, "learning_rate": 2.1231890877295374e-06, "loss": 0.6734, "step": 11610 }, { "epoch": 0.77, "grad_norm": 1.7087199687957764, "learning_rate": 2.1142740498138327e-06, "loss": 0.6764, "step": 11620 }, { "epoch": 0.77, "grad_norm": 1.9262723922729492, "learning_rate": 2.105372746930073e-06, "loss": 0.6512, "step": 11630 }, { "epoch": 0.77, "grad_norm": 1.9107701778411865, "learning_rate": 2.096485221445301e-06, "loss": 0.6493, "step": 11640 }, { "epoch": 0.77, "grad_norm": 1.5462172031402588, "learning_rate": 2.08761151566099e-06, "loss": 0.6609, "step": 11650 }, { "epoch": 0.77, "grad_norm": 2.213968515396118, "learning_rate": 2.0787516718128294e-06, "loss": 0.6764, "step": 11660 }, { "epoch": 0.77, "grad_norm": 2.0844807624816895, "learning_rate": 2.0699057320705328e-06, "loss": 0.6553, "step": 11670 }, { "epoch": 0.77, "grad_norm": 1.6982314586639404, "learning_rate": 2.061073738537635e-06, "loss": 0.6562, "step": 11680 }, { "epoch": 0.77, "grad_norm": 1.8195136785507202, "learning_rate": 2.0522557332512953e-06, "loss": 0.6799, "step": 11690 }, { "epoch": 0.77, "grad_norm": 1.861391305923462, "learning_rate": 2.0434517581820893e-06, "loss": 0.6695, "step": 11700 }, { "epoch": 0.77, "grad_norm": 1.5561097860336304, "learning_rate": 2.034661855233815e-06, "loss": 0.6468, "step": 11710 }, { "epoch": 0.78, "grad_norm": 1.8640862703323364, "learning_rate": 2.0258860662432946e-06, "loss": 0.6619, "step": 11720 }, { "epoch": 0.78, "grad_norm": 2.2300779819488525, "learning_rate": 2.0171244329801677e-06, "loss": 0.6681, "step": 11730 }, { "epoch": 0.78, "grad_norm": 1.967496633529663, "learning_rate": 2.008376997146705e-06, "loss": 0.6359, "step": 11740 }, { "epoch": 0.78, "grad_norm": 1.6537777185440063, "learning_rate": 1.999643800377596e-06, "loss": 0.6803, "step": 11750 }, { "epoch": 0.78, "grad_norm": 1.5698853731155396, "learning_rate": 1.990924884239758e-06, "loss": 0.6403, "step": 11760 }, { "epoch": 0.78, "grad_norm": 1.752765417098999, "learning_rate": 1.982220290232143e-06, "loss": 0.6745, "step": 11770 }, { "epoch": 0.78, "grad_norm": 1.6937237977981567, "learning_rate": 1.9735300597855287e-06, "loss": 0.6675, "step": 11780 }, { "epoch": 0.78, "grad_norm": 1.685810923576355, "learning_rate": 1.9648542342623276e-06, "loss": 0.6527, "step": 11790 }, { "epoch": 0.78, "grad_norm": 2.0320212841033936, "learning_rate": 1.956192854956397e-06, "loss": 0.6843, "step": 11800 }, { "epoch": 0.78, "grad_norm": 1.7817845344543457, "learning_rate": 1.9475459630928263e-06, "loss": 0.667, "step": 11810 }, { "epoch": 0.78, "grad_norm": 1.8626519441604614, "learning_rate": 1.938913599827753e-06, "loss": 0.6823, "step": 11820 }, { "epoch": 0.78, "grad_norm": 1.6484193801879883, "learning_rate": 1.9302958062481673e-06, "loss": 0.6713, "step": 11830 }, { "epoch": 0.78, "grad_norm": 1.940313696861267, "learning_rate": 1.9216926233717087e-06, "loss": 0.6425, "step": 11840 }, { "epoch": 0.78, "grad_norm": 1.8807305097579956, "learning_rate": 1.913104092146476e-06, "loss": 0.6591, "step": 11850 }, { "epoch": 0.78, "grad_norm": 1.7303801774978638, "learning_rate": 1.9045302534508298e-06, "loss": 0.6569, "step": 11860 }, { "epoch": 0.79, "grad_norm": 2.3046600818634033, "learning_rate": 1.8959711480932042e-06, "loss": 0.6785, "step": 11870 }, { "epoch": 0.79, "grad_norm": 1.4674731492996216, "learning_rate": 1.887426816811903e-06, "loss": 0.6738, "step": 11880 }, { "epoch": 0.79, "grad_norm": 2.161259651184082, "learning_rate": 1.8788973002749112e-06, "loss": 0.6712, "step": 11890 }, { "epoch": 0.79, "grad_norm": 2.3141844272613525, "learning_rate": 1.8703826390797047e-06, "loss": 0.6787, "step": 11900 }, { "epoch": 0.79, "grad_norm": 1.51625657081604, "learning_rate": 1.8618828737530497e-06, "loss": 0.6646, "step": 11910 }, { "epoch": 0.79, "grad_norm": 1.8078784942626953, "learning_rate": 1.8533980447508138e-06, "loss": 0.6605, "step": 11920 }, { "epoch": 0.79, "grad_norm": 1.5935428142547607, "learning_rate": 1.8449281924577716e-06, "loss": 0.681, "step": 11930 }, { "epoch": 0.79, "grad_norm": 1.6151143312454224, "learning_rate": 1.83647335718742e-06, "loss": 0.6732, "step": 11940 }, { "epoch": 0.79, "grad_norm": 2.1067352294921875, "learning_rate": 1.8280335791817733e-06, "loss": 0.6642, "step": 11950 }, { "epoch": 0.79, "grad_norm": 1.8711411952972412, "learning_rate": 1.8196088986111798e-06, "loss": 0.6602, "step": 11960 }, { "epoch": 0.79, "grad_norm": 2.235433578491211, "learning_rate": 1.8111993555741342e-06, "loss": 0.662, "step": 11970 }, { "epoch": 0.79, "grad_norm": 2.0249688625335693, "learning_rate": 1.8028049900970768e-06, "loss": 0.6615, "step": 11980 }, { "epoch": 0.79, "grad_norm": 2.3812663555145264, "learning_rate": 1.7944258421342097e-06, "loss": 0.6833, "step": 11990 }, { "epoch": 0.79, "grad_norm": 1.5996013879776, "learning_rate": 1.7860619515673034e-06, "loss": 0.6463, "step": 12000 }, { "epoch": 0.79, "grad_norm": 1.5530682802200317, "learning_rate": 1.777713358205514e-06, "loss": 0.6812, "step": 12010 }, { "epoch": 0.8, "grad_norm": 2.069619655609131, "learning_rate": 1.7693801017851818e-06, "loss": 0.6601, "step": 12020 }, { "epoch": 0.8, "grad_norm": 1.6531022787094116, "learning_rate": 1.761062221969651e-06, "loss": 0.65, "step": 12030 }, { "epoch": 0.8, "grad_norm": 1.5818767547607422, "learning_rate": 1.7527597583490825e-06, "loss": 0.6664, "step": 12040 }, { "epoch": 0.8, "grad_norm": 1.9506007432937622, "learning_rate": 1.7444727504402554e-06, "loss": 0.6709, "step": 12050 }, { "epoch": 0.8, "grad_norm": 1.5742695331573486, "learning_rate": 1.736201237686389e-06, "loss": 0.6492, "step": 12060 }, { "epoch": 0.8, "grad_norm": 1.3699088096618652, "learning_rate": 1.7279452594569484e-06, "loss": 0.6909, "step": 12070 }, { "epoch": 0.8, "grad_norm": 2.039612293243408, "learning_rate": 1.7197048550474643e-06, "loss": 0.6643, "step": 12080 }, { "epoch": 0.8, "grad_norm": 1.7419607639312744, "learning_rate": 1.7114800636793378e-06, "loss": 0.6795, "step": 12090 }, { "epoch": 0.8, "grad_norm": 3.1645009517669678, "learning_rate": 1.7032709244996559e-06, "loss": 0.6452, "step": 12100 }, { "epoch": 0.8, "grad_norm": 2.338763952255249, "learning_rate": 1.695077476581013e-06, "loss": 0.6519, "step": 12110 }, { "epoch": 0.8, "grad_norm": 1.672167420387268, "learning_rate": 1.6868997589213138e-06, "loss": 0.6373, "step": 12120 }, { "epoch": 0.8, "grad_norm": 2.539010763168335, "learning_rate": 1.6787378104435931e-06, "loss": 0.6561, "step": 12130 }, { "epoch": 0.8, "grad_norm": 1.6293374300003052, "learning_rate": 1.6705916699958292e-06, "loss": 0.6846, "step": 12140 }, { "epoch": 0.8, "grad_norm": 2.007550001144409, "learning_rate": 1.662461376350764e-06, "loss": 0.6662, "step": 12150 }, { "epoch": 0.8, "grad_norm": 1.6836156845092773, "learning_rate": 1.6543469682057105e-06, "loss": 0.6571, "step": 12160 }, { "epoch": 0.81, "grad_norm": 1.7401918172836304, "learning_rate": 1.6462484841823712e-06, "loss": 0.6443, "step": 12170 }, { "epoch": 0.81, "grad_norm": 1.6568111181259155, "learning_rate": 1.6381659628266589e-06, "loss": 0.6764, "step": 12180 }, { "epoch": 0.81, "grad_norm": 1.7030593156814575, "learning_rate": 1.6300994426085103e-06, "loss": 0.6521, "step": 12190 }, { "epoch": 0.81, "grad_norm": 1.7264918088912964, "learning_rate": 1.6220489619216988e-06, "loss": 0.6512, "step": 12200 }, { "epoch": 0.81, "grad_norm": 1.9158384799957275, "learning_rate": 1.6140145590836554e-06, "loss": 0.6806, "step": 12210 }, { "epoch": 0.81, "grad_norm": 1.5750938653945923, "learning_rate": 1.6059962723352912e-06, "loss": 0.6463, "step": 12220 }, { "epoch": 0.81, "grad_norm": 2.226882219314575, "learning_rate": 1.5979941398408045e-06, "loss": 0.6607, "step": 12230 }, { "epoch": 0.81, "grad_norm": 1.5629756450653076, "learning_rate": 1.5900081996875083e-06, "loss": 0.6355, "step": 12240 }, { "epoch": 0.81, "grad_norm": 1.8792222738265991, "learning_rate": 1.5820384898856433e-06, "loss": 0.6825, "step": 12250 }, { "epoch": 0.81, "grad_norm": 1.9516578912734985, "learning_rate": 1.574085048368204e-06, "loss": 0.639, "step": 12260 }, { "epoch": 0.81, "grad_norm": 2.400202512741089, "learning_rate": 1.5661479129907508e-06, "loss": 0.6697, "step": 12270 }, { "epoch": 0.81, "grad_norm": 2.255854606628418, "learning_rate": 1.5582271215312294e-06, "loss": 0.6621, "step": 12280 }, { "epoch": 0.81, "grad_norm": 2.195754289627075, "learning_rate": 1.5503227116898017e-06, "loss": 0.6412, "step": 12290 }, { "epoch": 0.81, "grad_norm": 1.8883105516433716, "learning_rate": 1.5424347210886538e-06, "loss": 0.6692, "step": 12300 }, { "epoch": 0.81, "grad_norm": 1.464462161064148, "learning_rate": 1.5345631872718214e-06, "loss": 0.6669, "step": 12310 }, { "epoch": 0.82, "grad_norm": 2.4194180965423584, "learning_rate": 1.5267081477050132e-06, "loss": 0.6795, "step": 12320 }, { "epoch": 0.82, "grad_norm": 1.6516296863555908, "learning_rate": 1.5188696397754344e-06, "loss": 0.6681, "step": 12330 }, { "epoch": 0.82, "grad_norm": 2.128793716430664, "learning_rate": 1.5110477007916002e-06, "loss": 0.6767, "step": 12340 }, { "epoch": 0.82, "grad_norm": 1.7647573947906494, "learning_rate": 1.5032423679831642e-06, "loss": 0.6705, "step": 12350 }, { "epoch": 0.82, "grad_norm": 1.5265520811080933, "learning_rate": 1.4954536785007456e-06, "loss": 0.662, "step": 12360 }, { "epoch": 0.82, "grad_norm": 2.2167117595672607, "learning_rate": 1.487681669415742e-06, "loss": 0.6628, "step": 12370 }, { "epoch": 0.82, "grad_norm": 1.4803221225738525, "learning_rate": 1.4799263777201594e-06, "loss": 0.6688, "step": 12380 }, { "epoch": 0.82, "grad_norm": 1.5486646890640259, "learning_rate": 1.4721878403264344e-06, "loss": 0.66, "step": 12390 }, { "epoch": 0.82, "grad_norm": 2.1329715251922607, "learning_rate": 1.4644660940672628e-06, "loss": 0.6564, "step": 12400 }, { "epoch": 0.82, "grad_norm": 2.2044312953948975, "learning_rate": 1.456761175695417e-06, "loss": 0.6582, "step": 12410 }, { "epoch": 0.82, "grad_norm": 1.7977498769760132, "learning_rate": 1.449073121883573e-06, "loss": 0.6713, "step": 12420 }, { "epoch": 0.82, "grad_norm": 2.147284507751465, "learning_rate": 1.4414019692241437e-06, "loss": 0.6577, "step": 12430 }, { "epoch": 0.82, "grad_norm": 1.5342192649841309, "learning_rate": 1.433747754229093e-06, "loss": 0.6673, "step": 12440 }, { "epoch": 0.82, "grad_norm": 1.5152603387832642, "learning_rate": 1.4261105133297693e-06, "loss": 0.64, "step": 12450 }, { "epoch": 0.82, "grad_norm": 2.131248950958252, "learning_rate": 1.4184902828767288e-06, "loss": 0.6667, "step": 12460 }, { "epoch": 0.83, "grad_norm": 2.3400142192840576, "learning_rate": 1.410887099139569e-06, "loss": 0.6758, "step": 12470 }, { "epoch": 0.83, "grad_norm": 1.8582547903060913, "learning_rate": 1.4033009983067454e-06, "loss": 0.6707, "step": 12480 }, { "epoch": 0.83, "grad_norm": 1.4715774059295654, "learning_rate": 1.395732016485406e-06, "loss": 0.6738, "step": 12490 }, { "epoch": 0.83, "grad_norm": 1.6715984344482422, "learning_rate": 1.3881801897012225e-06, "loss": 0.6513, "step": 12500 }, { "epoch": 0.83, "grad_norm": 2.1860718727111816, "learning_rate": 1.3806455538982106e-06, "loss": 0.6655, "step": 12510 }, { "epoch": 0.83, "grad_norm": 1.513540506362915, "learning_rate": 1.373128144938563e-06, "loss": 0.6441, "step": 12520 }, { "epoch": 0.83, "grad_norm": 1.9079211950302124, "learning_rate": 1.3656279986024802e-06, "loss": 0.6635, "step": 12530 }, { "epoch": 0.83, "grad_norm": 1.9643515348434448, "learning_rate": 1.3581451505879995e-06, "loss": 0.6565, "step": 12540 }, { "epoch": 0.83, "grad_norm": 2.0323612689971924, "learning_rate": 1.3506796365108232e-06, "loss": 0.7029, "step": 12550 }, { "epoch": 0.83, "grad_norm": 2.054985761642456, "learning_rate": 1.3432314919041478e-06, "loss": 0.6354, "step": 12560 }, { "epoch": 0.83, "grad_norm": 1.635546326637268, "learning_rate": 1.3358007522185035e-06, "loss": 0.6554, "step": 12570 }, { "epoch": 0.83, "grad_norm": 1.7966934442520142, "learning_rate": 1.3283874528215735e-06, "loss": 0.653, "step": 12580 }, { "epoch": 0.83, "grad_norm": 2.113840341567993, "learning_rate": 1.3209916289980336e-06, "loss": 0.6667, "step": 12590 }, { "epoch": 0.83, "grad_norm": 1.962321400642395, "learning_rate": 1.3136133159493803e-06, "loss": 0.6496, "step": 12600 }, { "epoch": 0.83, "grad_norm": 1.854382038116455, "learning_rate": 1.30625254879377e-06, "loss": 0.6693, "step": 12610 }, { "epoch": 0.84, "grad_norm": 1.7110834121704102, "learning_rate": 1.2989093625658411e-06, "loss": 0.684, "step": 12620 }, { "epoch": 0.84, "grad_norm": 2.151606798171997, "learning_rate": 1.2915837922165547e-06, "loss": 0.6638, "step": 12630 }, { "epoch": 0.84, "grad_norm": 1.8074573278427124, "learning_rate": 1.2842758726130283e-06, "loss": 0.6783, "step": 12640 }, { "epoch": 0.84, "grad_norm": 1.9045308828353882, "learning_rate": 1.2769856385383689e-06, "loss": 0.6646, "step": 12650 }, { "epoch": 0.84, "grad_norm": 2.033857583999634, "learning_rate": 1.269713124691503e-06, "loss": 0.6633, "step": 12660 }, { "epoch": 0.84, "grad_norm": 2.591675281524658, "learning_rate": 1.2624583656870153e-06, "loss": 0.6656, "step": 12670 }, { "epoch": 0.84, "grad_norm": 1.8951250314712524, "learning_rate": 1.2552213960549891e-06, "loss": 0.6642, "step": 12680 }, { "epoch": 0.84, "grad_norm": 2.0589277744293213, "learning_rate": 1.2480022502408306e-06, "loss": 0.6691, "step": 12690 }, { "epoch": 0.84, "grad_norm": 1.7472984790802002, "learning_rate": 1.2408009626051137e-06, "loss": 0.6548, "step": 12700 }, { "epoch": 0.84, "grad_norm": 2.1019725799560547, "learning_rate": 1.2336175674234112e-06, "loss": 0.6853, "step": 12710 }, { "epoch": 0.84, "grad_norm": 1.7122173309326172, "learning_rate": 1.22645209888614e-06, "loss": 0.6643, "step": 12720 }, { "epoch": 0.84, "grad_norm": 1.4647725820541382, "learning_rate": 1.2193045910983864e-06, "loss": 0.6637, "step": 12730 }, { "epoch": 0.84, "grad_norm": 1.424286127090454, "learning_rate": 1.2121750780797514e-06, "loss": 0.6656, "step": 12740 }, { "epoch": 0.84, "grad_norm": 2.3147671222686768, "learning_rate": 1.2050635937641909e-06, "loss": 0.6865, "step": 12750 }, { "epoch": 0.84, "grad_norm": 2.1018872261047363, "learning_rate": 1.1979701719998454e-06, "loss": 0.6708, "step": 12760 }, { "epoch": 0.85, "grad_norm": 2.553410291671753, "learning_rate": 1.1908948465488878e-06, "loss": 0.6679, "step": 12770 }, { "epoch": 0.85, "grad_norm": 2.1423566341400146, "learning_rate": 1.1838376510873557e-06, "loss": 0.6652, "step": 12780 }, { "epoch": 0.85, "grad_norm": 1.8816630840301514, "learning_rate": 1.1767986192049986e-06, "loss": 0.6632, "step": 12790 }, { "epoch": 0.85, "grad_norm": 2.626129627227783, "learning_rate": 1.1697777844051105e-06, "loss": 0.6678, "step": 12800 }, { "epoch": 0.85, "eval_loss": 0.7841401100158691, "eval_runtime": 134.3092, "eval_samples_per_second": 81.901, "eval_steps_per_second": 10.238, "step": 12800 }, { "epoch": 0.85, "grad_norm": 2.2205734252929688, "learning_rate": 1.1627751801043736e-06, "loss": 0.6638, "step": 12810 }, { "epoch": 0.85, "grad_norm": 1.9543955326080322, "learning_rate": 1.1557908396327028e-06, "loss": 0.6792, "step": 12820 }, { "epoch": 0.85, "grad_norm": 2.1611430644989014, "learning_rate": 1.14882479623308e-06, "loss": 0.667, "step": 12830 }, { "epoch": 0.85, "grad_norm": 1.858528971672058, "learning_rate": 1.1418770830614012e-06, "loss": 0.6918, "step": 12840 }, { "epoch": 0.85, "grad_norm": 2.0636982917785645, "learning_rate": 1.134947733186315e-06, "loss": 0.664, "step": 12850 }, { "epoch": 0.85, "grad_norm": 1.9815447330474854, "learning_rate": 1.1280367795890724e-06, "loss": 0.6562, "step": 12860 }, { "epoch": 0.85, "grad_norm": 1.684668779373169, "learning_rate": 1.1211442551633595e-06, "loss": 0.6825, "step": 12870 }, { "epoch": 0.85, "grad_norm": 2.0503435134887695, "learning_rate": 1.1142701927151456e-06, "loss": 0.6696, "step": 12880 }, { "epoch": 0.85, "grad_norm": 1.9878673553466797, "learning_rate": 1.1074146249625334e-06, "loss": 0.6593, "step": 12890 }, { "epoch": 0.85, "grad_norm": 2.233013391494751, "learning_rate": 1.100577584535592e-06, "loss": 0.6665, "step": 12900 }, { "epoch": 0.85, "grad_norm": 1.5695899724960327, "learning_rate": 1.0937591039762086e-06, "loss": 0.6692, "step": 12910 }, { "epoch": 0.85, "grad_norm": 1.6886929273605347, "learning_rate": 1.0869592157379305e-06, "loss": 0.6749, "step": 12920 }, { "epoch": 0.86, "grad_norm": 1.81573486328125, "learning_rate": 1.0801779521858175e-06, "loss": 0.642, "step": 12930 }, { "epoch": 0.86, "grad_norm": 1.753726840019226, "learning_rate": 1.0734153455962765e-06, "loss": 0.6506, "step": 12940 }, { "epoch": 0.86, "grad_norm": 1.8571149110794067, "learning_rate": 1.0666714281569152e-06, "loss": 0.6661, "step": 12950 }, { "epoch": 0.86, "grad_norm": 1.9926731586456299, "learning_rate": 1.0599462319663906e-06, "loss": 0.6618, "step": 12960 }, { "epoch": 0.86, "grad_norm": 2.0180611610412598, "learning_rate": 1.0532397890342506e-06, "loss": 0.6381, "step": 12970 }, { "epoch": 0.86, "grad_norm": 1.8695250749588013, "learning_rate": 1.0465521312807846e-06, "loss": 0.6609, "step": 12980 }, { "epoch": 0.86, "grad_norm": 1.7631117105484009, "learning_rate": 1.0398832905368693e-06, "loss": 0.6786, "step": 12990 }, { "epoch": 0.86, "grad_norm": 1.9388374090194702, "learning_rate": 1.0332332985438248e-06, "loss": 0.6765, "step": 13000 }, { "epoch": 0.86, "grad_norm": 2.051642417907715, "learning_rate": 1.0266021869532527e-06, "loss": 0.6743, "step": 13010 }, { "epoch": 0.86, "grad_norm": 2.41388201713562, "learning_rate": 1.0199899873268903e-06, "loss": 0.6845, "step": 13020 }, { "epoch": 0.86, "grad_norm": 2.17926025390625, "learning_rate": 1.013396731136465e-06, "loss": 0.6765, "step": 13030 }, { "epoch": 0.86, "grad_norm": 2.158048391342163, "learning_rate": 1.006822449763537e-06, "loss": 0.66, "step": 13040 }, { "epoch": 0.86, "grad_norm": 2.1093921661376953, "learning_rate": 1.0002671744993519e-06, "loss": 0.6716, "step": 13050 }, { "epoch": 0.86, "grad_norm": 1.8924907445907593, "learning_rate": 9.937309365446973e-07, "loss": 0.6559, "step": 13060 }, { "epoch": 0.86, "grad_norm": 1.9550564289093018, "learning_rate": 9.872137670097465e-07, "loss": 0.6692, "step": 13070 }, { "epoch": 0.87, "grad_norm": 2.1951191425323486, "learning_rate": 9.807156969139136e-07, "loss": 0.6782, "step": 13080 }, { "epoch": 0.87, "grad_norm": 2.0226147174835205, "learning_rate": 9.742367571857092e-07, "loss": 0.6669, "step": 13090 }, { "epoch": 0.87, "grad_norm": 2.125762701034546, "learning_rate": 9.677769786625869e-07, "loss": 0.6674, "step": 13100 }, { "epoch": 0.87, "grad_norm": 1.7338881492614746, "learning_rate": 9.613363920908025e-07, "loss": 0.6481, "step": 13110 }, { "epoch": 0.87, "grad_norm": 1.5108554363250732, "learning_rate": 9.549150281252633e-07, "loss": 0.6305, "step": 13120 }, { "epoch": 0.87, "grad_norm": 1.688029408454895, "learning_rate": 9.485129173293823e-07, "loss": 0.6535, "step": 13130 }, { "epoch": 0.87, "grad_norm": 1.406068205833435, "learning_rate": 9.421300901749386e-07, "loss": 0.6413, "step": 13140 }, { "epoch": 0.87, "grad_norm": 1.9008201360702515, "learning_rate": 9.357665770419244e-07, "loss": 0.6724, "step": 13150 }, { "epoch": 0.87, "grad_norm": 2.259673833847046, "learning_rate": 9.294224082184045e-07, "loss": 0.6548, "step": 13160 }, { "epoch": 0.87, "grad_norm": 2.195957660675049, "learning_rate": 9.230976139003717e-07, "loss": 0.6643, "step": 13170 }, { "epoch": 0.87, "grad_norm": 1.6432466506958008, "learning_rate": 9.167922241916055e-07, "loss": 0.6644, "step": 13180 }, { "epoch": 0.87, "grad_norm": 1.8441758155822754, "learning_rate": 9.105062691035233e-07, "loss": 0.6508, "step": 13190 }, { "epoch": 0.87, "grad_norm": 2.337584972381592, "learning_rate": 9.042397785550405e-07, "loss": 0.6821, "step": 13200 }, { "epoch": 0.87, "grad_norm": 1.7856107950210571, "learning_rate": 8.979927823724321e-07, "loss": 0.6543, "step": 13210 }, { "epoch": 0.87, "grad_norm": 2.0911343097686768, "learning_rate": 8.917653102891822e-07, "loss": 0.6605, "step": 13220 }, { "epoch": 0.88, "grad_norm": 1.6215004920959473, "learning_rate": 8.855573919458494e-07, "loss": 0.6685, "step": 13230 }, { "epoch": 0.88, "grad_norm": 1.819148302078247, "learning_rate": 8.793690568899216e-07, "loss": 0.6603, "step": 13240 }, { "epoch": 0.88, "grad_norm": 2.3011019229888916, "learning_rate": 8.732003345756812e-07, "loss": 0.6738, "step": 13250 }, { "epoch": 0.88, "grad_norm": 1.663140892982483, "learning_rate": 8.670512543640574e-07, "loss": 0.6618, "step": 13260 }, { "epoch": 0.88, "grad_norm": 1.843697190284729, "learning_rate": 8.609218455224893e-07, "loss": 0.6638, "step": 13270 }, { "epoch": 0.88, "grad_norm": 2.5996809005737305, "learning_rate": 8.54812137224792e-07, "loss": 0.6497, "step": 13280 }, { "epoch": 0.88, "grad_norm": 1.8779548406600952, "learning_rate": 8.487221585510075e-07, "loss": 0.6613, "step": 13290 }, { "epoch": 0.88, "grad_norm": 2.365200996398926, "learning_rate": 8.426519384872733e-07, "loss": 0.665, "step": 13300 }, { "epoch": 0.88, "grad_norm": 1.9242346286773682, "learning_rate": 8.366015059256871e-07, "loss": 0.6502, "step": 13310 }, { "epoch": 0.88, "grad_norm": 1.5760527849197388, "learning_rate": 8.305708896641596e-07, "loss": 0.6626, "step": 13320 }, { "epoch": 0.88, "grad_norm": 2.014904022216797, "learning_rate": 8.245601184062851e-07, "loss": 0.6584, "step": 13330 }, { "epoch": 0.88, "grad_norm": 2.12705135345459, "learning_rate": 8.185692207612023e-07, "loss": 0.6799, "step": 13340 }, { "epoch": 0.88, "grad_norm": 2.026439905166626, "learning_rate": 8.125982252434611e-07, "loss": 0.6819, "step": 13350 }, { "epoch": 0.88, "grad_norm": 2.0270392894744873, "learning_rate": 8.066471602728804e-07, "loss": 0.6701, "step": 13360 }, { "epoch": 0.88, "grad_norm": 2.5948376655578613, "learning_rate": 8.007160541744186e-07, "loss": 0.657, "step": 13370 }, { "epoch": 0.89, "grad_norm": 1.7466539144515991, "learning_rate": 7.94804935178038e-07, "loss": 0.665, "step": 13380 }, { "epoch": 0.89, "grad_norm": 1.7160353660583496, "learning_rate": 7.88913831418568e-07, "loss": 0.6628, "step": 13390 }, { "epoch": 0.89, "grad_norm": 2.2262911796569824, "learning_rate": 7.830427709355726e-07, "loss": 0.6555, "step": 13400 }, { "epoch": 0.89, "grad_norm": 2.0489237308502197, "learning_rate": 7.771917816732161e-07, "loss": 0.6678, "step": 13410 }, { "epoch": 0.89, "grad_norm": 2.559640645980835, "learning_rate": 7.71360891480134e-07, "loss": 0.6431, "step": 13420 }, { "epoch": 0.89, "grad_norm": 2.476576089859009, "learning_rate": 7.655501281092953e-07, "loss": 0.6667, "step": 13430 }, { "epoch": 0.89, "grad_norm": 1.941450595855713, "learning_rate": 7.597595192178702e-07, "loss": 0.6734, "step": 13440 }, { "epoch": 0.89, "grad_norm": 2.5670228004455566, "learning_rate": 7.539890923671061e-07, "loss": 0.6781, "step": 13450 }, { "epoch": 0.89, "grad_norm": 2.5730628967285156, "learning_rate": 7.482388750221864e-07, "loss": 0.6738, "step": 13460 }, { "epoch": 0.89, "grad_norm": 1.977264165878296, "learning_rate": 7.425088945521064e-07, "loss": 0.6633, "step": 13470 }, { "epoch": 0.89, "grad_norm": 2.168454647064209, "learning_rate": 7.367991782295392e-07, "loss": 0.6604, "step": 13480 }, { "epoch": 0.89, "grad_norm": 1.8109989166259766, "learning_rate": 7.311097532307121e-07, "loss": 0.6542, "step": 13490 }, { "epoch": 0.89, "grad_norm": 1.8640021085739136, "learning_rate": 7.254406466352682e-07, "loss": 0.6736, "step": 13500 }, { "epoch": 0.89, "grad_norm": 2.002460241317749, "learning_rate": 7.197918854261432e-07, "loss": 0.6951, "step": 13510 }, { "epoch": 0.89, "grad_norm": 1.855389952659607, "learning_rate": 7.141634964894389e-07, "loss": 0.6749, "step": 13520 }, { "epoch": 0.9, "grad_norm": 2.7581233978271484, "learning_rate": 7.085555066142886e-07, "loss": 0.6567, "step": 13530 }, { "epoch": 0.9, "grad_norm": 1.7739157676696777, "learning_rate": 7.029679424927366e-07, "loss": 0.6697, "step": 13540 }, { "epoch": 0.9, "grad_norm": 1.4929020404815674, "learning_rate": 6.974008307196057e-07, "loss": 0.6861, "step": 13550 }, { "epoch": 0.9, "grad_norm": 2.034844398498535, "learning_rate": 6.918541977923709e-07, "loss": 0.6607, "step": 13560 }, { "epoch": 0.9, "grad_norm": 1.5017189979553223, "learning_rate": 6.863280701110409e-07, "loss": 0.6726, "step": 13570 }, { "epoch": 0.9, "grad_norm": 1.9159743785858154, "learning_rate": 6.808224739780217e-07, "loss": 0.6781, "step": 13580 }, { "epoch": 0.9, "grad_norm": 1.910868763923645, "learning_rate": 6.753374355979975e-07, "loss": 0.6363, "step": 13590 }, { "epoch": 0.9, "grad_norm": 2.351539373397827, "learning_rate": 6.698729810778065e-07, "loss": 0.6571, "step": 13600 }, { "epoch": 0.9, "grad_norm": 3.010953664779663, "learning_rate": 6.644291364263139e-07, "loss": 0.6714, "step": 13610 }, { "epoch": 0.9, "grad_norm": 1.7596651315689087, "learning_rate": 6.590059275542882e-07, "loss": 0.6746, "step": 13620 }, { "epoch": 0.9, "grad_norm": 2.1174886226654053, "learning_rate": 6.536033802742814e-07, "loss": 0.6606, "step": 13630 }, { "epoch": 0.9, "grad_norm": 1.700536847114563, "learning_rate": 6.482215203005016e-07, "loss": 0.6512, "step": 13640 }, { "epoch": 0.9, "grad_norm": 1.9607415199279785, "learning_rate": 6.428603732486938e-07, "loss": 0.6769, "step": 13650 }, { "epoch": 0.9, "grad_norm": 1.9792729616165161, "learning_rate": 6.375199646360142e-07, "loss": 0.6615, "step": 13660 }, { "epoch": 0.9, "grad_norm": 1.2958414554595947, "learning_rate": 6.322003198809162e-07, "loss": 0.6879, "step": 13670 }, { "epoch": 0.91, "grad_norm": 2.095305919647217, "learning_rate": 6.269014643030214e-07, "loss": 0.6753, "step": 13680 }, { "epoch": 0.91, "grad_norm": 2.2841269969940186, "learning_rate": 6.216234231230012e-07, "loss": 0.6827, "step": 13690 }, { "epoch": 0.91, "grad_norm": 1.9332698583602905, "learning_rate": 6.163662214624616e-07, "loss": 0.6721, "step": 13700 }, { "epoch": 0.91, "grad_norm": 2.1489346027374268, "learning_rate": 6.111298843438169e-07, "loss": 0.6442, "step": 13710 }, { "epoch": 0.91, "grad_norm": 1.6940524578094482, "learning_rate": 6.059144366901737e-07, "loss": 0.6637, "step": 13720 }, { "epoch": 0.91, "grad_norm": 1.6115096807479858, "learning_rate": 6.007199033252131e-07, "loss": 0.669, "step": 13730 }, { "epoch": 0.91, "grad_norm": 2.0993459224700928, "learning_rate": 5.955463089730723e-07, "loss": 0.6644, "step": 13740 }, { "epoch": 0.91, "grad_norm": 1.9472441673278809, "learning_rate": 5.903936782582253e-07, "loss": 0.6715, "step": 13750 }, { "epoch": 0.91, "grad_norm": 2.4469716548919678, "learning_rate": 5.852620357053651e-07, "loss": 0.6633, "step": 13760 }, { "epoch": 0.91, "grad_norm": 2.319772481918335, "learning_rate": 5.80151405739292e-07, "loss": 0.689, "step": 13770 }, { "epoch": 0.91, "grad_norm": 1.8905223608016968, "learning_rate": 5.750618126847912e-07, "loss": 0.6469, "step": 13780 }, { "epoch": 0.91, "grad_norm": 1.8720320463180542, "learning_rate": 5.699932807665198e-07, "loss": 0.6647, "step": 13790 }, { "epoch": 0.91, "grad_norm": 1.8318226337432861, "learning_rate": 5.649458341088915e-07, "loss": 0.6804, "step": 13800 }, { "epoch": 0.91, "grad_norm": 2.6785199642181396, "learning_rate": 5.599194967359639e-07, "loss": 0.6481, "step": 13810 }, { "epoch": 0.91, "grad_norm": 2.7276508808135986, "learning_rate": 5.549142925713186e-07, "loss": 0.6818, "step": 13820 }, { "epoch": 0.92, "grad_norm": 2.1352598667144775, "learning_rate": 5.499302454379512e-07, "loss": 0.6594, "step": 13830 }, { "epoch": 0.92, "grad_norm": 1.9602961540222168, "learning_rate": 5.449673790581611e-07, "loss": 0.6741, "step": 13840 }, { "epoch": 0.92, "grad_norm": 2.205834150314331, "learning_rate": 5.400257170534296e-07, "loss": 0.6669, "step": 13850 }, { "epoch": 0.92, "grad_norm": 2.2802295684814453, "learning_rate": 5.351052829443159e-07, "loss": 0.6576, "step": 13860 }, { "epoch": 0.92, "grad_norm": 1.6849833726882935, "learning_rate": 5.302061001503395e-07, "loss": 0.6648, "step": 13870 }, { "epoch": 0.92, "grad_norm": 2.004652976989746, "learning_rate": 5.253281919898751e-07, "loss": 0.6302, "step": 13880 }, { "epoch": 0.92, "grad_norm": 1.8887594938278198, "learning_rate": 5.204715816800343e-07, "loss": 0.6642, "step": 13890 }, { "epoch": 0.92, "grad_norm": 2.5696282386779785, "learning_rate": 5.156362923365587e-07, "loss": 0.6419, "step": 13900 }, { "epoch": 0.92, "grad_norm": 1.7427834272384644, "learning_rate": 5.108223469737117e-07, "loss": 0.6784, "step": 13910 }, { "epoch": 0.92, "grad_norm": 2.1054744720458984, "learning_rate": 5.06029768504166e-07, "loss": 0.6545, "step": 13920 }, { "epoch": 0.92, "grad_norm": 2.091437339782715, "learning_rate": 5.012585797388936e-07, "loss": 0.655, "step": 13930 }, { "epoch": 0.92, "grad_norm": 2.2512803077697754, "learning_rate": 4.965088033870608e-07, "loss": 0.6354, "step": 13940 }, { "epoch": 0.92, "grad_norm": 2.2140800952911377, "learning_rate": 4.917804620559202e-07, "loss": 0.6559, "step": 13950 }, { "epoch": 0.92, "grad_norm": 1.8559973239898682, "learning_rate": 4.87073578250698e-07, "loss": 0.6559, "step": 13960 }, { "epoch": 0.92, "grad_norm": 3.173664093017578, "learning_rate": 4.823881743744907e-07, "loss": 0.6733, "step": 13970 }, { "epoch": 0.93, "grad_norm": 2.235106945037842, "learning_rate": 4.777242727281594e-07, "loss": 0.6404, "step": 13980 }, { "epoch": 0.93, "grad_norm": 1.6829653978347778, "learning_rate": 4.730818955102234e-07, "loss": 0.6621, "step": 13990 }, { "epoch": 0.93, "grad_norm": 2.1085221767425537, "learning_rate": 4.6846106481675035e-07, "loss": 0.6712, "step": 14000 }, { "epoch": 0.93, "grad_norm": 2.800367593765259, "learning_rate": 4.638618026412539e-07, "loss": 0.6713, "step": 14010 }, { "epoch": 0.93, "grad_norm": 1.9451186656951904, "learning_rate": 4.5928413087459325e-07, "loss": 0.664, "step": 14020 }, { "epoch": 0.93, "grad_norm": 1.687585711479187, "learning_rate": 4.5472807130486075e-07, "loss": 0.6699, "step": 14030 }, { "epoch": 0.93, "grad_norm": 2.3365931510925293, "learning_rate": 4.501936456172845e-07, "loss": 0.6572, "step": 14040 }, { "epoch": 0.93, "grad_norm": 1.7187267541885376, "learning_rate": 4.456808753941205e-07, "loss": 0.6675, "step": 14050 }, { "epoch": 0.93, "grad_norm": 2.139224052429199, "learning_rate": 4.4118978211455723e-07, "loss": 0.6786, "step": 14060 }, { "epoch": 0.93, "grad_norm": 2.3092565536499023, "learning_rate": 4.367203871546039e-07, "loss": 0.6549, "step": 14070 }, { "epoch": 0.93, "grad_norm": 1.6337780952453613, "learning_rate": 4.322727117869951e-07, "loss": 0.6408, "step": 14080 }, { "epoch": 0.93, "grad_norm": 1.9629242420196533, "learning_rate": 4.278467771810896e-07, "loss": 0.6744, "step": 14090 }, { "epoch": 0.93, "grad_norm": 2.3631951808929443, "learning_rate": 4.2344260440276455e-07, "loss": 0.6699, "step": 14100 }, { "epoch": 0.93, "grad_norm": 1.8263096809387207, "learning_rate": 4.1906021441432074e-07, "loss": 0.6651, "step": 14110 }, { "epoch": 0.93, "grad_norm": 2.167978286743164, "learning_rate": 4.146996280743798e-07, "loss": 0.6738, "step": 14120 }, { "epoch": 0.94, "grad_norm": 1.9163544178009033, "learning_rate": 4.103608661377867e-07, "loss": 0.6654, "step": 14130 }, { "epoch": 0.94, "grad_norm": 3.2101268768310547, "learning_rate": 4.0604394925550906e-07, "loss": 0.6709, "step": 14140 }, { "epoch": 0.94, "grad_norm": 1.9373053312301636, "learning_rate": 4.0174889797453875e-07, "loss": 0.6675, "step": 14150 }, { "epoch": 0.94, "grad_norm": 2.323124647140503, "learning_rate": 3.9747573273779816e-07, "loss": 0.6869, "step": 14160 }, { "epoch": 0.94, "grad_norm": 2.504706621170044, "learning_rate": 3.9322447388403796e-07, "loss": 0.6576, "step": 14170 }, { "epoch": 0.94, "grad_norm": 2.201514959335327, "learning_rate": 3.8899514164774166e-07, "loss": 0.6594, "step": 14180 }, { "epoch": 0.94, "grad_norm": 2.347700834274292, "learning_rate": 3.8478775615902965e-07, "loss": 0.6843, "step": 14190 }, { "epoch": 0.94, "grad_norm": 2.5682637691497803, "learning_rate": 3.8060233744356634e-07, "loss": 0.6636, "step": 14200 }, { "epoch": 0.94, "grad_norm": 2.0093626976013184, "learning_rate": 3.7643890542245985e-07, "loss": 0.6518, "step": 14210 }, { "epoch": 0.94, "grad_norm": 2.4383957386016846, "learning_rate": 3.722974799121681e-07, "loss": 0.6349, "step": 14220 }, { "epoch": 0.94, "grad_norm": 3.2455692291259766, "learning_rate": 3.6817808062440953e-07, "loss": 0.6564, "step": 14230 }, { "epoch": 0.94, "grad_norm": 1.918816328048706, "learning_rate": 3.6408072716606346e-07, "loss": 0.6615, "step": 14240 }, { "epoch": 0.94, "grad_norm": 2.3905696868896484, "learning_rate": 3.600054390390778e-07, "loss": 0.6458, "step": 14250 }, { "epoch": 0.94, "grad_norm": 2.004607677459717, "learning_rate": 3.5595223564037884e-07, "loss": 0.679, "step": 14260 }, { "epoch": 0.94, "grad_norm": 1.7747329473495483, "learning_rate": 3.5192113626177806e-07, "loss": 0.6708, "step": 14270 }, { "epoch": 0.94, "grad_norm": 2.097956657409668, "learning_rate": 3.479121600898777e-07, "loss": 0.652, "step": 14280 }, { "epoch": 0.95, "grad_norm": 2.358074903488159, "learning_rate": 3.439253262059822e-07, "loss": 0.6811, "step": 14290 }, { "epoch": 0.95, "grad_norm": 2.267530679702759, "learning_rate": 3.399606535860078e-07, "loss": 0.6504, "step": 14300 }, { "epoch": 0.95, "grad_norm": 2.4041032791137695, "learning_rate": 3.360181611003893e-07, "loss": 0.6658, "step": 14310 }, { "epoch": 0.95, "grad_norm": 1.7667807340621948, "learning_rate": 3.320978675139919e-07, "loss": 0.6827, "step": 14320 }, { "epoch": 0.95, "grad_norm": 2.0427334308624268, "learning_rate": 3.2819979148602245e-07, "loss": 0.6617, "step": 14330 }, { "epoch": 0.95, "grad_norm": 2.4956207275390625, "learning_rate": 3.24323951569942e-07, "loss": 0.6386, "step": 14340 }, { "epoch": 0.95, "grad_norm": 2.769589900970459, "learning_rate": 3.204703662133724e-07, "loss": 0.6644, "step": 14350 }, { "epoch": 0.95, "grad_norm": 2.495105743408203, "learning_rate": 3.166390537580122e-07, "loss": 0.6637, "step": 14360 }, { "epoch": 0.95, "grad_norm": 2.220623731613159, "learning_rate": 3.128300324395517e-07, "loss": 0.6637, "step": 14370 }, { "epoch": 0.95, "grad_norm": 1.9806053638458252, "learning_rate": 3.0904332038757977e-07, "loss": 0.652, "step": 14380 }, { "epoch": 0.95, "grad_norm": 1.9179275035858154, "learning_rate": 3.052789356255037e-07, "loss": 0.6586, "step": 14390 }, { "epoch": 0.95, "grad_norm": 2.090574026107788, "learning_rate": 3.015368960704584e-07, "loss": 0.6507, "step": 14400 }, { "epoch": 0.95, "eval_loss": 0.7874770760536194, "eval_runtime": 133.7124, "eval_samples_per_second": 82.266, "eval_steps_per_second": 10.283, "step": 14400 }, { "epoch": 0.95, "grad_norm": 5.8708295822143555, "learning_rate": 2.9781721953322627e-07, "loss": 0.6516, "step": 14410 }, { "epoch": 0.95, "grad_norm": 1.9250444173812866, "learning_rate": 2.9411992371814744e-07, "loss": 0.6604, "step": 14420 }, { "epoch": 0.95, "grad_norm": 1.5833460092544556, "learning_rate": 2.904450262230385e-07, "loss": 0.6766, "step": 14430 }, { "epoch": 0.96, "grad_norm": 2.0383031368255615, "learning_rate": 2.867925445391079e-07, "loss": 0.6542, "step": 14440 }, { "epoch": 0.96, "grad_norm": 1.9491095542907715, "learning_rate": 2.8316249605087386e-07, "loss": 0.6496, "step": 14450 }, { "epoch": 0.96, "grad_norm": 1.9134398698806763, "learning_rate": 2.7955489803607907e-07, "loss": 0.6666, "step": 14460 }, { "epoch": 0.96, "grad_norm": 1.8715418577194214, "learning_rate": 2.7596976766560977e-07, "loss": 0.6934, "step": 14470 }, { "epoch": 0.96, "grad_norm": 2.399040699005127, "learning_rate": 2.724071220034158e-07, "loss": 0.6518, "step": 14480 }, { "epoch": 0.96, "grad_norm": 2.1483490467071533, "learning_rate": 2.688669780064268e-07, "loss": 0.6539, "step": 14490 }, { "epoch": 0.96, "grad_norm": 2.5796451568603516, "learning_rate": 2.653493525244721e-07, "loss": 0.6554, "step": 14500 }, { "epoch": 0.96, "grad_norm": 2.631964683532715, "learning_rate": 2.6185426230020074e-07, "loss": 0.6617, "step": 14510 }, { "epoch": 0.96, "grad_norm": 2.705181837081909, "learning_rate": 2.583817239690034e-07, "loss": 0.6644, "step": 14520 }, { "epoch": 0.96, "grad_norm": 2.420186758041382, "learning_rate": 2.5493175405893076e-07, "loss": 0.6518, "step": 14530 }, { "epoch": 0.96, "grad_norm": 1.8278932571411133, "learning_rate": 2.5150436899061494e-07, "loss": 0.6563, "step": 14540 }, { "epoch": 0.96, "grad_norm": 2.477051019668579, "learning_rate": 2.4809958507719444e-07, "loss": 0.6658, "step": 14550 }, { "epoch": 0.96, "grad_norm": 1.8384480476379395, "learning_rate": 2.447174185242324e-07, "loss": 0.6573, "step": 14560 }, { "epoch": 0.96, "grad_norm": 2.3078768253326416, "learning_rate": 2.413578854296417e-07, "loss": 0.6715, "step": 14570 }, { "epoch": 0.96, "grad_norm": 2.314739227294922, "learning_rate": 2.3802100178360822e-07, "loss": 0.6414, "step": 14580 }, { "epoch": 0.97, "grad_norm": 1.8191229104995728, "learning_rate": 2.3470678346851517e-07, "loss": 0.6532, "step": 14590 }, { "epoch": 0.97, "grad_norm": 2.201284170150757, "learning_rate": 2.314152462588659e-07, "loss": 0.6845, "step": 14600 }, { "epoch": 0.97, "grad_norm": 1.9875273704528809, "learning_rate": 2.2814640582120905e-07, "loss": 0.6565, "step": 14610 }, { "epoch": 0.97, "grad_norm": 2.196948289871216, "learning_rate": 2.2490027771406686e-07, "loss": 0.6689, "step": 14620 }, { "epoch": 0.97, "grad_norm": 2.2184298038482666, "learning_rate": 2.2167687738785748e-07, "loss": 0.6884, "step": 14630 }, { "epoch": 0.97, "grad_norm": 2.3369548320770264, "learning_rate": 2.1847622018482283e-07, "loss": 0.6681, "step": 14640 }, { "epoch": 0.97, "grad_norm": 2.206387996673584, "learning_rate": 2.152983213389559e-07, "loss": 0.6722, "step": 14650 }, { "epoch": 0.97, "grad_norm": 1.7726385593414307, "learning_rate": 2.1214319597592792e-07, "loss": 0.6777, "step": 14660 }, { "epoch": 0.97, "grad_norm": 2.626243829727173, "learning_rate": 2.090108591130169e-07, "loss": 0.6818, "step": 14670 }, { "epoch": 0.97, "grad_norm": 2.6065804958343506, "learning_rate": 2.0590132565903475e-07, "loss": 0.6521, "step": 14680 }, { "epoch": 0.97, "grad_norm": 3.029127359390259, "learning_rate": 2.028146104142581e-07, "loss": 0.6711, "step": 14690 }, { "epoch": 0.97, "grad_norm": 2.4063072204589844, "learning_rate": 1.99750728070357e-07, "loss": 0.6603, "step": 14700 }, { "epoch": 0.97, "grad_norm": 2.1884777545928955, "learning_rate": 1.9670969321032407e-07, "loss": 0.6469, "step": 14710 }, { "epoch": 0.97, "grad_norm": 1.968603253364563, "learning_rate": 1.9369152030840553e-07, "loss": 0.6654, "step": 14720 }, { "epoch": 0.97, "grad_norm": 2.769115686416626, "learning_rate": 1.9069622373003638e-07, "loss": 0.6621, "step": 14730 }, { "epoch": 0.98, "grad_norm": 2.50546932220459, "learning_rate": 1.8772381773176417e-07, "loss": 0.6727, "step": 14740 }, { "epoch": 0.98, "grad_norm": 1.992620587348938, "learning_rate": 1.8477431646118648e-07, "loss": 0.6462, "step": 14750 }, { "epoch": 0.98, "grad_norm": 1.9770318269729614, "learning_rate": 1.8184773395688527e-07, "loss": 0.6522, "step": 14760 }, { "epoch": 0.98, "grad_norm": 2.0310850143432617, "learning_rate": 1.7894408414835362e-07, "loss": 0.6826, "step": 14770 }, { "epoch": 0.98, "grad_norm": 2.093679904937744, "learning_rate": 1.7606338085593532e-07, "loss": 0.675, "step": 14780 }, { "epoch": 0.98, "grad_norm": 1.967199444770813, "learning_rate": 1.7320563779075595e-07, "loss": 0.6918, "step": 14790 }, { "epoch": 0.98, "grad_norm": 1.9936221837997437, "learning_rate": 1.7037086855465902e-07, "loss": 0.6603, "step": 14800 }, { "epoch": 0.98, "grad_norm": 2.7689085006713867, "learning_rate": 1.6755908664014054e-07, "loss": 0.659, "step": 14810 }, { "epoch": 0.98, "grad_norm": 2.3982479572296143, "learning_rate": 1.6477030543028462e-07, "loss": 0.6478, "step": 14820 }, { "epoch": 0.98, "grad_norm": 2.1643288135528564, "learning_rate": 1.6200453819870122e-07, "loss": 0.6382, "step": 14830 }, { "epoch": 0.98, "grad_norm": 1.9189598560333252, "learning_rate": 1.5926179810946185e-07, "loss": 0.6403, "step": 14840 }, { "epoch": 0.98, "grad_norm": 1.7351678609848022, "learning_rate": 1.5654209821703458e-07, "loss": 0.6642, "step": 14850 }, { "epoch": 0.98, "grad_norm": 1.879950761795044, "learning_rate": 1.5384545146622854e-07, "loss": 0.6572, "step": 14860 }, { "epoch": 0.98, "grad_norm": 2.0498573780059814, "learning_rate": 1.511718706921239e-07, "loss": 0.6626, "step": 14870 }, { "epoch": 0.98, "grad_norm": 2.135221004486084, "learning_rate": 1.4852136862001766e-07, "loss": 0.662, "step": 14880 }, { "epoch": 0.99, "grad_norm": 2.5622806549072266, "learning_rate": 1.4589395786535954e-07, "loss": 0.6911, "step": 14890 }, { "epoch": 0.99, "grad_norm": 3.4293711185455322, "learning_rate": 1.4328965093369284e-07, "loss": 0.6699, "step": 14900 }, { "epoch": 0.99, "grad_norm": 2.0343523025512695, "learning_rate": 1.4070846022059437e-07, "loss": 0.6785, "step": 14910 }, { "epoch": 0.99, "grad_norm": 2.4741287231445312, "learning_rate": 1.3815039801161723e-07, "loss": 0.6639, "step": 14920 }, { "epoch": 0.99, "grad_norm": 1.8715765476226807, "learning_rate": 1.3561547648222871e-07, "loss": 0.6485, "step": 14930 }, { "epoch": 0.99, "grad_norm": 2.449462890625, "learning_rate": 1.331037076977576e-07, "loss": 0.664, "step": 14940 }, { "epoch": 0.99, "grad_norm": 2.398143768310547, "learning_rate": 1.3061510361333186e-07, "loss": 0.6662, "step": 14950 }, { "epoch": 0.99, "grad_norm": 2.0606908798217773, "learning_rate": 1.2814967607382433e-07, "loss": 0.6564, "step": 14960 }, { "epoch": 0.99, "grad_norm": 2.3036439418792725, "learning_rate": 1.257074368137945e-07, "loss": 0.6524, "step": 14970 }, { "epoch": 0.99, "grad_norm": 2.2730531692504883, "learning_rate": 1.232883974574367e-07, "loss": 0.6413, "step": 14980 }, { "epoch": 0.99, "grad_norm": 2.5283987522125244, "learning_rate": 1.2089256951851923e-07, "loss": 0.6682, "step": 14990 }, { "epoch": 0.99, "grad_norm": 2.1954846382141113, "learning_rate": 1.185199644003332e-07, "loss": 0.6377, "step": 15000 }, { "epoch": 0.99, "grad_norm": 1.8074020147323608, "learning_rate": 1.1617059339563807e-07, "loss": 0.6575, "step": 15010 }, { "epoch": 0.99, "grad_norm": 1.7706438302993774, "learning_rate": 1.1384446768660572e-07, "loss": 0.666, "step": 15020 }, { "epoch": 0.99, "grad_norm": 2.4781429767608643, "learning_rate": 1.115415983447704e-07, "loss": 0.6667, "step": 15030 }, { "epoch": 1.0, "grad_norm": 2.2024221420288086, "learning_rate": 1.0926199633097156e-07, "loss": 0.662, "step": 15040 }, { "epoch": 1.0, "grad_norm": 2.2697882652282715, "learning_rate": 1.0700567249530835e-07, "loss": 0.6578, "step": 15050 }, { "epoch": 1.0, "grad_norm": 2.4836816787719727, "learning_rate": 1.0477263757708078e-07, "loss": 0.6539, "step": 15060 }, { "epoch": 1.0, "grad_norm": 2.1601076126098633, "learning_rate": 1.0256290220474308e-07, "loss": 0.6474, "step": 15070 }, { "epoch": 1.0, "grad_norm": 1.9255223274230957, "learning_rate": 1.0037647689585207e-07, "loss": 0.6431, "step": 15080 }, { "epoch": 1.0, "grad_norm": 1.622870922088623, "learning_rate": 9.821337205701664e-08, "loss": 0.6397, "step": 15090 }, { "epoch": 1.0, "grad_norm": 3.9312548637390137, "learning_rate": 9.607359798384785e-08, "loss": 0.6416, "step": 15100 }, { "epoch": 1.0, "grad_norm": 2.5819900035858154, "learning_rate": 9.395716486091222e-08, "loss": 0.6512, "step": 15110 }, { "epoch": 1.0, "grad_norm": 1.9174243211746216, "learning_rate": 9.186408276168012e-08, "loss": 0.6391, "step": 15120 }, { "epoch": 1.0, "grad_norm": 2.371643304824829, "learning_rate": 8.979436164848088e-08, "loss": 0.6606, "step": 15130 }, { "epoch": 1.0, "grad_norm": 2.4533822536468506, "learning_rate": 8.77480113724516e-08, "loss": 0.6642, "step": 15140 }, { "epoch": 1.0, "grad_norm": 2.1988747119903564, "learning_rate": 8.572504167349449e-08, "loss": 0.637, "step": 15150 }, { "epoch": 1.0, "grad_norm": 2.2295937538146973, "learning_rate": 8.372546218022747e-08, "loss": 0.6612, "step": 15160 }, { "epoch": 1.0, "grad_norm": 1.8617255687713623, "learning_rate": 8.174928240993917e-08, "loss": 0.6626, "step": 15170 }, { "epoch": 1.0, "grad_norm": 2.5846171379089355, "learning_rate": 7.979651176854564e-08, "loss": 0.6715, "step": 15180 }, { "epoch": 1.01, "grad_norm": 1.8019553422927856, "learning_rate": 7.786715955054202e-08, "loss": 0.6487, "step": 15190 }, { "epoch": 1.01, "grad_norm": 2.963513135910034, "learning_rate": 7.59612349389599e-08, "loss": 0.6428, "step": 15200 }, { "epoch": 1.01, "grad_norm": 2.1513736248016357, "learning_rate": 7.407874700532447e-08, "loss": 0.6655, "step": 15210 }, { "epoch": 1.01, "grad_norm": 1.9389243125915527, "learning_rate": 7.221970470961125e-08, "loss": 0.6522, "step": 15220 }, { "epoch": 1.01, "grad_norm": 2.251108407974243, "learning_rate": 7.03841169002023e-08, "loss": 0.6634, "step": 15230 }, { "epoch": 1.01, "grad_norm": 2.1588563919067383, "learning_rate": 6.857199231384282e-08, "loss": 0.6442, "step": 15240 }, { "epoch": 1.01, "grad_norm": 1.868743896484375, "learning_rate": 6.678333957560513e-08, "loss": 0.6628, "step": 15250 }, { "epoch": 1.01, "grad_norm": 1.8450775146484375, "learning_rate": 6.501816719884091e-08, "loss": 0.6362, "step": 15260 }, { "epoch": 1.01, "grad_norm": 1.971536636352539, "learning_rate": 6.327648358514404e-08, "loss": 0.655, "step": 15270 }, { "epoch": 1.01, "grad_norm": 2.585538625717163, "learning_rate": 6.15582970243117e-08, "loss": 0.6618, "step": 15280 }, { "epoch": 1.01, "grad_norm": 2.5381572246551514, "learning_rate": 5.986361569430166e-08, "loss": 0.6577, "step": 15290 }, { "epoch": 1.01, "grad_norm": 2.4174349308013916, "learning_rate": 5.8192447661196694e-08, "loss": 0.6678, "step": 15300 }, { "epoch": 1.01, "grad_norm": 1.9419946670532227, "learning_rate": 5.654480087916303e-08, "loss": 0.6855, "step": 15310 }, { "epoch": 1.01, "grad_norm": 2.323552131652832, "learning_rate": 5.492068319041588e-08, "loss": 0.6705, "step": 15320 }, { "epoch": 1.01, "grad_norm": 2.2662553787231445, "learning_rate": 5.332010232517892e-08, "loss": 0.6549, "step": 15330 }, { "epoch": 1.02, "grad_norm": 1.6378850936889648, "learning_rate": 5.174306590164879e-08, "loss": 0.6512, "step": 15340 }, { "epoch": 1.02, "grad_norm": 2.1355671882629395, "learning_rate": 5.0189581425960644e-08, "loss": 0.6373, "step": 15350 }, { "epoch": 1.02, "grad_norm": 2.738846778869629, "learning_rate": 4.865965629214819e-08, "loss": 0.6454, "step": 15360 }, { "epoch": 1.02, "grad_norm": 2.610119342803955, "learning_rate": 4.715329778211375e-08, "loss": 0.6471, "step": 15370 }, { "epoch": 1.02, "grad_norm": 2.306705951690674, "learning_rate": 4.5670513065588785e-08, "loss": 0.6486, "step": 15380 }, { "epoch": 1.02, "grad_norm": 2.4253878593444824, "learning_rate": 4.42113092001023e-08, "loss": 0.6577, "step": 15390 }, { "epoch": 1.02, "grad_norm": 2.209932565689087, "learning_rate": 4.2775693130948094e-08, "loss": 0.6449, "step": 15400 }, { "epoch": 1.02, "grad_norm": 2.779481887817383, "learning_rate": 4.1363671691148633e-08, "loss": 0.6569, "step": 15410 }, { "epoch": 1.02, "grad_norm": 2.4713690280914307, "learning_rate": 3.9975251601425126e-08, "loss": 0.6672, "step": 15420 }, { "epoch": 1.02, "grad_norm": 2.153374195098877, "learning_rate": 3.861043947016474e-08, "loss": 0.6839, "step": 15430 }, { "epoch": 1.02, "grad_norm": 1.948896884918213, "learning_rate": 3.726924179339009e-08, "loss": 0.6471, "step": 15440 }, { "epoch": 1.02, "grad_norm": 3.0682265758514404, "learning_rate": 3.59516649547248e-08, "loss": 0.6591, "step": 15450 }, { "epoch": 1.02, "grad_norm": 2.3854193687438965, "learning_rate": 3.465771522536854e-08, "loss": 0.6489, "step": 15460 }, { "epoch": 1.02, "grad_norm": 1.635633111000061, "learning_rate": 3.3387398764062603e-08, "loss": 0.6699, "step": 15470 }, { "epoch": 1.02, "grad_norm": 1.99565589427948, "learning_rate": 3.214072161706272e-08, "loss": 0.6636, "step": 15480 }, { "epoch": 1.03, "grad_norm": 1.8887382745742798, "learning_rate": 3.09176897181096e-08, "loss": 0.651, "step": 15490 }, { "epoch": 1.03, "grad_norm": 2.1598098278045654, "learning_rate": 2.971830888840177e-08, "loss": 0.6575, "step": 15500 }, { "epoch": 1.03, "grad_norm": 2.4078586101531982, "learning_rate": 2.854258483656669e-08, "loss": 0.6666, "step": 15510 }, { "epoch": 1.03, "grad_norm": 2.4672226905822754, "learning_rate": 2.7390523158633552e-08, "loss": 0.631, "step": 15520 }, { "epoch": 1.03, "grad_norm": 2.4110639095306396, "learning_rate": 2.6262129338006647e-08, "loss": 0.6673, "step": 15530 }, { "epoch": 1.03, "grad_norm": 1.6732667684555054, "learning_rate": 2.515740874544148e-08, "loss": 0.6444, "step": 15540 }, { "epoch": 1.03, "grad_norm": 2.4348301887512207, "learning_rate": 2.4076366639015914e-08, "loss": 0.6613, "step": 15550 }, { "epoch": 1.03, "grad_norm": 2.3126699924468994, "learning_rate": 2.301900816410574e-08, "loss": 0.6694, "step": 15560 }, { "epoch": 1.03, "grad_norm": 2.0329172611236572, "learning_rate": 2.198533835336414e-08, "loss": 0.6501, "step": 15570 }, { "epoch": 1.03, "grad_norm": 2.472811460494995, "learning_rate": 2.097536212669171e-08, "loss": 0.6594, "step": 15580 }, { "epoch": 1.03, "grad_norm": 2.291701555252075, "learning_rate": 1.9989084291216487e-08, "loss": 0.658, "step": 15590 }, { "epoch": 1.03, "grad_norm": 1.8039275407791138, "learning_rate": 1.9026509541272276e-08, "loss": 0.6577, "step": 15600 }, { "epoch": 1.03, "grad_norm": 2.6124210357666016, "learning_rate": 1.8087642458373135e-08, "loss": 0.6733, "step": 15610 }, { "epoch": 1.03, "grad_norm": 2.3919432163238525, "learning_rate": 1.7172487511192827e-08, "loss": 0.6597, "step": 15620 }, { "epoch": 1.03, "grad_norm": 1.9119877815246582, "learning_rate": 1.628104905554484e-08, "loss": 0.6733, "step": 15630 }, { "epoch": 1.03, "grad_norm": 2.052133083343506, "learning_rate": 1.541333133436018e-08, "loss": 0.6405, "step": 15640 }, { "epoch": 1.04, "grad_norm": 2.703890323638916, "learning_rate": 1.4569338477666838e-08, "loss": 0.6659, "step": 15650 }, { "epoch": 1.04, "grad_norm": 2.1235244274139404, "learning_rate": 1.3749074502572012e-08, "loss": 0.6451, "step": 15660 }, { "epoch": 1.04, "grad_norm": 3.593223810195923, "learning_rate": 1.2952543313240474e-08, "loss": 0.6553, "step": 15670 }, { "epoch": 1.04, "grad_norm": 1.7909396886825562, "learning_rate": 1.2179748700879013e-08, "loss": 0.6514, "step": 15680 }, { "epoch": 1.04, "grad_norm": 2.645784854888916, "learning_rate": 1.1430694343715354e-08, "loss": 0.6431, "step": 15690 }, { "epoch": 1.04, "grad_norm": 2.3556981086730957, "learning_rate": 1.0705383806982606e-08, "loss": 0.6731, "step": 15700 }, { "epoch": 1.04, "grad_norm": 2.335996389389038, "learning_rate": 1.00038205429015e-08, "loss": 0.6665, "step": 15710 }, { "epoch": 1.04, "grad_norm": 2.2005608081817627, "learning_rate": 9.3260078906654e-09, "loss": 0.6463, "step": 15720 }, { "epoch": 1.04, "grad_norm": 2.0543875694274902, "learning_rate": 8.671949076420883e-09, "loss": 0.6582, "step": 15730 }, { "epoch": 1.04, "grad_norm": 1.907952070236206, "learning_rate": 8.041647213256066e-09, "loss": 0.6501, "step": 15740 }, { "epoch": 1.04, "grad_norm": 3.233694076538086, "learning_rate": 7.43510530118452e-09, "loss": 0.6657, "step": 15750 }, { "epoch": 1.04, "grad_norm": 2.057260036468506, "learning_rate": 6.852326227130835e-09, "loss": 0.6804, "step": 15760 }, { "epoch": 1.04, "grad_norm": 2.1894454956054688, "learning_rate": 6.293312764916182e-09, "loss": 0.6562, "step": 15770 }, { "epoch": 1.04, "grad_norm": 2.1258082389831543, "learning_rate": 5.758067575246662e-09, "loss": 0.6551, "step": 15780 }, { "epoch": 1.04, "grad_norm": 1.4697284698486328, "learning_rate": 5.246593205699424e-09, "loss": 0.6638, "step": 15790 }, { "epoch": 1.05, "grad_norm": 2.231762409210205, "learning_rate": 4.758892090711009e-09, "loss": 0.6746, "step": 15800 }, { "epoch": 1.05, "grad_norm": 2.007432222366333, "learning_rate": 4.294966551566249e-09, "loss": 0.6382, "step": 15810 }, { "epoch": 1.05, "grad_norm": 2.1840968132019043, "learning_rate": 3.854818796385495e-09, "loss": 0.6444, "step": 15820 }, { "epoch": 1.05, "grad_norm": 2.2293922901153564, "learning_rate": 3.4384509201157435e-09, "loss": 0.6611, "step": 15830 }, { "epoch": 1.05, "grad_norm": 1.7496803998947144, "learning_rate": 3.0458649045211897e-09, "loss": 0.6644, "step": 15840 }, { "epoch": 1.05, "grad_norm": 2.6910362243652344, "learning_rate": 2.6770626181715776e-09, "loss": 0.6383, "step": 15850 }, { "epoch": 1.05, "grad_norm": 2.11971378326416, "learning_rate": 2.3320458164355352e-09, "loss": 0.6382, "step": 15860 }, { "epoch": 1.05, "grad_norm": 1.7193676233291626, "learning_rate": 2.0108161414694736e-09, "loss": 0.6607, "step": 15870 }, { "epoch": 1.05, "grad_norm": 2.7598321437835693, "learning_rate": 1.7133751222137007e-09, "loss": 0.6503, "step": 15880 }, { "epoch": 1.05, "grad_norm": 2.2920305728912354, "learning_rate": 1.4397241743813185e-09, "loss": 0.6791, "step": 15890 }, { "epoch": 1.05, "grad_norm": 2.376326560974121, "learning_rate": 1.189864600454338e-09, "loss": 0.666, "step": 15900 }, { "epoch": 1.05, "grad_norm": 2.610793352127075, "learning_rate": 9.637975896759077e-10, "loss": 0.6424, "step": 15910 }, { "epoch": 1.05, "grad_norm": 2.694457530975342, "learning_rate": 7.615242180436521e-10, "loss": 0.6644, "step": 15920 }, { "epoch": 1.05, "grad_norm": 2.2358672618865967, "learning_rate": 5.830454483085612e-10, "loss": 0.6916, "step": 15930 }, { "epoch": 1.05, "grad_norm": 1.7769392728805542, "learning_rate": 4.283621299649987e-10, "loss": 0.6549, "step": 15940 }, { "epoch": 1.06, "grad_norm": 2.700413227081299, "learning_rate": 2.974749992512571e-10, "loss": 0.6605, "step": 15950 }, { "epoch": 1.06, "grad_norm": 2.442838191986084, "learning_rate": 1.903846791434516e-10, "loss": 0.6544, "step": 15960 }, { "epoch": 1.06, "grad_norm": 1.9941034317016602, "learning_rate": 1.0709167935385456e-10, "loss": 0.6723, "step": 15970 }, { "epoch": 1.06, "grad_norm": 1.7016099691390991, "learning_rate": 4.7596396327564966e-11, "loss": 0.6498, "step": 15980 }, { "epoch": 1.06, "grad_norm": 2.3243091106414795, "learning_rate": 1.189911324084303e-11, "loss": 0.6521, "step": 15990 }, { "epoch": 1.06, "grad_norm": 2.1339619159698486, "learning_rate": 0.0, "loss": 0.6481, "step": 16000 }, { "epoch": 1.06, "eval_loss": 0.7880622148513794, "eval_runtime": 133.9187, "eval_samples_per_second": 82.139, "eval_steps_per_second": 10.267, "step": 16000 } ], "logging_steps": 10, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1, "total_flos": 3.248711155315009e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }