{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 5102, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.812536716461182, "learning_rate": 9.70873786407767e-08, "loss": 1.4108, "step": 10 }, { "epoch": 0.0, "grad_norm": 6.775967597961426, "learning_rate": 1.941747572815534e-07, "loss": 1.4065, "step": 20 }, { "epoch": 0.01, "grad_norm": 6.67765998840332, "learning_rate": 2.9126213592233014e-07, "loss": 1.4145, "step": 30 }, { "epoch": 0.01, "grad_norm": 6.1705145835876465, "learning_rate": 3.883495145631068e-07, "loss": 1.3955, "step": 40 }, { "epoch": 0.01, "grad_norm": 5.644610404968262, "learning_rate": 4.854368932038835e-07, "loss": 1.3986, "step": 50 }, { "epoch": 0.01, "grad_norm": 6.067681789398193, "learning_rate": 5.825242718446603e-07, "loss": 1.3721, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.449668884277344, "learning_rate": 6.79611650485437e-07, "loss": 1.3672, "step": 70 }, { "epoch": 0.02, "grad_norm": 4.1965107917785645, "learning_rate": 7.766990291262136e-07, "loss": 1.3229, "step": 80 }, { "epoch": 0.02, "grad_norm": 5.0556440353393555, "learning_rate": 8.737864077669904e-07, "loss": 1.2846, "step": 90 }, { "epoch": 0.02, "grad_norm": 5.58712911605835, "learning_rate": 9.70873786407767e-07, "loss": 1.2561, "step": 100 }, { "epoch": 0.02, "grad_norm": 4.737099647521973, "learning_rate": 1.0679611650485437e-06, "loss": 1.2511, "step": 110 }, { "epoch": 0.02, "grad_norm": 4.916423320770264, "learning_rate": 1.1650485436893206e-06, "loss": 1.2378, "step": 120 }, { "epoch": 0.03, "grad_norm": 5.898449420928955, "learning_rate": 1.2621359223300972e-06, "loss": 1.2263, "step": 130 }, { "epoch": 0.03, "grad_norm": 4.9001641273498535, "learning_rate": 1.359223300970874e-06, "loss": 1.228, "step": 140 }, { "epoch": 0.03, "grad_norm": 4.939080238342285, "learning_rate": 1.4563106796116506e-06, "loss": 1.2006, "step": 150 }, { "epoch": 0.03, "grad_norm": 4.023560047149658, "learning_rate": 1.5533980582524272e-06, "loss": 1.1934, "step": 160 }, { "epoch": 0.03, "grad_norm": 4.546477317810059, "learning_rate": 1.650485436893204e-06, "loss": 1.1905, "step": 170 }, { "epoch": 0.04, "grad_norm": 4.045153617858887, "learning_rate": 1.7475728155339808e-06, "loss": 1.1642, "step": 180 }, { "epoch": 0.04, "grad_norm": 4.5985870361328125, "learning_rate": 1.8446601941747574e-06, "loss": 1.1674, "step": 190 }, { "epoch": 0.04, "grad_norm": 4.283823490142822, "learning_rate": 1.941747572815534e-06, "loss": 1.157, "step": 200 }, { "epoch": 0.04, "grad_norm": 4.753105640411377, "learning_rate": 2.0388349514563107e-06, "loss": 1.164, "step": 210 }, { "epoch": 0.04, "grad_norm": 4.330531597137451, "learning_rate": 2.1359223300970874e-06, "loss": 1.169, "step": 220 }, { "epoch": 0.05, "grad_norm": 4.404067516326904, "learning_rate": 2.2330097087378645e-06, "loss": 1.1493, "step": 230 }, { "epoch": 0.05, "grad_norm": 4.143772602081299, "learning_rate": 2.330097087378641e-06, "loss": 1.1388, "step": 240 }, { "epoch": 0.05, "grad_norm": 3.965195417404175, "learning_rate": 2.427184466019418e-06, "loss": 1.147, "step": 250 }, { "epoch": 0.05, "grad_norm": 4.0882039070129395, "learning_rate": 2.5242718446601945e-06, "loss": 1.1487, "step": 260 }, { "epoch": 0.05, "grad_norm": 4.03162956237793, "learning_rate": 2.621359223300971e-06, "loss": 1.136, "step": 270 }, { "epoch": 0.05, "grad_norm": 4.2308855056762695, "learning_rate": 2.718446601941748e-06, "loss": 1.1378, "step": 280 }, { "epoch": 0.06, "grad_norm": 4.1606221199035645, "learning_rate": 2.8155339805825245e-06, "loss": 1.1471, "step": 290 }, { "epoch": 0.06, "grad_norm": 4.03283166885376, "learning_rate": 2.912621359223301e-06, "loss": 1.1185, "step": 300 }, { "epoch": 0.06, "grad_norm": 4.2199859619140625, "learning_rate": 3.0097087378640778e-06, "loss": 1.1432, "step": 310 }, { "epoch": 0.06, "grad_norm": 4.150729656219482, "learning_rate": 3.1067961165048544e-06, "loss": 1.1155, "step": 320 }, { "epoch": 0.06, "grad_norm": 3.9178309440612793, "learning_rate": 3.2038834951456315e-06, "loss": 1.1219, "step": 330 }, { "epoch": 0.07, "grad_norm": 4.3575663566589355, "learning_rate": 3.300970873786408e-06, "loss": 1.108, "step": 340 }, { "epoch": 0.07, "grad_norm": 4.251282215118408, "learning_rate": 3.398058252427185e-06, "loss": 1.1052, "step": 350 }, { "epoch": 0.07, "grad_norm": 4.297124862670898, "learning_rate": 3.4951456310679615e-06, "loss": 1.112, "step": 360 }, { "epoch": 0.07, "grad_norm": 4.4219465255737305, "learning_rate": 3.592233009708738e-06, "loss": 1.12, "step": 370 }, { "epoch": 0.07, "grad_norm": 4.0326948165893555, "learning_rate": 3.689320388349515e-06, "loss": 1.1286, "step": 380 }, { "epoch": 0.08, "grad_norm": 4.3179030418396, "learning_rate": 3.7864077669902915e-06, "loss": 1.1255, "step": 390 }, { "epoch": 0.08, "grad_norm": 4.20508337020874, "learning_rate": 3.883495145631068e-06, "loss": 1.1116, "step": 400 }, { "epoch": 0.08, "grad_norm": 4.361403942108154, "learning_rate": 3.980582524271845e-06, "loss": 1.1061, "step": 410 }, { "epoch": 0.08, "grad_norm": 4.279186248779297, "learning_rate": 4.0776699029126215e-06, "loss": 1.1031, "step": 420 }, { "epoch": 0.08, "grad_norm": 4.163991928100586, "learning_rate": 4.1747572815533986e-06, "loss": 1.1097, "step": 430 }, { "epoch": 0.09, "grad_norm": 4.068342685699463, "learning_rate": 4.271844660194175e-06, "loss": 1.1048, "step": 440 }, { "epoch": 0.09, "grad_norm": 4.233316421508789, "learning_rate": 4.368932038834952e-06, "loss": 1.11, "step": 450 }, { "epoch": 0.09, "grad_norm": 4.018556594848633, "learning_rate": 4.466019417475729e-06, "loss": 1.1, "step": 460 }, { "epoch": 0.09, "grad_norm": 4.257734298706055, "learning_rate": 4.563106796116505e-06, "loss": 1.1102, "step": 470 }, { "epoch": 0.09, "grad_norm": 4.20249605178833, "learning_rate": 4.660194174757282e-06, "loss": 1.0938, "step": 480 }, { "epoch": 0.1, "grad_norm": 4.196409225463867, "learning_rate": 4.7572815533980585e-06, "loss": 1.1153, "step": 490 }, { "epoch": 0.1, "grad_norm": 3.7823269367218018, "learning_rate": 4.854368932038836e-06, "loss": 1.105, "step": 500 }, { "epoch": 0.1, "grad_norm": 3.8932242393493652, "learning_rate": 4.951456310679612e-06, "loss": 1.0977, "step": 510 }, { "epoch": 0.1, "grad_norm": 4.169296741485596, "learning_rate": 5.048543689320389e-06, "loss": 1.1022, "step": 520 }, { "epoch": 0.1, "grad_norm": 4.110784530639648, "learning_rate": 5.145631067961165e-06, "loss": 1.0954, "step": 530 }, { "epoch": 0.11, "grad_norm": 4.476339340209961, "learning_rate": 5.242718446601942e-06, "loss": 1.104, "step": 540 }, { "epoch": 0.11, "grad_norm": 3.7383415699005127, "learning_rate": 5.3398058252427185e-06, "loss": 1.106, "step": 550 }, { "epoch": 0.11, "grad_norm": 4.040097236633301, "learning_rate": 5.436893203883496e-06, "loss": 1.1006, "step": 560 }, { "epoch": 0.11, "grad_norm": 4.430188179016113, "learning_rate": 5.533980582524272e-06, "loss": 1.1087, "step": 570 }, { "epoch": 0.11, "grad_norm": 4.724983215332031, "learning_rate": 5.631067961165049e-06, "loss": 1.1004, "step": 580 }, { "epoch": 0.12, "grad_norm": 4.040498733520508, "learning_rate": 5.728155339805825e-06, "loss": 1.0944, "step": 590 }, { "epoch": 0.12, "grad_norm": 4.504246234893799, "learning_rate": 5.825242718446602e-06, "loss": 1.0915, "step": 600 }, { "epoch": 0.12, "grad_norm": 4.28185510635376, "learning_rate": 5.9223300970873785e-06, "loss": 1.0951, "step": 610 }, { "epoch": 0.12, "grad_norm": 3.7988641262054443, "learning_rate": 6.0194174757281556e-06, "loss": 1.1052, "step": 620 }, { "epoch": 0.12, "grad_norm": 4.40338134765625, "learning_rate": 6.116504854368932e-06, "loss": 1.0931, "step": 630 }, { "epoch": 0.13, "grad_norm": 4.3256659507751465, "learning_rate": 6.213592233009709e-06, "loss": 1.0863, "step": 640 }, { "epoch": 0.13, "grad_norm": 4.146902561187744, "learning_rate": 6.310679611650487e-06, "loss": 1.0916, "step": 650 }, { "epoch": 0.13, "grad_norm": 4.871020317077637, "learning_rate": 6.407766990291263e-06, "loss": 1.0853, "step": 660 }, { "epoch": 0.13, "grad_norm": 3.6339454650878906, "learning_rate": 6.50485436893204e-06, "loss": 1.0907, "step": 670 }, { "epoch": 0.13, "grad_norm": 4.507898330688477, "learning_rate": 6.601941747572816e-06, "loss": 1.1049, "step": 680 }, { "epoch": 0.14, "grad_norm": 4.15292501449585, "learning_rate": 6.6990291262135935e-06, "loss": 1.0928, "step": 690 }, { "epoch": 0.14, "grad_norm": 4.081875324249268, "learning_rate": 6.79611650485437e-06, "loss": 1.1104, "step": 700 }, { "epoch": 0.14, "grad_norm": 3.9496753215789795, "learning_rate": 6.893203883495147e-06, "loss": 1.0916, "step": 710 }, { "epoch": 0.14, "grad_norm": 4.676784992218018, "learning_rate": 6.990291262135923e-06, "loss": 1.087, "step": 720 }, { "epoch": 0.14, "grad_norm": 3.889897584915161, "learning_rate": 7.0873786407767e-06, "loss": 1.0889, "step": 730 }, { "epoch": 0.15, "grad_norm": 3.654224157333374, "learning_rate": 7.184466019417476e-06, "loss": 1.0929, "step": 740 }, { "epoch": 0.15, "grad_norm": 5.457578182220459, "learning_rate": 7.2815533980582534e-06, "loss": 1.0855, "step": 750 }, { "epoch": 0.15, "grad_norm": 4.403715133666992, "learning_rate": 7.37864077669903e-06, "loss": 1.0793, "step": 760 }, { "epoch": 0.15, "grad_norm": 4.498962879180908, "learning_rate": 7.475728155339807e-06, "loss": 1.0946, "step": 770 }, { "epoch": 0.15, "grad_norm": 4.233016014099121, "learning_rate": 7.572815533980583e-06, "loss": 1.0753, "step": 780 }, { "epoch": 0.15, "grad_norm": 4.361231327056885, "learning_rate": 7.66990291262136e-06, "loss": 1.0865, "step": 790 }, { "epoch": 0.16, "grad_norm": 4.116058349609375, "learning_rate": 7.766990291262136e-06, "loss": 1.0874, "step": 800 }, { "epoch": 0.16, "grad_norm": 3.87738037109375, "learning_rate": 7.864077669902913e-06, "loss": 1.0897, "step": 810 }, { "epoch": 0.16, "grad_norm": 4.444067478179932, "learning_rate": 7.96116504854369e-06, "loss": 1.0926, "step": 820 }, { "epoch": 0.16, "grad_norm": 3.9490249156951904, "learning_rate": 8.058252427184466e-06, "loss": 1.0678, "step": 830 }, { "epoch": 0.16, "grad_norm": 5.674923896789551, "learning_rate": 8.155339805825243e-06, "loss": 1.0923, "step": 840 }, { "epoch": 0.17, "grad_norm": 4.707766056060791, "learning_rate": 8.25242718446602e-06, "loss": 1.0874, "step": 850 }, { "epoch": 0.17, "grad_norm": 4.693995475769043, "learning_rate": 8.349514563106797e-06, "loss": 1.0558, "step": 860 }, { "epoch": 0.17, "grad_norm": 4.561864376068115, "learning_rate": 8.446601941747573e-06, "loss": 1.0961, "step": 870 }, { "epoch": 0.17, "grad_norm": 4.7990922927856445, "learning_rate": 8.54368932038835e-06, "loss": 1.0877, "step": 880 }, { "epoch": 0.17, "grad_norm": 5.014951705932617, "learning_rate": 8.640776699029127e-06, "loss": 1.0866, "step": 890 }, { "epoch": 0.18, "grad_norm": 5.007348537445068, "learning_rate": 8.737864077669904e-06, "loss": 1.0795, "step": 900 }, { "epoch": 0.18, "grad_norm": 4.876419544219971, "learning_rate": 8.834951456310681e-06, "loss": 1.0826, "step": 910 }, { "epoch": 0.18, "grad_norm": 5.8001556396484375, "learning_rate": 8.932038834951458e-06, "loss": 1.0828, "step": 920 }, { "epoch": 0.18, "grad_norm": 4.761294364929199, "learning_rate": 9.029126213592233e-06, "loss": 1.0897, "step": 930 }, { "epoch": 0.18, "grad_norm": 4.832348346710205, "learning_rate": 9.12621359223301e-06, "loss": 1.0826, "step": 940 }, { "epoch": 0.19, "grad_norm": 5.325027942657471, "learning_rate": 9.223300970873788e-06, "loss": 1.0705, "step": 950 }, { "epoch": 0.19, "grad_norm": 3.859924554824829, "learning_rate": 9.320388349514565e-06, "loss": 1.0691, "step": 960 }, { "epoch": 0.19, "grad_norm": 3.8470637798309326, "learning_rate": 9.41747572815534e-06, "loss": 1.0808, "step": 970 }, { "epoch": 0.19, "grad_norm": 4.887052536010742, "learning_rate": 9.514563106796117e-06, "loss": 1.0688, "step": 980 }, { "epoch": 0.19, "grad_norm": 3.891918897628784, "learning_rate": 9.611650485436894e-06, "loss": 1.0748, "step": 990 }, { "epoch": 0.2, "grad_norm": 4.252170562744141, "learning_rate": 9.708737864077671e-06, "loss": 1.0889, "step": 1000 }, { "epoch": 0.2, "eval_loss": 1.0763322114944458, "eval_runtime": 12.4536, "eval_samples_per_second": 52.515, "eval_steps_per_second": 6.584, "step": 1000 }, { "epoch": 0.2, "grad_norm": 4.76177453994751, "learning_rate": 9.805825242718447e-06, "loss": 1.0925, "step": 1010 }, { "epoch": 0.2, "grad_norm": 5.820517063140869, "learning_rate": 9.902912621359224e-06, "loss": 1.0879, "step": 1020 }, { "epoch": 0.2, "grad_norm": 4.241539478302002, "learning_rate": 1e-05, "loss": 1.0688, "step": 1030 }, { "epoch": 0.2, "grad_norm": 4.468598365783691, "learning_rate": 9.999971286914108e-06, "loss": 1.0782, "step": 1040 }, { "epoch": 0.21, "grad_norm": 4.580018043518066, "learning_rate": 9.999885147986207e-06, "loss": 1.0793, "step": 1050 }, { "epoch": 0.21, "grad_norm": 5.351871490478516, "learning_rate": 9.999741584205621e-06, "loss": 1.0746, "step": 1060 }, { "epoch": 0.21, "grad_norm": 4.939090251922607, "learning_rate": 9.999540597221217e-06, "loss": 1.0814, "step": 1070 }, { "epoch": 0.21, "grad_norm": 4.497158050537109, "learning_rate": 9.999282189341374e-06, "loss": 1.076, "step": 1080 }, { "epoch": 0.21, "grad_norm": 3.999101161956787, "learning_rate": 9.998966363533972e-06, "loss": 1.0826, "step": 1090 }, { "epoch": 0.22, "grad_norm": 5.041301727294922, "learning_rate": 9.99859312342634e-06, "loss": 1.0675, "step": 1100 }, { "epoch": 0.22, "grad_norm": 4.890156269073486, "learning_rate": 9.998162473305229e-06, "loss": 1.0724, "step": 1110 }, { "epoch": 0.22, "grad_norm": 6.730093955993652, "learning_rate": 9.997674418116759e-06, "loss": 1.0679, "step": 1120 }, { "epoch": 0.22, "grad_norm": 5.929831504821777, "learning_rate": 9.997128963466355e-06, "loss": 1.064, "step": 1130 }, { "epoch": 0.22, "grad_norm": 6.288021087646484, "learning_rate": 9.996526115618694e-06, "loss": 1.0895, "step": 1140 }, { "epoch": 0.23, "grad_norm": 4.647055149078369, "learning_rate": 9.995865881497621e-06, "loss": 1.0768, "step": 1150 }, { "epoch": 0.23, "grad_norm": 5.861464977264404, "learning_rate": 9.995148268686086e-06, "loss": 1.073, "step": 1160 }, { "epoch": 0.23, "grad_norm": 5.072396278381348, "learning_rate": 9.994373285426034e-06, "loss": 1.0679, "step": 1170 }, { "epoch": 0.23, "grad_norm": 4.59302282333374, "learning_rate": 9.993540940618334e-06, "loss": 1.0999, "step": 1180 }, { "epoch": 0.23, "grad_norm": 4.47246789932251, "learning_rate": 9.992651243822658e-06, "loss": 1.0797, "step": 1190 }, { "epoch": 0.24, "grad_norm": 3.917146682739258, "learning_rate": 9.991704205257383e-06, "loss": 1.0786, "step": 1200 }, { "epoch": 0.24, "grad_norm": 5.097021102905273, "learning_rate": 9.99069983579947e-06, "loss": 1.0709, "step": 1210 }, { "epoch": 0.24, "grad_norm": 5.385958671569824, "learning_rate": 9.989638146984337e-06, "loss": 1.0878, "step": 1220 }, { "epoch": 0.24, "grad_norm": 4.534526824951172, "learning_rate": 9.988519151005728e-06, "loss": 1.0753, "step": 1230 }, { "epoch": 0.24, "grad_norm": 5.479306697845459, "learning_rate": 9.987342860715575e-06, "loss": 1.0638, "step": 1240 }, { "epoch": 0.25, "grad_norm": 4.557966232299805, "learning_rate": 9.986109289623848e-06, "loss": 1.0841, "step": 1250 }, { "epoch": 0.25, "grad_norm": 4.882152080535889, "learning_rate": 9.984818451898399e-06, "loss": 1.0678, "step": 1260 }, { "epoch": 0.25, "grad_norm": 5.506185054779053, "learning_rate": 9.983470362364803e-06, "loss": 1.0766, "step": 1270 }, { "epoch": 0.25, "grad_norm": 4.027829170227051, "learning_rate": 9.982065036506183e-06, "loss": 1.0825, "step": 1280 }, { "epoch": 0.25, "grad_norm": 5.385190010070801, "learning_rate": 9.980602490463037e-06, "loss": 1.0709, "step": 1290 }, { "epoch": 0.25, "grad_norm": 6.701911449432373, "learning_rate": 9.979082741033047e-06, "loss": 1.0813, "step": 1300 }, { "epoch": 0.26, "grad_norm": 4.571600437164307, "learning_rate": 9.977505805670895e-06, "loss": 1.0678, "step": 1310 }, { "epoch": 0.26, "grad_norm": 6.041151523590088, "learning_rate": 9.97587170248805e-06, "loss": 1.08, "step": 1320 }, { "epoch": 0.26, "grad_norm": 5.1545729637146, "learning_rate": 9.97418045025257e-06, "loss": 1.071, "step": 1330 }, { "epoch": 0.26, "grad_norm": 4.163790702819824, "learning_rate": 9.972432068388885e-06, "loss": 1.068, "step": 1340 }, { "epoch": 0.26, "grad_norm": 5.824156284332275, "learning_rate": 9.97062657697757e-06, "loss": 1.0718, "step": 1350 }, { "epoch": 0.27, "grad_norm": 6.520030975341797, "learning_rate": 9.968763996755115e-06, "loss": 1.0779, "step": 1360 }, { "epoch": 0.27, "grad_norm": 4.418417453765869, "learning_rate": 9.966844349113695e-06, "loss": 1.0677, "step": 1370 }, { "epoch": 0.27, "grad_norm": 4.918398380279541, "learning_rate": 9.96486765610091e-06, "loss": 1.0942, "step": 1380 }, { "epoch": 0.27, "grad_norm": 5.05225944519043, "learning_rate": 9.96283394041954e-06, "loss": 1.0852, "step": 1390 }, { "epoch": 0.27, "grad_norm": 4.492018699645996, "learning_rate": 9.96074322542729e-06, "loss": 1.0728, "step": 1400 }, { "epoch": 0.28, "grad_norm": 4.9885711669921875, "learning_rate": 9.958595535136511e-06, "loss": 1.0618, "step": 1410 }, { "epoch": 0.28, "grad_norm": 5.547979831695557, "learning_rate": 9.95639089421393e-06, "loss": 1.0753, "step": 1420 }, { "epoch": 0.28, "grad_norm": 4.2231621742248535, "learning_rate": 9.954129327980362e-06, "loss": 1.0573, "step": 1430 }, { "epoch": 0.28, "grad_norm": 4.681105613708496, "learning_rate": 9.951810862410426e-06, "loss": 1.0833, "step": 1440 }, { "epoch": 0.28, "grad_norm": 5.339748382568359, "learning_rate": 9.949435524132245e-06, "loss": 1.0712, "step": 1450 }, { "epoch": 0.29, "grad_norm": 4.844823837280273, "learning_rate": 9.947003340427134e-06, "loss": 1.0803, "step": 1460 }, { "epoch": 0.29, "grad_norm": 3.774728536605835, "learning_rate": 9.944514339229292e-06, "loss": 1.0787, "step": 1470 }, { "epoch": 0.29, "grad_norm": 4.437483310699463, "learning_rate": 9.941968549125481e-06, "loss": 1.0757, "step": 1480 }, { "epoch": 0.29, "grad_norm": 5.675827503204346, "learning_rate": 9.9393659993547e-06, "loss": 1.0635, "step": 1490 }, { "epoch": 0.29, "grad_norm": 5.193900108337402, "learning_rate": 9.936706719807839e-06, "loss": 1.0527, "step": 1500 }, { "epoch": 0.3, "grad_norm": 6.871400833129883, "learning_rate": 9.93399074102735e-06, "loss": 1.0683, "step": 1510 }, { "epoch": 0.3, "grad_norm": 6.098533630371094, "learning_rate": 9.931218094206882e-06, "loss": 1.0704, "step": 1520 }, { "epoch": 0.3, "grad_norm": 7.03292989730835, "learning_rate": 9.928388811190938e-06, "loss": 1.0839, "step": 1530 }, { "epoch": 0.3, "grad_norm": 4.196811199188232, "learning_rate": 9.925502924474495e-06, "loss": 1.0749, "step": 1540 }, { "epoch": 0.3, "grad_norm": 5.939242362976074, "learning_rate": 9.922560467202638e-06, "loss": 1.0761, "step": 1550 }, { "epoch": 0.31, "grad_norm": 5.359401702880859, "learning_rate": 9.919561473170178e-06, "loss": 1.0625, "step": 1560 }, { "epoch": 0.31, "grad_norm": 5.799266338348389, "learning_rate": 9.916505976821262e-06, "loss": 1.0691, "step": 1570 }, { "epoch": 0.31, "grad_norm": 4.341587543487549, "learning_rate": 9.913394013248987e-06, "loss": 1.0737, "step": 1580 }, { "epoch": 0.31, "grad_norm": 5.106640815734863, "learning_rate": 9.91022561819498e-06, "loss": 1.0805, "step": 1590 }, { "epoch": 0.31, "grad_norm": 4.987344741821289, "learning_rate": 9.907000828049001e-06, "loss": 1.0569, "step": 1600 }, { "epoch": 0.32, "grad_norm": 5.510980606079102, "learning_rate": 9.903719679848522e-06, "loss": 1.078, "step": 1610 }, { "epoch": 0.32, "grad_norm": 4.564126968383789, "learning_rate": 9.9003822112783e-06, "loss": 1.0726, "step": 1620 }, { "epoch": 0.32, "grad_norm": 4.666530132293701, "learning_rate": 9.89698846066994e-06, "loss": 1.0635, "step": 1630 }, { "epoch": 0.32, "grad_norm": 4.472270488739014, "learning_rate": 9.893538467001466e-06, "loss": 1.0529, "step": 1640 }, { "epoch": 0.32, "grad_norm": 4.534265995025635, "learning_rate": 9.890032269896862e-06, "loss": 1.08, "step": 1650 }, { "epoch": 0.33, "grad_norm": 4.677114963531494, "learning_rate": 9.886469909625624e-06, "loss": 1.0719, "step": 1660 }, { "epoch": 0.33, "grad_norm": 4.84233283996582, "learning_rate": 9.882851427102299e-06, "loss": 1.0665, "step": 1670 }, { "epoch": 0.33, "grad_norm": 4.839110374450684, "learning_rate": 9.879176863885997e-06, "loss": 1.0635, "step": 1680 }, { "epoch": 0.33, "grad_norm": 4.322112560272217, "learning_rate": 9.875446262179948e-06, "loss": 1.0755, "step": 1690 }, { "epoch": 0.33, "grad_norm": 4.2779130935668945, "learning_rate": 9.87165966483098e-06, "loss": 1.0745, "step": 1700 }, { "epoch": 0.34, "grad_norm": 5.142566204071045, "learning_rate": 9.867817115329055e-06, "loss": 1.0725, "step": 1710 }, { "epoch": 0.34, "grad_norm": 4.487438678741455, "learning_rate": 9.863918657806752e-06, "loss": 1.0538, "step": 1720 }, { "epoch": 0.34, "grad_norm": 4.5258588790893555, "learning_rate": 9.85996433703877e-06, "loss": 1.0559, "step": 1730 }, { "epoch": 0.34, "grad_norm": 4.927529811859131, "learning_rate": 9.855954198441411e-06, "loss": 1.0661, "step": 1740 }, { "epoch": 0.34, "grad_norm": 3.9455795288085938, "learning_rate": 9.851888288072053e-06, "loss": 1.0769, "step": 1750 }, { "epoch": 0.34, "grad_norm": 4.094160556793213, "learning_rate": 9.847766652628635e-06, "loss": 1.0767, "step": 1760 }, { "epoch": 0.35, "grad_norm": 4.2093424797058105, "learning_rate": 9.843589339449102e-06, "loss": 1.0635, "step": 1770 }, { "epoch": 0.35, "grad_norm": 4.292007923126221, "learning_rate": 9.839356396510875e-06, "loss": 1.0593, "step": 1780 }, { "epoch": 0.35, "grad_norm": 4.241662502288818, "learning_rate": 9.835067872430297e-06, "loss": 1.0627, "step": 1790 }, { "epoch": 0.35, "grad_norm": 3.465583324432373, "learning_rate": 9.830723816462071e-06, "loss": 1.0551, "step": 1800 }, { "epoch": 0.35, "grad_norm": 4.395579814910889, "learning_rate": 9.8263242784987e-06, "loss": 1.0691, "step": 1810 }, { "epoch": 0.36, "grad_norm": 4.91081428527832, "learning_rate": 9.821869309069907e-06, "loss": 1.0632, "step": 1820 }, { "epoch": 0.36, "grad_norm": 3.9647531509399414, "learning_rate": 9.817358959342057e-06, "loss": 1.0693, "step": 1830 }, { "epoch": 0.36, "grad_norm": 4.396968364715576, "learning_rate": 9.81279328111758e-06, "loss": 1.0869, "step": 1840 }, { "epoch": 0.36, "grad_norm": 4.095502853393555, "learning_rate": 9.808172326834356e-06, "loss": 1.0636, "step": 1850 }, { "epoch": 0.36, "grad_norm": 4.527934551239014, "learning_rate": 9.80349614956513e-06, "loss": 1.0679, "step": 1860 }, { "epoch": 0.37, "grad_norm": 5.237788200378418, "learning_rate": 9.798764803016892e-06, "loss": 1.05, "step": 1870 }, { "epoch": 0.37, "grad_norm": 4.45642614364624, "learning_rate": 9.793978341530265e-06, "loss": 1.0697, "step": 1880 }, { "epoch": 0.37, "grad_norm": 3.550489664077759, "learning_rate": 9.789136820078884e-06, "loss": 1.079, "step": 1890 }, { "epoch": 0.37, "grad_norm": 3.879551887512207, "learning_rate": 9.784240294268756e-06, "loss": 1.0649, "step": 1900 }, { "epoch": 0.37, "grad_norm": 4.333283424377441, "learning_rate": 9.779288820337628e-06, "loss": 1.0668, "step": 1910 }, { "epoch": 0.38, "grad_norm": 3.761474370956421, "learning_rate": 9.774282455154338e-06, "loss": 1.0738, "step": 1920 }, { "epoch": 0.38, "grad_norm": 3.7375967502593994, "learning_rate": 9.769221256218165e-06, "loss": 1.0574, "step": 1930 }, { "epoch": 0.38, "grad_norm": 3.852424144744873, "learning_rate": 9.764105281658161e-06, "loss": 1.0536, "step": 1940 }, { "epoch": 0.38, "grad_norm": 4.345045566558838, "learning_rate": 9.758934590232495e-06, "loss": 1.0898, "step": 1950 }, { "epoch": 0.38, "grad_norm": 4.355409622192383, "learning_rate": 9.753709241327773e-06, "loss": 1.0657, "step": 1960 }, { "epoch": 0.39, "grad_norm": 4.160512924194336, "learning_rate": 9.748429294958345e-06, "loss": 1.0699, "step": 1970 }, { "epoch": 0.39, "grad_norm": 4.259660243988037, "learning_rate": 9.74309481176564e-06, "loss": 1.0703, "step": 1980 }, { "epoch": 0.39, "grad_norm": 4.260087013244629, "learning_rate": 9.737705853017442e-06, "loss": 1.0816, "step": 1990 }, { "epoch": 0.39, "grad_norm": 4.026108264923096, "learning_rate": 9.732262480607207e-06, "loss": 1.0556, "step": 2000 }, { "epoch": 0.39, "eval_loss": 1.0583994388580322, "eval_runtime": 12.5254, "eval_samples_per_second": 52.214, "eval_steps_per_second": 6.547, "step": 2000 }, { "epoch": 0.39, "grad_norm": 4.157490253448486, "learning_rate": 9.726764757053343e-06, "loss": 1.0735, "step": 2010 }, { "epoch": 0.4, "grad_norm": 4.293674468994141, "learning_rate": 9.721212745498493e-06, "loss": 1.0697, "step": 2020 }, { "epoch": 0.4, "grad_norm": 3.8046298027038574, "learning_rate": 9.715606509708812e-06, "loss": 1.0635, "step": 2030 }, { "epoch": 0.4, "grad_norm": 3.55698561668396, "learning_rate": 9.709946114073231e-06, "loss": 1.0685, "step": 2040 }, { "epoch": 0.4, "grad_norm": 3.9492263793945312, "learning_rate": 9.704231623602721e-06, "loss": 1.0692, "step": 2050 }, { "epoch": 0.4, "grad_norm": 2.995659351348877, "learning_rate": 9.698463103929542e-06, "loss": 1.069, "step": 2060 }, { "epoch": 0.41, "grad_norm": 4.452322483062744, "learning_rate": 9.692640621306497e-06, "loss": 1.0728, "step": 2070 }, { "epoch": 0.41, "grad_norm": 4.006470680236816, "learning_rate": 9.686764242606164e-06, "loss": 1.0616, "step": 2080 }, { "epoch": 0.41, "grad_norm": 4.041233539581299, "learning_rate": 9.680834035320127e-06, "loss": 1.0712, "step": 2090 }, { "epoch": 0.41, "grad_norm": 3.9690816402435303, "learning_rate": 9.674850067558209e-06, "loss": 1.0682, "step": 2100 }, { "epoch": 0.41, "grad_norm": 4.229337215423584, "learning_rate": 9.66881240804768e-06, "loss": 1.0734, "step": 2110 }, { "epoch": 0.42, "grad_norm": 3.3140339851379395, "learning_rate": 9.662721126132473e-06, "loss": 1.0665, "step": 2120 }, { "epoch": 0.42, "grad_norm": 4.23881721496582, "learning_rate": 9.656576291772392e-06, "loss": 1.0535, "step": 2130 }, { "epoch": 0.42, "grad_norm": 3.9377248287200928, "learning_rate": 9.650377975542298e-06, "loss": 1.068, "step": 2140 }, { "epoch": 0.42, "grad_norm": 3.9194915294647217, "learning_rate": 9.644126248631306e-06, "loss": 1.0803, "step": 2150 }, { "epoch": 0.42, "grad_norm": 3.5396149158477783, "learning_rate": 9.637821182841965e-06, "loss": 1.0574, "step": 2160 }, { "epoch": 0.43, "grad_norm": 4.14241361618042, "learning_rate": 9.631462850589432e-06, "loss": 1.0643, "step": 2170 }, { "epoch": 0.43, "grad_norm": 3.7068986892700195, "learning_rate": 9.625051324900645e-06, "loss": 1.0519, "step": 2180 }, { "epoch": 0.43, "grad_norm": 3.9636712074279785, "learning_rate": 9.618586679413477e-06, "loss": 1.054, "step": 2190 }, { "epoch": 0.43, "grad_norm": 3.6051032543182373, "learning_rate": 9.612068988375898e-06, "loss": 1.0534, "step": 2200 }, { "epoch": 0.43, "grad_norm": 3.969806671142578, "learning_rate": 9.605498326645115e-06, "loss": 1.0851, "step": 2210 }, { "epoch": 0.44, "grad_norm": 4.08690071105957, "learning_rate": 9.598874769686721e-06, "loss": 1.0645, "step": 2220 }, { "epoch": 0.44, "grad_norm": 3.7391505241394043, "learning_rate": 9.592198393573816e-06, "loss": 1.0693, "step": 2230 }, { "epoch": 0.44, "grad_norm": 4.053625583648682, "learning_rate": 9.585469274986148e-06, "loss": 1.0682, "step": 2240 }, { "epoch": 0.44, "grad_norm": 3.923471212387085, "learning_rate": 9.578687491209219e-06, "loss": 1.0606, "step": 2250 }, { "epoch": 0.44, "grad_norm": 4.096630573272705, "learning_rate": 9.571853120133406e-06, "loss": 1.064, "step": 2260 }, { "epoch": 0.44, "grad_norm": 3.9412453174591064, "learning_rate": 9.564966240253062e-06, "loss": 1.0786, "step": 2270 }, { "epoch": 0.45, "grad_norm": 3.6600263118743896, "learning_rate": 9.558026930665614e-06, "loss": 1.0622, "step": 2280 }, { "epoch": 0.45, "grad_norm": 4.281722545623779, "learning_rate": 9.551035271070665e-06, "loss": 1.0542, "step": 2290 }, { "epoch": 0.45, "grad_norm": 3.6689794063568115, "learning_rate": 9.543991341769057e-06, "loss": 1.0496, "step": 2300 }, { "epoch": 0.45, "grad_norm": 4.148682594299316, "learning_rate": 9.536895223661975e-06, "loss": 1.0648, "step": 2310 }, { "epoch": 0.45, "grad_norm": 4.112921237945557, "learning_rate": 9.529746998249994e-06, "loss": 1.0632, "step": 2320 }, { "epoch": 0.46, "grad_norm": 4.144115447998047, "learning_rate": 9.52254674763216e-06, "loss": 1.0555, "step": 2330 }, { "epoch": 0.46, "grad_norm": 4.198709011077881, "learning_rate": 9.515294554505039e-06, "loss": 1.049, "step": 2340 }, { "epoch": 0.46, "grad_norm": 3.7727572917938232, "learning_rate": 9.507990502161769e-06, "loss": 1.0428, "step": 2350 }, { "epoch": 0.46, "grad_norm": 4.66112756729126, "learning_rate": 9.500634674491099e-06, "loss": 1.0666, "step": 2360 }, { "epoch": 0.46, "grad_norm": 4.313396453857422, "learning_rate": 9.49322715597644e-06, "loss": 1.0658, "step": 2370 }, { "epoch": 0.47, "grad_norm": 3.8979835510253906, "learning_rate": 9.485768031694872e-06, "loss": 1.0516, "step": 2380 }, { "epoch": 0.47, "grad_norm": 4.047280788421631, "learning_rate": 9.478257387316189e-06, "loss": 1.0708, "step": 2390 }, { "epoch": 0.47, "grad_norm": 3.9112637042999268, "learning_rate": 9.470695309101903e-06, "loss": 1.0576, "step": 2400 }, { "epoch": 0.47, "grad_norm": 3.9057672023773193, "learning_rate": 9.463081883904251e-06, "loss": 1.0653, "step": 2410 }, { "epoch": 0.47, "grad_norm": 4.1275105476379395, "learning_rate": 9.455417199165209e-06, "loss": 1.0454, "step": 2420 }, { "epoch": 0.48, "grad_norm": 3.834106922149658, "learning_rate": 9.447701342915473e-06, "loss": 1.0593, "step": 2430 }, { "epoch": 0.48, "grad_norm": 3.673773765563965, "learning_rate": 9.439934403773468e-06, "loss": 1.0543, "step": 2440 }, { "epoch": 0.48, "grad_norm": 4.011989593505859, "learning_rate": 9.4321164709443e-06, "loss": 1.0468, "step": 2450 }, { "epoch": 0.48, "grad_norm": 4.221887588500977, "learning_rate": 9.42424763421877e-06, "loss": 1.0699, "step": 2460 }, { "epoch": 0.48, "grad_norm": 3.603546142578125, "learning_rate": 9.416327983972304e-06, "loss": 1.0525, "step": 2470 }, { "epoch": 0.49, "grad_norm": 4.173734188079834, "learning_rate": 9.408357611163945e-06, "loss": 1.0678, "step": 2480 }, { "epoch": 0.49, "grad_norm": 3.5241010189056396, "learning_rate": 9.400336607335294e-06, "loss": 1.0536, "step": 2490 }, { "epoch": 0.49, "grad_norm": 3.8831429481506348, "learning_rate": 9.392265064609455e-06, "loss": 1.0367, "step": 2500 }, { "epoch": 0.49, "grad_norm": 3.8573215007781982, "learning_rate": 9.384143075689992e-06, "loss": 1.0474, "step": 2510 }, { "epoch": 0.49, "grad_norm": 3.8050475120544434, "learning_rate": 9.375970733859848e-06, "loss": 1.0508, "step": 2520 }, { "epoch": 0.5, "grad_norm": 4.164061546325684, "learning_rate": 9.367748132980286e-06, "loss": 1.0629, "step": 2530 }, { "epoch": 0.5, "grad_norm": 3.8858516216278076, "learning_rate": 9.359475367489805e-06, "loss": 1.0616, "step": 2540 }, { "epoch": 0.5, "grad_norm": 4.033576965332031, "learning_rate": 9.351152532403054e-06, "loss": 1.0687, "step": 2550 }, { "epoch": 0.5, "grad_norm": 4.06900691986084, "learning_rate": 9.342779723309746e-06, "loss": 1.0519, "step": 2560 }, { "epoch": 0.5, "grad_norm": 3.8570592403411865, "learning_rate": 9.334357036373552e-06, "loss": 1.0482, "step": 2570 }, { "epoch": 0.51, "grad_norm": 4.30549430847168, "learning_rate": 9.32588456833101e-06, "loss": 1.0714, "step": 2580 }, { "epoch": 0.51, "grad_norm": 3.6463634967803955, "learning_rate": 9.317362416490396e-06, "loss": 1.055, "step": 2590 }, { "epoch": 0.51, "grad_norm": 4.366295337677002, "learning_rate": 9.308790678730627e-06, "loss": 1.0502, "step": 2600 }, { "epoch": 0.51, "grad_norm": 3.8573434352874756, "learning_rate": 9.300169453500117e-06, "loss": 1.0597, "step": 2610 }, { "epoch": 0.51, "grad_norm": 3.8393239974975586, "learning_rate": 9.291498839815658e-06, "loss": 1.0553, "step": 2620 }, { "epoch": 0.52, "grad_norm": 4.246349334716797, "learning_rate": 9.282778937261279e-06, "loss": 1.0734, "step": 2630 }, { "epoch": 0.52, "grad_norm": 4.076491355895996, "learning_rate": 9.274009845987106e-06, "loss": 1.0643, "step": 2640 }, { "epoch": 0.52, "grad_norm": 3.7121007442474365, "learning_rate": 9.26519166670821e-06, "loss": 1.0491, "step": 2650 }, { "epoch": 0.52, "grad_norm": 3.8223936557769775, "learning_rate": 9.256324500703439e-06, "loss": 1.0713, "step": 2660 }, { "epoch": 0.52, "grad_norm": 3.4132473468780518, "learning_rate": 9.247408449814281e-06, "loss": 1.0541, "step": 2670 }, { "epoch": 0.53, "grad_norm": 3.846742868423462, "learning_rate": 9.238443616443666e-06, "loss": 1.0573, "step": 2680 }, { "epoch": 0.53, "grad_norm": 4.1583123207092285, "learning_rate": 9.229430103554808e-06, "loss": 1.038, "step": 2690 }, { "epoch": 0.53, "grad_norm": 4.192993640899658, "learning_rate": 9.22036801467001e-06, "loss": 1.0645, "step": 2700 }, { "epoch": 0.53, "grad_norm": 4.137371063232422, "learning_rate": 9.211257453869495e-06, "loss": 1.058, "step": 2710 }, { "epoch": 0.53, "grad_norm": 4.091256618499756, "learning_rate": 9.202098525790182e-06, "loss": 1.0702, "step": 2720 }, { "epoch": 0.54, "grad_norm": 3.9540719985961914, "learning_rate": 9.192891335624508e-06, "loss": 1.0406, "step": 2730 }, { "epoch": 0.54, "grad_norm": 4.149806022644043, "learning_rate": 9.183635989119211e-06, "loss": 1.0558, "step": 2740 }, { "epoch": 0.54, "grad_norm": 3.950911045074463, "learning_rate": 9.174332592574115e-06, "loss": 1.0446, "step": 2750 }, { "epoch": 0.54, "grad_norm": 3.7937111854553223, "learning_rate": 9.164981252840908e-06, "loss": 1.0608, "step": 2760 }, { "epoch": 0.54, "grad_norm": 4.06609582901001, "learning_rate": 9.155582077321918e-06, "loss": 1.0653, "step": 2770 }, { "epoch": 0.54, "grad_norm": 4.201600074768066, "learning_rate": 9.146135173968881e-06, "loss": 1.0651, "step": 2780 }, { "epoch": 0.55, "grad_norm": 3.8813118934631348, "learning_rate": 9.136640651281694e-06, "loss": 1.0567, "step": 2790 }, { "epoch": 0.55, "grad_norm": 3.4754064083099365, "learning_rate": 9.127098618307177e-06, "loss": 1.0632, "step": 2800 }, { "epoch": 0.55, "grad_norm": 3.5580170154571533, "learning_rate": 9.117509184637814e-06, "loss": 1.057, "step": 2810 }, { "epoch": 0.55, "grad_norm": 3.7533814907073975, "learning_rate": 9.107872460410496e-06, "loss": 1.0398, "step": 2820 }, { "epoch": 0.55, "grad_norm": 4.086584091186523, "learning_rate": 9.098188556305262e-06, "loss": 1.0633, "step": 2830 }, { "epoch": 0.56, "grad_norm": 4.2910237312316895, "learning_rate": 9.088457583544022e-06, "loss": 1.0334, "step": 2840 }, { "epoch": 0.56, "grad_norm": 4.132745742797852, "learning_rate": 9.078679653889273e-06, "loss": 1.0595, "step": 2850 }, { "epoch": 0.56, "grad_norm": 3.9243948459625244, "learning_rate": 9.068854879642833e-06, "loss": 1.0641, "step": 2860 }, { "epoch": 0.56, "grad_norm": 3.943058490753174, "learning_rate": 9.058983373644532e-06, "loss": 1.0493, "step": 2870 }, { "epoch": 0.56, "grad_norm": 3.724886417388916, "learning_rate": 9.049065249270936e-06, "loss": 1.0374, "step": 2880 }, { "epoch": 0.57, "grad_norm": 3.8636670112609863, "learning_rate": 9.039100620434025e-06, "loss": 1.0634, "step": 2890 }, { "epoch": 0.57, "grad_norm": 3.841193675994873, "learning_rate": 9.029089601579895e-06, "loss": 1.0433, "step": 2900 }, { "epoch": 0.57, "grad_norm": 3.6147212982177734, "learning_rate": 9.019032307687446e-06, "loss": 1.0416, "step": 2910 }, { "epoch": 0.57, "grad_norm": 3.8550570011138916, "learning_rate": 9.008928854267054e-06, "loss": 1.064, "step": 2920 }, { "epoch": 0.57, "grad_norm": 3.5188698768615723, "learning_rate": 8.99877935735925e-06, "loss": 1.0472, "step": 2930 }, { "epoch": 0.58, "grad_norm": 4.188703536987305, "learning_rate": 8.988583933533384e-06, "loss": 1.0688, "step": 2940 }, { "epoch": 0.58, "grad_norm": 3.8695075511932373, "learning_rate": 8.978342699886289e-06, "loss": 1.0391, "step": 2950 }, { "epoch": 0.58, "grad_norm": 3.8634023666381836, "learning_rate": 8.968055774040932e-06, "loss": 1.0422, "step": 2960 }, { "epoch": 0.58, "grad_norm": 4.071281909942627, "learning_rate": 8.95772327414507e-06, "loss": 1.0442, "step": 2970 }, { "epoch": 0.58, "grad_norm": 4.14091157913208, "learning_rate": 8.947345318869883e-06, "loss": 1.0541, "step": 2980 }, { "epoch": 0.59, "grad_norm": 4.287483215332031, "learning_rate": 8.936922027408618e-06, "loss": 1.0391, "step": 2990 }, { "epoch": 0.59, "grad_norm": 4.207295894622803, "learning_rate": 8.926453519475225e-06, "loss": 1.0455, "step": 3000 }, { "epoch": 0.59, "eval_loss": 1.046447992324829, "eval_runtime": 12.4637, "eval_samples_per_second": 52.472, "eval_steps_per_second": 6.579, "step": 3000 }, { "epoch": 0.59, "grad_norm": 4.189324855804443, "learning_rate": 8.91593991530297e-06, "loss": 1.0559, "step": 3010 }, { "epoch": 0.59, "grad_norm": 4.109393119812012, "learning_rate": 8.905381335643056e-06, "loss": 1.0524, "step": 3020 }, { "epoch": 0.59, "grad_norm": 3.9047772884368896, "learning_rate": 8.89477790176325e-06, "loss": 1.059, "step": 3030 }, { "epoch": 0.6, "grad_norm": 4.3157196044921875, "learning_rate": 8.884129735446471e-06, "loss": 1.0494, "step": 3040 }, { "epoch": 0.6, "grad_norm": 4.147355079650879, "learning_rate": 8.873436958989409e-06, "loss": 1.0517, "step": 3050 }, { "epoch": 0.6, "grad_norm": 4.453030586242676, "learning_rate": 8.862699695201107e-06, "loss": 1.0538, "step": 3060 }, { "epoch": 0.6, "grad_norm": 4.24041223526001, "learning_rate": 8.851918067401552e-06, "loss": 1.0425, "step": 3070 }, { "epoch": 0.6, "grad_norm": 3.6763038635253906, "learning_rate": 8.84109219942027e-06, "loss": 1.0558, "step": 3080 }, { "epoch": 0.61, "grad_norm": 3.736586093902588, "learning_rate": 8.83022221559489e-06, "loss": 1.0589, "step": 3090 }, { "epoch": 0.61, "grad_norm": 3.622150182723999, "learning_rate": 8.819308240769726e-06, "loss": 1.0428, "step": 3100 }, { "epoch": 0.61, "grad_norm": 3.96793794631958, "learning_rate": 8.808350400294332e-06, "loss": 1.0245, "step": 3110 }, { "epoch": 0.61, "grad_norm": 3.7367801666259766, "learning_rate": 8.797348820022079e-06, "loss": 1.0551, "step": 3120 }, { "epoch": 0.61, "grad_norm": 3.7123868465423584, "learning_rate": 8.78630362630869e-06, "loss": 1.0381, "step": 3130 }, { "epoch": 0.62, "grad_norm": 3.651548385620117, "learning_rate": 8.775214946010806e-06, "loss": 1.0428, "step": 3140 }, { "epoch": 0.62, "grad_norm": 4.124800205230713, "learning_rate": 8.764082906484518e-06, "loss": 1.0638, "step": 3150 }, { "epoch": 0.62, "grad_norm": 3.599874496459961, "learning_rate": 8.752907635583911e-06, "loss": 1.0441, "step": 3160 }, { "epoch": 0.62, "grad_norm": 3.769707441329956, "learning_rate": 8.74168926165959e-06, "loss": 1.0526, "step": 3170 }, { "epoch": 0.62, "grad_norm": 4.018752098083496, "learning_rate": 8.730427913557205e-06, "loss": 1.0672, "step": 3180 }, { "epoch": 0.63, "grad_norm": 3.963313341140747, "learning_rate": 8.71912372061598e-06, "loss": 1.0606, "step": 3190 }, { "epoch": 0.63, "grad_norm": 4.098948001861572, "learning_rate": 8.707776812667224e-06, "loss": 1.0383, "step": 3200 }, { "epoch": 0.63, "grad_norm": 3.441176652908325, "learning_rate": 8.696387320032827e-06, "loss": 1.0629, "step": 3210 }, { "epoch": 0.63, "grad_norm": 3.6925058364868164, "learning_rate": 8.684955373523787e-06, "loss": 1.0555, "step": 3220 }, { "epoch": 0.63, "grad_norm": 3.5602104663848877, "learning_rate": 8.673481104438685e-06, "loss": 1.0421, "step": 3230 }, { "epoch": 0.64, "grad_norm": 4.177275657653809, "learning_rate": 8.661964644562194e-06, "loss": 1.0504, "step": 3240 }, { "epoch": 0.64, "grad_norm": 3.9053499698638916, "learning_rate": 8.650406126163553e-06, "loss": 1.0508, "step": 3250 }, { "epoch": 0.64, "grad_norm": 3.4393839836120605, "learning_rate": 8.638805681995052e-06, "loss": 1.0375, "step": 3260 }, { "epoch": 0.64, "grad_norm": 3.890512228012085, "learning_rate": 8.627163445290514e-06, "loss": 1.0453, "step": 3270 }, { "epoch": 0.64, "grad_norm": 4.122755527496338, "learning_rate": 8.615479549763756e-06, "loss": 1.0427, "step": 3280 }, { "epoch": 0.64, "grad_norm": 3.3840408325195312, "learning_rate": 8.603754129607055e-06, "loss": 1.0454, "step": 3290 }, { "epoch": 0.65, "grad_norm": 4.196717262268066, "learning_rate": 8.591987319489612e-06, "loss": 1.0594, "step": 3300 }, { "epoch": 0.65, "grad_norm": 3.9698941707611084, "learning_rate": 8.580179254555997e-06, "loss": 1.0431, "step": 3310 }, { "epoch": 0.65, "grad_norm": 3.883592128753662, "learning_rate": 8.5683300704246e-06, "loss": 1.0257, "step": 3320 }, { "epoch": 0.65, "grad_norm": 3.783325672149658, "learning_rate": 8.556439903186082e-06, "loss": 1.0445, "step": 3330 }, { "epoch": 0.65, "grad_norm": 4.042956352233887, "learning_rate": 8.544508889401799e-06, "loss": 1.0507, "step": 3340 }, { "epoch": 0.66, "grad_norm": 3.7064452171325684, "learning_rate": 8.53253716610224e-06, "loss": 1.0604, "step": 3350 }, { "epoch": 0.66, "grad_norm": 3.835759401321411, "learning_rate": 8.520524870785453e-06, "loss": 1.0526, "step": 3360 }, { "epoch": 0.66, "grad_norm": 3.9541969299316406, "learning_rate": 8.508472141415468e-06, "loss": 1.0365, "step": 3370 }, { "epoch": 0.66, "grad_norm": 4.182323932647705, "learning_rate": 8.4963791164207e-06, "loss": 1.0292, "step": 3380 }, { "epoch": 0.66, "grad_norm": 4.205846309661865, "learning_rate": 8.484245934692379e-06, "loss": 1.0236, "step": 3390 }, { "epoch": 0.67, "grad_norm": 3.628838300704956, "learning_rate": 8.472072735582942e-06, "loss": 1.0457, "step": 3400 }, { "epoch": 0.67, "grad_norm": 3.7961418628692627, "learning_rate": 8.45985965890443e-06, "loss": 1.0491, "step": 3410 }, { "epoch": 0.67, "grad_norm": 3.6906325817108154, "learning_rate": 8.447606844926895e-06, "loss": 1.0315, "step": 3420 }, { "epoch": 0.67, "grad_norm": 4.307608604431152, "learning_rate": 8.435314434376773e-06, "loss": 1.0498, "step": 3430 }, { "epoch": 0.67, "grad_norm": 3.8979074954986572, "learning_rate": 8.422982568435283e-06, "loss": 1.0637, "step": 3440 }, { "epoch": 0.68, "grad_norm": 4.03852653503418, "learning_rate": 8.410611388736793e-06, "loss": 1.06, "step": 3450 }, { "epoch": 0.68, "grad_norm": 3.238548994064331, "learning_rate": 8.398201037367202e-06, "loss": 1.0385, "step": 3460 }, { "epoch": 0.68, "grad_norm": 4.223562240600586, "learning_rate": 8.385751656862305e-06, "loss": 1.039, "step": 3470 }, { "epoch": 0.68, "grad_norm": 3.7112879753112793, "learning_rate": 8.373263390206155e-06, "loss": 1.0412, "step": 3480 }, { "epoch": 0.68, "grad_norm": 3.801882743835449, "learning_rate": 8.36073638082942e-06, "loss": 1.0455, "step": 3490 }, { "epoch": 0.69, "grad_norm": 4.619334697723389, "learning_rate": 8.348170772607737e-06, "loss": 1.054, "step": 3500 }, { "epoch": 0.69, "grad_norm": 3.967297077178955, "learning_rate": 8.335566709860065e-06, "loss": 1.0369, "step": 3510 }, { "epoch": 0.69, "grad_norm": 3.9955484867095947, "learning_rate": 8.322924337347016e-06, "loss": 1.0631, "step": 3520 }, { "epoch": 0.69, "grad_norm": 3.6488661766052246, "learning_rate": 8.3102438002692e-06, "loss": 1.0427, "step": 3530 }, { "epoch": 0.69, "grad_norm": 4.1679534912109375, "learning_rate": 8.29752524426556e-06, "loss": 1.0396, "step": 3540 }, { "epoch": 0.7, "grad_norm": 3.7516090869903564, "learning_rate": 8.284768815411693e-06, "loss": 1.0457, "step": 3550 }, { "epoch": 0.7, "grad_norm": 3.902599811553955, "learning_rate": 8.27197466021817e-06, "loss": 1.0354, "step": 3560 }, { "epoch": 0.7, "grad_norm": 3.828345775604248, "learning_rate": 8.259142925628862e-06, "loss": 1.0359, "step": 3570 }, { "epoch": 0.7, "grad_norm": 4.114760875701904, "learning_rate": 8.246273759019252e-06, "loss": 1.0346, "step": 3580 }, { "epoch": 0.7, "grad_norm": 4.289566993713379, "learning_rate": 8.233367308194735e-06, "loss": 1.038, "step": 3590 }, { "epoch": 0.71, "grad_norm": 3.713040828704834, "learning_rate": 8.220423721388918e-06, "loss": 1.0442, "step": 3600 }, { "epoch": 0.71, "grad_norm": 4.1226630210876465, "learning_rate": 8.20744314726193e-06, "loss": 1.0502, "step": 3610 }, { "epoch": 0.71, "grad_norm": 3.980717182159424, "learning_rate": 8.19442573489871e-06, "loss": 1.0398, "step": 3620 }, { "epoch": 0.71, "grad_norm": 3.998352527618408, "learning_rate": 8.181371633807289e-06, "loss": 1.0558, "step": 3630 }, { "epoch": 0.71, "grad_norm": 4.392803192138672, "learning_rate": 8.168280993917078e-06, "loss": 1.0508, "step": 3640 }, { "epoch": 0.72, "grad_norm": 4.483020305633545, "learning_rate": 8.155153965577139e-06, "loss": 1.028, "step": 3650 }, { "epoch": 0.72, "grad_norm": 3.593369960784912, "learning_rate": 8.141990699554476e-06, "loss": 1.0591, "step": 3660 }, { "epoch": 0.72, "grad_norm": 3.9365193843841553, "learning_rate": 8.12879134703228e-06, "loss": 1.0496, "step": 3670 }, { "epoch": 0.72, "grad_norm": 4.063488960266113, "learning_rate": 8.115556059608208e-06, "loss": 1.0554, "step": 3680 }, { "epoch": 0.72, "grad_norm": 3.916815996170044, "learning_rate": 8.102284989292639e-06, "loss": 1.0382, "step": 3690 }, { "epoch": 0.73, "grad_norm": 3.987957000732422, "learning_rate": 8.088978288506923e-06, "loss": 1.0668, "step": 3700 }, { "epoch": 0.73, "grad_norm": 4.158136367797852, "learning_rate": 8.075636110081643e-06, "loss": 1.0346, "step": 3710 }, { "epoch": 0.73, "grad_norm": 3.6939709186553955, "learning_rate": 8.062258607254841e-06, "loss": 1.0401, "step": 3720 }, { "epoch": 0.73, "grad_norm": 4.190096378326416, "learning_rate": 8.048845933670274e-06, "loss": 1.0285, "step": 3730 }, { "epoch": 0.73, "grad_norm": 3.59887957572937, "learning_rate": 8.035398243375636e-06, "loss": 1.036, "step": 3740 }, { "epoch": 0.74, "grad_norm": 3.9749696254730225, "learning_rate": 8.021915690820808e-06, "loss": 1.0555, "step": 3750 }, { "epoch": 0.74, "grad_norm": 3.9689297676086426, "learning_rate": 8.008398430856064e-06, "loss": 1.038, "step": 3760 }, { "epoch": 0.74, "grad_norm": 3.9846508502960205, "learning_rate": 7.994846618730301e-06, "loss": 1.0523, "step": 3770 }, { "epoch": 0.74, "grad_norm": 4.488775730133057, "learning_rate": 7.981260410089258e-06, "loss": 1.0244, "step": 3780 }, { "epoch": 0.74, "grad_norm": 4.135215759277344, "learning_rate": 7.967639960973727e-06, "loss": 1.0653, "step": 3790 }, { "epoch": 0.74, "grad_norm": 4.3885884284973145, "learning_rate": 7.953985427817757e-06, "loss": 1.0531, "step": 3800 }, { "epoch": 0.75, "grad_norm": 3.9444169998168945, "learning_rate": 7.94029696744686e-06, "loss": 1.04, "step": 3810 }, { "epoch": 0.75, "grad_norm": 4.401015758514404, "learning_rate": 7.92657473707621e-06, "loss": 1.0498, "step": 3820 }, { "epoch": 0.75, "grad_norm": 4.35683012008667, "learning_rate": 7.912818894308845e-06, "loss": 1.0288, "step": 3830 }, { "epoch": 0.75, "grad_norm": 4.314106464385986, "learning_rate": 7.899029597133836e-06, "loss": 1.0413, "step": 3840 }, { "epoch": 0.75, "grad_norm": 3.9266910552978516, "learning_rate": 7.885207003924498e-06, "loss": 1.0319, "step": 3850 }, { "epoch": 0.76, "grad_norm": 3.997091054916382, "learning_rate": 7.87135127343655e-06, "loss": 1.0324, "step": 3860 }, { "epoch": 0.76, "grad_norm": 4.264638423919678, "learning_rate": 7.857462564806306e-06, "loss": 1.0328, "step": 3870 }, { "epoch": 0.76, "grad_norm": 4.032344818115234, "learning_rate": 7.84354103754884e-06, "loss": 1.0415, "step": 3880 }, { "epoch": 0.76, "grad_norm": 4.569589614868164, "learning_rate": 7.82958685155615e-06, "loss": 1.0566, "step": 3890 }, { "epoch": 0.76, "grad_norm": 4.405215740203857, "learning_rate": 7.815600167095338e-06, "loss": 1.0508, "step": 3900 }, { "epoch": 0.77, "grad_norm": 3.7878050804138184, "learning_rate": 7.801581144806752e-06, "loss": 1.0365, "step": 3910 }, { "epoch": 0.77, "grad_norm": 3.773585319519043, "learning_rate": 7.787529945702145e-06, "loss": 1.0366, "step": 3920 }, { "epoch": 0.77, "grad_norm": 4.027467727661133, "learning_rate": 7.773446731162835e-06, "loss": 1.0285, "step": 3930 }, { "epoch": 0.77, "grad_norm": 3.831883430480957, "learning_rate": 7.759331662937841e-06, "loss": 1.0342, "step": 3940 }, { "epoch": 0.77, "grad_norm": 4.330446243286133, "learning_rate": 7.745184903142029e-06, "loss": 1.0398, "step": 3950 }, { "epoch": 0.78, "grad_norm": 4.389279842376709, "learning_rate": 7.731006614254252e-06, "loss": 1.017, "step": 3960 }, { "epoch": 0.78, "grad_norm": 4.518781661987305, "learning_rate": 7.716796959115479e-06, "loss": 1.0465, "step": 3970 }, { "epoch": 0.78, "grad_norm": 4.294104099273682, "learning_rate": 7.70255610092693e-06, "loss": 1.0328, "step": 3980 }, { "epoch": 0.78, "grad_norm": 3.937368154525757, "learning_rate": 7.688284203248197e-06, "loss": 1.0496, "step": 3990 }, { "epoch": 0.78, "grad_norm": 4.222658157348633, "learning_rate": 7.673981429995372e-06, "loss": 1.032, "step": 4000 }, { "epoch": 0.78, "eval_loss": 1.0327889919281006, "eval_runtime": 12.4602, "eval_samples_per_second": 52.487, "eval_steps_per_second": 6.581, "step": 4000 }, { "epoch": 0.79, "grad_norm": 3.9882421493530273, "learning_rate": 7.659647945439157e-06, "loss": 1.0262, "step": 4010 }, { "epoch": 0.79, "grad_norm": 4.082265377044678, "learning_rate": 7.645283914202981e-06, "loss": 1.03, "step": 4020 }, { "epoch": 0.79, "grad_norm": 4.47793436050415, "learning_rate": 7.63088950126111e-06, "loss": 1.0402, "step": 4030 }, { "epoch": 0.79, "grad_norm": 4.539676189422607, "learning_rate": 7.616464871936748e-06, "loss": 1.0441, "step": 4040 }, { "epoch": 0.79, "grad_norm": 4.070407867431641, "learning_rate": 7.602010191900147e-06, "loss": 1.0298, "step": 4050 }, { "epoch": 0.8, "grad_norm": 4.478466510772705, "learning_rate": 7.587525627166691e-06, "loss": 1.0298, "step": 4060 }, { "epoch": 0.8, "grad_norm": 4.1451005935668945, "learning_rate": 7.573011344095002e-06, "loss": 1.0411, "step": 4070 }, { "epoch": 0.8, "grad_norm": 3.8588812351226807, "learning_rate": 7.558467509385023e-06, "loss": 1.0312, "step": 4080 }, { "epoch": 0.8, "grad_norm": 4.136762619018555, "learning_rate": 7.5438942900761035e-06, "loss": 1.0436, "step": 4090 }, { "epoch": 0.8, "grad_norm": 4.054186820983887, "learning_rate": 7.529291853545082e-06, "loss": 1.0421, "step": 4100 }, { "epoch": 0.81, "grad_norm": 4.862720012664795, "learning_rate": 7.514660367504368e-06, "loss": 1.0355, "step": 4110 }, { "epoch": 0.81, "grad_norm": 38.17692565917969, "learning_rate": 7.500000000000001e-06, "loss": 1.045, "step": 4120 }, { "epoch": 0.81, "grad_norm": 4.037071228027344, "learning_rate": 7.485310919409742e-06, "loss": 1.0382, "step": 4130 }, { "epoch": 0.81, "grad_norm": 4.044297218322754, "learning_rate": 7.470593294441124e-06, "loss": 1.0354, "step": 4140 }, { "epoch": 0.81, "grad_norm": 3.8578081130981445, "learning_rate": 7.455847294129519e-06, "loss": 1.0475, "step": 4150 }, { "epoch": 0.82, "grad_norm": 4.1042094230651855, "learning_rate": 7.4410730878361936e-06, "loss": 1.0302, "step": 4160 }, { "epoch": 0.82, "grad_norm": 4.391599178314209, "learning_rate": 7.426270845246373e-06, "loss": 1.0317, "step": 4170 }, { "epoch": 0.82, "grad_norm": 4.354910373687744, "learning_rate": 7.411440736367281e-06, "loss": 1.0291, "step": 4180 }, { "epoch": 0.82, "grad_norm": 4.061986923217773, "learning_rate": 7.396582931526194e-06, "loss": 1.0434, "step": 4190 }, { "epoch": 0.82, "grad_norm": 3.731538772583008, "learning_rate": 7.381697601368481e-06, "loss": 1.0472, "step": 4200 }, { "epoch": 0.83, "grad_norm": 4.0257887840271, "learning_rate": 7.36678491685565e-06, "loss": 1.0399, "step": 4210 }, { "epoch": 0.83, "grad_norm": 4.179793834686279, "learning_rate": 7.351845049263374e-06, "loss": 1.0518, "step": 4220 }, { "epoch": 0.83, "grad_norm": 4.212937355041504, "learning_rate": 7.3368781701795365e-06, "loss": 1.0381, "step": 4230 }, { "epoch": 0.83, "grad_norm": 4.426169395446777, "learning_rate": 7.321884451502252e-06, "loss": 1.0338, "step": 4240 }, { "epoch": 0.83, "grad_norm": 4.190229415893555, "learning_rate": 7.30686406543789e-06, "loss": 1.0482, "step": 4250 }, { "epoch": 0.83, "grad_norm": 3.897801160812378, "learning_rate": 7.291817184499107e-06, "loss": 1.0331, "step": 4260 }, { "epoch": 0.84, "grad_norm": 4.616969585418701, "learning_rate": 7.276743981502856e-06, "loss": 1.0515, "step": 4270 }, { "epoch": 0.84, "grad_norm": 3.8713490962982178, "learning_rate": 7.2616446295684075e-06, "loss": 1.0222, "step": 4280 }, { "epoch": 0.84, "grad_norm": 3.93888783454895, "learning_rate": 7.246519302115355e-06, "loss": 1.0355, "step": 4290 }, { "epoch": 0.84, "grad_norm": 4.489087104797363, "learning_rate": 7.23136817286163e-06, "loss": 1.0316, "step": 4300 }, { "epoch": 0.84, "grad_norm": 3.9029769897460938, "learning_rate": 7.216191415821503e-06, "loss": 1.0212, "step": 4310 }, { "epoch": 0.85, "grad_norm": 4.405784606933594, "learning_rate": 7.200989205303583e-06, "loss": 1.0421, "step": 4320 }, { "epoch": 0.85, "grad_norm": 4.0875701904296875, "learning_rate": 7.185761715908826e-06, "loss": 1.0468, "step": 4330 }, { "epoch": 0.85, "grad_norm": 4.10852575302124, "learning_rate": 7.170509122528511e-06, "loss": 1.0307, "step": 4340 }, { "epoch": 0.85, "grad_norm": 4.401843547821045, "learning_rate": 7.15523160034225e-06, "loss": 1.0265, "step": 4350 }, { "epoch": 0.85, "grad_norm": 4.106047630310059, "learning_rate": 7.139929324815965e-06, "loss": 1.021, "step": 4360 }, { "epoch": 0.86, "grad_norm": 4.1536407470703125, "learning_rate": 7.124602471699878e-06, "loss": 1.0409, "step": 4370 }, { "epoch": 0.86, "grad_norm": 4.14933443069458, "learning_rate": 7.109251217026487e-06, "loss": 1.0385, "step": 4380 }, { "epoch": 0.86, "grad_norm": 4.064835071563721, "learning_rate": 7.0938757371085485e-06, "loss": 1.0312, "step": 4390 }, { "epoch": 0.86, "grad_norm": 3.811549425125122, "learning_rate": 7.078476208537057e-06, "loss": 1.0359, "step": 4400 }, { "epoch": 0.86, "grad_norm": 4.325003623962402, "learning_rate": 7.063052808179205e-06, "loss": 1.0483, "step": 4410 }, { "epoch": 0.87, "grad_norm": 3.5266337394714355, "learning_rate": 7.04760571317636e-06, "loss": 1.0228, "step": 4420 }, { "epoch": 0.87, "grad_norm": 4.071694850921631, "learning_rate": 7.032135100942027e-06, "loss": 1.0353, "step": 4430 }, { "epoch": 0.87, "grad_norm": 4.121958255767822, "learning_rate": 7.016641149159816e-06, "loss": 1.049, "step": 4440 }, { "epoch": 0.87, "grad_norm": 4.714683532714844, "learning_rate": 7.00112403578139e-06, "loss": 1.0361, "step": 4450 }, { "epoch": 0.87, "grad_norm": 4.453790664672852, "learning_rate": 6.985583939024436e-06, "loss": 1.033, "step": 4460 }, { "epoch": 0.88, "grad_norm": 4.712753772735596, "learning_rate": 6.970021037370609e-06, "loss": 1.0462, "step": 4470 }, { "epoch": 0.88, "grad_norm": 4.329601287841797, "learning_rate": 6.9544355095634775e-06, "loss": 1.0459, "step": 4480 }, { "epoch": 0.88, "grad_norm": 4.669638156890869, "learning_rate": 6.938827534606484e-06, "loss": 1.0335, "step": 4490 }, { "epoch": 0.88, "grad_norm": 3.9964518547058105, "learning_rate": 6.923197291760876e-06, "loss": 1.0433, "step": 4500 }, { "epoch": 0.88, "grad_norm": 3.998533248901367, "learning_rate": 6.907544960543659e-06, "loss": 1.035, "step": 4510 }, { "epoch": 0.89, "grad_norm": 4.344484329223633, "learning_rate": 6.891870720725522e-06, "loss": 1.0405, "step": 4520 }, { "epoch": 0.89, "grad_norm": 4.392531871795654, "learning_rate": 6.8761747523287845e-06, "loss": 1.0339, "step": 4530 }, { "epoch": 0.89, "grad_norm": 4.274383544921875, "learning_rate": 6.860457235625322e-06, "loss": 1.0337, "step": 4540 }, { "epoch": 0.89, "grad_norm": 4.2484846115112305, "learning_rate": 6.844718351134496e-06, "loss": 1.0433, "step": 4550 }, { "epoch": 0.89, "grad_norm": 3.707181692123413, "learning_rate": 6.828958279621085e-06, "loss": 1.0497, "step": 4560 }, { "epoch": 0.9, "grad_norm": 4.188033103942871, "learning_rate": 6.813177202093203e-06, "loss": 1.0274, "step": 4570 }, { "epoch": 0.9, "grad_norm": 3.837230682373047, "learning_rate": 6.797375299800224e-06, "loss": 1.0395, "step": 4580 }, { "epoch": 0.9, "grad_norm": 3.9512484073638916, "learning_rate": 6.7815527542307e-06, "loss": 1.0516, "step": 4590 }, { "epoch": 0.9, "grad_norm": 4.2635297775268555, "learning_rate": 6.765709747110274e-06, "loss": 1.057, "step": 4600 }, { "epoch": 0.9, "grad_norm": 4.248997211456299, "learning_rate": 6.749846460399594e-06, "loss": 1.0296, "step": 4610 }, { "epoch": 0.91, "grad_norm": 4.210043430328369, "learning_rate": 6.7339630762922295e-06, "loss": 1.0291, "step": 4620 }, { "epoch": 0.91, "grad_norm": 3.8999147415161133, "learning_rate": 6.7180597772125665e-06, "loss": 1.0375, "step": 4630 }, { "epoch": 0.91, "grad_norm": 4.221770286560059, "learning_rate": 6.702136745813721e-06, "loss": 1.0206, "step": 4640 }, { "epoch": 0.91, "grad_norm": 4.14971399307251, "learning_rate": 6.686194164975446e-06, "loss": 1.0283, "step": 4650 }, { "epoch": 0.91, "grad_norm": 3.6616663932800293, "learning_rate": 6.670232217802011e-06, "loss": 1.0299, "step": 4660 }, { "epoch": 0.92, "grad_norm": 4.623802661895752, "learning_rate": 6.654251087620125e-06, "loss": 1.0325, "step": 4670 }, { "epoch": 0.92, "grad_norm": 3.6086490154266357, "learning_rate": 6.638250957976813e-06, "loss": 1.0299, "step": 4680 }, { "epoch": 0.92, "grad_norm": 4.8812456130981445, "learning_rate": 6.6222320126373105e-06, "loss": 1.0436, "step": 4690 }, { "epoch": 0.92, "grad_norm": 3.9015376567840576, "learning_rate": 6.6061944355829634e-06, "loss": 1.0093, "step": 4700 }, { "epoch": 0.92, "grad_norm": 4.15576171875, "learning_rate": 6.590138411009099e-06, "loss": 1.0378, "step": 4710 }, { "epoch": 0.93, "grad_norm": 4.204216957092285, "learning_rate": 6.574064123322925e-06, "loss": 1.032, "step": 4720 }, { "epoch": 0.93, "grad_norm": 4.158588409423828, "learning_rate": 6.557971757141402e-06, "loss": 1.0182, "step": 4730 }, { "epoch": 0.93, "grad_norm": 4.28289270401001, "learning_rate": 6.541861497289126e-06, "loss": 1.0324, "step": 4740 }, { "epoch": 0.93, "grad_norm": 4.406084060668945, "learning_rate": 6.525733528796207e-06, "loss": 1.0311, "step": 4750 }, { "epoch": 0.93, "grad_norm": 3.9430246353149414, "learning_rate": 6.509588036896144e-06, "loss": 1.0365, "step": 4760 }, { "epoch": 0.93, "grad_norm": 3.8312675952911377, "learning_rate": 6.493425207023693e-06, "loss": 1.0313, "step": 4770 }, { "epoch": 0.94, "grad_norm": 4.555315017700195, "learning_rate": 6.477245224812746e-06, "loss": 1.0336, "step": 4780 }, { "epoch": 0.94, "grad_norm": 4.399374961853027, "learning_rate": 6.46104827609419e-06, "loss": 1.0309, "step": 4790 }, { "epoch": 0.94, "grad_norm": 4.963261604309082, "learning_rate": 6.444834546893773e-06, "loss": 1.0401, "step": 4800 }, { "epoch": 0.94, "grad_norm": 4.5317559242248535, "learning_rate": 6.42860422342998e-06, "loss": 1.0287, "step": 4810 }, { "epoch": 0.94, "grad_norm": 4.08970308303833, "learning_rate": 6.412357492111877e-06, "loss": 1.0314, "step": 4820 }, { "epoch": 0.95, "grad_norm": 4.869360446929932, "learning_rate": 6.396094539536981e-06, "loss": 1.0426, "step": 4830 }, { "epoch": 0.95, "grad_norm": 4.279962539672852, "learning_rate": 6.379815552489112e-06, "loss": 1.044, "step": 4840 }, { "epoch": 0.95, "grad_norm": 4.379662990570068, "learning_rate": 6.363520717936256e-06, "loss": 1.022, "step": 4850 }, { "epoch": 0.95, "grad_norm": 4.329278469085693, "learning_rate": 6.347210223028403e-06, "loss": 1.0295, "step": 4860 }, { "epoch": 0.95, "grad_norm": 4.4202423095703125, "learning_rate": 6.330884255095409e-06, "loss": 1.0391, "step": 4870 }, { "epoch": 0.96, "grad_norm": 4.681463718414307, "learning_rate": 6.3145430016448435e-06, "loss": 1.0326, "step": 4880 }, { "epoch": 0.96, "grad_norm": 4.008312225341797, "learning_rate": 6.298186650359832e-06, "loss": 1.0459, "step": 4890 }, { "epoch": 0.96, "grad_norm": 4.1975884437561035, "learning_rate": 6.281815389096903e-06, "loss": 1.032, "step": 4900 }, { "epoch": 0.96, "grad_norm": 4.011014461517334, "learning_rate": 6.265429405883825e-06, "loss": 1.0537, "step": 4910 }, { "epoch": 0.96, "grad_norm": 4.628488063812256, "learning_rate": 6.24902888891746e-06, "loss": 1.0296, "step": 4920 }, { "epoch": 0.97, "grad_norm": 4.286167621612549, "learning_rate": 6.232614026561586e-06, "loss": 1.0251, "step": 4930 }, { "epoch": 0.97, "grad_norm": 4.162431240081787, "learning_rate": 6.216185007344745e-06, "loss": 1.0231, "step": 4940 }, { "epoch": 0.97, "grad_norm": 4.599613189697266, "learning_rate": 6.199742019958074e-06, "loss": 1.0259, "step": 4950 }, { "epoch": 0.97, "grad_norm": 3.7376463413238525, "learning_rate": 6.183285253253135e-06, "loss": 1.0308, "step": 4960 }, { "epoch": 0.97, "grad_norm": 4.396124362945557, "learning_rate": 6.1668148962397525e-06, "loss": 1.0383, "step": 4970 }, { "epoch": 0.98, "grad_norm": 4.382174015045166, "learning_rate": 6.150331138083833e-06, "loss": 1.0269, "step": 4980 }, { "epoch": 0.98, "grad_norm": 4.524794578552246, "learning_rate": 6.133834168105206e-06, "loss": 1.0381, "step": 4990 }, { "epoch": 0.98, "grad_norm": 4.226146221160889, "learning_rate": 6.117324175775435e-06, "loss": 1.0449, "step": 5000 }, { "epoch": 0.98, "eval_loss": 1.023166298866272, "eval_runtime": 12.4375, "eval_samples_per_second": 52.583, "eval_steps_per_second": 6.593, "step": 5000 }, { "epoch": 0.98, "grad_norm": 4.120533466339111, "learning_rate": 6.100801350715652e-06, "loss": 1.0285, "step": 5010 }, { "epoch": 0.98, "grad_norm": 3.9948532581329346, "learning_rate": 6.084265882694378e-06, "loss": 1.0411, "step": 5020 }, { "epoch": 0.99, "grad_norm": 4.175631999969482, "learning_rate": 6.0677179616253345e-06, "loss": 1.0347, "step": 5030 }, { "epoch": 0.99, "grad_norm": 4.19612455368042, "learning_rate": 6.0511577775652744e-06, "loss": 1.0367, "step": 5040 }, { "epoch": 0.99, "grad_norm": 4.379330158233643, "learning_rate": 6.034585520711792e-06, "loss": 1.0314, "step": 5050 }, { "epoch": 0.99, "grad_norm": 4.5682902336120605, "learning_rate": 6.018001381401143e-06, "loss": 1.0333, "step": 5060 }, { "epoch": 0.99, "grad_norm": 4.473631381988525, "learning_rate": 6.001405550106052e-06, "loss": 1.0397, "step": 5070 }, { "epoch": 1.0, "grad_norm": 4.200445175170898, "learning_rate": 5.9847982174335314e-06, "loss": 1.0262, "step": 5080 }, { "epoch": 1.0, "grad_norm": 3.9142019748687744, "learning_rate": 5.96817957412269e-06, "loss": 1.034, "step": 5090 }, { "epoch": 1.0, "grad_norm": 4.04217004776001, "learning_rate": 5.951549811042539e-06, "loss": 1.0466, "step": 5100 } ], "logging_steps": 10, "max_steps": 10300, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1, "total_flos": 1.9350228034513797e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }