|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 5102, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.812536716461182, |
|
"learning_rate": 9.70873786407767e-08, |
|
"loss": 1.4108, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.775967597961426, |
|
"learning_rate": 1.941747572815534e-07, |
|
"loss": 1.4065, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.67765998840332, |
|
"learning_rate": 2.9126213592233014e-07, |
|
"loss": 1.4145, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.1705145835876465, |
|
"learning_rate": 3.883495145631068e-07, |
|
"loss": 1.3955, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.644610404968262, |
|
"learning_rate": 4.854368932038835e-07, |
|
"loss": 1.3986, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.067681789398193, |
|
"learning_rate": 5.825242718446603e-07, |
|
"loss": 1.3721, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.449668884277344, |
|
"learning_rate": 6.79611650485437e-07, |
|
"loss": 1.3672, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.1965107917785645, |
|
"learning_rate": 7.766990291262136e-07, |
|
"loss": 1.3229, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.0556440353393555, |
|
"learning_rate": 8.737864077669904e-07, |
|
"loss": 1.2846, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.58712911605835, |
|
"learning_rate": 9.70873786407767e-07, |
|
"loss": 1.2561, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.737099647521973, |
|
"learning_rate": 1.0679611650485437e-06, |
|
"loss": 1.2511, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.916423320770264, |
|
"learning_rate": 1.1650485436893206e-06, |
|
"loss": 1.2378, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.898449420928955, |
|
"learning_rate": 1.2621359223300972e-06, |
|
"loss": 1.2263, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.9001641273498535, |
|
"learning_rate": 1.359223300970874e-06, |
|
"loss": 1.228, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.939080238342285, |
|
"learning_rate": 1.4563106796116506e-06, |
|
"loss": 1.2006, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.023560047149658, |
|
"learning_rate": 1.5533980582524272e-06, |
|
"loss": 1.1934, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.546477317810059, |
|
"learning_rate": 1.650485436893204e-06, |
|
"loss": 1.1905, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.045153617858887, |
|
"learning_rate": 1.7475728155339808e-06, |
|
"loss": 1.1642, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.5985870361328125, |
|
"learning_rate": 1.8446601941747574e-06, |
|
"loss": 1.1674, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.283823490142822, |
|
"learning_rate": 1.941747572815534e-06, |
|
"loss": 1.157, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.753105640411377, |
|
"learning_rate": 2.0388349514563107e-06, |
|
"loss": 1.164, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.330531597137451, |
|
"learning_rate": 2.1359223300970874e-06, |
|
"loss": 1.169, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.404067516326904, |
|
"learning_rate": 2.2330097087378645e-06, |
|
"loss": 1.1493, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.143772602081299, |
|
"learning_rate": 2.330097087378641e-06, |
|
"loss": 1.1388, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.965195417404175, |
|
"learning_rate": 2.427184466019418e-06, |
|
"loss": 1.147, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.0882039070129395, |
|
"learning_rate": 2.5242718446601945e-06, |
|
"loss": 1.1487, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.03162956237793, |
|
"learning_rate": 2.621359223300971e-06, |
|
"loss": 1.136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.2308855056762695, |
|
"learning_rate": 2.718446601941748e-06, |
|
"loss": 1.1378, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.1606221199035645, |
|
"learning_rate": 2.8155339805825245e-06, |
|
"loss": 1.1471, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.03283166885376, |
|
"learning_rate": 2.912621359223301e-06, |
|
"loss": 1.1185, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.2199859619140625, |
|
"learning_rate": 3.0097087378640778e-06, |
|
"loss": 1.1432, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.150729656219482, |
|
"learning_rate": 3.1067961165048544e-06, |
|
"loss": 1.1155, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9178309440612793, |
|
"learning_rate": 3.2038834951456315e-06, |
|
"loss": 1.1219, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.3575663566589355, |
|
"learning_rate": 3.300970873786408e-06, |
|
"loss": 1.108, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.251282215118408, |
|
"learning_rate": 3.398058252427185e-06, |
|
"loss": 1.1052, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.297124862670898, |
|
"learning_rate": 3.4951456310679615e-06, |
|
"loss": 1.112, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.4219465255737305, |
|
"learning_rate": 3.592233009708738e-06, |
|
"loss": 1.12, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.0326948165893555, |
|
"learning_rate": 3.689320388349515e-06, |
|
"loss": 1.1286, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.3179030418396, |
|
"learning_rate": 3.7864077669902915e-06, |
|
"loss": 1.1255, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.20508337020874, |
|
"learning_rate": 3.883495145631068e-06, |
|
"loss": 1.1116, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.361403942108154, |
|
"learning_rate": 3.980582524271845e-06, |
|
"loss": 1.1061, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.279186248779297, |
|
"learning_rate": 4.0776699029126215e-06, |
|
"loss": 1.1031, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.163991928100586, |
|
"learning_rate": 4.1747572815533986e-06, |
|
"loss": 1.1097, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.068342685699463, |
|
"learning_rate": 4.271844660194175e-06, |
|
"loss": 1.1048, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.233316421508789, |
|
"learning_rate": 4.368932038834952e-06, |
|
"loss": 1.11, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.018556594848633, |
|
"learning_rate": 4.466019417475729e-06, |
|
"loss": 1.1, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.257734298706055, |
|
"learning_rate": 4.563106796116505e-06, |
|
"loss": 1.1102, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.20249605178833, |
|
"learning_rate": 4.660194174757282e-06, |
|
"loss": 1.0938, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.196409225463867, |
|
"learning_rate": 4.7572815533980585e-06, |
|
"loss": 1.1153, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.7823269367218018, |
|
"learning_rate": 4.854368932038836e-06, |
|
"loss": 1.105, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.8932242393493652, |
|
"learning_rate": 4.951456310679612e-06, |
|
"loss": 1.0977, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.169296741485596, |
|
"learning_rate": 5.048543689320389e-06, |
|
"loss": 1.1022, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.110784530639648, |
|
"learning_rate": 5.145631067961165e-06, |
|
"loss": 1.0954, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.476339340209961, |
|
"learning_rate": 5.242718446601942e-06, |
|
"loss": 1.104, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.7383415699005127, |
|
"learning_rate": 5.3398058252427185e-06, |
|
"loss": 1.106, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.040097236633301, |
|
"learning_rate": 5.436893203883496e-06, |
|
"loss": 1.1006, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.430188179016113, |
|
"learning_rate": 5.533980582524272e-06, |
|
"loss": 1.1087, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.724983215332031, |
|
"learning_rate": 5.631067961165049e-06, |
|
"loss": 1.1004, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.040498733520508, |
|
"learning_rate": 5.728155339805825e-06, |
|
"loss": 1.0944, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.504246234893799, |
|
"learning_rate": 5.825242718446602e-06, |
|
"loss": 1.0915, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.28185510635376, |
|
"learning_rate": 5.9223300970873785e-06, |
|
"loss": 1.0951, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.7988641262054443, |
|
"learning_rate": 6.0194174757281556e-06, |
|
"loss": 1.1052, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.40338134765625, |
|
"learning_rate": 6.116504854368932e-06, |
|
"loss": 1.0931, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.3256659507751465, |
|
"learning_rate": 6.213592233009709e-06, |
|
"loss": 1.0863, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.146902561187744, |
|
"learning_rate": 6.310679611650487e-06, |
|
"loss": 1.0916, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.871020317077637, |
|
"learning_rate": 6.407766990291263e-06, |
|
"loss": 1.0853, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.6339454650878906, |
|
"learning_rate": 6.50485436893204e-06, |
|
"loss": 1.0907, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.507898330688477, |
|
"learning_rate": 6.601941747572816e-06, |
|
"loss": 1.1049, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.15292501449585, |
|
"learning_rate": 6.6990291262135935e-06, |
|
"loss": 1.0928, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.081875324249268, |
|
"learning_rate": 6.79611650485437e-06, |
|
"loss": 1.1104, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.9496753215789795, |
|
"learning_rate": 6.893203883495147e-06, |
|
"loss": 1.0916, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.676784992218018, |
|
"learning_rate": 6.990291262135923e-06, |
|
"loss": 1.087, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.889897584915161, |
|
"learning_rate": 7.0873786407767e-06, |
|
"loss": 1.0889, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.654224157333374, |
|
"learning_rate": 7.184466019417476e-06, |
|
"loss": 1.0929, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.457578182220459, |
|
"learning_rate": 7.2815533980582534e-06, |
|
"loss": 1.0855, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.403715133666992, |
|
"learning_rate": 7.37864077669903e-06, |
|
"loss": 1.0793, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.498962879180908, |
|
"learning_rate": 7.475728155339807e-06, |
|
"loss": 1.0946, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.233016014099121, |
|
"learning_rate": 7.572815533980583e-06, |
|
"loss": 1.0753, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.361231327056885, |
|
"learning_rate": 7.66990291262136e-06, |
|
"loss": 1.0865, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.116058349609375, |
|
"learning_rate": 7.766990291262136e-06, |
|
"loss": 1.0874, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.87738037109375, |
|
"learning_rate": 7.864077669902913e-06, |
|
"loss": 1.0897, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.444067478179932, |
|
"learning_rate": 7.96116504854369e-06, |
|
"loss": 1.0926, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.9490249156951904, |
|
"learning_rate": 8.058252427184466e-06, |
|
"loss": 1.0678, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.674923896789551, |
|
"learning_rate": 8.155339805825243e-06, |
|
"loss": 1.0923, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.707766056060791, |
|
"learning_rate": 8.25242718446602e-06, |
|
"loss": 1.0874, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.693995475769043, |
|
"learning_rate": 8.349514563106797e-06, |
|
"loss": 1.0558, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.561864376068115, |
|
"learning_rate": 8.446601941747573e-06, |
|
"loss": 1.0961, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.7990922927856445, |
|
"learning_rate": 8.54368932038835e-06, |
|
"loss": 1.0877, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.014951705932617, |
|
"learning_rate": 8.640776699029127e-06, |
|
"loss": 1.0866, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.007348537445068, |
|
"learning_rate": 8.737864077669904e-06, |
|
"loss": 1.0795, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.876419544219971, |
|
"learning_rate": 8.834951456310681e-06, |
|
"loss": 1.0826, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.8001556396484375, |
|
"learning_rate": 8.932038834951458e-06, |
|
"loss": 1.0828, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.761294364929199, |
|
"learning_rate": 9.029126213592233e-06, |
|
"loss": 1.0897, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.832348346710205, |
|
"learning_rate": 9.12621359223301e-06, |
|
"loss": 1.0826, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.325027942657471, |
|
"learning_rate": 9.223300970873788e-06, |
|
"loss": 1.0705, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.859924554824829, |
|
"learning_rate": 9.320388349514565e-06, |
|
"loss": 1.0691, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.8470637798309326, |
|
"learning_rate": 9.41747572815534e-06, |
|
"loss": 1.0808, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.887052536010742, |
|
"learning_rate": 9.514563106796117e-06, |
|
"loss": 1.0688, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.891918897628784, |
|
"learning_rate": 9.611650485436894e-06, |
|
"loss": 1.0748, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.252170562744141, |
|
"learning_rate": 9.708737864077671e-06, |
|
"loss": 1.0889, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.0763322114944458, |
|
"eval_runtime": 12.4536, |
|
"eval_samples_per_second": 52.515, |
|
"eval_steps_per_second": 6.584, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.76177453994751, |
|
"learning_rate": 9.805825242718447e-06, |
|
"loss": 1.0925, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.820517063140869, |
|
"learning_rate": 9.902912621359224e-06, |
|
"loss": 1.0879, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.241539478302002, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0688, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.468598365783691, |
|
"learning_rate": 9.999971286914108e-06, |
|
"loss": 1.0782, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.580018043518066, |
|
"learning_rate": 9.999885147986207e-06, |
|
"loss": 1.0793, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.351871490478516, |
|
"learning_rate": 9.999741584205621e-06, |
|
"loss": 1.0746, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.939090251922607, |
|
"learning_rate": 9.999540597221217e-06, |
|
"loss": 1.0814, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.497158050537109, |
|
"learning_rate": 9.999282189341374e-06, |
|
"loss": 1.076, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.999101161956787, |
|
"learning_rate": 9.998966363533972e-06, |
|
"loss": 1.0826, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.041301727294922, |
|
"learning_rate": 9.99859312342634e-06, |
|
"loss": 1.0675, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.890156269073486, |
|
"learning_rate": 9.998162473305229e-06, |
|
"loss": 1.0724, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.730093955993652, |
|
"learning_rate": 9.997674418116759e-06, |
|
"loss": 1.0679, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.929831504821777, |
|
"learning_rate": 9.997128963466355e-06, |
|
"loss": 1.064, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.288021087646484, |
|
"learning_rate": 9.996526115618694e-06, |
|
"loss": 1.0895, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.647055149078369, |
|
"learning_rate": 9.995865881497621e-06, |
|
"loss": 1.0768, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.861464977264404, |
|
"learning_rate": 9.995148268686086e-06, |
|
"loss": 1.073, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.072396278381348, |
|
"learning_rate": 9.994373285426034e-06, |
|
"loss": 1.0679, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.59302282333374, |
|
"learning_rate": 9.993540940618334e-06, |
|
"loss": 1.0999, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.47246789932251, |
|
"learning_rate": 9.992651243822658e-06, |
|
"loss": 1.0797, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.917146682739258, |
|
"learning_rate": 9.991704205257383e-06, |
|
"loss": 1.0786, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.097021102905273, |
|
"learning_rate": 9.99069983579947e-06, |
|
"loss": 1.0709, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.385958671569824, |
|
"learning_rate": 9.989638146984337e-06, |
|
"loss": 1.0878, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.534526824951172, |
|
"learning_rate": 9.988519151005728e-06, |
|
"loss": 1.0753, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.479306697845459, |
|
"learning_rate": 9.987342860715575e-06, |
|
"loss": 1.0638, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.557966232299805, |
|
"learning_rate": 9.986109289623848e-06, |
|
"loss": 1.0841, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.882152080535889, |
|
"learning_rate": 9.984818451898399e-06, |
|
"loss": 1.0678, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.506185054779053, |
|
"learning_rate": 9.983470362364803e-06, |
|
"loss": 1.0766, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.027829170227051, |
|
"learning_rate": 9.982065036506183e-06, |
|
"loss": 1.0825, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.385190010070801, |
|
"learning_rate": 9.980602490463037e-06, |
|
"loss": 1.0709, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.701911449432373, |
|
"learning_rate": 9.979082741033047e-06, |
|
"loss": 1.0813, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.571600437164307, |
|
"learning_rate": 9.977505805670895e-06, |
|
"loss": 1.0678, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.041151523590088, |
|
"learning_rate": 9.97587170248805e-06, |
|
"loss": 1.08, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.1545729637146, |
|
"learning_rate": 9.97418045025257e-06, |
|
"loss": 1.071, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.163790702819824, |
|
"learning_rate": 9.972432068388885e-06, |
|
"loss": 1.068, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.824156284332275, |
|
"learning_rate": 9.97062657697757e-06, |
|
"loss": 1.0718, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 6.520030975341797, |
|
"learning_rate": 9.968763996755115e-06, |
|
"loss": 1.0779, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.418417453765869, |
|
"learning_rate": 9.966844349113695e-06, |
|
"loss": 1.0677, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.918398380279541, |
|
"learning_rate": 9.96486765610091e-06, |
|
"loss": 1.0942, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.05225944519043, |
|
"learning_rate": 9.96283394041954e-06, |
|
"loss": 1.0852, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.492018699645996, |
|
"learning_rate": 9.96074322542729e-06, |
|
"loss": 1.0728, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.9885711669921875, |
|
"learning_rate": 9.958595535136511e-06, |
|
"loss": 1.0618, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.547979831695557, |
|
"learning_rate": 9.95639089421393e-06, |
|
"loss": 1.0753, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.2231621742248535, |
|
"learning_rate": 9.954129327980362e-06, |
|
"loss": 1.0573, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.681105613708496, |
|
"learning_rate": 9.951810862410426e-06, |
|
"loss": 1.0833, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.339748382568359, |
|
"learning_rate": 9.949435524132245e-06, |
|
"loss": 1.0712, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.844823837280273, |
|
"learning_rate": 9.947003340427134e-06, |
|
"loss": 1.0803, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.774728536605835, |
|
"learning_rate": 9.944514339229292e-06, |
|
"loss": 1.0787, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.437483310699463, |
|
"learning_rate": 9.941968549125481e-06, |
|
"loss": 1.0757, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.675827503204346, |
|
"learning_rate": 9.9393659993547e-06, |
|
"loss": 1.0635, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.193900108337402, |
|
"learning_rate": 9.936706719807839e-06, |
|
"loss": 1.0527, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.871400833129883, |
|
"learning_rate": 9.93399074102735e-06, |
|
"loss": 1.0683, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.098533630371094, |
|
"learning_rate": 9.931218094206882e-06, |
|
"loss": 1.0704, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.03292989730835, |
|
"learning_rate": 9.928388811190938e-06, |
|
"loss": 1.0839, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.196811199188232, |
|
"learning_rate": 9.925502924474495e-06, |
|
"loss": 1.0749, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.939242362976074, |
|
"learning_rate": 9.922560467202638e-06, |
|
"loss": 1.0761, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.359401702880859, |
|
"learning_rate": 9.919561473170178e-06, |
|
"loss": 1.0625, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.799266338348389, |
|
"learning_rate": 9.916505976821262e-06, |
|
"loss": 1.0691, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.341587543487549, |
|
"learning_rate": 9.913394013248987e-06, |
|
"loss": 1.0737, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.106640815734863, |
|
"learning_rate": 9.91022561819498e-06, |
|
"loss": 1.0805, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.987344741821289, |
|
"learning_rate": 9.907000828049001e-06, |
|
"loss": 1.0569, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.510980606079102, |
|
"learning_rate": 9.903719679848522e-06, |
|
"loss": 1.078, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.564126968383789, |
|
"learning_rate": 9.9003822112783e-06, |
|
"loss": 1.0726, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.666530132293701, |
|
"learning_rate": 9.89698846066994e-06, |
|
"loss": 1.0635, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.472270488739014, |
|
"learning_rate": 9.893538467001466e-06, |
|
"loss": 1.0529, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.534265995025635, |
|
"learning_rate": 9.890032269896862e-06, |
|
"loss": 1.08, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.677114963531494, |
|
"learning_rate": 9.886469909625624e-06, |
|
"loss": 1.0719, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.84233283996582, |
|
"learning_rate": 9.882851427102299e-06, |
|
"loss": 1.0665, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.839110374450684, |
|
"learning_rate": 9.879176863885997e-06, |
|
"loss": 1.0635, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.322112560272217, |
|
"learning_rate": 9.875446262179948e-06, |
|
"loss": 1.0755, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.2779130935668945, |
|
"learning_rate": 9.87165966483098e-06, |
|
"loss": 1.0745, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.142566204071045, |
|
"learning_rate": 9.867817115329055e-06, |
|
"loss": 1.0725, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.487438678741455, |
|
"learning_rate": 9.863918657806752e-06, |
|
"loss": 1.0538, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.5258588790893555, |
|
"learning_rate": 9.85996433703877e-06, |
|
"loss": 1.0559, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.927529811859131, |
|
"learning_rate": 9.855954198441411e-06, |
|
"loss": 1.0661, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.9455795288085938, |
|
"learning_rate": 9.851888288072053e-06, |
|
"loss": 1.0769, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.094160556793213, |
|
"learning_rate": 9.847766652628635e-06, |
|
"loss": 1.0767, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.2093424797058105, |
|
"learning_rate": 9.843589339449102e-06, |
|
"loss": 1.0635, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.292007923126221, |
|
"learning_rate": 9.839356396510875e-06, |
|
"loss": 1.0593, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.241662502288818, |
|
"learning_rate": 9.835067872430297e-06, |
|
"loss": 1.0627, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.465583324432373, |
|
"learning_rate": 9.830723816462071e-06, |
|
"loss": 1.0551, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.395579814910889, |
|
"learning_rate": 9.8263242784987e-06, |
|
"loss": 1.0691, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.91081428527832, |
|
"learning_rate": 9.821869309069907e-06, |
|
"loss": 1.0632, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.9647531509399414, |
|
"learning_rate": 9.817358959342057e-06, |
|
"loss": 1.0693, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.396968364715576, |
|
"learning_rate": 9.81279328111758e-06, |
|
"loss": 1.0869, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.095502853393555, |
|
"learning_rate": 9.808172326834356e-06, |
|
"loss": 1.0636, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.527934551239014, |
|
"learning_rate": 9.80349614956513e-06, |
|
"loss": 1.0679, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.237788200378418, |
|
"learning_rate": 9.798764803016892e-06, |
|
"loss": 1.05, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.45642614364624, |
|
"learning_rate": 9.793978341530265e-06, |
|
"loss": 1.0697, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.550489664077759, |
|
"learning_rate": 9.789136820078884e-06, |
|
"loss": 1.079, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.879551887512207, |
|
"learning_rate": 9.784240294268756e-06, |
|
"loss": 1.0649, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.333283424377441, |
|
"learning_rate": 9.779288820337628e-06, |
|
"loss": 1.0668, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.761474370956421, |
|
"learning_rate": 9.774282455154338e-06, |
|
"loss": 1.0738, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.7375967502593994, |
|
"learning_rate": 9.769221256218165e-06, |
|
"loss": 1.0574, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.852424144744873, |
|
"learning_rate": 9.764105281658161e-06, |
|
"loss": 1.0536, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.345045566558838, |
|
"learning_rate": 9.758934590232495e-06, |
|
"loss": 1.0898, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.355409622192383, |
|
"learning_rate": 9.753709241327773e-06, |
|
"loss": 1.0657, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.160512924194336, |
|
"learning_rate": 9.748429294958345e-06, |
|
"loss": 1.0699, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.259660243988037, |
|
"learning_rate": 9.74309481176564e-06, |
|
"loss": 1.0703, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.260087013244629, |
|
"learning_rate": 9.737705853017442e-06, |
|
"loss": 1.0816, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.026108264923096, |
|
"learning_rate": 9.732262480607207e-06, |
|
"loss": 1.0556, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 1.0583994388580322, |
|
"eval_runtime": 12.5254, |
|
"eval_samples_per_second": 52.214, |
|
"eval_steps_per_second": 6.547, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.157490253448486, |
|
"learning_rate": 9.726764757053343e-06, |
|
"loss": 1.0735, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.293674468994141, |
|
"learning_rate": 9.721212745498493e-06, |
|
"loss": 1.0697, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.8046298027038574, |
|
"learning_rate": 9.715606509708812e-06, |
|
"loss": 1.0635, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.55698561668396, |
|
"learning_rate": 9.709946114073231e-06, |
|
"loss": 1.0685, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.9492263793945312, |
|
"learning_rate": 9.704231623602721e-06, |
|
"loss": 1.0692, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.995659351348877, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 1.069, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.452322483062744, |
|
"learning_rate": 9.692640621306497e-06, |
|
"loss": 1.0728, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.006470680236816, |
|
"learning_rate": 9.686764242606164e-06, |
|
"loss": 1.0616, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.041233539581299, |
|
"learning_rate": 9.680834035320127e-06, |
|
"loss": 1.0712, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.9690816402435303, |
|
"learning_rate": 9.674850067558209e-06, |
|
"loss": 1.0682, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.229337215423584, |
|
"learning_rate": 9.66881240804768e-06, |
|
"loss": 1.0734, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.3140339851379395, |
|
"learning_rate": 9.662721126132473e-06, |
|
"loss": 1.0665, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.23881721496582, |
|
"learning_rate": 9.656576291772392e-06, |
|
"loss": 1.0535, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.9377248287200928, |
|
"learning_rate": 9.650377975542298e-06, |
|
"loss": 1.068, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.9194915294647217, |
|
"learning_rate": 9.644126248631306e-06, |
|
"loss": 1.0803, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.5396149158477783, |
|
"learning_rate": 9.637821182841965e-06, |
|
"loss": 1.0574, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.14241361618042, |
|
"learning_rate": 9.631462850589432e-06, |
|
"loss": 1.0643, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.7068986892700195, |
|
"learning_rate": 9.625051324900645e-06, |
|
"loss": 1.0519, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.9636712074279785, |
|
"learning_rate": 9.618586679413477e-06, |
|
"loss": 1.054, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.6051032543182373, |
|
"learning_rate": 9.612068988375898e-06, |
|
"loss": 1.0534, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.969806671142578, |
|
"learning_rate": 9.605498326645115e-06, |
|
"loss": 1.0851, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.08690071105957, |
|
"learning_rate": 9.598874769686721e-06, |
|
"loss": 1.0645, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.7391505241394043, |
|
"learning_rate": 9.592198393573816e-06, |
|
"loss": 1.0693, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.053625583648682, |
|
"learning_rate": 9.585469274986148e-06, |
|
"loss": 1.0682, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.923471212387085, |
|
"learning_rate": 9.578687491209219e-06, |
|
"loss": 1.0606, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.096630573272705, |
|
"learning_rate": 9.571853120133406e-06, |
|
"loss": 1.064, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.9412453174591064, |
|
"learning_rate": 9.564966240253062e-06, |
|
"loss": 1.0786, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.6600263118743896, |
|
"learning_rate": 9.558026930665614e-06, |
|
"loss": 1.0622, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.281722545623779, |
|
"learning_rate": 9.551035271070665e-06, |
|
"loss": 1.0542, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.6689794063568115, |
|
"learning_rate": 9.543991341769057e-06, |
|
"loss": 1.0496, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.148682594299316, |
|
"learning_rate": 9.536895223661975e-06, |
|
"loss": 1.0648, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.112921237945557, |
|
"learning_rate": 9.529746998249994e-06, |
|
"loss": 1.0632, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.144115447998047, |
|
"learning_rate": 9.52254674763216e-06, |
|
"loss": 1.0555, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.198709011077881, |
|
"learning_rate": 9.515294554505039e-06, |
|
"loss": 1.049, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.7727572917938232, |
|
"learning_rate": 9.507990502161769e-06, |
|
"loss": 1.0428, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.66112756729126, |
|
"learning_rate": 9.500634674491099e-06, |
|
"loss": 1.0666, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.313396453857422, |
|
"learning_rate": 9.49322715597644e-06, |
|
"loss": 1.0658, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.8979835510253906, |
|
"learning_rate": 9.485768031694872e-06, |
|
"loss": 1.0516, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.047280788421631, |
|
"learning_rate": 9.478257387316189e-06, |
|
"loss": 1.0708, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.9112637042999268, |
|
"learning_rate": 9.470695309101903e-06, |
|
"loss": 1.0576, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.9057672023773193, |
|
"learning_rate": 9.463081883904251e-06, |
|
"loss": 1.0653, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.1275105476379395, |
|
"learning_rate": 9.455417199165209e-06, |
|
"loss": 1.0454, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.834106922149658, |
|
"learning_rate": 9.447701342915473e-06, |
|
"loss": 1.0593, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.673773765563965, |
|
"learning_rate": 9.439934403773468e-06, |
|
"loss": 1.0543, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.011989593505859, |
|
"learning_rate": 9.4321164709443e-06, |
|
"loss": 1.0468, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.221887588500977, |
|
"learning_rate": 9.42424763421877e-06, |
|
"loss": 1.0699, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.603546142578125, |
|
"learning_rate": 9.416327983972304e-06, |
|
"loss": 1.0525, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.173734188079834, |
|
"learning_rate": 9.408357611163945e-06, |
|
"loss": 1.0678, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.5241010189056396, |
|
"learning_rate": 9.400336607335294e-06, |
|
"loss": 1.0536, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.8831429481506348, |
|
"learning_rate": 9.392265064609455e-06, |
|
"loss": 1.0367, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.8573215007781982, |
|
"learning_rate": 9.384143075689992e-06, |
|
"loss": 1.0474, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.8050475120544434, |
|
"learning_rate": 9.375970733859848e-06, |
|
"loss": 1.0508, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.164061546325684, |
|
"learning_rate": 9.367748132980286e-06, |
|
"loss": 1.0629, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.8858516216278076, |
|
"learning_rate": 9.359475367489805e-06, |
|
"loss": 1.0616, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.033576965332031, |
|
"learning_rate": 9.351152532403054e-06, |
|
"loss": 1.0687, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.06900691986084, |
|
"learning_rate": 9.342779723309746e-06, |
|
"loss": 1.0519, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.8570592403411865, |
|
"learning_rate": 9.334357036373552e-06, |
|
"loss": 1.0482, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.30549430847168, |
|
"learning_rate": 9.32588456833101e-06, |
|
"loss": 1.0714, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.6463634967803955, |
|
"learning_rate": 9.317362416490396e-06, |
|
"loss": 1.055, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.366295337677002, |
|
"learning_rate": 9.308790678730627e-06, |
|
"loss": 1.0502, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.8573434352874756, |
|
"learning_rate": 9.300169453500117e-06, |
|
"loss": 1.0597, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.8393239974975586, |
|
"learning_rate": 9.291498839815658e-06, |
|
"loss": 1.0553, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.246349334716797, |
|
"learning_rate": 9.282778937261279e-06, |
|
"loss": 1.0734, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.076491355895996, |
|
"learning_rate": 9.274009845987106e-06, |
|
"loss": 1.0643, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.7121007442474365, |
|
"learning_rate": 9.26519166670821e-06, |
|
"loss": 1.0491, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.8223936557769775, |
|
"learning_rate": 9.256324500703439e-06, |
|
"loss": 1.0713, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.4132473468780518, |
|
"learning_rate": 9.247408449814281e-06, |
|
"loss": 1.0541, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.846742868423462, |
|
"learning_rate": 9.238443616443666e-06, |
|
"loss": 1.0573, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.1583123207092285, |
|
"learning_rate": 9.229430103554808e-06, |
|
"loss": 1.038, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.192993640899658, |
|
"learning_rate": 9.22036801467001e-06, |
|
"loss": 1.0645, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.137371063232422, |
|
"learning_rate": 9.211257453869495e-06, |
|
"loss": 1.058, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.091256618499756, |
|
"learning_rate": 9.202098525790182e-06, |
|
"loss": 1.0702, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.9540719985961914, |
|
"learning_rate": 9.192891335624508e-06, |
|
"loss": 1.0406, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.149806022644043, |
|
"learning_rate": 9.183635989119211e-06, |
|
"loss": 1.0558, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.950911045074463, |
|
"learning_rate": 9.174332592574115e-06, |
|
"loss": 1.0446, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.7937111854553223, |
|
"learning_rate": 9.164981252840908e-06, |
|
"loss": 1.0608, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.06609582901001, |
|
"learning_rate": 9.155582077321918e-06, |
|
"loss": 1.0653, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.201600074768066, |
|
"learning_rate": 9.146135173968881e-06, |
|
"loss": 1.0651, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.8813118934631348, |
|
"learning_rate": 9.136640651281694e-06, |
|
"loss": 1.0567, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.4754064083099365, |
|
"learning_rate": 9.127098618307177e-06, |
|
"loss": 1.0632, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.5580170154571533, |
|
"learning_rate": 9.117509184637814e-06, |
|
"loss": 1.057, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.7533814907073975, |
|
"learning_rate": 9.107872460410496e-06, |
|
"loss": 1.0398, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.086584091186523, |
|
"learning_rate": 9.098188556305262e-06, |
|
"loss": 1.0633, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.2910237312316895, |
|
"learning_rate": 9.088457583544022e-06, |
|
"loss": 1.0334, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.132745742797852, |
|
"learning_rate": 9.078679653889273e-06, |
|
"loss": 1.0595, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.9243948459625244, |
|
"learning_rate": 9.068854879642833e-06, |
|
"loss": 1.0641, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.943058490753174, |
|
"learning_rate": 9.058983373644532e-06, |
|
"loss": 1.0493, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.724886417388916, |
|
"learning_rate": 9.049065249270936e-06, |
|
"loss": 1.0374, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.8636670112609863, |
|
"learning_rate": 9.039100620434025e-06, |
|
"loss": 1.0634, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.841193675994873, |
|
"learning_rate": 9.029089601579895e-06, |
|
"loss": 1.0433, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.6147212982177734, |
|
"learning_rate": 9.019032307687446e-06, |
|
"loss": 1.0416, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.8550570011138916, |
|
"learning_rate": 9.008928854267054e-06, |
|
"loss": 1.064, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.5188698768615723, |
|
"learning_rate": 8.99877935735925e-06, |
|
"loss": 1.0472, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.188703536987305, |
|
"learning_rate": 8.988583933533384e-06, |
|
"loss": 1.0688, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.8695075511932373, |
|
"learning_rate": 8.978342699886289e-06, |
|
"loss": 1.0391, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.8634023666381836, |
|
"learning_rate": 8.968055774040932e-06, |
|
"loss": 1.0422, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.071281909942627, |
|
"learning_rate": 8.95772327414507e-06, |
|
"loss": 1.0442, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.14091157913208, |
|
"learning_rate": 8.947345318869883e-06, |
|
"loss": 1.0541, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.287483215332031, |
|
"learning_rate": 8.936922027408618e-06, |
|
"loss": 1.0391, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.207295894622803, |
|
"learning_rate": 8.926453519475225e-06, |
|
"loss": 1.0455, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 1.046447992324829, |
|
"eval_runtime": 12.4637, |
|
"eval_samples_per_second": 52.472, |
|
"eval_steps_per_second": 6.579, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.189324855804443, |
|
"learning_rate": 8.91593991530297e-06, |
|
"loss": 1.0559, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.109393119812012, |
|
"learning_rate": 8.905381335643056e-06, |
|
"loss": 1.0524, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.9047772884368896, |
|
"learning_rate": 8.89477790176325e-06, |
|
"loss": 1.059, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.3157196044921875, |
|
"learning_rate": 8.884129735446471e-06, |
|
"loss": 1.0494, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.147355079650879, |
|
"learning_rate": 8.873436958989409e-06, |
|
"loss": 1.0517, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.453030586242676, |
|
"learning_rate": 8.862699695201107e-06, |
|
"loss": 1.0538, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.24041223526001, |
|
"learning_rate": 8.851918067401552e-06, |
|
"loss": 1.0425, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.6763038635253906, |
|
"learning_rate": 8.84109219942027e-06, |
|
"loss": 1.0558, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.736586093902588, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 1.0589, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.622150182723999, |
|
"learning_rate": 8.819308240769726e-06, |
|
"loss": 1.0428, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.96793794631958, |
|
"learning_rate": 8.808350400294332e-06, |
|
"loss": 1.0245, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.7367801666259766, |
|
"learning_rate": 8.797348820022079e-06, |
|
"loss": 1.0551, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.7123868465423584, |
|
"learning_rate": 8.78630362630869e-06, |
|
"loss": 1.0381, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.651548385620117, |
|
"learning_rate": 8.775214946010806e-06, |
|
"loss": 1.0428, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.124800205230713, |
|
"learning_rate": 8.764082906484518e-06, |
|
"loss": 1.0638, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.599874496459961, |
|
"learning_rate": 8.752907635583911e-06, |
|
"loss": 1.0441, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.769707441329956, |
|
"learning_rate": 8.74168926165959e-06, |
|
"loss": 1.0526, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.018752098083496, |
|
"learning_rate": 8.730427913557205e-06, |
|
"loss": 1.0672, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.963313341140747, |
|
"learning_rate": 8.71912372061598e-06, |
|
"loss": 1.0606, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.098948001861572, |
|
"learning_rate": 8.707776812667224e-06, |
|
"loss": 1.0383, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.441176652908325, |
|
"learning_rate": 8.696387320032827e-06, |
|
"loss": 1.0629, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.6925058364868164, |
|
"learning_rate": 8.684955373523787e-06, |
|
"loss": 1.0555, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.5602104663848877, |
|
"learning_rate": 8.673481104438685e-06, |
|
"loss": 1.0421, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.177275657653809, |
|
"learning_rate": 8.661964644562194e-06, |
|
"loss": 1.0504, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.9053499698638916, |
|
"learning_rate": 8.650406126163553e-06, |
|
"loss": 1.0508, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.4393839836120605, |
|
"learning_rate": 8.638805681995052e-06, |
|
"loss": 1.0375, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.890512228012085, |
|
"learning_rate": 8.627163445290514e-06, |
|
"loss": 1.0453, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.122755527496338, |
|
"learning_rate": 8.615479549763756e-06, |
|
"loss": 1.0427, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.3840408325195312, |
|
"learning_rate": 8.603754129607055e-06, |
|
"loss": 1.0454, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.196717262268066, |
|
"learning_rate": 8.591987319489612e-06, |
|
"loss": 1.0594, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.9698941707611084, |
|
"learning_rate": 8.580179254555997e-06, |
|
"loss": 1.0431, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.883592128753662, |
|
"learning_rate": 8.5683300704246e-06, |
|
"loss": 1.0257, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.783325672149658, |
|
"learning_rate": 8.556439903186082e-06, |
|
"loss": 1.0445, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.042956352233887, |
|
"learning_rate": 8.544508889401799e-06, |
|
"loss": 1.0507, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.7064452171325684, |
|
"learning_rate": 8.53253716610224e-06, |
|
"loss": 1.0604, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.835759401321411, |
|
"learning_rate": 8.520524870785453e-06, |
|
"loss": 1.0526, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.9541969299316406, |
|
"learning_rate": 8.508472141415468e-06, |
|
"loss": 1.0365, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.182323932647705, |
|
"learning_rate": 8.4963791164207e-06, |
|
"loss": 1.0292, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.205846309661865, |
|
"learning_rate": 8.484245934692379e-06, |
|
"loss": 1.0236, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.628838300704956, |
|
"learning_rate": 8.472072735582942e-06, |
|
"loss": 1.0457, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.7961418628692627, |
|
"learning_rate": 8.45985965890443e-06, |
|
"loss": 1.0491, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.6906325817108154, |
|
"learning_rate": 8.447606844926895e-06, |
|
"loss": 1.0315, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.307608604431152, |
|
"learning_rate": 8.435314434376773e-06, |
|
"loss": 1.0498, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.8979074954986572, |
|
"learning_rate": 8.422982568435283e-06, |
|
"loss": 1.0637, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.03852653503418, |
|
"learning_rate": 8.410611388736793e-06, |
|
"loss": 1.06, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.238548994064331, |
|
"learning_rate": 8.398201037367202e-06, |
|
"loss": 1.0385, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.223562240600586, |
|
"learning_rate": 8.385751656862305e-06, |
|
"loss": 1.039, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.7112879753112793, |
|
"learning_rate": 8.373263390206155e-06, |
|
"loss": 1.0412, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.801882743835449, |
|
"learning_rate": 8.36073638082942e-06, |
|
"loss": 1.0455, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.619334697723389, |
|
"learning_rate": 8.348170772607737e-06, |
|
"loss": 1.054, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.967297077178955, |
|
"learning_rate": 8.335566709860065e-06, |
|
"loss": 1.0369, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.9955484867095947, |
|
"learning_rate": 8.322924337347016e-06, |
|
"loss": 1.0631, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.6488661766052246, |
|
"learning_rate": 8.3102438002692e-06, |
|
"loss": 1.0427, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.1679534912109375, |
|
"learning_rate": 8.29752524426556e-06, |
|
"loss": 1.0396, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.7516090869903564, |
|
"learning_rate": 8.284768815411693e-06, |
|
"loss": 1.0457, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.902599811553955, |
|
"learning_rate": 8.27197466021817e-06, |
|
"loss": 1.0354, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.828345775604248, |
|
"learning_rate": 8.259142925628862e-06, |
|
"loss": 1.0359, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.114760875701904, |
|
"learning_rate": 8.246273759019252e-06, |
|
"loss": 1.0346, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.289566993713379, |
|
"learning_rate": 8.233367308194735e-06, |
|
"loss": 1.038, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.713040828704834, |
|
"learning_rate": 8.220423721388918e-06, |
|
"loss": 1.0442, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.1226630210876465, |
|
"learning_rate": 8.20744314726193e-06, |
|
"loss": 1.0502, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.980717182159424, |
|
"learning_rate": 8.19442573489871e-06, |
|
"loss": 1.0398, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.998352527618408, |
|
"learning_rate": 8.181371633807289e-06, |
|
"loss": 1.0558, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.392803192138672, |
|
"learning_rate": 8.168280993917078e-06, |
|
"loss": 1.0508, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.483020305633545, |
|
"learning_rate": 8.155153965577139e-06, |
|
"loss": 1.028, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.593369960784912, |
|
"learning_rate": 8.141990699554476e-06, |
|
"loss": 1.0591, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.9365193843841553, |
|
"learning_rate": 8.12879134703228e-06, |
|
"loss": 1.0496, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.063488960266113, |
|
"learning_rate": 8.115556059608208e-06, |
|
"loss": 1.0554, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.916815996170044, |
|
"learning_rate": 8.102284989292639e-06, |
|
"loss": 1.0382, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.987957000732422, |
|
"learning_rate": 8.088978288506923e-06, |
|
"loss": 1.0668, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.158136367797852, |
|
"learning_rate": 8.075636110081643e-06, |
|
"loss": 1.0346, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.6939709186553955, |
|
"learning_rate": 8.062258607254841e-06, |
|
"loss": 1.0401, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.190096378326416, |
|
"learning_rate": 8.048845933670274e-06, |
|
"loss": 1.0285, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.59887957572937, |
|
"learning_rate": 8.035398243375636e-06, |
|
"loss": 1.036, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.9749696254730225, |
|
"learning_rate": 8.021915690820808e-06, |
|
"loss": 1.0555, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.9689297676086426, |
|
"learning_rate": 8.008398430856064e-06, |
|
"loss": 1.038, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.9846508502960205, |
|
"learning_rate": 7.994846618730301e-06, |
|
"loss": 1.0523, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.488775730133057, |
|
"learning_rate": 7.981260410089258e-06, |
|
"loss": 1.0244, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.135215759277344, |
|
"learning_rate": 7.967639960973727e-06, |
|
"loss": 1.0653, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.3885884284973145, |
|
"learning_rate": 7.953985427817757e-06, |
|
"loss": 1.0531, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.9444169998168945, |
|
"learning_rate": 7.94029696744686e-06, |
|
"loss": 1.04, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.401015758514404, |
|
"learning_rate": 7.92657473707621e-06, |
|
"loss": 1.0498, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.35683012008667, |
|
"learning_rate": 7.912818894308845e-06, |
|
"loss": 1.0288, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.314106464385986, |
|
"learning_rate": 7.899029597133836e-06, |
|
"loss": 1.0413, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.9266910552978516, |
|
"learning_rate": 7.885207003924498e-06, |
|
"loss": 1.0319, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.997091054916382, |
|
"learning_rate": 7.87135127343655e-06, |
|
"loss": 1.0324, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.264638423919678, |
|
"learning_rate": 7.857462564806306e-06, |
|
"loss": 1.0328, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.032344818115234, |
|
"learning_rate": 7.84354103754884e-06, |
|
"loss": 1.0415, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.569589614868164, |
|
"learning_rate": 7.82958685155615e-06, |
|
"loss": 1.0566, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.405215740203857, |
|
"learning_rate": 7.815600167095338e-06, |
|
"loss": 1.0508, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.7878050804138184, |
|
"learning_rate": 7.801581144806752e-06, |
|
"loss": 1.0365, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.773585319519043, |
|
"learning_rate": 7.787529945702145e-06, |
|
"loss": 1.0366, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.027467727661133, |
|
"learning_rate": 7.773446731162835e-06, |
|
"loss": 1.0285, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.831883430480957, |
|
"learning_rate": 7.759331662937841e-06, |
|
"loss": 1.0342, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.330446243286133, |
|
"learning_rate": 7.745184903142029e-06, |
|
"loss": 1.0398, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.389279842376709, |
|
"learning_rate": 7.731006614254252e-06, |
|
"loss": 1.017, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.518781661987305, |
|
"learning_rate": 7.716796959115479e-06, |
|
"loss": 1.0465, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.294104099273682, |
|
"learning_rate": 7.70255610092693e-06, |
|
"loss": 1.0328, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.937368154525757, |
|
"learning_rate": 7.688284203248197e-06, |
|
"loss": 1.0496, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.222658157348633, |
|
"learning_rate": 7.673981429995372e-06, |
|
"loss": 1.032, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.0327889919281006, |
|
"eval_runtime": 12.4602, |
|
"eval_samples_per_second": 52.487, |
|
"eval_steps_per_second": 6.581, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.9882421493530273, |
|
"learning_rate": 7.659647945439157e-06, |
|
"loss": 1.0262, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.082265377044678, |
|
"learning_rate": 7.645283914202981e-06, |
|
"loss": 1.03, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.47793436050415, |
|
"learning_rate": 7.63088950126111e-06, |
|
"loss": 1.0402, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.539676189422607, |
|
"learning_rate": 7.616464871936748e-06, |
|
"loss": 1.0441, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.070407867431641, |
|
"learning_rate": 7.602010191900147e-06, |
|
"loss": 1.0298, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.478466510772705, |
|
"learning_rate": 7.587525627166691e-06, |
|
"loss": 1.0298, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.1451005935668945, |
|
"learning_rate": 7.573011344095002e-06, |
|
"loss": 1.0411, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.8588812351226807, |
|
"learning_rate": 7.558467509385023e-06, |
|
"loss": 1.0312, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.136762619018555, |
|
"learning_rate": 7.5438942900761035e-06, |
|
"loss": 1.0436, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.054186820983887, |
|
"learning_rate": 7.529291853545082e-06, |
|
"loss": 1.0421, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.862720012664795, |
|
"learning_rate": 7.514660367504368e-06, |
|
"loss": 1.0355, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 38.17692565917969, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.045, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.037071228027344, |
|
"learning_rate": 7.485310919409742e-06, |
|
"loss": 1.0382, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.044297218322754, |
|
"learning_rate": 7.470593294441124e-06, |
|
"loss": 1.0354, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.8578081130981445, |
|
"learning_rate": 7.455847294129519e-06, |
|
"loss": 1.0475, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.1042094230651855, |
|
"learning_rate": 7.4410730878361936e-06, |
|
"loss": 1.0302, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.391599178314209, |
|
"learning_rate": 7.426270845246373e-06, |
|
"loss": 1.0317, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.354910373687744, |
|
"learning_rate": 7.411440736367281e-06, |
|
"loss": 1.0291, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.061986923217773, |
|
"learning_rate": 7.396582931526194e-06, |
|
"loss": 1.0434, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.731538772583008, |
|
"learning_rate": 7.381697601368481e-06, |
|
"loss": 1.0472, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.0257887840271, |
|
"learning_rate": 7.36678491685565e-06, |
|
"loss": 1.0399, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.179793834686279, |
|
"learning_rate": 7.351845049263374e-06, |
|
"loss": 1.0518, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.212937355041504, |
|
"learning_rate": 7.3368781701795365e-06, |
|
"loss": 1.0381, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.426169395446777, |
|
"learning_rate": 7.321884451502252e-06, |
|
"loss": 1.0338, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.190229415893555, |
|
"learning_rate": 7.30686406543789e-06, |
|
"loss": 1.0482, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.897801160812378, |
|
"learning_rate": 7.291817184499107e-06, |
|
"loss": 1.0331, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.616969585418701, |
|
"learning_rate": 7.276743981502856e-06, |
|
"loss": 1.0515, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.8713490962982178, |
|
"learning_rate": 7.2616446295684075e-06, |
|
"loss": 1.0222, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.93888783454895, |
|
"learning_rate": 7.246519302115355e-06, |
|
"loss": 1.0355, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.489087104797363, |
|
"learning_rate": 7.23136817286163e-06, |
|
"loss": 1.0316, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.9029769897460938, |
|
"learning_rate": 7.216191415821503e-06, |
|
"loss": 1.0212, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.405784606933594, |
|
"learning_rate": 7.200989205303583e-06, |
|
"loss": 1.0421, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.0875701904296875, |
|
"learning_rate": 7.185761715908826e-06, |
|
"loss": 1.0468, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.10852575302124, |
|
"learning_rate": 7.170509122528511e-06, |
|
"loss": 1.0307, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.401843547821045, |
|
"learning_rate": 7.15523160034225e-06, |
|
"loss": 1.0265, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.106047630310059, |
|
"learning_rate": 7.139929324815965e-06, |
|
"loss": 1.021, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.1536407470703125, |
|
"learning_rate": 7.124602471699878e-06, |
|
"loss": 1.0409, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.14933443069458, |
|
"learning_rate": 7.109251217026487e-06, |
|
"loss": 1.0385, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.064835071563721, |
|
"learning_rate": 7.0938757371085485e-06, |
|
"loss": 1.0312, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.811549425125122, |
|
"learning_rate": 7.078476208537057e-06, |
|
"loss": 1.0359, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.325003623962402, |
|
"learning_rate": 7.063052808179205e-06, |
|
"loss": 1.0483, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.5266337394714355, |
|
"learning_rate": 7.04760571317636e-06, |
|
"loss": 1.0228, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.071694850921631, |
|
"learning_rate": 7.032135100942027e-06, |
|
"loss": 1.0353, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.121958255767822, |
|
"learning_rate": 7.016641149159816e-06, |
|
"loss": 1.049, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.714683532714844, |
|
"learning_rate": 7.00112403578139e-06, |
|
"loss": 1.0361, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.453790664672852, |
|
"learning_rate": 6.985583939024436e-06, |
|
"loss": 1.033, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.712753772735596, |
|
"learning_rate": 6.970021037370609e-06, |
|
"loss": 1.0462, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.329601287841797, |
|
"learning_rate": 6.9544355095634775e-06, |
|
"loss": 1.0459, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.669638156890869, |
|
"learning_rate": 6.938827534606484e-06, |
|
"loss": 1.0335, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.9964518547058105, |
|
"learning_rate": 6.923197291760876e-06, |
|
"loss": 1.0433, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.998533248901367, |
|
"learning_rate": 6.907544960543659e-06, |
|
"loss": 1.035, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.344484329223633, |
|
"learning_rate": 6.891870720725522e-06, |
|
"loss": 1.0405, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.392531871795654, |
|
"learning_rate": 6.8761747523287845e-06, |
|
"loss": 1.0339, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.274383544921875, |
|
"learning_rate": 6.860457235625322e-06, |
|
"loss": 1.0337, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.2484846115112305, |
|
"learning_rate": 6.844718351134496e-06, |
|
"loss": 1.0433, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.707181692123413, |
|
"learning_rate": 6.828958279621085e-06, |
|
"loss": 1.0497, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.188033103942871, |
|
"learning_rate": 6.813177202093203e-06, |
|
"loss": 1.0274, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.837230682373047, |
|
"learning_rate": 6.797375299800224e-06, |
|
"loss": 1.0395, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.9512484073638916, |
|
"learning_rate": 6.7815527542307e-06, |
|
"loss": 1.0516, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.2635297775268555, |
|
"learning_rate": 6.765709747110274e-06, |
|
"loss": 1.057, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.248997211456299, |
|
"learning_rate": 6.749846460399594e-06, |
|
"loss": 1.0296, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.210043430328369, |
|
"learning_rate": 6.7339630762922295e-06, |
|
"loss": 1.0291, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.8999147415161133, |
|
"learning_rate": 6.7180597772125665e-06, |
|
"loss": 1.0375, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.221770286560059, |
|
"learning_rate": 6.702136745813721e-06, |
|
"loss": 1.0206, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.14971399307251, |
|
"learning_rate": 6.686194164975446e-06, |
|
"loss": 1.0283, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.6616663932800293, |
|
"learning_rate": 6.670232217802011e-06, |
|
"loss": 1.0299, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.623802661895752, |
|
"learning_rate": 6.654251087620125e-06, |
|
"loss": 1.0325, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.6086490154266357, |
|
"learning_rate": 6.638250957976813e-06, |
|
"loss": 1.0299, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.8812456130981445, |
|
"learning_rate": 6.6222320126373105e-06, |
|
"loss": 1.0436, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.9015376567840576, |
|
"learning_rate": 6.6061944355829634e-06, |
|
"loss": 1.0093, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.15576171875, |
|
"learning_rate": 6.590138411009099e-06, |
|
"loss": 1.0378, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.204216957092285, |
|
"learning_rate": 6.574064123322925e-06, |
|
"loss": 1.032, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.158588409423828, |
|
"learning_rate": 6.557971757141402e-06, |
|
"loss": 1.0182, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.28289270401001, |
|
"learning_rate": 6.541861497289126e-06, |
|
"loss": 1.0324, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.406084060668945, |
|
"learning_rate": 6.525733528796207e-06, |
|
"loss": 1.0311, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.9430246353149414, |
|
"learning_rate": 6.509588036896144e-06, |
|
"loss": 1.0365, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.8312675952911377, |
|
"learning_rate": 6.493425207023693e-06, |
|
"loss": 1.0313, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.555315017700195, |
|
"learning_rate": 6.477245224812746e-06, |
|
"loss": 1.0336, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.399374961853027, |
|
"learning_rate": 6.46104827609419e-06, |
|
"loss": 1.0309, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.963261604309082, |
|
"learning_rate": 6.444834546893773e-06, |
|
"loss": 1.0401, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.5317559242248535, |
|
"learning_rate": 6.42860422342998e-06, |
|
"loss": 1.0287, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.08970308303833, |
|
"learning_rate": 6.412357492111877e-06, |
|
"loss": 1.0314, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.869360446929932, |
|
"learning_rate": 6.396094539536981e-06, |
|
"loss": 1.0426, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.279962539672852, |
|
"learning_rate": 6.379815552489112e-06, |
|
"loss": 1.044, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.379662990570068, |
|
"learning_rate": 6.363520717936256e-06, |
|
"loss": 1.022, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.329278469085693, |
|
"learning_rate": 6.347210223028403e-06, |
|
"loss": 1.0295, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.4202423095703125, |
|
"learning_rate": 6.330884255095409e-06, |
|
"loss": 1.0391, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.681463718414307, |
|
"learning_rate": 6.3145430016448435e-06, |
|
"loss": 1.0326, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.008312225341797, |
|
"learning_rate": 6.298186650359832e-06, |
|
"loss": 1.0459, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.1975884437561035, |
|
"learning_rate": 6.281815389096903e-06, |
|
"loss": 1.032, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.011014461517334, |
|
"learning_rate": 6.265429405883825e-06, |
|
"loss": 1.0537, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.628488063812256, |
|
"learning_rate": 6.24902888891746e-06, |
|
"loss": 1.0296, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.286167621612549, |
|
"learning_rate": 6.232614026561586e-06, |
|
"loss": 1.0251, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.162431240081787, |
|
"learning_rate": 6.216185007344745e-06, |
|
"loss": 1.0231, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.599613189697266, |
|
"learning_rate": 6.199742019958074e-06, |
|
"loss": 1.0259, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.7376463413238525, |
|
"learning_rate": 6.183285253253135e-06, |
|
"loss": 1.0308, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.396124362945557, |
|
"learning_rate": 6.1668148962397525e-06, |
|
"loss": 1.0383, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.382174015045166, |
|
"learning_rate": 6.150331138083833e-06, |
|
"loss": 1.0269, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.524794578552246, |
|
"learning_rate": 6.133834168105206e-06, |
|
"loss": 1.0381, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.226146221160889, |
|
"learning_rate": 6.117324175775435e-06, |
|
"loss": 1.0449, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 1.023166298866272, |
|
"eval_runtime": 12.4375, |
|
"eval_samples_per_second": 52.583, |
|
"eval_steps_per_second": 6.593, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.120533466339111, |
|
"learning_rate": 6.100801350715652e-06, |
|
"loss": 1.0285, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.9948532581329346, |
|
"learning_rate": 6.084265882694378e-06, |
|
"loss": 1.0411, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.175631999969482, |
|
"learning_rate": 6.0677179616253345e-06, |
|
"loss": 1.0347, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.19612455368042, |
|
"learning_rate": 6.0511577775652744e-06, |
|
"loss": 1.0367, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.379330158233643, |
|
"learning_rate": 6.034585520711792e-06, |
|
"loss": 1.0314, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.5682902336120605, |
|
"learning_rate": 6.018001381401143e-06, |
|
"loss": 1.0333, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.473631381988525, |
|
"learning_rate": 6.001405550106052e-06, |
|
"loss": 1.0397, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.200445175170898, |
|
"learning_rate": 5.9847982174335314e-06, |
|
"loss": 1.0262, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.9142019748687744, |
|
"learning_rate": 5.96817957412269e-06, |
|
"loss": 1.034, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.04217004776001, |
|
"learning_rate": 5.951549811042539e-06, |
|
"loss": 1.0466, |
|
"step": 5100 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1, |
|
"total_flos": 1.9350228034513797e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|