|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 60.0, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.939609527587891, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.4893, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.7112133502960205, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.2055, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.128227710723877, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.0467, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.812015175819397, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.959, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.001788854598999, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.8838, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.275044322013855, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.8455, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.9736475348472595, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.8018, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2415788173675537, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.7934, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.08917236328125, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.7508, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.7156405448913574, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.6881, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.8509552478790283, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.6296, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.1180299520492554, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.599, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.9558420181274414, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.6189, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.513689637184143, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.6507, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.101956844329834, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6767, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.2160413265228271, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.5263, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.2328133583068848, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.5254, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.8659697771072388, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.526, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 3.3415005207061768, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.5359, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.0912972688674927, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5413, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 3.15617299079895, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.5434, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 2.088568687438965, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.4533, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 5.438419342041016, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.4051, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.167334794998169, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.5039, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.4560151100158691, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4845, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 2.4988670349121094, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.4003, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": Infinity, |
|
"learning_rate": 1.0760000000000002e-05, |
|
"loss": 0.358, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.568748950958252, |
|
"learning_rate": 1.1160000000000002e-05, |
|
"loss": 0.438, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 2.2470250129699707, |
|
"learning_rate": 1.156e-05, |
|
"loss": 0.4482, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.1772513389587402, |
|
"learning_rate": 1.196e-05, |
|
"loss": 0.4083, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 2.48183536529541, |
|
"learning_rate": 1.236e-05, |
|
"loss": 0.3352, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 2.4329469203948975, |
|
"learning_rate": 1.2760000000000001e-05, |
|
"loss": 0.3115, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 3.1733779907226562, |
|
"learning_rate": 1.3160000000000001e-05, |
|
"loss": 0.3295, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.5031541585922241, |
|
"learning_rate": 1.3560000000000002e-05, |
|
"loss": 0.3617, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.4660649299621582, |
|
"learning_rate": 1.396e-05, |
|
"loss": 0.3997, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 2.0959041118621826, |
|
"learning_rate": 1.4360000000000001e-05, |
|
"loss": 0.2882, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 2.1866960525512695, |
|
"learning_rate": 1.4760000000000001e-05, |
|
"loss": 0.1761, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 3.808837652206421, |
|
"learning_rate": 1.516e-05, |
|
"loss": 0.3476, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 2.1240618228912354, |
|
"learning_rate": 1.556e-05, |
|
"loss": 0.2462, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 4.748623847961426, |
|
"learning_rate": 1.5960000000000003e-05, |
|
"loss": 0.2915, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 1.3342574834823608, |
|
"learning_rate": 1.636e-05, |
|
"loss": 0.2063, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 4.858347415924072, |
|
"learning_rate": 1.6760000000000002e-05, |
|
"loss": 0.1655, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 7.156524658203125, |
|
"learning_rate": 1.7160000000000002e-05, |
|
"loss": 0.3072, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 1.796271562576294, |
|
"learning_rate": 1.756e-05, |
|
"loss": 0.2161, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.3102816641330719, |
|
"learning_rate": 1.796e-05, |
|
"loss": 0.1655, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 2.9082212448120117, |
|
"learning_rate": 1.8360000000000004e-05, |
|
"loss": 0.1096, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 2.176609754562378, |
|
"learning_rate": 1.876e-05, |
|
"loss": 0.1441, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 4.31577205657959, |
|
"learning_rate": 1.916e-05, |
|
"loss": 0.1766, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 3.6560301780700684, |
|
"learning_rate": 1.9560000000000002e-05, |
|
"loss": 0.2017, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 2.5466725826263428, |
|
"learning_rate": 1.9960000000000002e-05, |
|
"loss": 0.2644, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 1.722813606262207, |
|
"learning_rate": 1.9999844036286483e-05, |
|
"loss": 0.1303, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 1.9171879291534424, |
|
"learning_rate": 1.9999210442038164e-05, |
|
"loss": 0.1545, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 5.039783477783203, |
|
"learning_rate": 1.999808950037968e-05, |
|
"loss": 0.1664, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 1.7670263051986694, |
|
"learning_rate": 1.9996481265944146e-05, |
|
"loss": 0.1468, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 2.348796844482422, |
|
"learning_rate": 1.9994385817114644e-05, |
|
"loss": 0.1501, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 0.6991663575172424, |
|
"learning_rate": 1.9991803256020393e-05, |
|
"loss": 0.0598, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.9841463565826416, |
|
"learning_rate": 1.9988733708531772e-05, |
|
"loss": 0.0975, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 0.5151156783103943, |
|
"learning_rate": 1.99851773242542e-05, |
|
"loss": 0.1535, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 0.2328900247812271, |
|
"learning_rate": 1.9981134276520828e-05, |
|
"loss": 0.1044, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 6.50738525390625, |
|
"learning_rate": 1.99766047623841e-05, |
|
"loss": 0.1007, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 1.2217416763305664, |
|
"learning_rate": 1.997158900260614e-05, |
|
"loss": 0.0586, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 0.12162764370441437, |
|
"learning_rate": 1.996608724164801e-05, |
|
"loss": 0.0689, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 3.3592309951782227, |
|
"learning_rate": 1.9960099747657774e-05, |
|
"loss": 0.0902, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.7452058792114258, |
|
"learning_rate": 1.995362681245744e-05, |
|
"loss": 0.1005, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 4.02609920501709, |
|
"learning_rate": 1.9946668751528745e-05, |
|
"loss": 0.0921, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 0.6409686207771301, |
|
"learning_rate": 1.9939225903997748e-05, |
|
"loss": 0.0846, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 0.5872007608413696, |
|
"learning_rate": 1.9931298632618355e-05, |
|
"loss": 0.0758, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 0.45962050557136536, |
|
"learning_rate": 1.992288732375458e-05, |
|
"loss": 0.0706, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"grad_norm": 0.5623728632926941, |
|
"learning_rate": 1.9913992387361747e-05, |
|
"loss": 0.0872, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 1.1793012619018555, |
|
"learning_rate": 1.9904614256966514e-05, |
|
"loss": 0.0889, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 1.6852632761001587, |
|
"learning_rate": 1.9894753389645723e-05, |
|
"loss": 0.0352, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 0.9700791835784912, |
|
"learning_rate": 1.9884410266004134e-05, |
|
"loss": 0.0595, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"grad_norm": 1.602258324623108, |
|
"learning_rate": 1.9873585390151003e-05, |
|
"loss": 0.1022, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 1.9688293933868408, |
|
"learning_rate": 1.986227928967551e-05, |
|
"loss": 0.0556, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 1.102990746498108, |
|
"learning_rate": 1.9850492515621038e-05, |
|
"loss": 0.0393, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"grad_norm": 0.3437381684780121, |
|
"learning_rate": 1.983822564245833e-05, |
|
"loss": 0.0701, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 15.4, |
|
"grad_norm": 1.0362671613693237, |
|
"learning_rate": 1.982547926805747e-05, |
|
"loss": 0.0564, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 0.20574238896369934, |
|
"learning_rate": 1.981225401365877e-05, |
|
"loss": 0.0525, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"grad_norm": 2.8671562671661377, |
|
"learning_rate": 1.979855052384247e-05, |
|
"loss": 0.0604, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.1551826000213623, |
|
"learning_rate": 1.9784369466497333e-05, |
|
"loss": 0.1029, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 16.2, |
|
"grad_norm": 1.6653599739074707, |
|
"learning_rate": 1.9769711532788083e-05, |
|
"loss": 0.0592, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.7432021498680115, |
|
"learning_rate": 1.9754577437121733e-05, |
|
"loss": 0.0465, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 16.6, |
|
"grad_norm": 0.7289913892745972, |
|
"learning_rate": 1.9738967917112752e-05, |
|
"loss": 0.0338, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 0.05203216150403023, |
|
"learning_rate": 1.9722883733547128e-05, |
|
"loss": 0.1039, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.6313950419425964, |
|
"learning_rate": 1.9706325670345276e-05, |
|
"loss": 0.0514, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 0.6369919180870056, |
|
"learning_rate": 1.968929453452383e-05, |
|
"loss": 0.0358, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 17.4, |
|
"grad_norm": 1.4422746896743774, |
|
"learning_rate": 1.967179115615633e-05, |
|
"loss": 0.0274, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 0.514399528503418, |
|
"learning_rate": 1.965381638833274e-05, |
|
"loss": 0.0279, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 0.05869906023144722, |
|
"learning_rate": 1.963537110711789e-05, |
|
"loss": 0.0477, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.21828149259090424, |
|
"learning_rate": 1.9616456211508756e-05, |
|
"loss": 0.0343, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 18.2, |
|
"grad_norm": 0.0298184622079134, |
|
"learning_rate": 1.9597072623390668e-05, |
|
"loss": 0.0167, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"grad_norm": 0.378787100315094, |
|
"learning_rate": 1.9577221287492368e-05, |
|
"loss": 0.0215, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"grad_norm": 4.263318061828613, |
|
"learning_rate": 1.9556903171339963e-05, |
|
"loss": 0.0161, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"grad_norm": 2.174684524536133, |
|
"learning_rate": 1.9536119265209763e-05, |
|
"loss": 0.0134, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.03400244563817978, |
|
"learning_rate": 1.951487058208003e-05, |
|
"loss": 0.015, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.05419604852795601, |
|
"learning_rate": 1.9493158157581617e-05, |
|
"loss": 0.0112, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 19.4, |
|
"grad_norm": 0.058521125465631485, |
|
"learning_rate": 1.9470983049947446e-05, |
|
"loss": 0.0033, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"grad_norm": 0.016082677990198135, |
|
"learning_rate": 1.9448346339960984e-05, |
|
"loss": 0.0192, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 19.8, |
|
"grad_norm": 6.2492570877075195, |
|
"learning_rate": 1.9425249130903544e-05, |
|
"loss": 0.0519, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 2.238447666168213, |
|
"learning_rate": 1.9401692548500504e-05, |
|
"loss": 0.0251, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 20.2, |
|
"grad_norm": 0.035032495856285095, |
|
"learning_rate": 1.937767774086646e-05, |
|
"loss": 0.0127, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 20.4, |
|
"grad_norm": 0.01225972082465887, |
|
"learning_rate": 1.935320587844926e-05, |
|
"loss": 0.004, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 20.6, |
|
"grad_norm": 0.004545880481600761, |
|
"learning_rate": 1.9328278153972947e-05, |
|
"loss": 0.0356, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"grad_norm": 3.91441011428833, |
|
"learning_rate": 1.9302895782379648e-05, |
|
"loss": 0.0355, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.48417937755584717, |
|
"learning_rate": 1.9277060000770342e-05, |
|
"loss": 0.0043, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 21.2, |
|
"grad_norm": 3.8912832736968994, |
|
"learning_rate": 1.925077206834458e-05, |
|
"loss": 0.0242, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 21.4, |
|
"grad_norm": 0.013348647393286228, |
|
"learning_rate": 1.9224033266339103e-05, |
|
"loss": 0.0076, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 21.6, |
|
"grad_norm": 0.11039220541715622, |
|
"learning_rate": 1.9196844897965393e-05, |
|
"loss": 0.0145, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 21.8, |
|
"grad_norm": 3.4672281742095947, |
|
"learning_rate": 1.9169208288346168e-05, |
|
"loss": 0.0033, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.006285279057919979, |
|
"learning_rate": 1.914112478445079e-05, |
|
"loss": 0.0107, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 22.2, |
|
"grad_norm": 0.04273529723286629, |
|
"learning_rate": 1.9112595755029625e-05, |
|
"loss": 0.0483, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"grad_norm": 0.007396802771836519, |
|
"learning_rate": 1.9083622590547313e-05, |
|
"loss": 0.0021, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 22.6, |
|
"grad_norm": 0.2701977491378784, |
|
"learning_rate": 1.905420670311502e-05, |
|
"loss": 0.0027, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 22.8, |
|
"grad_norm": 0.016060367226600647, |
|
"learning_rate": 1.9024349526421596e-05, |
|
"loss": 0.0217, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.017666257917881012, |
|
"learning_rate": 1.899405251566371e-05, |
|
"loss": 0.0458, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 23.2, |
|
"grad_norm": 0.043862633407115936, |
|
"learning_rate": 1.896331714747493e-05, |
|
"loss": 0.0011, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 23.4, |
|
"grad_norm": 0.031046241521835327, |
|
"learning_rate": 1.893214491985374e-05, |
|
"loss": 0.0128, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 23.6, |
|
"grad_norm": 0.005483916029334068, |
|
"learning_rate": 1.8900537352090523e-05, |
|
"loss": 0.0108, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 23.8, |
|
"grad_norm": 0.03962108865380287, |
|
"learning_rate": 1.886849598469356e-05, |
|
"loss": 0.0007, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.06565005332231522, |
|
"learning_rate": 1.8836022379313884e-05, |
|
"loss": 0.0173, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 24.2, |
|
"grad_norm": 0.006141431163996458, |
|
"learning_rate": 1.8803118118669203e-05, |
|
"loss": 0.001, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 24.4, |
|
"grad_norm": 0.008573741652071476, |
|
"learning_rate": 1.8769784806466768e-05, |
|
"loss": 0.0003, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 24.6, |
|
"grad_norm": 0.016578705981373787, |
|
"learning_rate": 1.8736024067325188e-05, |
|
"loss": 0.0043, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 24.8, |
|
"grad_norm": 0.006640794221311808, |
|
"learning_rate": 1.870183754669526e-05, |
|
"loss": 0.0002, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.8556722402572632, |
|
"learning_rate": 1.8667226910779767e-05, |
|
"loss": 0.0041, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 25.2, |
|
"grad_norm": 0.0022739043924957514, |
|
"learning_rate": 1.863219384645227e-05, |
|
"loss": 0.0003, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 25.4, |
|
"grad_norm": 0.012654612772166729, |
|
"learning_rate": 1.8596740061174912e-05, |
|
"loss": 0.0005, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 25.6, |
|
"grad_norm": 0.004489220213145018, |
|
"learning_rate": 1.8560867282915164e-05, |
|
"loss": 0.0041, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 25.8, |
|
"grad_norm": 0.006669621914625168, |
|
"learning_rate": 1.8524577260061628e-05, |
|
"loss": 0.0044, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.12849736213684082, |
|
"learning_rate": 1.848787176133882e-05, |
|
"loss": 0.0006, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 26.2, |
|
"grad_norm": 0.003363212803378701, |
|
"learning_rate": 1.8450752575720967e-05, |
|
"loss": 0.0002, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 26.4, |
|
"grad_norm": 0.005168403964489698, |
|
"learning_rate": 1.8413221512344805e-05, |
|
"loss": 0.0001, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 26.6, |
|
"grad_norm": 0.006153238005936146, |
|
"learning_rate": 1.837528040042142e-05, |
|
"loss": 0.0005, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 26.8, |
|
"grad_norm": 0.005247591994702816, |
|
"learning_rate": 1.8336931089147076e-05, |
|
"loss": 0.0002, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.008389480412006378, |
|
"learning_rate": 1.82981754476131e-05, |
|
"loss": 0.0061, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 27.2, |
|
"grad_norm": 0.001621560426428914, |
|
"learning_rate": 1.8259015364714786e-05, |
|
"loss": 0.0002, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 27.4, |
|
"grad_norm": 0.012759492732584476, |
|
"learning_rate": 1.8219452749059332e-05, |
|
"loss": 0.0009, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 27.6, |
|
"grad_norm": 0.0008200361044146121, |
|
"learning_rate": 1.8179489528872808e-05, |
|
"loss": 0.0002, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 27.8, |
|
"grad_norm": 0.010921397246420383, |
|
"learning_rate": 1.8139127651906183e-05, |
|
"loss": 0.0001, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.009295720607042313, |
|
"learning_rate": 1.80983690853404e-05, |
|
"loss": 0.0001, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 28.2, |
|
"grad_norm": 0.003324209712445736, |
|
"learning_rate": 1.8057215815690494e-05, |
|
"loss": 0.0001, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 28.4, |
|
"grad_norm": 0.19921115040779114, |
|
"learning_rate": 1.8015669848708768e-05, |
|
"loss": 0.0003, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 28.6, |
|
"grad_norm": 0.005366879981011152, |
|
"learning_rate": 1.7973733209287036e-05, |
|
"loss": 0.0001, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 28.8, |
|
"grad_norm": 0.0017453564796596766, |
|
"learning_rate": 1.793140794135795e-05, |
|
"loss": 0.0029, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.013708272948861122, |
|
"learning_rate": 1.7888696107795343e-05, |
|
"loss": 0.0001, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 29.2, |
|
"grad_norm": 0.0043100654147565365, |
|
"learning_rate": 1.7845599790313735e-05, |
|
"loss": 0.0003, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 29.4, |
|
"grad_norm": 0.02302490547299385, |
|
"learning_rate": 1.780212108936684e-05, |
|
"loss": 0.0011, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 29.6, |
|
"grad_norm": 0.004419188946485519, |
|
"learning_rate": 1.7758262124045195e-05, |
|
"loss": 0.0001, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 29.8, |
|
"grad_norm": 0.0013392951805144548, |
|
"learning_rate": 1.7714025031972904e-05, |
|
"loss": 0.0003, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.0014274229761213064, |
|
"learning_rate": 1.7669411969203417e-05, |
|
"loss": 0.0002, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 30.2, |
|
"grad_norm": 0.001878496608696878, |
|
"learning_rate": 1.762442511011448e-05, |
|
"loss": 0.0001, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 30.4, |
|
"grad_norm": 0.005069437436759472, |
|
"learning_rate": 1.7579066647302134e-05, |
|
"loss": 0.0001, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 30.6, |
|
"grad_norm": 0.0014868683647364378, |
|
"learning_rate": 1.7533338791473872e-05, |
|
"loss": 0.0001, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 30.8, |
|
"grad_norm": 0.003168011549860239, |
|
"learning_rate": 1.7487243771340862e-05, |
|
"loss": 0.0001, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.004825201351195574, |
|
"learning_rate": 1.7440783833509366e-05, |
|
"loss": 0.0001, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 31.2, |
|
"grad_norm": 0.006287866272032261, |
|
"learning_rate": 1.7393961242371203e-05, |
|
"loss": 0.0001, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 31.4, |
|
"grad_norm": 0.005126502364873886, |
|
"learning_rate": 1.7346778279993417e-05, |
|
"loss": 0.0001, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 31.6, |
|
"grad_norm": 0.0005843121325597167, |
|
"learning_rate": 1.7299237246007018e-05, |
|
"loss": 0.0001, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 31.8, |
|
"grad_norm": 0.0023757207673043013, |
|
"learning_rate": 1.7251340457494934e-05, |
|
"loss": 0.0001, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.004019857384264469, |
|
"learning_rate": 1.720309024887907e-05, |
|
"loss": 0.0001, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 32.2, |
|
"grad_norm": 0.006111355032771826, |
|
"learning_rate": 1.715448897180652e-05, |
|
"loss": 0.0001, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 32.4, |
|
"grad_norm": 0.000667453627102077, |
|
"learning_rate": 1.710553899503496e-05, |
|
"loss": 0.0001, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 32.6, |
|
"grad_norm": 0.0010104542598128319, |
|
"learning_rate": 1.705624270431721e-05, |
|
"loss": 0.0001, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 32.8, |
|
"grad_norm": 0.004804989788681269, |
|
"learning_rate": 1.700660250228492e-05, |
|
"loss": 0.0001, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.004056953825056553, |
|
"learning_rate": 1.695662080833151e-05, |
|
"loss": 0.0001, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 33.2, |
|
"grad_norm": 0.0015452856896445155, |
|
"learning_rate": 1.690630005849423e-05, |
|
"loss": 0.0001, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 33.4, |
|
"grad_norm": 0.0012899715220555663, |
|
"learning_rate": 1.6855642705335438e-05, |
|
"loss": 0.0001, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 33.6, |
|
"grad_norm": 0.0010839162860065699, |
|
"learning_rate": 1.6804651217823055e-05, |
|
"loss": 0.0001, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 33.8, |
|
"grad_norm": 0.0011267494410276413, |
|
"learning_rate": 1.6753328081210244e-05, |
|
"loss": 0.0001, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.0027927302289754152, |
|
"learning_rate": 1.6701675796914284e-05, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 34.2, |
|
"grad_norm": 0.0011741920607164502, |
|
"learning_rate": 1.6649696882394635e-05, |
|
"loss": 0.0001, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 34.4, |
|
"grad_norm": 0.0019491278799250722, |
|
"learning_rate": 1.6597393871030264e-05, |
|
"loss": 0.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 34.6, |
|
"grad_norm": 0.0015785045688971877, |
|
"learning_rate": 1.654476931199615e-05, |
|
"loss": 0.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 34.8, |
|
"grad_norm": 0.0008629497606307268, |
|
"learning_rate": 1.649182577013906e-05, |
|
"loss": 0.0001, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.003077285597100854, |
|
"learning_rate": 1.643856582585254e-05, |
|
"loss": 0.0001, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 35.2, |
|
"grad_norm": 0.0013041843194514513, |
|
"learning_rate": 1.6384992074951124e-05, |
|
"loss": 0.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 35.4, |
|
"grad_norm": 0.0008168126805685461, |
|
"learning_rate": 1.6331107128543856e-05, |
|
"loss": 0.0001, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 35.6, |
|
"grad_norm": 0.0013320008292794228, |
|
"learning_rate": 1.6276913612907005e-05, |
|
"loss": 0.0001, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 35.8, |
|
"grad_norm": 0.0011947667226195335, |
|
"learning_rate": 1.6222414169356066e-05, |
|
"loss": 0.0001, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.0013605114072561264, |
|
"learning_rate": 1.6167611454117027e-05, |
|
"loss": 0.0001, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 36.2, |
|
"grad_norm": 0.00024013180518522859, |
|
"learning_rate": 1.611250813819692e-05, |
|
"loss": 0.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 36.4, |
|
"grad_norm": 0.002005120040848851, |
|
"learning_rate": 1.6057106907253617e-05, |
|
"loss": 0.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 36.6, |
|
"grad_norm": 0.0015634404262527823, |
|
"learning_rate": 1.6001410461464955e-05, |
|
"loss": 0.0001, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 36.8, |
|
"grad_norm": 0.0009015756659209728, |
|
"learning_rate": 1.5945421515397135e-05, |
|
"loss": 0.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.001351571292616427, |
|
"learning_rate": 1.5889142797872387e-05, |
|
"loss": 0.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 37.2, |
|
"grad_norm": 0.0014051242033019662, |
|
"learning_rate": 1.5832577051836016e-05, |
|
"loss": 0.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 37.4, |
|
"grad_norm": 0.0011573507217690349, |
|
"learning_rate": 1.5775727034222675e-05, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 37.6, |
|
"grad_norm": 0.0010998898651450872, |
|
"learning_rate": 1.5718595515822027e-05, |
|
"loss": 0.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 37.8, |
|
"grad_norm": 0.001832842011936009, |
|
"learning_rate": 1.5661185281143666e-05, |
|
"loss": 0.0001, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.0008399503421969712, |
|
"learning_rate": 1.5603499128281447e-05, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 38.2, |
|
"grad_norm": 0.0016098183114081621, |
|
"learning_rate": 1.5545539868777075e-05, |
|
"loss": 0.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 38.4, |
|
"grad_norm": 0.0031196950003504753, |
|
"learning_rate": 1.5487310327483087e-05, |
|
"loss": 0.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 38.6, |
|
"grad_norm": 0.0025677145458757877, |
|
"learning_rate": 1.5428813342425177e-05, |
|
"loss": 0.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 38.8, |
|
"grad_norm": 0.00022408693621400744, |
|
"learning_rate": 1.5370051764663872e-05, |
|
"loss": 0.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.001144173787906766, |
|
"learning_rate": 1.5311028458155567e-05, |
|
"loss": 0.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 39.2, |
|
"grad_norm": 0.0006699798977933824, |
|
"learning_rate": 1.5251746299612959e-05, |
|
"loss": 0.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 39.4, |
|
"grad_norm": 0.0007150991004891694, |
|
"learning_rate": 1.5192208178364815e-05, |
|
"loss": 0.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 39.6, |
|
"grad_norm": 0.0018977021099999547, |
|
"learning_rate": 1.5132416996215171e-05, |
|
"loss": 0.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 39.8, |
|
"grad_norm": 0.0005208357470110059, |
|
"learning_rate": 1.5072375667301893e-05, |
|
"loss": 0.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.0018139018211513758, |
|
"learning_rate": 1.5012087117954643e-05, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 40.2, |
|
"grad_norm": 0.0009972749976441264, |
|
"learning_rate": 1.4951554286552266e-05, |
|
"loss": 0.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 40.4, |
|
"grad_norm": 0.0007164340931922197, |
|
"learning_rate": 1.4890780123379565e-05, |
|
"loss": 0.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 40.6, |
|
"grad_norm": 0.0004905250389128923, |
|
"learning_rate": 1.4829767590483508e-05, |
|
"loss": 0.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 40.8, |
|
"grad_norm": 0.001772880437783897, |
|
"learning_rate": 1.4768519661528879e-05, |
|
"loss": 0.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.0008068050374276936, |
|
"learning_rate": 1.470703932165333e-05, |
|
"loss": 0.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 41.2, |
|
"grad_norm": 0.0004466329119168222, |
|
"learning_rate": 1.464532956732188e-05, |
|
"loss": 0.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 41.4, |
|
"grad_norm": 0.0007810869137756526, |
|
"learning_rate": 1.4583393406180898e-05, |
|
"loss": 0.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 41.6, |
|
"grad_norm": 0.000590488372836262, |
|
"learning_rate": 1.4521233856911507e-05, |
|
"loss": 0.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 41.8, |
|
"grad_norm": 0.0007621172117069364, |
|
"learning_rate": 1.4458853949082443e-05, |
|
"loss": 0.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.0016841273754835129, |
|
"learning_rate": 1.43962567230024e-05, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 42.2, |
|
"grad_norm": 0.0006738647352904081, |
|
"learning_rate": 1.4333445229571874e-05, |
|
"loss": 0.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 42.4, |
|
"grad_norm": 0.0022500527556985617, |
|
"learning_rate": 1.4270422530134433e-05, |
|
"loss": 0.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 42.6, |
|
"grad_norm": 0.000947895459830761, |
|
"learning_rate": 1.420719169632755e-05, |
|
"loss": 0.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 42.8, |
|
"grad_norm": 0.0006658710190095007, |
|
"learning_rate": 1.4143755809932843e-05, |
|
"loss": 0.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.0005016120849177241, |
|
"learning_rate": 1.4080117962725929e-05, |
|
"loss": 0.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 43.2, |
|
"grad_norm": 0.000756660767365247, |
|
"learning_rate": 1.4016281256325702e-05, |
|
"loss": 0.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 43.4, |
|
"grad_norm": 0.0005679208552464843, |
|
"learning_rate": 1.3952248802043166e-05, |
|
"loss": 0.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 43.6, |
|
"grad_norm": 0.0014170549111440778, |
|
"learning_rate": 1.388802372072981e-05, |
|
"loss": 0.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 43.8, |
|
"grad_norm": 0.0006305701099336147, |
|
"learning_rate": 1.3823609142625492e-05, |
|
"loss": 0.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.0015643569640815258, |
|
"learning_rate": 1.3759008207205869e-05, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 44.2, |
|
"grad_norm": 0.000876879261340946, |
|
"learning_rate": 1.3694224063029396e-05, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 44.4, |
|
"grad_norm": 0.0015212204307317734, |
|
"learning_rate": 1.3629259867583864e-05, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 44.6, |
|
"grad_norm": 0.000732761574909091, |
|
"learning_rate": 1.3564118787132507e-05, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 44.8, |
|
"grad_norm": 0.0007465777453035116, |
|
"learning_rate": 1.349880399655969e-05, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.0010901193600147963, |
|
"learning_rate": 1.3433318679216154e-05, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 45.2, |
|
"grad_norm": 0.0006251283921301365, |
|
"learning_rate": 1.3367666026763884e-05, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 45.4, |
|
"grad_norm": 0.0005420687957666814, |
|
"learning_rate": 1.3301849239020537e-05, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 45.6, |
|
"grad_norm": 0.0010151707101613283, |
|
"learning_rate": 1.3235871523803496e-05, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 45.8, |
|
"grad_norm": 0.0009046559571288526, |
|
"learning_rate": 1.316973609677352e-05, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.0015346037689596415, |
|
"learning_rate": 1.3103446181278015e-05, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 46.2, |
|
"grad_norm": 0.00042417130316607654, |
|
"learning_rate": 1.3037005008193944e-05, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 46.4, |
|
"grad_norm": 0.000657816359307617, |
|
"learning_rate": 1.297041581577035e-05, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 46.6, |
|
"grad_norm": 0.0005462078843265772, |
|
"learning_rate": 1.2903681849470528e-05, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 46.8, |
|
"grad_norm": 0.0008825630648061633, |
|
"learning_rate": 1.2836806361813846e-05, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.0012507627252489328, |
|
"learning_rate": 1.2769792612217224e-05, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 47.2, |
|
"grad_norm": 0.0010354206897318363, |
|
"learning_rate": 1.270264386683628e-05, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 47.4, |
|
"grad_norm": 0.000858921732287854, |
|
"learning_rate": 1.263536339840613e-05, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 47.6, |
|
"grad_norm": 0.0004237701359670609, |
|
"learning_rate": 1.256795448608188e-05, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 47.8, |
|
"grad_norm": 0.001207050052471459, |
|
"learning_rate": 1.2500420415278822e-05, |
|
"loss": 0.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.0013371013337746263, |
|
"learning_rate": 1.2432764477512294e-05, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 48.2, |
|
"grad_norm": 0.0011320069897919893, |
|
"learning_rate": 1.236498997023725e-05, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 48.4, |
|
"grad_norm": 0.0005478229722939432, |
|
"learning_rate": 1.2297100196687557e-05, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 48.6, |
|
"grad_norm": 0.0010939686326310039, |
|
"learning_rate": 1.2229098465715005e-05, |
|
"loss": 0.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 48.8, |
|
"grad_norm": 0.0005616277339868248, |
|
"learning_rate": 1.2160988091628023e-05, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.003290074411779642, |
|
"learning_rate": 1.2092772394030153e-05, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 49.2, |
|
"grad_norm": 0.000533553131390363, |
|
"learning_rate": 1.202445469765826e-05, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 49.4, |
|
"grad_norm": 0.0006738762021996081, |
|
"learning_rate": 1.1956038332220484e-05, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 49.6, |
|
"grad_norm": 0.0005617621936835349, |
|
"learning_rate": 1.1887526632233954e-05, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 49.8, |
|
"grad_norm": 0.0004871887213084847, |
|
"learning_rate": 1.181892293686227e-05, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.0006230998551473022, |
|
"learning_rate": 1.1750230589752763e-05, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 50.2, |
|
"grad_norm": 0.0003269360458943993, |
|
"learning_rate": 1.1681452938873516e-05, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 50.4, |
|
"grad_norm": 0.0002793700259644538, |
|
"learning_rate": 1.1612593336350209e-05, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 50.6, |
|
"grad_norm": 0.0009224207024089992, |
|
"learning_rate": 1.1543655138302714e-05, |
|
"loss": 0.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 50.8, |
|
"grad_norm": 0.0006281372043304145, |
|
"learning_rate": 1.1474641704681551e-05, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.0026524649001657963, |
|
"learning_rate": 1.140555639910411e-05, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 51.2, |
|
"grad_norm": 0.00017076131189242005, |
|
"learning_rate": 1.1336402588690727e-05, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 51.4, |
|
"grad_norm": 0.00033115188125520945, |
|
"learning_rate": 1.1267183643900548e-05, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 51.6, |
|
"grad_norm": 0.0005470161559060216, |
|
"learning_rate": 1.1197902938367297e-05, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 51.8, |
|
"grad_norm": 0.000852386059705168, |
|
"learning_rate": 1.1128563848734817e-05, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.0011009910376742482, |
|
"learning_rate": 1.105916975449252e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 52.2, |
|
"grad_norm": 0.0002607049827929586, |
|
"learning_rate": 1.0989724037810651e-05, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 52.4, |
|
"grad_norm": 0.0009237987687811255, |
|
"learning_rate": 1.0920230083375474e-05, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 52.6, |
|
"grad_norm": 0.0008139739511534572, |
|
"learning_rate": 1.0850691278224282e-05, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 52.8, |
|
"grad_norm": 0.00035912173916585743, |
|
"learning_rate": 1.0781111011580336e-05, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.0004961323575116694, |
|
"learning_rate": 1.071149267468767e-05, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 53.2, |
|
"grad_norm": 0.00048558932030573487, |
|
"learning_rate": 1.0641839660645806e-05, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 53.4, |
|
"grad_norm": 0.0002258286694996059, |
|
"learning_rate": 1.0572155364244383e-05, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 53.6, |
|
"grad_norm": 0.0002879631647374481, |
|
"learning_rate": 1.0502443181797696e-05, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 53.8, |
|
"grad_norm": 0.0004035363963339478, |
|
"learning_rate": 1.0432706510979172e-05, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.0009778700768947601, |
|
"learning_rate": 1.036294875065576e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 54.2, |
|
"grad_norm": 0.0002560973516665399, |
|
"learning_rate": 1.0293173300722286e-05, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 54.4, |
|
"grad_norm": 0.00044633098877966404, |
|
"learning_rate": 1.0223383561935738e-05, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 54.6, |
|
"grad_norm": 0.00038400889025069773, |
|
"learning_rate": 1.0153582935749531e-05, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 54.8, |
|
"grad_norm": 0.00039862972334958613, |
|
"learning_rate": 1.0083774824147707e-05, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.000944117724429816, |
|
"learning_rate": 1.0013962629479145e-05, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 55.2, |
|
"grad_norm": 0.0006676441989839077, |
|
"learning_rate": 9.944149754291719e-06, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 55.4, |
|
"grad_norm": 0.0003962357295677066, |
|
"learning_rate": 9.874339601166474e-06, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 55.6, |
|
"grad_norm": 0.0003374406660441309, |
|
"learning_rate": 9.80453557255179e-06, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 55.8, |
|
"grad_norm": 0.000748621707316488, |
|
"learning_rate": 9.73474107059754e-06, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.0006823341245763004, |
|
"learning_rate": 9.664959496989286e-06, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 56.2, |
|
"grad_norm": 0.00021607705275528133, |
|
"learning_rate": 9.595194252782476e-06, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 56.4, |
|
"grad_norm": 0.0003680960217025131, |
|
"learning_rate": 9.525448738236691e-06, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 56.6, |
|
"grad_norm": 0.0009641498327255249, |
|
"learning_rate": 9.45572635264991e-06, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 56.8, |
|
"grad_norm": 0.0002777623012661934, |
|
"learning_rate": 9.386030494192847e-06, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.0002459356328472495, |
|
"learning_rate": 9.316364559743315e-06, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 57.2, |
|
"grad_norm": 0.0004620914114639163, |
|
"learning_rate": 9.246731944720675e-06, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 57.4, |
|
"grad_norm": 0.0006009451462887228, |
|
"learning_rate": 9.177136042920344e-06, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 57.6, |
|
"grad_norm": 0.00014700352039653808, |
|
"learning_rate": 9.107580246348395e-06, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 57.8, |
|
"grad_norm": 0.000799352303147316, |
|
"learning_rate": 9.038067945056229e-06, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.0003337472153361887, |
|
"learning_rate": 8.968602526975329e-06, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 58.2, |
|
"grad_norm": 0.0003684649127535522, |
|
"learning_rate": 8.89918737775218e-06, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 58.4, |
|
"grad_norm": 0.0007150989840738475, |
|
"learning_rate": 8.829825880583228e-06, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 58.6, |
|
"grad_norm": 0.0007625590660609305, |
|
"learning_rate": 8.760521416049983e-06, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 58.8, |
|
"grad_norm": 0.0006776860682293773, |
|
"learning_rate": 8.69127736195428e-06, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.0006930575473234057, |
|
"learning_rate": 8.62209709315362e-06, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 59.2, |
|
"grad_norm": 0.00034225499257445335, |
|
"learning_rate": 8.552983981396709e-06, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 59.4, |
|
"grad_norm": 0.0020495259668678045, |
|
"learning_rate": 8.483941395159114e-06, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 59.6, |
|
"grad_norm": 0.00036732573062181473, |
|
"learning_rate": 8.414972699479076e-06, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 59.8, |
|
"grad_norm": 0.0004440142656676471, |
|
"learning_rate": 8.346081255793524e-06, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.0002861985703930259, |
|
"learning_rate": 8.277270421774234e-06, |
|
"loss": 0.0, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0184896961970176e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|