{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.004092034400369192, "eval_steps": 500, "global_step": 2250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.818681955719641e-05, "grad_norm": 1.608859896659851, "learning_rate": 0.0002, "loss": 3.3372, "step": 10 }, { "epoch": 3.637363911439282e-05, "grad_norm": 0.9594001770019531, "learning_rate": 0.0002, "loss": 0.1895, "step": 20 }, { "epoch": 5.4560458671589234e-05, "grad_norm": 0.7858404517173767, "learning_rate": 0.0002, "loss": 0.1819, "step": 30 }, { "epoch": 7.274727822878565e-05, "grad_norm": 0.05236278474330902, "learning_rate": 0.0002, "loss": 0.0967, "step": 40 }, { "epoch": 9.093409778598205e-05, "grad_norm": 0.00239331996999681, "learning_rate": 0.0002, "loss": 0.0001, "step": 50 }, { "epoch": 0.00010912091734317847, "grad_norm": 0.6015797853469849, "learning_rate": 0.0002, "loss": 0.9137, "step": 60 }, { "epoch": 0.00012730773690037487, "grad_norm": 0.2916141152381897, "learning_rate": 0.0002, "loss": 0.1584, "step": 70 }, { "epoch": 0.0001454945564575713, "grad_norm": 0.22034427523612976, "learning_rate": 0.0002, "loss": 0.1212, "step": 80 }, { "epoch": 0.0001636813760147677, "grad_norm": 0.05342680215835571, "learning_rate": 0.0002, "loss": 0.0933, "step": 90 }, { "epoch": 0.0001818681955719641, "grad_norm": 0.0009122246992774308, "learning_rate": 0.0002, "loss": 0.0002, "step": 100 }, { "epoch": 0.00020005501512916052, "grad_norm": 0.30845287442207336, "learning_rate": 0.0002, "loss": 0.8141, "step": 110 }, { "epoch": 0.00021824183468635694, "grad_norm": 0.1849660873413086, "learning_rate": 0.0002, "loss": 0.1604, "step": 120 }, { "epoch": 0.00023642865424355333, "grad_norm": 0.09605516493320465, "learning_rate": 0.0002, "loss": 0.1163, "step": 130 }, { "epoch": 0.00025461547380074975, "grad_norm": 0.4438878893852234, "learning_rate": 0.0002, "loss": 0.1043, "step": 140 }, { "epoch": 0.00027280229335794617, "grad_norm": 0.0012718827929347754, "learning_rate": 0.0002, "loss": 0.0018, "step": 150 }, { "epoch": 0.0002909891129151426, "grad_norm": 0.19092628359794617, "learning_rate": 0.0002, "loss": 0.6226, "step": 160 }, { "epoch": 0.000309175932472339, "grad_norm": 0.04102358967065811, "learning_rate": 0.0002, "loss": 0.1575, "step": 170 }, { "epoch": 0.0003273627520295354, "grad_norm": 0.06057624891400337, "learning_rate": 0.0002, "loss": 0.1119, "step": 180 }, { "epoch": 0.0003455495715867318, "grad_norm": 0.13942261040210724, "learning_rate": 0.0002, "loss": 0.0781, "step": 190 }, { "epoch": 0.0003637363911439282, "grad_norm": 0.005650315433740616, "learning_rate": 0.0002, "loss": 0.0013, "step": 200 }, { "epoch": 0.0003819232107011246, "grad_norm": 0.6245204210281372, "learning_rate": 0.0002, "loss": 0.7091, "step": 210 }, { "epoch": 0.00040011003025832104, "grad_norm": 0.058550119400024414, "learning_rate": 0.0002, "loss": 0.1411, "step": 220 }, { "epoch": 0.00041829684981551746, "grad_norm": 0.08625461906194687, "learning_rate": 0.0002, "loss": 0.1138, "step": 230 }, { "epoch": 0.0004364836693727139, "grad_norm": 0.06455521285533905, "learning_rate": 0.0002, "loss": 0.0805, "step": 240 }, { "epoch": 0.0004546704889299103, "grad_norm": 0.0020822149235755205, "learning_rate": 0.0002, "loss": 0.0013, "step": 250 }, { "epoch": 0.00047285730848710666, "grad_norm": 0.1977258324623108, "learning_rate": 0.0002, "loss": 0.5558, "step": 260 }, { "epoch": 0.0004910441280443031, "grad_norm": 0.20794034004211426, "learning_rate": 0.0002, "loss": 0.1263, "step": 270 }, { "epoch": 0.0005092309476014995, "grad_norm": 0.08760973066091537, "learning_rate": 0.0002, "loss": 0.1118, "step": 280 }, { "epoch": 0.0005274177671586959, "grad_norm": 0.299059122800827, "learning_rate": 0.0002, "loss": 0.0804, "step": 290 }, { "epoch": 0.0005456045867158923, "grad_norm": 0.002420844743028283, "learning_rate": 0.0002, "loss": 0.0022, "step": 300 }, { "epoch": 0.0005637914062730887, "grad_norm": 2.2061026096343994, "learning_rate": 0.0002, "loss": 0.5624, "step": 310 }, { "epoch": 0.0005819782258302852, "grad_norm": 0.7011717557907104, "learning_rate": 0.0002, "loss": 0.1177, "step": 320 }, { "epoch": 0.0006001650453874816, "grad_norm": 0.37657421827316284, "learning_rate": 0.0002, "loss": 0.1005, "step": 330 }, { "epoch": 0.000618351864944678, "grad_norm": 0.048011403530836105, "learning_rate": 0.0002, "loss": 0.079, "step": 340 }, { "epoch": 0.0006365386845018744, "grad_norm": 0.002076848642900586, "learning_rate": 0.0002, "loss": 0.0018, "step": 350 }, { "epoch": 0.0006547255040590708, "grad_norm": 0.721218466758728, "learning_rate": 0.0002, "loss": 0.5647, "step": 360 }, { "epoch": 0.0006729123236162671, "grad_norm": 0.09965512156486511, "learning_rate": 0.0002, "loss": 0.1403, "step": 370 }, { "epoch": 0.0006910991431734636, "grad_norm": 0.21733985841274261, "learning_rate": 0.0002, "loss": 0.1024, "step": 380 }, { "epoch": 0.00070928596273066, "grad_norm": 0.003134253202006221, "learning_rate": 0.0002, "loss": 0.0707, "step": 390 }, { "epoch": 0.0007274727822878564, "grad_norm": 0.0011866611894220114, "learning_rate": 0.0002, "loss": 0.0003, "step": 400 }, { "epoch": 0.0007456596018450528, "grad_norm": 0.37573525309562683, "learning_rate": 0.0002, "loss": 0.6122, "step": 410 }, { "epoch": 0.0007638464214022492, "grad_norm": 0.34029653668403625, "learning_rate": 0.0002, "loss": 0.1149, "step": 420 }, { "epoch": 0.0007820332409594457, "grad_norm": 0.35701191425323486, "learning_rate": 0.0002, "loss": 0.0972, "step": 430 }, { "epoch": 0.0008002200605166421, "grad_norm": 0.06324547529220581, "learning_rate": 0.0002, "loss": 0.0722, "step": 440 }, { "epoch": 0.0008184068800738385, "grad_norm": 0.0011920438846573234, "learning_rate": 0.0002, "loss": 0.0011, "step": 450 }, { "epoch": 0.0008365936996310349, "grad_norm": 0.861393392086029, "learning_rate": 0.0002, "loss": 0.4433, "step": 460 }, { "epoch": 0.0008547805191882313, "grad_norm": 0.16104361414909363, "learning_rate": 0.0002, "loss": 0.1176, "step": 470 }, { "epoch": 0.0008729673387454278, "grad_norm": 0.28712376952171326, "learning_rate": 0.0002, "loss": 0.0983, "step": 480 }, { "epoch": 0.0008911541583026242, "grad_norm": 0.07980292290449142, "learning_rate": 0.0002, "loss": 0.0721, "step": 490 }, { "epoch": 0.0009093409778598206, "grad_norm": 0.0018368292367085814, "learning_rate": 0.0002, "loss": 0.0019, "step": 500 }, { "epoch": 0.000927527797417017, "grad_norm": 0.05152284353971481, "learning_rate": 0.0002, "loss": 0.2823, "step": 510 }, { "epoch": 0.0009457146169742133, "grad_norm": 0.04693318158388138, "learning_rate": 0.0002, "loss": 0.1179, "step": 520 }, { "epoch": 0.0009639014365314097, "grad_norm": 0.10586889833211899, "learning_rate": 0.0002, "loss": 0.1094, "step": 530 }, { "epoch": 0.0009820882560886062, "grad_norm": 0.006325385998934507, "learning_rate": 0.0002, "loss": 0.0706, "step": 540 }, { "epoch": 0.0010002750756458027, "grad_norm": 4.665973028750159e-05, "learning_rate": 0.0002, "loss": 0.0002, "step": 550 }, { "epoch": 0.001018461895202999, "grad_norm": 0.293944776058197, "learning_rate": 0.0002, "loss": 0.732, "step": 560 }, { "epoch": 0.0010366487147601955, "grad_norm": 0.22614754736423492, "learning_rate": 0.0002, "loss": 0.1226, "step": 570 }, { "epoch": 0.0010548355343173918, "grad_norm": 0.10801248252391815, "learning_rate": 0.0002, "loss": 0.1065, "step": 580 }, { "epoch": 0.0010730223538745881, "grad_norm": 0.04501640051603317, "learning_rate": 0.0002, "loss": 0.0759, "step": 590 }, { "epoch": 0.0010912091734317847, "grad_norm": 0.00014656950952485204, "learning_rate": 0.0002, "loss": 0.0001, "step": 600 }, { "epoch": 0.001109395992988981, "grad_norm": 0.2490423321723938, "learning_rate": 0.0002, "loss": 0.4868, "step": 610 }, { "epoch": 0.0011275828125461775, "grad_norm": 0.026224857196211815, "learning_rate": 0.0002, "loss": 0.1242, "step": 620 }, { "epoch": 0.0011457696321033738, "grad_norm": 0.11845973134040833, "learning_rate": 0.0002, "loss": 0.1153, "step": 630 }, { "epoch": 0.0011639564516605703, "grad_norm": 0.8349707722663879, "learning_rate": 0.0002, "loss": 0.1229, "step": 640 }, { "epoch": 0.0011821432712177666, "grad_norm": 0.00733955716714263, "learning_rate": 0.0002, "loss": 0.0016, "step": 650 }, { "epoch": 0.0012003300907749632, "grad_norm": 0.3534531891345978, "learning_rate": 0.0002, "loss": 0.4961, "step": 660 }, { "epoch": 0.0012185169103321595, "grad_norm": 0.3938736915588379, "learning_rate": 0.0002, "loss": 0.126, "step": 670 }, { "epoch": 0.001236703729889356, "grad_norm": 0.16779105365276337, "learning_rate": 0.0002, "loss": 0.1217, "step": 680 }, { "epoch": 0.0012548905494465523, "grad_norm": 0.6998353600502014, "learning_rate": 0.0002, "loss": 0.1171, "step": 690 }, { "epoch": 0.0012730773690037488, "grad_norm": 0.0005113715888001025, "learning_rate": 0.0002, "loss": 0.0143, "step": 700 }, { "epoch": 0.0012912641885609452, "grad_norm": 0.4034057557582855, "learning_rate": 0.0002, "loss": 0.6274, "step": 710 }, { "epoch": 0.0013094510081181417, "grad_norm": 0.08985241502523422, "learning_rate": 0.0002, "loss": 0.1295, "step": 720 }, { "epoch": 0.001327637827675338, "grad_norm": 0.20418916642665863, "learning_rate": 0.0002, "loss": 0.1234, "step": 730 }, { "epoch": 0.0013458246472325343, "grad_norm": 1.0206961631774902, "learning_rate": 0.0002, "loss": 0.1107, "step": 740 }, { "epoch": 0.0013640114667897308, "grad_norm": 0.0008244478958658874, "learning_rate": 0.0002, "loss": 0.0087, "step": 750 }, { "epoch": 0.0013821982863469271, "grad_norm": 3.696362018585205, "learning_rate": 0.0002, "loss": 0.9078, "step": 760 }, { "epoch": 0.0014003851059041237, "grad_norm": 0.8782555460929871, "learning_rate": 0.0002, "loss": 0.1732, "step": 770 }, { "epoch": 0.00141857192546132, "grad_norm": 0.18350496888160706, "learning_rate": 0.0002, "loss": 0.1205, "step": 780 }, { "epoch": 0.0014367587450185165, "grad_norm": 0.634567141532898, "learning_rate": 0.0002, "loss": 0.0997, "step": 790 }, { "epoch": 0.0014549455645757128, "grad_norm": 0.01041293516755104, "learning_rate": 0.0002, "loss": 0.0065, "step": 800 }, { "epoch": 0.0014731323841329093, "grad_norm": 3.0739810466766357, "learning_rate": 0.0002, "loss": 0.7739, "step": 810 }, { "epoch": 0.0014913192036901056, "grad_norm": 0.4407779276371002, "learning_rate": 0.0002, "loss": 0.2848, "step": 820 }, { "epoch": 0.0015095060232473022, "grad_norm": 0.25743165612220764, "learning_rate": 0.0002, "loss": 0.1318, "step": 830 }, { "epoch": 0.0015276928428044985, "grad_norm": 2.0397753715515137, "learning_rate": 0.0002, "loss": 0.1385, "step": 840 }, { "epoch": 0.001545879662361695, "grad_norm": 0.060638878494501114, "learning_rate": 0.0002, "loss": 0.047, "step": 850 }, { "epoch": 0.0015640664819188913, "grad_norm": 2.5641930103302, "learning_rate": 0.0002, "loss": 0.5497, "step": 860 }, { "epoch": 0.0015822533014760878, "grad_norm": 0.9419782161712646, "learning_rate": 0.0002, "loss": 0.1626, "step": 870 }, { "epoch": 0.0016004401210332842, "grad_norm": 0.1152188628911972, "learning_rate": 0.0002, "loss": 0.1063, "step": 880 }, { "epoch": 0.0016186269405904805, "grad_norm": 0.6502537131309509, "learning_rate": 0.0002, "loss": 0.0871, "step": 890 }, { "epoch": 0.001636813760147677, "grad_norm": 0.023487605154514313, "learning_rate": 0.0002, "loss": 0.0094, "step": 900 }, { "epoch": 0.0016550005797048733, "grad_norm": 1.9080859422683716, "learning_rate": 0.0002, "loss": 0.5073, "step": 910 }, { "epoch": 0.0016731873992620698, "grad_norm": 0.44722509384155273, "learning_rate": 0.0002, "loss": 0.167, "step": 920 }, { "epoch": 0.0016913742188192661, "grad_norm": 0.24151289463043213, "learning_rate": 0.0002, "loss": 0.1237, "step": 930 }, { "epoch": 0.0017095610383764627, "grad_norm": 1.1394294500350952, "learning_rate": 0.0002, "loss": 0.1014, "step": 940 }, { "epoch": 0.001727747857933659, "grad_norm": 0.011057032272219658, "learning_rate": 0.0002, "loss": 0.0069, "step": 950 }, { "epoch": 0.0017459346774908555, "grad_norm": 4.32397985458374, "learning_rate": 0.0002, "loss": 0.7672, "step": 960 }, { "epoch": 0.0017641214970480518, "grad_norm": 0.9529788494110107, "learning_rate": 0.0002, "loss": 0.3286, "step": 970 }, { "epoch": 0.0017823083166052483, "grad_norm": 0.27676528692245483, "learning_rate": 0.0002, "loss": 0.126, "step": 980 }, { "epoch": 0.0018004951361624446, "grad_norm": 0.62413090467453, "learning_rate": 0.0002, "loss": 0.0844, "step": 990 }, { "epoch": 0.0018186819557196412, "grad_norm": 0.010768013074994087, "learning_rate": 0.0002, "loss": 0.0038, "step": 1000 }, { "epoch": 0.0018368687752768375, "grad_norm": 4.500253200531006, "learning_rate": 0.0002, "loss": 0.8415, "step": 1010 }, { "epoch": 0.001855055594834034, "grad_norm": 0.4661908447742462, "learning_rate": 0.0002, "loss": 0.2552, "step": 1020 }, { "epoch": 0.0018732424143912303, "grad_norm": 0.17337530851364136, "learning_rate": 0.0002, "loss": 0.1032, "step": 1030 }, { "epoch": 0.0018914292339484266, "grad_norm": 0.3994196355342865, "learning_rate": 0.0002, "loss": 0.0814, "step": 1040 }, { "epoch": 0.0019096160535056232, "grad_norm": 0.025604812428355217, "learning_rate": 0.0002, "loss": 0.0077, "step": 1050 }, { "epoch": 0.0019278028730628195, "grad_norm": 3.7293856143951416, "learning_rate": 0.0002, "loss": 0.6543, "step": 1060 }, { "epoch": 0.001945989692620016, "grad_norm": 1.2915587425231934, "learning_rate": 0.0002, "loss": 0.3782, "step": 1070 }, { "epoch": 0.0019641765121772123, "grad_norm": 1.0336438417434692, "learning_rate": 0.0002, "loss": 0.1515, "step": 1080 }, { "epoch": 0.0019823633317344086, "grad_norm": 1.6816803216934204, "learning_rate": 0.0002, "loss": 0.1173, "step": 1090 }, { "epoch": 0.0020005501512916054, "grad_norm": 0.015431606210768223, "learning_rate": 0.0002, "loss": 0.0165, "step": 1100 }, { "epoch": 0.0020187369708488017, "grad_norm": 3.059936046600342, "learning_rate": 0.0002, "loss": 0.6981, "step": 1110 }, { "epoch": 0.002036923790405998, "grad_norm": 0.5564419031143188, "learning_rate": 0.0002, "loss": 0.2193, "step": 1120 }, { "epoch": 0.0020551106099631943, "grad_norm": 0.11465179920196533, "learning_rate": 0.0002, "loss": 0.1157, "step": 1130 }, { "epoch": 0.002073297429520391, "grad_norm": 1.7084763050079346, "learning_rate": 0.0002, "loss": 0.0997, "step": 1140 }, { "epoch": 0.0020914842490775873, "grad_norm": 0.00997951254248619, "learning_rate": 0.0002, "loss": 0.0185, "step": 1150 }, { "epoch": 0.0021096710686347836, "grad_norm": 4.252767086029053, "learning_rate": 0.0002, "loss": 0.6719, "step": 1160 }, { "epoch": 0.00212785788819198, "grad_norm": 0.7261558175086975, "learning_rate": 0.0002, "loss": 0.1939, "step": 1170 }, { "epoch": 0.0021460447077491763, "grad_norm": 0.3190513253211975, "learning_rate": 0.0002, "loss": 0.0981, "step": 1180 }, { "epoch": 0.002164231527306373, "grad_norm": 0.5305098295211792, "learning_rate": 0.0002, "loss": 0.0756, "step": 1190 }, { "epoch": 0.0021824183468635693, "grad_norm": 0.03356161713600159, "learning_rate": 0.0002, "loss": 0.0064, "step": 1200 }, { "epoch": 0.0022006051664207656, "grad_norm": 3.8724617958068848, "learning_rate": 0.0002, "loss": 0.6282, "step": 1210 }, { "epoch": 0.002218791985977962, "grad_norm": 1.3044495582580566, "learning_rate": 0.0002, "loss": 0.3827, "step": 1220 }, { "epoch": 0.0022369788055351587, "grad_norm": 0.18937312066555023, "learning_rate": 0.0002, "loss": 0.1412, "step": 1230 }, { "epoch": 0.002255165625092355, "grad_norm": 2.488002061843872, "learning_rate": 0.0002, "loss": 0.1299, "step": 1240 }, { "epoch": 0.0022733524446495513, "grad_norm": 0.05979600548744202, "learning_rate": 0.0002, "loss": 0.0276, "step": 1250 }, { "epoch": 0.0022915392642067476, "grad_norm": 4.089362144470215, "learning_rate": 0.0002, "loss": 0.6152, "step": 1260 }, { "epoch": 0.0023097260837639444, "grad_norm": 0.2717827260494232, "learning_rate": 0.0002, "loss": 0.277, "step": 1270 }, { "epoch": 0.0023279129033211407, "grad_norm": 0.40145063400268555, "learning_rate": 0.0002, "loss": 0.113, "step": 1280 }, { "epoch": 0.002346099722878337, "grad_norm": 0.8193599581718445, "learning_rate": 0.0002, "loss": 0.0952, "step": 1290 }, { "epoch": 0.0023642865424355333, "grad_norm": 0.03877554461359978, "learning_rate": 0.0002, "loss": 0.0187, "step": 1300 }, { "epoch": 0.00238247336199273, "grad_norm": 3.7022697925567627, "learning_rate": 0.0002, "loss": 0.6711, "step": 1310 }, { "epoch": 0.0024006601815499263, "grad_norm": 1.0773606300354004, "learning_rate": 0.0002, "loss": 0.3495, "step": 1320 }, { "epoch": 0.0024188470011071227, "grad_norm": 0.46499383449554443, "learning_rate": 0.0002, "loss": 0.1181, "step": 1330 }, { "epoch": 0.002437033820664319, "grad_norm": 0.7035688757896423, "learning_rate": 0.0002, "loss": 0.08, "step": 1340 }, { "epoch": 0.0024552206402215153, "grad_norm": 0.022035669535398483, "learning_rate": 0.0002, "loss": 0.0102, "step": 1350 }, { "epoch": 0.002473407459778712, "grad_norm": 3.3636128902435303, "learning_rate": 0.0002, "loss": 0.6578, "step": 1360 }, { "epoch": 0.0024915942793359083, "grad_norm": 0.21912692487239838, "learning_rate": 0.0002, "loss": 0.2275, "step": 1370 }, { "epoch": 0.0025097810988931046, "grad_norm": 0.1632055938243866, "learning_rate": 0.0002, "loss": 0.1015, "step": 1380 }, { "epoch": 0.002527967918450301, "grad_norm": 0.44282346963882446, "learning_rate": 0.0002, "loss": 0.103, "step": 1390 }, { "epoch": 0.0025461547380074977, "grad_norm": 0.17366968095302582, "learning_rate": 0.0002, "loss": 0.0322, "step": 1400 }, { "epoch": 0.002564341557564694, "grad_norm": 1.816606879234314, "learning_rate": 0.0002, "loss": 0.3914, "step": 1410 }, { "epoch": 0.0025825283771218903, "grad_norm": 0.6741718649864197, "learning_rate": 0.0002, "loss": 0.2593, "step": 1420 }, { "epoch": 0.0026007151966790866, "grad_norm": 0.580172598361969, "learning_rate": 0.0002, "loss": 0.151, "step": 1430 }, { "epoch": 0.0026189020162362834, "grad_norm": 1.500544548034668, "learning_rate": 0.0002, "loss": 0.1266, "step": 1440 }, { "epoch": 0.0026370888357934797, "grad_norm": 0.03482064977288246, "learning_rate": 0.0002, "loss": 0.0209, "step": 1450 }, { "epoch": 0.002655275655350676, "grad_norm": 1.9266266822814941, "learning_rate": 0.0002, "loss": 0.5335, "step": 1460 }, { "epoch": 0.0026734624749078723, "grad_norm": 0.6076328158378601, "learning_rate": 0.0002, "loss": 0.1771, "step": 1470 }, { "epoch": 0.0026916492944650686, "grad_norm": 0.047803062945604324, "learning_rate": 0.0002, "loss": 0.1312, "step": 1480 }, { "epoch": 0.0027098361140222653, "grad_norm": 2.2670884132385254, "learning_rate": 0.0002, "loss": 0.1303, "step": 1490 }, { "epoch": 0.0027280229335794617, "grad_norm": 0.4342607259750366, "learning_rate": 0.0002, "loss": 0.0709, "step": 1500 }, { "epoch": 0.002746209753136658, "grad_norm": 1.5955005884170532, "learning_rate": 0.0002, "loss": 0.2632, "step": 1510 }, { "epoch": 0.0027643965726938543, "grad_norm": 0.20393006503582, "learning_rate": 0.0002, "loss": 0.1324, "step": 1520 }, { "epoch": 0.002782583392251051, "grad_norm": 0.2312391996383667, "learning_rate": 0.0002, "loss": 0.1056, "step": 1530 }, { "epoch": 0.0028007702118082473, "grad_norm": 1.2107295989990234, "learning_rate": 0.0002, "loss": 0.1038, "step": 1540 }, { "epoch": 0.0028189570313654436, "grad_norm": 0.07030847668647766, "learning_rate": 0.0002, "loss": 0.032, "step": 1550 }, { "epoch": 0.00283714385092264, "grad_norm": 3.563960552215576, "learning_rate": 0.0002, "loss": 0.5389, "step": 1560 }, { "epoch": 0.0028553306704798367, "grad_norm": 0.6965789794921875, "learning_rate": 0.0002, "loss": 0.2889, "step": 1570 }, { "epoch": 0.002873517490037033, "grad_norm": 0.5975427031517029, "learning_rate": 0.0002, "loss": 0.1235, "step": 1580 }, { "epoch": 0.0028917043095942293, "grad_norm": 1.371771216392517, "learning_rate": 0.0002, "loss": 0.0971, "step": 1590 }, { "epoch": 0.0029098911291514256, "grad_norm": 0.01906588114798069, "learning_rate": 0.0002, "loss": 0.0172, "step": 1600 }, { "epoch": 0.002928077948708622, "grad_norm": 3.8812315464019775, "learning_rate": 0.0002, "loss": 0.621, "step": 1610 }, { "epoch": 0.0029462647682658187, "grad_norm": 0.41589802503585815, "learning_rate": 0.0002, "loss": 0.2029, "step": 1620 }, { "epoch": 0.002964451587823015, "grad_norm": 0.24198026955127716, "learning_rate": 0.0002, "loss": 0.1042, "step": 1630 }, { "epoch": 0.0029826384073802113, "grad_norm": 0.8711221814155579, "learning_rate": 0.0002, "loss": 0.0904, "step": 1640 }, { "epoch": 0.0030008252269374076, "grad_norm": 0.06849978119134903, "learning_rate": 0.0002, "loss": 0.0226, "step": 1650 }, { "epoch": 0.0030190120464946043, "grad_norm": 3.912189483642578, "learning_rate": 0.0002, "loss": 0.5554, "step": 1660 }, { "epoch": 0.0030371988660518007, "grad_norm": 1.076832890510559, "learning_rate": 0.0002, "loss": 0.2968, "step": 1670 }, { "epoch": 0.003055385685608997, "grad_norm": 0.3734837472438812, "learning_rate": 0.0002, "loss": 0.1151, "step": 1680 }, { "epoch": 0.0030735725051661933, "grad_norm": 0.8407588005065918, "learning_rate": 0.0002, "loss": 0.0897, "step": 1690 }, { "epoch": 0.00309175932472339, "grad_norm": 0.023632407188415527, "learning_rate": 0.0002, "loss": 0.0113, "step": 1700 }, { "epoch": 0.0031099461442805863, "grad_norm": 4.268885612487793, "learning_rate": 0.0002, "loss": 0.6717, "step": 1710 }, { "epoch": 0.0031281329638377826, "grad_norm": 0.3088800013065338, "learning_rate": 0.0002, "loss": 0.2635, "step": 1720 }, { "epoch": 0.003146319783394979, "grad_norm": 0.05659230053424835, "learning_rate": 0.0002, "loss": 0.0987, "step": 1730 }, { "epoch": 0.0031645066029521757, "grad_norm": 0.5756633281707764, "learning_rate": 0.0002, "loss": 0.0949, "step": 1740 }, { "epoch": 0.003182693422509372, "grad_norm": 0.23241274058818817, "learning_rate": 0.0002, "loss": 0.044, "step": 1750 }, { "epoch": 0.0032008802420665683, "grad_norm": 2.2380006313323975, "learning_rate": 0.0002, "loss": 0.3458, "step": 1760 }, { "epoch": 0.0032190670616237646, "grad_norm": 0.4196106493473053, "learning_rate": 0.0002, "loss": 0.2116, "step": 1770 }, { "epoch": 0.003237253881180961, "grad_norm": 0.3544403612613678, "learning_rate": 0.0002, "loss": 0.1011, "step": 1780 }, { "epoch": 0.0032554407007381577, "grad_norm": 0.6422521471977234, "learning_rate": 0.0002, "loss": 0.084, "step": 1790 }, { "epoch": 0.003273627520295354, "grad_norm": 0.03676289692521095, "learning_rate": 0.0002, "loss": 0.0115, "step": 1800 }, { "epoch": 0.0032918143398525503, "grad_norm": 3.173424243927002, "learning_rate": 0.0002, "loss": 0.5644, "step": 1810 }, { "epoch": 0.0033100011594097466, "grad_norm": 0.14629468321800232, "learning_rate": 0.0002, "loss": 0.2249, "step": 1820 }, { "epoch": 0.0033281879789669433, "grad_norm": 0.27524232864379883, "learning_rate": 0.0002, "loss": 0.0965, "step": 1830 }, { "epoch": 0.0033463747985241397, "grad_norm": 0.5685613751411438, "learning_rate": 0.0002, "loss": 0.0949, "step": 1840 }, { "epoch": 0.003364561618081336, "grad_norm": 0.19684627652168274, "learning_rate": 0.0002, "loss": 0.0423, "step": 1850 }, { "epoch": 0.0033827484376385323, "grad_norm": 2.0270469188690186, "learning_rate": 0.0002, "loss": 0.3322, "step": 1860 }, { "epoch": 0.003400935257195729, "grad_norm": 0.3960348963737488, "learning_rate": 0.0002, "loss": 0.1933, "step": 1870 }, { "epoch": 0.0034191220767529253, "grad_norm": 0.8636507391929626, "learning_rate": 0.0002, "loss": 0.1056, "step": 1880 }, { "epoch": 0.0034373088963101216, "grad_norm": 0.7978588342666626, "learning_rate": 0.0002, "loss": 0.0812, "step": 1890 }, { "epoch": 0.003455495715867318, "grad_norm": 0.020584411919116974, "learning_rate": 0.0002, "loss": 0.0098, "step": 1900 }, { "epoch": 0.0034736825354245143, "grad_norm": 2.648928165435791, "learning_rate": 0.0002, "loss": 0.5485, "step": 1910 }, { "epoch": 0.003491869354981711, "grad_norm": 0.5433089137077332, "learning_rate": 0.0002, "loss": 0.1546, "step": 1920 }, { "epoch": 0.0035100561745389073, "grad_norm": 0.2638677656650543, "learning_rate": 0.0002, "loss": 0.0935, "step": 1930 }, { "epoch": 0.0035282429940961036, "grad_norm": 0.4292812943458557, "learning_rate": 0.0002, "loss": 0.0879, "step": 1940 }, { "epoch": 0.0035464298136533, "grad_norm": 0.09974557906389236, "learning_rate": 0.0002, "loss": 0.0234, "step": 1950 }, { "epoch": 0.0035646166332104967, "grad_norm": 1.626259446144104, "learning_rate": 0.0002, "loss": 0.4046, "step": 1960 }, { "epoch": 0.003582803452767693, "grad_norm": 0.7747110724449158, "learning_rate": 0.0002, "loss": 0.2436, "step": 1970 }, { "epoch": 0.0036009902723248893, "grad_norm": 1.130542516708374, "learning_rate": 0.0002, "loss": 0.1371, "step": 1980 }, { "epoch": 0.0036191770918820856, "grad_norm": 2.542160987854004, "learning_rate": 0.0002, "loss": 0.1204, "step": 1990 }, { "epoch": 0.0036373639114392823, "grad_norm": 0.1563112586736679, "learning_rate": 0.0002, "loss": 0.0343, "step": 2000 }, { "epoch": 0.0036555507309964787, "grad_norm": 3.1544902324676514, "learning_rate": 0.0002, "loss": 0.4769, "step": 2010 }, { "epoch": 0.003673737550553675, "grad_norm": 1.0212864875793457, "learning_rate": 0.0002, "loss": 0.2462, "step": 2020 }, { "epoch": 0.0036919243701108713, "grad_norm": 0.3565104305744171, "learning_rate": 0.0002, "loss": 0.1209, "step": 2030 }, { "epoch": 0.003710111189668068, "grad_norm": 1.3275020122528076, "learning_rate": 0.0002, "loss": 0.1064, "step": 2040 }, { "epoch": 0.0037282980092252643, "grad_norm": 0.11180760711431503, "learning_rate": 0.0002, "loss": 0.0326, "step": 2050 }, { "epoch": 0.0037464848287824606, "grad_norm": 1.9683802127838135, "learning_rate": 0.0002, "loss": 0.3878, "step": 2060 }, { "epoch": 0.003764671648339657, "grad_norm": 0.7875238060951233, "learning_rate": 0.0002, "loss": 0.1145, "step": 2070 }, { "epoch": 0.0037828584678968533, "grad_norm": 0.4307851195335388, "learning_rate": 0.0002, "loss": 0.0891, "step": 2080 }, { "epoch": 0.00380104528745405, "grad_norm": 0.6907076239585876, "learning_rate": 0.0002, "loss": 0.0801, "step": 2090 }, { "epoch": 0.0038192321070112463, "grad_norm": 0.04466943070292473, "learning_rate": 0.0002, "loss": 0.0148, "step": 2100 }, { "epoch": 0.0038374189265684426, "grad_norm": 2.8212766647338867, "learning_rate": 0.0002, "loss": 0.501, "step": 2110 }, { "epoch": 0.003855605746125639, "grad_norm": 0.4052332639694214, "learning_rate": 0.0002, "loss": 0.2379, "step": 2120 }, { "epoch": 0.0038737925656828357, "grad_norm": 0.5726248621940613, "learning_rate": 0.0002, "loss": 0.0925, "step": 2130 }, { "epoch": 0.003891979385240032, "grad_norm": 0.7385726571083069, "learning_rate": 0.0002, "loss": 0.0744, "step": 2140 }, { "epoch": 0.003910166204797228, "grad_norm": 0.01478211022913456, "learning_rate": 0.0002, "loss": 0.0094, "step": 2150 }, { "epoch": 0.003928353024354425, "grad_norm": 4.001941204071045, "learning_rate": 0.0002, "loss": 0.5398, "step": 2160 }, { "epoch": 0.003946539843911621, "grad_norm": 0.5501906275749207, "learning_rate": 0.0002, "loss": 0.1502, "step": 2170 }, { "epoch": 0.003964726663468817, "grad_norm": 0.05887573957443237, "learning_rate": 0.0002, "loss": 0.1137, "step": 2180 }, { "epoch": 0.003982913483026014, "grad_norm": 0.6087843179702759, "learning_rate": 0.0002, "loss": 0.0738, "step": 2190 }, { "epoch": 0.004001100302583211, "grad_norm": 0.027440447360277176, "learning_rate": 0.0002, "loss": 0.0122, "step": 2200 }, { "epoch": 0.004019287122140407, "grad_norm": 3.8189752101898193, "learning_rate": 0.0002, "loss": 0.5507, "step": 2210 }, { "epoch": 0.004037473941697603, "grad_norm": 0.7837066054344177, "learning_rate": 0.0002, "loss": 0.2931, "step": 2220 }, { "epoch": 0.0040556607612548, "grad_norm": 0.4113297462463379, "learning_rate": 0.0002, "loss": 0.1169, "step": 2230 }, { "epoch": 0.004073847580811996, "grad_norm": 0.9759702086448669, "learning_rate": 0.0002, "loss": 0.0902, "step": 2240 }, { "epoch": 0.004092034400369192, "grad_norm": 0.03002658113837242, "learning_rate": 0.0002, "loss": 0.0165, "step": 2250 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3899519044675994e+17, "train_batch_size": 24, "trial_name": null, "trial_params": null }