|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.004092034400369192, |
|
"eval_steps": 500, |
|
"global_step": 2250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.818681955719641e-05, |
|
"grad_norm": 1.608859896659851, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3372, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 3.637363911439282e-05, |
|
"grad_norm": 0.9594001770019531, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1895, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 5.4560458671589234e-05, |
|
"grad_norm": 0.7858404517173767, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1819, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 7.274727822878565e-05, |
|
"grad_norm": 0.05236278474330902, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0967, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 9.093409778598205e-05, |
|
"grad_norm": 0.00239331996999681, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0001, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00010912091734317847, |
|
"grad_norm": 0.6015797853469849, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9137, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00012730773690037487, |
|
"grad_norm": 0.2916141152381897, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1584, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0001454945564575713, |
|
"grad_norm": 0.22034427523612976, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1212, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0001636813760147677, |
|
"grad_norm": 0.05342680215835571, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0933, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0001818681955719641, |
|
"grad_norm": 0.0009122246992774308, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0002, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00020005501512916052, |
|
"grad_norm": 0.30845287442207336, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8141, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.00021824183468635694, |
|
"grad_norm": 0.1849660873413086, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1604, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00023642865424355333, |
|
"grad_norm": 0.09605516493320465, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1163, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00025461547380074975, |
|
"grad_norm": 0.4438878893852234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1043, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00027280229335794617, |
|
"grad_norm": 0.0012718827929347754, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0018, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0002909891129151426, |
|
"grad_norm": 0.19092628359794617, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6226, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.000309175932472339, |
|
"grad_norm": 0.04102358967065811, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1575, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0003273627520295354, |
|
"grad_norm": 0.06057624891400337, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1119, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0003455495715867318, |
|
"grad_norm": 0.13942261040210724, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0781, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0003637363911439282, |
|
"grad_norm": 0.005650315433740616, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0013, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0003819232107011246, |
|
"grad_norm": 0.6245204210281372, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7091, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.00040011003025832104, |
|
"grad_norm": 0.058550119400024414, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1411, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00041829684981551746, |
|
"grad_norm": 0.08625461906194687, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1138, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0004364836693727139, |
|
"grad_norm": 0.06455521285533905, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0805, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0004546704889299103, |
|
"grad_norm": 0.0020822149235755205, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0013, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00047285730848710666, |
|
"grad_norm": 0.1977258324623108, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5558, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0004910441280443031, |
|
"grad_norm": 0.20794034004211426, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1263, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0005092309476014995, |
|
"grad_norm": 0.08760973066091537, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1118, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0005274177671586959, |
|
"grad_norm": 0.299059122800827, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0804, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0005456045867158923, |
|
"grad_norm": 0.002420844743028283, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0022, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0005637914062730887, |
|
"grad_norm": 2.2061026096343994, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5624, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0005819782258302852, |
|
"grad_norm": 0.7011717557907104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1177, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0006001650453874816, |
|
"grad_norm": 0.37657421827316284, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1005, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.000618351864944678, |
|
"grad_norm": 0.048011403530836105, |
|
"learning_rate": 0.0002, |
|
"loss": 0.079, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0006365386845018744, |
|
"grad_norm": 0.002076848642900586, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0018, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0006547255040590708, |
|
"grad_norm": 0.721218466758728, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5647, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0006729123236162671, |
|
"grad_norm": 0.09965512156486511, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1403, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.0006910991431734636, |
|
"grad_norm": 0.21733985841274261, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.00070928596273066, |
|
"grad_norm": 0.003134253202006221, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0707, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0007274727822878564, |
|
"grad_norm": 0.0011866611894220114, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0003, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0007456596018450528, |
|
"grad_norm": 0.37573525309562683, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6122, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.0007638464214022492, |
|
"grad_norm": 0.34029653668403625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1149, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0007820332409594457, |
|
"grad_norm": 0.35701191425323486, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0972, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0008002200605166421, |
|
"grad_norm": 0.06324547529220581, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0722, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0008184068800738385, |
|
"grad_norm": 0.0011920438846573234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0008365936996310349, |
|
"grad_norm": 0.861393392086029, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4433, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.0008547805191882313, |
|
"grad_norm": 0.16104361414909363, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1176, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.0008729673387454278, |
|
"grad_norm": 0.28712376952171326, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0983, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0008911541583026242, |
|
"grad_norm": 0.07980292290449142, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0721, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0009093409778598206, |
|
"grad_norm": 0.0018368292367085814, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0019, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.000927527797417017, |
|
"grad_norm": 0.05152284353971481, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2823, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.0009457146169742133, |
|
"grad_norm": 0.04693318158388138, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1179, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.0009639014365314097, |
|
"grad_norm": 0.10586889833211899, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1094, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.0009820882560886062, |
|
"grad_norm": 0.006325385998934507, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0706, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.0010002750756458027, |
|
"grad_norm": 4.665973028750159e-05, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0002, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.001018461895202999, |
|
"grad_norm": 0.293944776058197, |
|
"learning_rate": 0.0002, |
|
"loss": 0.732, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.0010366487147601955, |
|
"grad_norm": 0.22614754736423492, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1226, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.0010548355343173918, |
|
"grad_norm": 0.10801248252391815, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1065, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0010730223538745881, |
|
"grad_norm": 0.04501640051603317, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0759, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.0010912091734317847, |
|
"grad_norm": 0.00014656950952485204, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0001, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.001109395992988981, |
|
"grad_norm": 0.2490423321723938, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4868, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.0011275828125461775, |
|
"grad_norm": 0.026224857196211815, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1242, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.0011457696321033738, |
|
"grad_norm": 0.11845973134040833, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1153, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.0011639564516605703, |
|
"grad_norm": 0.8349707722663879, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1229, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.0011821432712177666, |
|
"grad_norm": 0.00733955716714263, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0016, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.0012003300907749632, |
|
"grad_norm": 0.3534531891345978, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4961, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.0012185169103321595, |
|
"grad_norm": 0.3938736915588379, |
|
"learning_rate": 0.0002, |
|
"loss": 0.126, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.001236703729889356, |
|
"grad_norm": 0.16779105365276337, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1217, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.0012548905494465523, |
|
"grad_norm": 0.6998353600502014, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1171, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.0012730773690037488, |
|
"grad_norm": 0.0005113715888001025, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0143, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0012912641885609452, |
|
"grad_norm": 0.4034057557582855, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6274, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.0013094510081181417, |
|
"grad_norm": 0.08985241502523422, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1295, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.001327637827675338, |
|
"grad_norm": 0.20418916642665863, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1234, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.0013458246472325343, |
|
"grad_norm": 1.0206961631774902, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1107, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.0013640114667897308, |
|
"grad_norm": 0.0008244478958658874, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0087, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.0013821982863469271, |
|
"grad_norm": 3.696362018585205, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9078, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.0014003851059041237, |
|
"grad_norm": 0.8782555460929871, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1732, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.00141857192546132, |
|
"grad_norm": 0.18350496888160706, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1205, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.0014367587450185165, |
|
"grad_norm": 0.634567141532898, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0997, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0014549455645757128, |
|
"grad_norm": 0.01041293516755104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0065, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0014731323841329093, |
|
"grad_norm": 3.0739810466766357, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7739, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.0014913192036901056, |
|
"grad_norm": 0.4407779276371002, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2848, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0015095060232473022, |
|
"grad_norm": 0.25743165612220764, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1318, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.0015276928428044985, |
|
"grad_norm": 2.0397753715515137, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1385, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.001545879662361695, |
|
"grad_norm": 0.060638878494501114, |
|
"learning_rate": 0.0002, |
|
"loss": 0.047, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.0015640664819188913, |
|
"grad_norm": 2.5641930103302, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5497, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.0015822533014760878, |
|
"grad_norm": 0.9419782161712646, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1626, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.0016004401210332842, |
|
"grad_norm": 0.1152188628911972, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1063, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.0016186269405904805, |
|
"grad_norm": 0.6502537131309509, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0871, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.001636813760147677, |
|
"grad_norm": 0.023487605154514313, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0094, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0016550005797048733, |
|
"grad_norm": 1.9080859422683716, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5073, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.0016731873992620698, |
|
"grad_norm": 0.44722509384155273, |
|
"learning_rate": 0.0002, |
|
"loss": 0.167, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.0016913742188192661, |
|
"grad_norm": 0.24151289463043213, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1237, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0017095610383764627, |
|
"grad_norm": 1.1394294500350952, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1014, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.001727747857933659, |
|
"grad_norm": 0.011057032272219658, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0069, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.0017459346774908555, |
|
"grad_norm": 4.32397985458374, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7672, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.0017641214970480518, |
|
"grad_norm": 0.9529788494110107, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3286, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.0017823083166052483, |
|
"grad_norm": 0.27676528692245483, |
|
"learning_rate": 0.0002, |
|
"loss": 0.126, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.0018004951361624446, |
|
"grad_norm": 0.62413090467453, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0844, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.0018186819557196412, |
|
"grad_norm": 0.010768013074994087, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0038, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0018368687752768375, |
|
"grad_norm": 4.500253200531006, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8415, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.001855055594834034, |
|
"grad_norm": 0.4661908447742462, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2552, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.0018732424143912303, |
|
"grad_norm": 0.17337530851364136, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1032, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.0018914292339484266, |
|
"grad_norm": 0.3994196355342865, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0814, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.0019096160535056232, |
|
"grad_norm": 0.025604812428355217, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0077, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.0019278028730628195, |
|
"grad_norm": 3.7293856143951416, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6543, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.001945989692620016, |
|
"grad_norm": 1.2915587425231934, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3782, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.0019641765121772123, |
|
"grad_norm": 1.0336438417434692, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1515, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.0019823633317344086, |
|
"grad_norm": 1.6816803216934204, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1173, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.0020005501512916054, |
|
"grad_norm": 0.015431606210768223, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0165, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.0020187369708488017, |
|
"grad_norm": 3.059936046600342, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6981, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.002036923790405998, |
|
"grad_norm": 0.5564419031143188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2193, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.0020551106099631943, |
|
"grad_norm": 0.11465179920196533, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1157, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.002073297429520391, |
|
"grad_norm": 1.7084763050079346, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0997, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.0020914842490775873, |
|
"grad_norm": 0.00997951254248619, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0185, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.0021096710686347836, |
|
"grad_norm": 4.252767086029053, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6719, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.00212785788819198, |
|
"grad_norm": 0.7261558175086975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1939, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.0021460447077491763, |
|
"grad_norm": 0.3190513253211975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0981, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.002164231527306373, |
|
"grad_norm": 0.5305098295211792, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0756, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.0021824183468635693, |
|
"grad_norm": 0.03356161713600159, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0064, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0022006051664207656, |
|
"grad_norm": 3.8724617958068848, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6282, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.002218791985977962, |
|
"grad_norm": 1.3044495582580566, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3827, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.0022369788055351587, |
|
"grad_norm": 0.18937312066555023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1412, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.002255165625092355, |
|
"grad_norm": 2.488002061843872, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1299, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.0022733524446495513, |
|
"grad_norm": 0.05979600548744202, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0276, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.0022915392642067476, |
|
"grad_norm": 4.089362144470215, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6152, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.0023097260837639444, |
|
"grad_norm": 0.2717827260494232, |
|
"learning_rate": 0.0002, |
|
"loss": 0.277, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.0023279129033211407, |
|
"grad_norm": 0.40145063400268555, |
|
"learning_rate": 0.0002, |
|
"loss": 0.113, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.002346099722878337, |
|
"grad_norm": 0.8193599581718445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0952, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.0023642865424355333, |
|
"grad_norm": 0.03877554461359978, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0187, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.00238247336199273, |
|
"grad_norm": 3.7022697925567627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6711, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.0024006601815499263, |
|
"grad_norm": 1.0773606300354004, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3495, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.0024188470011071227, |
|
"grad_norm": 0.46499383449554443, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1181, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.002437033820664319, |
|
"grad_norm": 0.7035688757896423, |
|
"learning_rate": 0.0002, |
|
"loss": 0.08, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.0024552206402215153, |
|
"grad_norm": 0.022035669535398483, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0102, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.002473407459778712, |
|
"grad_norm": 3.3636128902435303, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6578, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.0024915942793359083, |
|
"grad_norm": 0.21912692487239838, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2275, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.0025097810988931046, |
|
"grad_norm": 0.1632055938243866, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1015, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.002527967918450301, |
|
"grad_norm": 0.44282346963882446, |
|
"learning_rate": 0.0002, |
|
"loss": 0.103, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.0025461547380074977, |
|
"grad_norm": 0.17366968095302582, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0322, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.002564341557564694, |
|
"grad_norm": 1.816606879234314, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3914, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.0025825283771218903, |
|
"grad_norm": 0.6741718649864197, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2593, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.0026007151966790866, |
|
"grad_norm": 0.580172598361969, |
|
"learning_rate": 0.0002, |
|
"loss": 0.151, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.0026189020162362834, |
|
"grad_norm": 1.500544548034668, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1266, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.0026370888357934797, |
|
"grad_norm": 0.03482064977288246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0209, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.002655275655350676, |
|
"grad_norm": 1.9266266822814941, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5335, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.0026734624749078723, |
|
"grad_norm": 0.6076328158378601, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1771, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.0026916492944650686, |
|
"grad_norm": 0.047803062945604324, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1312, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.0027098361140222653, |
|
"grad_norm": 2.2670884132385254, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1303, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.0027280229335794617, |
|
"grad_norm": 0.4342607259750366, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0709, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.002746209753136658, |
|
"grad_norm": 1.5955005884170532, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2632, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.0027643965726938543, |
|
"grad_norm": 0.20393006503582, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1324, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.002782583392251051, |
|
"grad_norm": 0.2312391996383667, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1056, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.0028007702118082473, |
|
"grad_norm": 1.2107295989990234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1038, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.0028189570313654436, |
|
"grad_norm": 0.07030847668647766, |
|
"learning_rate": 0.0002, |
|
"loss": 0.032, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.00283714385092264, |
|
"grad_norm": 3.563960552215576, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5389, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.0028553306704798367, |
|
"grad_norm": 0.6965789794921875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2889, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.002873517490037033, |
|
"grad_norm": 0.5975427031517029, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1235, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.0028917043095942293, |
|
"grad_norm": 1.371771216392517, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0971, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.0029098911291514256, |
|
"grad_norm": 0.01906588114798069, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0172, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.002928077948708622, |
|
"grad_norm": 3.8812315464019775, |
|
"learning_rate": 0.0002, |
|
"loss": 0.621, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.0029462647682658187, |
|
"grad_norm": 0.41589802503585815, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2029, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.002964451587823015, |
|
"grad_norm": 0.24198026955127716, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1042, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.0029826384073802113, |
|
"grad_norm": 0.8711221814155579, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0904, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.0030008252269374076, |
|
"grad_norm": 0.06849978119134903, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0226, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.0030190120464946043, |
|
"grad_norm": 3.912189483642578, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5554, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.0030371988660518007, |
|
"grad_norm": 1.076832890510559, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2968, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.003055385685608997, |
|
"grad_norm": 0.3734837472438812, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1151, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.0030735725051661933, |
|
"grad_norm": 0.8407588005065918, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0897, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.00309175932472339, |
|
"grad_norm": 0.023632407188415527, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0113, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.0031099461442805863, |
|
"grad_norm": 4.268885612487793, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6717, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.0031281329638377826, |
|
"grad_norm": 0.3088800013065338, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2635, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.003146319783394979, |
|
"grad_norm": 0.05659230053424835, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0987, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.0031645066029521757, |
|
"grad_norm": 0.5756633281707764, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0949, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.003182693422509372, |
|
"grad_norm": 0.23241274058818817, |
|
"learning_rate": 0.0002, |
|
"loss": 0.044, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.0032008802420665683, |
|
"grad_norm": 2.2380006313323975, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3458, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.0032190670616237646, |
|
"grad_norm": 0.4196106493473053, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2116, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.003237253881180961, |
|
"grad_norm": 0.3544403612613678, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1011, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.0032554407007381577, |
|
"grad_norm": 0.6422521471977234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.084, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.003273627520295354, |
|
"grad_norm": 0.03676289692521095, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0115, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.0032918143398525503, |
|
"grad_norm": 3.173424243927002, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5644, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.0033100011594097466, |
|
"grad_norm": 0.14629468321800232, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2249, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.0033281879789669433, |
|
"grad_norm": 0.27524232864379883, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0965, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.0033463747985241397, |
|
"grad_norm": 0.5685613751411438, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0949, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.003364561618081336, |
|
"grad_norm": 0.19684627652168274, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0423, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.0033827484376385323, |
|
"grad_norm": 2.0270469188690186, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3322, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.003400935257195729, |
|
"grad_norm": 0.3960348963737488, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1933, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.0034191220767529253, |
|
"grad_norm": 0.8636507391929626, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1056, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.0034373088963101216, |
|
"grad_norm": 0.7978588342666626, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0812, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.003455495715867318, |
|
"grad_norm": 0.020584411919116974, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0098, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.0034736825354245143, |
|
"grad_norm": 2.648928165435791, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5485, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.003491869354981711, |
|
"grad_norm": 0.5433089137077332, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1546, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.0035100561745389073, |
|
"grad_norm": 0.2638677656650543, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0935, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.0035282429940961036, |
|
"grad_norm": 0.4292812943458557, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0879, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.0035464298136533, |
|
"grad_norm": 0.09974557906389236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0234, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.0035646166332104967, |
|
"grad_norm": 1.626259446144104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4046, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.003582803452767693, |
|
"grad_norm": 0.7747110724449158, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2436, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.0036009902723248893, |
|
"grad_norm": 1.130542516708374, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1371, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.0036191770918820856, |
|
"grad_norm": 2.542160987854004, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1204, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.0036373639114392823, |
|
"grad_norm": 0.1563112586736679, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0343, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0036555507309964787, |
|
"grad_norm": 3.1544902324676514, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4769, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.003673737550553675, |
|
"grad_norm": 1.0212864875793457, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2462, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.0036919243701108713, |
|
"grad_norm": 0.3565104305744171, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1209, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.003710111189668068, |
|
"grad_norm": 1.3275020122528076, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1064, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.0037282980092252643, |
|
"grad_norm": 0.11180760711431503, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0326, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.0037464848287824606, |
|
"grad_norm": 1.9683802127838135, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3878, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.003764671648339657, |
|
"grad_norm": 0.7875238060951233, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1145, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.0037828584678968533, |
|
"grad_norm": 0.4307851195335388, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0891, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.00380104528745405, |
|
"grad_norm": 0.6907076239585876, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0801, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.0038192321070112463, |
|
"grad_norm": 0.04466943070292473, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0148, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0038374189265684426, |
|
"grad_norm": 2.8212766647338867, |
|
"learning_rate": 0.0002, |
|
"loss": 0.501, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.003855605746125639, |
|
"grad_norm": 0.4052332639694214, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2379, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.0038737925656828357, |
|
"grad_norm": 0.5726248621940613, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0925, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.003891979385240032, |
|
"grad_norm": 0.7385726571083069, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0744, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.003910166204797228, |
|
"grad_norm": 0.01478211022913456, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0094, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.003928353024354425, |
|
"grad_norm": 4.001941204071045, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5398, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.003946539843911621, |
|
"grad_norm": 0.5501906275749207, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1502, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.003964726663468817, |
|
"grad_norm": 0.05887573957443237, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1137, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.003982913483026014, |
|
"grad_norm": 0.6087843179702759, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0738, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.004001100302583211, |
|
"grad_norm": 0.027440447360277176, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0122, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.004019287122140407, |
|
"grad_norm": 3.8189752101898193, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5507, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.004037473941697603, |
|
"grad_norm": 0.7837066054344177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2931, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.0040556607612548, |
|
"grad_norm": 0.4113297462463379, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1169, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.004073847580811996, |
|
"grad_norm": 0.9759702086448669, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0902, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.004092034400369192, |
|
"grad_norm": 0.03002658113837242, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0165, |
|
"step": 2250 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3899519044675994e+17, |
|
"train_batch_size": 24, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|