|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.999438727782975, |
|
"eval_steps": 500, |
|
"global_step": 1002, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029934518241347054, |
|
"grad_norm": 1.4403637427480462, |
|
"learning_rate": 2e-06, |
|
"loss": 0.786, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05986903648269411, |
|
"grad_norm": 0.7585646246007425, |
|
"learning_rate": 2e-06, |
|
"loss": 0.7103, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08980355472404115, |
|
"grad_norm": 0.7877019895147521, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6845, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11973807296538821, |
|
"grad_norm": 0.7526284526378991, |
|
"learning_rate": 2e-06, |
|
"loss": 0.679, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14967259120673526, |
|
"grad_norm": 0.6462593569528299, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6695, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1796071094480823, |
|
"grad_norm": 0.7389173448475503, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6608, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20954162768942938, |
|
"grad_norm": 0.6921347775545471, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6569, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23947614593077643, |
|
"grad_norm": 0.6664400127292619, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2694106641721235, |
|
"grad_norm": 0.6931891241440936, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6491, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2993451824134705, |
|
"grad_norm": 0.7790977620270173, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6497, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3292797006548176, |
|
"grad_norm": 0.6948688988620945, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6451, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3592142188961646, |
|
"grad_norm": 0.723431615671195, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6439, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3891487371375117, |
|
"grad_norm": 0.6738418075637761, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6436, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.41908325537885877, |
|
"grad_norm": 0.6653975851597016, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6372, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4490177736202058, |
|
"grad_norm": 0.6601441553159767, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6387, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47895229186155286, |
|
"grad_norm": 0.7054970289254373, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6375, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5088868101028999, |
|
"grad_norm": 0.7199826049159775, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6317, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.538821328344247, |
|
"grad_norm": 0.7113756352568565, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6332, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.568755846585594, |
|
"grad_norm": 0.7044231675783028, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6308, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.598690364826941, |
|
"grad_norm": 0.7309238834285012, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6387, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6286248830682881, |
|
"grad_norm": 0.7095992221953259, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6324, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6585594013096352, |
|
"grad_norm": 0.641738749873779, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6304, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6884939195509823, |
|
"grad_norm": 0.7754539446943373, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6329, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7184284377923292, |
|
"grad_norm": 0.6924947810387134, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6317, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7483629560336763, |
|
"grad_norm": 0.832705575892092, |
|
"learning_rate": 2e-06, |
|
"loss": 0.628, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7782974742750234, |
|
"grad_norm": 0.7232676849968064, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6267, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8082319925163705, |
|
"grad_norm": 0.7419413773726808, |
|
"learning_rate": 2e-06, |
|
"loss": 0.634, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8381665107577175, |
|
"grad_norm": 0.7199244688464713, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6244, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8681010289990645, |
|
"grad_norm": 0.6947896790690724, |
|
"learning_rate": 2e-06, |
|
"loss": 0.621, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8980355472404116, |
|
"grad_norm": 0.7851259698790731, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6222, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9279700654817586, |
|
"grad_norm": 0.7028052394028984, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6207, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9579045837231057, |
|
"grad_norm": 0.8063430875369427, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6227, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9878391019644528, |
|
"grad_norm": 0.8830953748187379, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6157, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9998129092609915, |
|
"eval_loss": 0.6270928382873535, |
|
"eval_runtime": 518.3743, |
|
"eval_samples_per_second": 17.364, |
|
"eval_steps_per_second": 0.544, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.0177736202057999, |
|
"grad_norm": 0.7847590939808942, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6651, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.047708138447147, |
|
"grad_norm": 0.7143067638653315, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5899, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.077642656688494, |
|
"grad_norm": 0.6616124745271351, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5856, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1075771749298409, |
|
"grad_norm": 0.6761369846237072, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5848, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.137511693171188, |
|
"grad_norm": 0.7226956240435726, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5847, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.167446211412535, |
|
"grad_norm": 0.6492986790737924, |
|
"learning_rate": 2e-06, |
|
"loss": 0.586, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.197380729653882, |
|
"grad_norm": 0.9238463038340056, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5862, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2273152478952292, |
|
"grad_norm": 0.7384372135206262, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5875, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2572497661365762, |
|
"grad_norm": 0.7798681474247887, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5922, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2871842843779233, |
|
"grad_norm": 0.6769000626695846, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5856, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3171188026192704, |
|
"grad_norm": 0.7081906344223899, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5873, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3470533208606175, |
|
"grad_norm": 0.6669059541200351, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5807, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3769878391019645, |
|
"grad_norm": 0.7803535308998839, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5913, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4069223573433116, |
|
"grad_norm": 0.7695489200166007, |
|
"learning_rate": 2e-06, |
|
"loss": 0.59, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4368568755846587, |
|
"grad_norm": 0.7503675606232418, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5917, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4667913938260055, |
|
"grad_norm": 0.7163429476040114, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5904, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4967259120673526, |
|
"grad_norm": 0.7665811370018359, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5866, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5266604303086997, |
|
"grad_norm": 0.8742449064402781, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5892, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5565949485500468, |
|
"grad_norm": 0.6515777532874945, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5867, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5865294667913938, |
|
"grad_norm": 0.7364233764205356, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5871, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.616463985032741, |
|
"grad_norm": 0.6869489380724798, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5907, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.646398503274088, |
|
"grad_norm": 0.6905962679241299, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5896, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6763330215154348, |
|
"grad_norm": 0.722590813324787, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5795, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.706267539756782, |
|
"grad_norm": 0.7155444269064662, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5812, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.736202057998129, |
|
"grad_norm": 0.6934837112832971, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5821, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.766136576239476, |
|
"grad_norm": 0.6890374087051357, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5851, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7960710944808231, |
|
"grad_norm": 0.6987803144035127, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5857, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8260056127221702, |
|
"grad_norm": 0.8108212865561982, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5829, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8559401309635173, |
|
"grad_norm": 0.6612306296879438, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5807, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8858746492048644, |
|
"grad_norm": 0.7409886326024834, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5829, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9158091674462114, |
|
"grad_norm": 0.9188602740383207, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5842, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9457436856875585, |
|
"grad_norm": 0.730641006811515, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5823, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9756782039289056, |
|
"grad_norm": 0.6773026445013379, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5868, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.999625818521983, |
|
"eval_loss": 0.6162874102592468, |
|
"eval_runtime": 518.1783, |
|
"eval_samples_per_second": 17.37, |
|
"eval_steps_per_second": 0.544, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.0056127221702527, |
|
"grad_norm": 0.9331327758390977, |
|
"learning_rate": 2e-06, |
|
"loss": 0.6381, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0355472404115997, |
|
"grad_norm": 0.7895988754181943, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5476, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.065481758652947, |
|
"grad_norm": 0.8235146344949044, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5451, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.095416276894294, |
|
"grad_norm": 0.7267162587943428, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5498, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.125350795135641, |
|
"grad_norm": 0.7345843211419183, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5495, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.155285313376988, |
|
"grad_norm": 0.6908318018947618, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5524, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.185219831618335, |
|
"grad_norm": 0.6981065999228409, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5516, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2151543498596817, |
|
"grad_norm": 0.7445453069101049, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5517, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.245088868101029, |
|
"grad_norm": 0.7278674464439252, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5538, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.275023386342376, |
|
"grad_norm": 0.6879321927261636, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5478, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.304957904583723, |
|
"grad_norm": 0.7343459951201352, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5538, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.33489242282507, |
|
"grad_norm": 0.7607710355221491, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5481, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.364826941066417, |
|
"grad_norm": 0.7417516698375253, |
|
"learning_rate": 2e-06, |
|
"loss": 0.549, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.394761459307764, |
|
"grad_norm": 0.6975464703626868, |
|
"learning_rate": 2e-06, |
|
"loss": 0.552, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4246959775491113, |
|
"grad_norm": 0.7361057536866448, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4546304957904583, |
|
"grad_norm": 0.7452745025276496, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5552, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4845650140318054, |
|
"grad_norm": 0.7760152871621997, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5542, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5144995322731525, |
|
"grad_norm": 0.7081762034349137, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5512, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5444340505144996, |
|
"grad_norm": 0.7832350493954435, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5545, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5743685687558466, |
|
"grad_norm": 0.767598031204084, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5527, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6043030869971937, |
|
"grad_norm": 0.6862395207363299, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5487, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.634237605238541, |
|
"grad_norm": 0.6888763524013458, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5478, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.664172123479888, |
|
"grad_norm": 0.8203400036669106, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5534, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.694106641721235, |
|
"grad_norm": 0.8260483366154581, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5561, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.724041159962582, |
|
"grad_norm": 0.7295963125166559, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5488, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.753975678203929, |
|
"grad_norm": 0.6857495144156721, |
|
"learning_rate": 2e-06, |
|
"loss": 0.55, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.7839101964452757, |
|
"grad_norm": 0.7241224249227611, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5554, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8138447146866232, |
|
"grad_norm": 0.6688797316878076, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5544, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.84377923292797, |
|
"grad_norm": 0.7283714304791777, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5526, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8737137511693174, |
|
"grad_norm": 0.7216563707506914, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5548, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.903648269410664, |
|
"grad_norm": 0.7405760586708118, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5491, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.933582787652011, |
|
"grad_norm": 0.7101741115648686, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5552, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.963517305893358, |
|
"grad_norm": 0.687213640945178, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5568, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9934518241347052, |
|
"grad_norm": 0.7118013404841623, |
|
"learning_rate": 2e-06, |
|
"loss": 0.5572, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.999438727782975, |
|
"eval_loss": 0.6159842014312744, |
|
"eval_runtime": 517.5485, |
|
"eval_samples_per_second": 17.392, |
|
"eval_steps_per_second": 0.545, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.999438727782975, |
|
"step": 1002, |
|
"total_flos": 3818092983484416.0, |
|
"train_loss": 0.5959667034015922, |
|
"train_runtime": 91515.2633, |
|
"train_samples_per_second": 5.606, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1002, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3818092983484416.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|