|
{ |
|
"best_metric": 0.0047513521276414394, |
|
"best_model_checkpoint": "finetuned-arsenic/checkpoint-2000", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 2164, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018484288354898338, |
|
"grad_norm": 4.949392795562744, |
|
"learning_rate": 0.0001990757855822551, |
|
"loss": 0.5368, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.036968576709796676, |
|
"grad_norm": 3.3969953060150146, |
|
"learning_rate": 0.00019815157116451017, |
|
"loss": 0.3313, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05545286506469501, |
|
"grad_norm": 0.859575629234314, |
|
"learning_rate": 0.00019722735674676528, |
|
"loss": 0.5003, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07393715341959335, |
|
"grad_norm": 5.522923946380615, |
|
"learning_rate": 0.00019630314232902034, |
|
"loss": 0.2564, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09242144177449169, |
|
"grad_norm": 4.462332248687744, |
|
"learning_rate": 0.00019537892791127544, |
|
"loss": 0.3339, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11090573012939002, |
|
"grad_norm": 1.6224160194396973, |
|
"learning_rate": 0.0001944547134935305, |
|
"loss": 0.3965, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12939001848428835, |
|
"grad_norm": 6.097796440124512, |
|
"learning_rate": 0.0001935304990757856, |
|
"loss": 0.3319, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1478743068391867, |
|
"grad_norm": 3.9769697189331055, |
|
"learning_rate": 0.00019260628465804066, |
|
"loss": 0.4012, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16635859519408502, |
|
"grad_norm": 2.335510730743408, |
|
"learning_rate": 0.00019168207024029577, |
|
"loss": 0.4584, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18484288354898337, |
|
"grad_norm": 3.8701980113983154, |
|
"learning_rate": 0.00019075785582255082, |
|
"loss": 0.1855, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18484288354898337, |
|
"eval_accuracy": 0.931237721021611, |
|
"eval_loss": 0.1917603313922882, |
|
"eval_runtime": 57.9367, |
|
"eval_samples_per_second": 26.356, |
|
"eval_steps_per_second": 3.297, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2033271719038817, |
|
"grad_norm": 2.2155282497406006, |
|
"learning_rate": 0.00018983364140480593, |
|
"loss": 0.2331, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22181146025878004, |
|
"grad_norm": 0.9634373188018799, |
|
"learning_rate": 0.000188909426987061, |
|
"loss": 0.209, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24029574861367836, |
|
"grad_norm": 0.2715567648410797, |
|
"learning_rate": 0.0001879852125693161, |
|
"loss": 0.1486, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2587800369685767, |
|
"grad_norm": 12.090089797973633, |
|
"learning_rate": 0.00018706099815157118, |
|
"loss": 0.1629, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27726432532347506, |
|
"grad_norm": 1.551562786102295, |
|
"learning_rate": 0.00018613678373382626, |
|
"loss": 0.1852, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2957486136783734, |
|
"grad_norm": 0.775977373123169, |
|
"learning_rate": 0.00018521256931608134, |
|
"loss": 0.3179, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3142329020332717, |
|
"grad_norm": 3.0043396949768066, |
|
"learning_rate": 0.00018428835489833642, |
|
"loss": 0.3842, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33271719038817005, |
|
"grad_norm": 1.2949095964431763, |
|
"learning_rate": 0.0001833641404805915, |
|
"loss": 0.2534, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3512014787430684, |
|
"grad_norm": 9.545828819274902, |
|
"learning_rate": 0.00018243992606284658, |
|
"loss": 0.2031, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"grad_norm": 0.29387930035591125, |
|
"learning_rate": 0.0001815157116451017, |
|
"loss": 0.1792, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"eval_accuracy": 0.9364767518009168, |
|
"eval_loss": 0.17399875819683075, |
|
"eval_runtime": 52.9831, |
|
"eval_samples_per_second": 28.821, |
|
"eval_steps_per_second": 3.605, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38817005545286504, |
|
"grad_norm": 2.138578414916992, |
|
"learning_rate": 0.00018059149722735675, |
|
"loss": 0.2129, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4066543438077634, |
|
"grad_norm": 2.022083282470703, |
|
"learning_rate": 0.00017966728280961186, |
|
"loss": 0.1577, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.42513863216266173, |
|
"grad_norm": 2.8811872005462646, |
|
"learning_rate": 0.0001787430683918669, |
|
"loss": 0.21, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4436229205175601, |
|
"grad_norm": 1.491790771484375, |
|
"learning_rate": 0.00017781885397412202, |
|
"loss": 0.2498, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.46210720887245843, |
|
"grad_norm": 2.5274643898010254, |
|
"learning_rate": 0.00017689463955637707, |
|
"loss": 0.149, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4805914972273567, |
|
"grad_norm": 0.6268563270568848, |
|
"learning_rate": 0.00017597042513863218, |
|
"loss": 0.1306, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.49907578558225507, |
|
"grad_norm": 6.4418511390686035, |
|
"learning_rate": 0.00017504621072088724, |
|
"loss": 0.1889, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5175600739371534, |
|
"grad_norm": 0.13176225125789642, |
|
"learning_rate": 0.00017412199630314234, |
|
"loss": 0.1304, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5360443622920518, |
|
"grad_norm": 1.4023276567459106, |
|
"learning_rate": 0.00017319778188539743, |
|
"loss": 0.0872, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5545286506469501, |
|
"grad_norm": 5.165181636810303, |
|
"learning_rate": 0.0001722735674676525, |
|
"loss": 0.1688, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5545286506469501, |
|
"eval_accuracy": 0.9692206941715783, |
|
"eval_loss": 0.078226737678051, |
|
"eval_runtime": 52.9719, |
|
"eval_samples_per_second": 28.827, |
|
"eval_steps_per_second": 3.606, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5730129390018485, |
|
"grad_norm": 4.743193626403809, |
|
"learning_rate": 0.0001713493530499076, |
|
"loss": 0.1222, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5914972273567468, |
|
"grad_norm": 3.3770973682403564, |
|
"learning_rate": 0.00017042513863216267, |
|
"loss": 0.2799, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.609981515711645, |
|
"grad_norm": 1.9085370302200317, |
|
"learning_rate": 0.00016950092421441775, |
|
"loss": 0.1779, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6284658040665434, |
|
"grad_norm": 2.592458963394165, |
|
"learning_rate": 0.00016857670979667283, |
|
"loss": 0.1619, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6469500924214417, |
|
"grad_norm": 1.1735055446624756, |
|
"learning_rate": 0.00016765249537892791, |
|
"loss": 0.4249, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6654343807763401, |
|
"grad_norm": 3.8289904594421387, |
|
"learning_rate": 0.000166728280961183, |
|
"loss": 0.1009, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6839186691312384, |
|
"grad_norm": 2.531283378601074, |
|
"learning_rate": 0.00016580406654343808, |
|
"loss": 0.1494, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7024029574861368, |
|
"grad_norm": 0.21572425961494446, |
|
"learning_rate": 0.00016487985212569316, |
|
"loss": 0.0824, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7208872458410351, |
|
"grad_norm": 3.6041758060455322, |
|
"learning_rate": 0.00016395563770794827, |
|
"loss": 0.1145, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"grad_norm": 0.6018674969673157, |
|
"learning_rate": 0.00016303142329020332, |
|
"loss": 0.1238, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"eval_accuracy": 0.922724296005239, |
|
"eval_loss": 0.21575002372264862, |
|
"eval_runtime": 52.6224, |
|
"eval_samples_per_second": 29.018, |
|
"eval_steps_per_second": 3.63, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7578558225508318, |
|
"grad_norm": 0.25093191862106323, |
|
"learning_rate": 0.00016210720887245843, |
|
"loss": 0.0724, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7763401109057301, |
|
"grad_norm": 0.2480381280183792, |
|
"learning_rate": 0.00016118299445471348, |
|
"loss": 0.106, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7948243992606284, |
|
"grad_norm": 8.212138175964355, |
|
"learning_rate": 0.0001602587800369686, |
|
"loss": 0.1665, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8133086876155268, |
|
"grad_norm": 0.6615661382675171, |
|
"learning_rate": 0.00015933456561922367, |
|
"loss": 0.0547, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8317929759704251, |
|
"grad_norm": 4.98212194442749, |
|
"learning_rate": 0.00015841035120147876, |
|
"loss": 0.1982, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8502772643253235, |
|
"grad_norm": 1.7662006616592407, |
|
"learning_rate": 0.00015748613678373384, |
|
"loss": 0.1402, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8687615526802218, |
|
"grad_norm": 5.664543151855469, |
|
"learning_rate": 0.00015656192236598892, |
|
"loss": 0.1606, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8872458410351202, |
|
"grad_norm": 5.662344932556152, |
|
"learning_rate": 0.000155637707948244, |
|
"loss": 0.0869, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9057301293900185, |
|
"grad_norm": 1.1777679920196533, |
|
"learning_rate": 0.00015471349353049908, |
|
"loss": 0.0827, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9242144177449169, |
|
"grad_norm": 0.06051797419786453, |
|
"learning_rate": 0.00015378927911275416, |
|
"loss": 0.0969, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9242144177449169, |
|
"eval_accuracy": 0.9842829076620825, |
|
"eval_loss": 0.04485374689102173, |
|
"eval_runtime": 52.5355, |
|
"eval_samples_per_second": 29.066, |
|
"eval_steps_per_second": 3.636, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9426987060998152, |
|
"grad_norm": 9.434717178344727, |
|
"learning_rate": 0.00015286506469500925, |
|
"loss": 0.1921, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9611829944547134, |
|
"grad_norm": 1.619040846824646, |
|
"learning_rate": 0.00015194085027726433, |
|
"loss": 0.1906, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9796672828096118, |
|
"grad_norm": 0.5532277226448059, |
|
"learning_rate": 0.0001510166358595194, |
|
"loss": 0.1082, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9981515711645101, |
|
"grad_norm": 0.0866900086402893, |
|
"learning_rate": 0.0001500924214417745, |
|
"loss": 0.1119, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0166358595194085, |
|
"grad_norm": 2.668076276779175, |
|
"learning_rate": 0.00014916820702402957, |
|
"loss": 0.143, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0351201478743068, |
|
"grad_norm": 0.15896956622600555, |
|
"learning_rate": 0.00014824399260628468, |
|
"loss": 0.0378, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0536044362292052, |
|
"grad_norm": 0.12053361535072327, |
|
"learning_rate": 0.00014731977818853976, |
|
"loss": 0.0528, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0720887245841035, |
|
"grad_norm": 0.06896385550498962, |
|
"learning_rate": 0.00014639556377079484, |
|
"loss": 0.1663, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0905730129390019, |
|
"grad_norm": 7.400400638580322, |
|
"learning_rate": 0.00014547134935304992, |
|
"loss": 0.081, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1090573012939002, |
|
"grad_norm": 0.04029673710465431, |
|
"learning_rate": 0.000144547134935305, |
|
"loss": 0.0326, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1090573012939002, |
|
"eval_accuracy": 0.9574328749181401, |
|
"eval_loss": 0.1554253250360489, |
|
"eval_runtime": 52.4665, |
|
"eval_samples_per_second": 29.104, |
|
"eval_steps_per_second": 3.64, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1275415896487986, |
|
"grad_norm": 1.2735309600830078, |
|
"learning_rate": 0.0001436229205175601, |
|
"loss": 0.1339, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.146025878003697, |
|
"grad_norm": 2.2266452312469482, |
|
"learning_rate": 0.00014269870609981517, |
|
"loss": 0.1443, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1645101663585953, |
|
"grad_norm": 2.932450294494629, |
|
"learning_rate": 0.00014177449168207025, |
|
"loss": 0.0869, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1829944547134936, |
|
"grad_norm": 5.688024520874023, |
|
"learning_rate": 0.00014085027726432533, |
|
"loss": 0.091, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.201478743068392, |
|
"grad_norm": 0.04643339663743973, |
|
"learning_rate": 0.0001399260628465804, |
|
"loss": 0.0433, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.21996303142329, |
|
"grad_norm": 0.38614460825920105, |
|
"learning_rate": 0.0001390018484288355, |
|
"loss": 0.0514, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2384473197781884, |
|
"grad_norm": 0.03372357785701752, |
|
"learning_rate": 0.00013807763401109058, |
|
"loss": 0.0826, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2569316081330868, |
|
"grad_norm": 0.7059990763664246, |
|
"learning_rate": 0.00013715341959334566, |
|
"loss": 0.1309, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2754158964879851, |
|
"grad_norm": 1.5385607481002808, |
|
"learning_rate": 0.00013622920517560074, |
|
"loss": 0.115, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2939001848428835, |
|
"grad_norm": 1.647644281387329, |
|
"learning_rate": 0.00013530499075785582, |
|
"loss": 0.1057, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2939001848428835, |
|
"eval_accuracy": 0.9738048461034708, |
|
"eval_loss": 0.08448445796966553, |
|
"eval_runtime": 52.7705, |
|
"eval_samples_per_second": 28.937, |
|
"eval_steps_per_second": 3.619, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3123844731977818, |
|
"grad_norm": 0.8896564841270447, |
|
"learning_rate": 0.0001343807763401109, |
|
"loss": 0.1076, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3308687615526802, |
|
"grad_norm": 0.9722292423248291, |
|
"learning_rate": 0.000133456561922366, |
|
"loss": 0.1285, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3493530499075785, |
|
"grad_norm": 3.9030041694641113, |
|
"learning_rate": 0.00013253234750462106, |
|
"loss": 0.1367, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3678373382624769, |
|
"grad_norm": 1.199768304824829, |
|
"learning_rate": 0.00013160813308687617, |
|
"loss": 0.088, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3863216266173752, |
|
"grad_norm": 0.8339413404464722, |
|
"learning_rate": 0.00013068391866913125, |
|
"loss": 0.0481, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4048059149722736, |
|
"grad_norm": 2.3673453330993652, |
|
"learning_rate": 0.00012975970425138634, |
|
"loss": 0.0698, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.423290203327172, |
|
"grad_norm": 0.042785417288541794, |
|
"learning_rate": 0.00012883548983364142, |
|
"loss": 0.0179, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4417744916820703, |
|
"grad_norm": 2.720048189163208, |
|
"learning_rate": 0.0001279112754158965, |
|
"loss": 0.0996, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4602587800369686, |
|
"grad_norm": 16.840740203857422, |
|
"learning_rate": 0.00012698706099815158, |
|
"loss": 0.0707, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.478743068391867, |
|
"grad_norm": 0.1579107642173767, |
|
"learning_rate": 0.00012606284658040666, |
|
"loss": 0.0805, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.478743068391867, |
|
"eval_accuracy": 0.9823182711198428, |
|
"eval_loss": 0.07117750495672226, |
|
"eval_runtime": 53.0346, |
|
"eval_samples_per_second": 28.793, |
|
"eval_steps_per_second": 3.601, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4972273567467653, |
|
"grad_norm": 7.252885341644287, |
|
"learning_rate": 0.00012513863216266174, |
|
"loss": 0.0848, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5157116451016637, |
|
"grad_norm": 0.25338369607925415, |
|
"learning_rate": 0.00012421441774491682, |
|
"loss": 0.0689, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5341959334565618, |
|
"grad_norm": 3.66860032081604, |
|
"learning_rate": 0.0001232902033271719, |
|
"loss": 0.041, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5526802218114604, |
|
"grad_norm": 9.176445960998535, |
|
"learning_rate": 0.000122365988909427, |
|
"loss": 0.111, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5711645101663585, |
|
"grad_norm": 0.032652150839567184, |
|
"learning_rate": 0.00012144177449168208, |
|
"loss": 0.0519, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.589648798521257, |
|
"grad_norm": 0.054165273904800415, |
|
"learning_rate": 0.00012051756007393715, |
|
"loss": 0.0661, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6081330868761552, |
|
"grad_norm": 0.10612482577562332, |
|
"learning_rate": 0.00011959334565619225, |
|
"loss": 0.0157, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6266173752310538, |
|
"grad_norm": 0.7138892412185669, |
|
"learning_rate": 0.00011866913123844731, |
|
"loss": 0.1159, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.645101663585952, |
|
"grad_norm": 0.0576617456972599, |
|
"learning_rate": 0.00011774491682070241, |
|
"loss": 0.1059, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6635859519408502, |
|
"grad_norm": 2.485743999481201, |
|
"learning_rate": 0.00011682070240295748, |
|
"loss": 0.0889, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6635859519408502, |
|
"eval_accuracy": 0.9796987557301899, |
|
"eval_loss": 0.07181376963853836, |
|
"eval_runtime": 53.7952, |
|
"eval_samples_per_second": 28.385, |
|
"eval_steps_per_second": 3.551, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6820702402957486, |
|
"grad_norm": 0.25389525294303894, |
|
"learning_rate": 0.00011589648798521257, |
|
"loss": 0.0478, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.700554528650647, |
|
"grad_norm": 0.040639039129018784, |
|
"learning_rate": 0.00011497227356746765, |
|
"loss": 0.0579, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7190388170055453, |
|
"grad_norm": 0.04252118989825249, |
|
"learning_rate": 0.00011404805914972275, |
|
"loss": 0.0414, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7375231053604436, |
|
"grad_norm": 0.03039310872554779, |
|
"learning_rate": 0.00011312384473197783, |
|
"loss": 0.1247, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.756007393715342, |
|
"grad_norm": 0.04092634469270706, |
|
"learning_rate": 0.00011219963031423291, |
|
"loss": 0.0485, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7744916820702403, |
|
"grad_norm": 0.02784869633615017, |
|
"learning_rate": 0.000111275415896488, |
|
"loss": 0.044, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7929759704251387, |
|
"grad_norm": 0.6377788186073303, |
|
"learning_rate": 0.00011035120147874307, |
|
"loss": 0.0833, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8114602587800368, |
|
"grad_norm": 0.0410403273999691, |
|
"learning_rate": 0.00010942698706099817, |
|
"loss": 0.0079, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8299445471349354, |
|
"grad_norm": 0.16617639362812042, |
|
"learning_rate": 0.00010850277264325324, |
|
"loss": 0.0562, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8484288354898335, |
|
"grad_norm": 6.131214141845703, |
|
"learning_rate": 0.00010757855822550833, |
|
"loss": 0.0503, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8484288354898335, |
|
"eval_accuracy": 0.9934512115258677, |
|
"eval_loss": 0.0250676441937685, |
|
"eval_runtime": 53.2731, |
|
"eval_samples_per_second": 28.664, |
|
"eval_steps_per_second": 3.585, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.866913123844732, |
|
"grad_norm": 0.07335863262414932, |
|
"learning_rate": 0.0001066543438077634, |
|
"loss": 0.0444, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8853974121996302, |
|
"grad_norm": 0.034475117921829224, |
|
"learning_rate": 0.0001057301293900185, |
|
"loss": 0.0513, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9038817005545288, |
|
"grad_norm": 0.035967420786619186, |
|
"learning_rate": 0.00010480591497227356, |
|
"loss": 0.0669, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.922365988909427, |
|
"grad_norm": 0.029034554958343506, |
|
"learning_rate": 0.00010388170055452866, |
|
"loss": 0.0278, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9408502772643255, |
|
"grad_norm": 3.698307514190674, |
|
"learning_rate": 0.00010295748613678373, |
|
"loss": 0.0547, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9593345656192236, |
|
"grad_norm": 0.040026549249887466, |
|
"learning_rate": 0.00010203327171903882, |
|
"loss": 0.0065, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9778188539741222, |
|
"grad_norm": 3.3067240715026855, |
|
"learning_rate": 0.0001011090573012939, |
|
"loss": 0.0828, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9963031423290203, |
|
"grad_norm": 0.05000556632876396, |
|
"learning_rate": 0.000100184842883549, |
|
"loss": 0.0632, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.014787430683919, |
|
"grad_norm": 0.04542790353298187, |
|
"learning_rate": 9.926062846580408e-05, |
|
"loss": 0.0682, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.033271719038817, |
|
"grad_norm": 0.030154038220643997, |
|
"learning_rate": 9.833641404805916e-05, |
|
"loss": 0.0225, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.033271719038817, |
|
"eval_accuracy": 0.9967256057629339, |
|
"eval_loss": 0.01773611083626747, |
|
"eval_runtime": 52.5689, |
|
"eval_samples_per_second": 29.048, |
|
"eval_steps_per_second": 3.633, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0517560073937156, |
|
"grad_norm": 0.3824068307876587, |
|
"learning_rate": 9.741219963031424e-05, |
|
"loss": 0.0194, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.0702402957486137, |
|
"grad_norm": 0.020000776275992393, |
|
"learning_rate": 9.648798521256932e-05, |
|
"loss": 0.0259, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.088724584103512, |
|
"grad_norm": 3.488415241241455, |
|
"learning_rate": 9.55637707948244e-05, |
|
"loss": 0.0629, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.1072088724584104, |
|
"grad_norm": 10.373331069946289, |
|
"learning_rate": 9.463955637707949e-05, |
|
"loss": 0.015, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1256931608133085, |
|
"grad_norm": 0.23100066184997559, |
|
"learning_rate": 9.371534195933457e-05, |
|
"loss": 0.0619, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.144177449168207, |
|
"grad_norm": 0.07692666351795197, |
|
"learning_rate": 9.279112754158965e-05, |
|
"loss": 0.06, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.162661737523105, |
|
"grad_norm": 0.057554759085178375, |
|
"learning_rate": 9.186691312384473e-05, |
|
"loss": 0.0079, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1811460258780038, |
|
"grad_norm": 0.039722565561532974, |
|
"learning_rate": 9.094269870609981e-05, |
|
"loss": 0.0581, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.199630314232902, |
|
"grad_norm": 0.021510232239961624, |
|
"learning_rate": 9.001848428835489e-05, |
|
"loss": 0.0052, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.2181146025878005, |
|
"grad_norm": 0.019746674224734306, |
|
"learning_rate": 8.909426987060999e-05, |
|
"loss": 0.0049, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2181146025878005, |
|
"eval_accuracy": 0.9921414538310412, |
|
"eval_loss": 0.024552814662456512, |
|
"eval_runtime": 52.686, |
|
"eval_samples_per_second": 28.983, |
|
"eval_steps_per_second": 3.625, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2365988909426986, |
|
"grad_norm": 4.809552192687988, |
|
"learning_rate": 8.817005545286507e-05, |
|
"loss": 0.098, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.255083179297597, |
|
"grad_norm": 0.22049099206924438, |
|
"learning_rate": 8.724584103512015e-05, |
|
"loss": 0.1328, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.2735674676524953, |
|
"grad_norm": 0.02430686727166176, |
|
"learning_rate": 8.632162661737525e-05, |
|
"loss": 0.0332, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.292051756007394, |
|
"grad_norm": 0.16566839814186096, |
|
"learning_rate": 8.539741219963033e-05, |
|
"loss": 0.0242, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.310536044362292, |
|
"grad_norm": 0.07895852625370026, |
|
"learning_rate": 8.447319778188541e-05, |
|
"loss": 0.0394, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3290203327171906, |
|
"grad_norm": 0.01941494271159172, |
|
"learning_rate": 8.354898336414049e-05, |
|
"loss": 0.0373, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3475046210720887, |
|
"grad_norm": 0.018574291840195656, |
|
"learning_rate": 8.262476894639557e-05, |
|
"loss": 0.0582, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.3659889094269873, |
|
"grad_norm": 9.006904602050781, |
|
"learning_rate": 8.170055452865065e-05, |
|
"loss": 0.075, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3844731977818854, |
|
"grad_norm": 0.5771515965461731, |
|
"learning_rate": 8.077634011090573e-05, |
|
"loss": 0.0217, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.402957486136784, |
|
"grad_norm": 0.01840708591043949, |
|
"learning_rate": 7.985212569316082e-05, |
|
"loss": 0.0152, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.402957486136784, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.008291647769510746, |
|
"eval_runtime": 53.4499, |
|
"eval_samples_per_second": 28.569, |
|
"eval_steps_per_second": 3.573, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.421441774491682, |
|
"grad_norm": 0.017435792833566666, |
|
"learning_rate": 7.89279112754159e-05, |
|
"loss": 0.0448, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.43992606284658, |
|
"grad_norm": 0.7729086875915527, |
|
"learning_rate": 7.800369685767098e-05, |
|
"loss": 0.0444, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.4584103512014788, |
|
"grad_norm": 0.059264715760946274, |
|
"learning_rate": 7.707948243992606e-05, |
|
"loss": 0.0397, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.476894639556377, |
|
"grad_norm": 0.024057278409600258, |
|
"learning_rate": 7.615526802218114e-05, |
|
"loss": 0.028, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.4953789279112755, |
|
"grad_norm": 0.022951899096369743, |
|
"learning_rate": 7.523105360443624e-05, |
|
"loss": 0.0444, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.5138632162661736, |
|
"grad_norm": 0.021782563999295235, |
|
"learning_rate": 7.430683918669132e-05, |
|
"loss": 0.0385, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.532347504621072, |
|
"grad_norm": 0.1371038258075714, |
|
"learning_rate": 7.33826247689464e-05, |
|
"loss": 0.0188, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.5508317929759703, |
|
"grad_norm": 0.7299683690071106, |
|
"learning_rate": 7.245841035120148e-05, |
|
"loss": 0.0845, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.569316081330869, |
|
"grad_norm": 0.34656259417533875, |
|
"learning_rate": 7.153419593345656e-05, |
|
"loss": 0.0436, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.587800369685767, |
|
"grad_norm": 0.10165718197822571, |
|
"learning_rate": 7.060998151571166e-05, |
|
"loss": 0.08, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.587800369685767, |
|
"eval_accuracy": 0.9941060903732809, |
|
"eval_loss": 0.021378275007009506, |
|
"eval_runtime": 52.8132, |
|
"eval_samples_per_second": 28.913, |
|
"eval_steps_per_second": 3.617, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.6062846580406656, |
|
"grad_norm": 5.586907863616943, |
|
"learning_rate": 6.968576709796674e-05, |
|
"loss": 0.0295, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.6247689463955637, |
|
"grad_norm": 0.0221896730363369, |
|
"learning_rate": 6.876155268022182e-05, |
|
"loss": 0.0627, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.6432532347504623, |
|
"grad_norm": 0.30416977405548096, |
|
"learning_rate": 6.78373382624769e-05, |
|
"loss": 0.0035, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.6617375231053604, |
|
"grad_norm": 0.102454274892807, |
|
"learning_rate": 6.691312384473198e-05, |
|
"loss": 0.0641, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.6802218114602585, |
|
"grad_norm": 0.023131974041461945, |
|
"learning_rate": 6.598890942698706e-05, |
|
"loss": 0.0326, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.698706099815157, |
|
"grad_norm": 0.09067076444625854, |
|
"learning_rate": 6.506469500924215e-05, |
|
"loss": 0.017, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7171903881700556, |
|
"grad_norm": 3.3906850814819336, |
|
"learning_rate": 6.414048059149723e-05, |
|
"loss": 0.029, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7356746765249538, |
|
"grad_norm": 0.061337146908044815, |
|
"learning_rate": 6.321626617375231e-05, |
|
"loss": 0.0168, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.754158964879852, |
|
"grad_norm": 0.19621238112449646, |
|
"learning_rate": 6.229205175600739e-05, |
|
"loss": 0.006, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.7726432532347505, |
|
"grad_norm": 0.012029612436890602, |
|
"learning_rate": 6.136783733826249e-05, |
|
"loss": 0.0043, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7726432532347505, |
|
"eval_accuracy": 0.9980353634577603, |
|
"eval_loss": 0.006946724373847246, |
|
"eval_runtime": 52.203, |
|
"eval_samples_per_second": 29.251, |
|
"eval_steps_per_second": 3.659, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.791127541589649, |
|
"grad_norm": 0.014309920370578766, |
|
"learning_rate": 6.044362292051756e-05, |
|
"loss": 0.0074, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.809611829944547, |
|
"grad_norm": 3.063054323196411, |
|
"learning_rate": 5.951940850277264e-05, |
|
"loss": 0.0045, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.8280961182994453, |
|
"grad_norm": 0.011617097072303295, |
|
"learning_rate": 5.859519408502773e-05, |
|
"loss": 0.0525, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.846580406654344, |
|
"grad_norm": 5.252607345581055, |
|
"learning_rate": 5.767097966728281e-05, |
|
"loss": 0.0104, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.865064695009242, |
|
"grad_norm": 0.014846362173557281, |
|
"learning_rate": 5.674676524953789e-05, |
|
"loss": 0.0265, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.8835489833641406, |
|
"grad_norm": 0.011737200431525707, |
|
"learning_rate": 5.5822550831792974e-05, |
|
"loss": 0.0543, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.9020332717190387, |
|
"grad_norm": 0.012772896327078342, |
|
"learning_rate": 5.4898336414048056e-05, |
|
"loss": 0.0018, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.9205175600739373, |
|
"grad_norm": 0.06962817162275314, |
|
"learning_rate": 5.397412199630314e-05, |
|
"loss": 0.0234, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.9390018484288354, |
|
"grad_norm": 0.019341696053743362, |
|
"learning_rate": 5.304990757855823e-05, |
|
"loss": 0.105, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.957486136783734, |
|
"grad_norm": 4.673314571380615, |
|
"learning_rate": 5.2125693160813314e-05, |
|
"loss": 0.0501, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.957486136783734, |
|
"eval_accuracy": 0.9967256057629339, |
|
"eval_loss": 0.015068226493895054, |
|
"eval_runtime": 51.6353, |
|
"eval_samples_per_second": 29.573, |
|
"eval_steps_per_second": 3.699, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.975970425138632, |
|
"grad_norm": 0.018514908850193024, |
|
"learning_rate": 5.1201478743068395e-05, |
|
"loss": 0.0312, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9944547134935307, |
|
"grad_norm": 0.0645008459687233, |
|
"learning_rate": 5.027726432532348e-05, |
|
"loss": 0.0489, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.0129390018484288, |
|
"grad_norm": 0.017880817875266075, |
|
"learning_rate": 4.935304990757856e-05, |
|
"loss": 0.0366, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.0314232902033273, |
|
"grad_norm": 0.04122663289308548, |
|
"learning_rate": 4.8428835489833646e-05, |
|
"loss": 0.0539, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.0499075785582255, |
|
"grad_norm": 0.022179430350661278, |
|
"learning_rate": 4.750462107208873e-05, |
|
"loss": 0.0248, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.068391866913124, |
|
"grad_norm": 0.924117386341095, |
|
"learning_rate": 4.658040665434381e-05, |
|
"loss": 0.02, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.086876155268022, |
|
"grad_norm": 0.01614381931722164, |
|
"learning_rate": 4.565619223659889e-05, |
|
"loss": 0.023, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.1053604436229207, |
|
"grad_norm": 0.05051511153578758, |
|
"learning_rate": 4.473197781885398e-05, |
|
"loss": 0.0041, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.123844731977819, |
|
"grad_norm": 0.02787856012582779, |
|
"learning_rate": 4.380776340110906e-05, |
|
"loss": 0.0163, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.142329020332717, |
|
"grad_norm": 0.21667926013469696, |
|
"learning_rate": 4.288354898336414e-05, |
|
"loss": 0.0186, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.142329020332717, |
|
"eval_accuracy": 0.9973804846103471, |
|
"eval_loss": 0.007818276062607765, |
|
"eval_runtime": 52.8582, |
|
"eval_samples_per_second": 28.889, |
|
"eval_steps_per_second": 3.613, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.1608133086876156, |
|
"grad_norm": 0.02714550867676735, |
|
"learning_rate": 4.195933456561922e-05, |
|
"loss": 0.0178, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.1792975970425137, |
|
"grad_norm": 0.5191987156867981, |
|
"learning_rate": 4.1035120147874305e-05, |
|
"loss": 0.0582, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.1977818853974123, |
|
"grad_norm": 0.02666807919740677, |
|
"learning_rate": 4.011090573012939e-05, |
|
"loss": 0.007, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.2162661737523104, |
|
"grad_norm": 0.06601597368717194, |
|
"learning_rate": 3.9186691312384474e-05, |
|
"loss": 0.0477, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.234750462107209, |
|
"grad_norm": 0.0280216746032238, |
|
"learning_rate": 3.826247689463956e-05, |
|
"loss": 0.0048, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.253234750462107, |
|
"grad_norm": 4.720592021942139, |
|
"learning_rate": 3.7338262476894644e-05, |
|
"loss": 0.0186, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.2717190388170057, |
|
"grad_norm": 0.01574169471859932, |
|
"learning_rate": 3.6414048059149726e-05, |
|
"loss": 0.0017, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.290203327171904, |
|
"grad_norm": 0.02533087506890297, |
|
"learning_rate": 3.548983364140481e-05, |
|
"loss": 0.0025, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.3086876155268024, |
|
"grad_norm": 0.013142619282007217, |
|
"learning_rate": 3.456561922365989e-05, |
|
"loss": 0.0376, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.3271719038817005, |
|
"grad_norm": 0.07316397875547409, |
|
"learning_rate": 3.364140480591497e-05, |
|
"loss": 0.0033, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.3271719038817005, |
|
"eval_accuracy": 0.9960707269155207, |
|
"eval_loss": 0.013949541375041008, |
|
"eval_runtime": 53.0604, |
|
"eval_samples_per_second": 28.779, |
|
"eval_steps_per_second": 3.6, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.345656192236599, |
|
"grad_norm": 0.015296310186386108, |
|
"learning_rate": 3.271719038817006e-05, |
|
"loss": 0.0015, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.364140480591497, |
|
"grad_norm": 5.960048198699951, |
|
"learning_rate": 3.179297597042514e-05, |
|
"loss": 0.0222, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.3826247689463957, |
|
"grad_norm": 0.21616186201572418, |
|
"learning_rate": 3.086876155268023e-05, |
|
"loss": 0.0038, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.401109057301294, |
|
"grad_norm": 0.015051410533487797, |
|
"learning_rate": 2.994454713493531e-05, |
|
"loss": 0.0019, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.4195933456561924, |
|
"grad_norm": 13.381204605102539, |
|
"learning_rate": 2.902033271719039e-05, |
|
"loss": 0.0182, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.4380776340110906, |
|
"grad_norm": 0.1726062297821045, |
|
"learning_rate": 2.8096118299445472e-05, |
|
"loss": 0.0022, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.4565619223659887, |
|
"grad_norm": 0.01701999455690384, |
|
"learning_rate": 2.7171903881700557e-05, |
|
"loss": 0.0014, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.4750462107208873, |
|
"grad_norm": 0.013869056478142738, |
|
"learning_rate": 2.624768946395564e-05, |
|
"loss": 0.0013, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.4935304990757854, |
|
"grad_norm": 0.021621432155370712, |
|
"learning_rate": 2.532347504621072e-05, |
|
"loss": 0.0016, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.512014787430684, |
|
"grad_norm": 1.3106377124786377, |
|
"learning_rate": 2.4399260628465805e-05, |
|
"loss": 0.0023, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.512014787430684, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.0075506423600018024, |
|
"eval_runtime": 50.8135, |
|
"eval_samples_per_second": 30.051, |
|
"eval_steps_per_second": 3.759, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.530499075785582, |
|
"grad_norm": 0.01985827274620533, |
|
"learning_rate": 2.347504621072089e-05, |
|
"loss": 0.0016, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.5489833641404807, |
|
"grad_norm": 0.013897390104830265, |
|
"learning_rate": 2.255083179297597e-05, |
|
"loss": 0.0308, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.567467652495379, |
|
"grad_norm": 0.009370139800012112, |
|
"learning_rate": 2.1626617375231053e-05, |
|
"loss": 0.0123, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.5859519408502774, |
|
"grad_norm": 0.019544150680303574, |
|
"learning_rate": 2.0702402957486137e-05, |
|
"loss": 0.0257, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.6044362292051755, |
|
"grad_norm": 0.018746808171272278, |
|
"learning_rate": 1.9778188539741222e-05, |
|
"loss": 0.03, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.622920517560074, |
|
"grad_norm": 0.009196238592267036, |
|
"learning_rate": 1.8853974121996304e-05, |
|
"loss": 0.0011, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.641404805914972, |
|
"grad_norm": 0.011442320421338081, |
|
"learning_rate": 1.7929759704251385e-05, |
|
"loss": 0.0012, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.6598890942698707, |
|
"grad_norm": 0.010710498318076134, |
|
"learning_rate": 1.700554528650647e-05, |
|
"loss": 0.0019, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.678373382624769, |
|
"grad_norm": 0.06102241575717926, |
|
"learning_rate": 1.6081330868761555e-05, |
|
"loss": 0.0012, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.6968576709796674, |
|
"grad_norm": 0.008612744510173798, |
|
"learning_rate": 1.5157116451016636e-05, |
|
"loss": 0.0054, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.6968576709796674, |
|
"eval_accuracy": 0.9993451211525868, |
|
"eval_loss": 0.0047513521276414394, |
|
"eval_runtime": 52.2618, |
|
"eval_samples_per_second": 29.218, |
|
"eval_steps_per_second": 3.655, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.7153419593345656, |
|
"grad_norm": 0.008234468288719654, |
|
"learning_rate": 1.423290203327172e-05, |
|
"loss": 0.043, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.733826247689464, |
|
"grad_norm": 0.008917649276554585, |
|
"learning_rate": 1.3308687615526803e-05, |
|
"loss": 0.0384, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.7523105360443623, |
|
"grad_norm": 0.00844865757972002, |
|
"learning_rate": 1.2384473197781886e-05, |
|
"loss": 0.0013, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.7707948243992604, |
|
"grad_norm": 0.008531128987669945, |
|
"learning_rate": 1.1460258780036969e-05, |
|
"loss": 0.0195, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.789279112754159, |
|
"grad_norm": 0.009270643815398216, |
|
"learning_rate": 1.0536044362292052e-05, |
|
"loss": 0.0392, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.8077634011090575, |
|
"grad_norm": 0.009245671331882477, |
|
"learning_rate": 9.611829944547135e-06, |
|
"loss": 0.0011, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.8262476894639557, |
|
"grad_norm": 0.01690092496573925, |
|
"learning_rate": 8.687615526802218e-06, |
|
"loss": 0.0016, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.844731977818854, |
|
"grad_norm": 0.015731679275631905, |
|
"learning_rate": 7.763401109057302e-06, |
|
"loss": 0.0317, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.8632162661737524, |
|
"grad_norm": 3.0953285694122314, |
|
"learning_rate": 6.931608133086876e-06, |
|
"loss": 0.0454, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.8817005545286505, |
|
"grad_norm": 6.279654502868652, |
|
"learning_rate": 6.0073937153419595e-06, |
|
"loss": 0.0168, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.8817005545286505, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.006641203537583351, |
|
"eval_runtime": 52.9204, |
|
"eval_samples_per_second": 28.855, |
|
"eval_steps_per_second": 3.609, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.900184842883549, |
|
"grad_norm": 0.009602474048733711, |
|
"learning_rate": 5.083179297597043e-06, |
|
"loss": 0.0011, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.918669131238447, |
|
"grad_norm": 12.240010261535645, |
|
"learning_rate": 4.158964879852126e-06, |
|
"loss": 0.0236, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.9371534195933457, |
|
"grad_norm": 0.03988449275493622, |
|
"learning_rate": 3.234750462107209e-06, |
|
"loss": 0.0014, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.955637707948244, |
|
"grad_norm": 5.554378986358643, |
|
"learning_rate": 2.310536044362292e-06, |
|
"loss": 0.0041, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.9741219963031424, |
|
"grad_norm": 0.0083112558349967, |
|
"learning_rate": 1.3863216266173753e-06, |
|
"loss": 0.02, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.9926062846580406, |
|
"grad_norm": 2.2959258556365967, |
|
"learning_rate": 4.621072088724585e-07, |
|
"loss": 0.0053, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 2164, |
|
"total_flos": 2.6818427765818e+18, |
|
"train_loss": 0.0841421499820822, |
|
"train_runtime": 2597.595, |
|
"train_samples_per_second": 13.323, |
|
"train_steps_per_second": 0.833 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2164, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6818427765818e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|