{ "best_metric": 0.0025608371943235397, "best_model_checkpoint": "finetuned-arsenic/checkpoint-1700", "epoch": 4.0, "eval_steps": 100, "global_step": 2164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018484288354898338, "grad_norm": 3.1622745990753174, "learning_rate": 0.0001990757855822551, "loss": 0.5015, "step": 10 }, { "epoch": 0.036968576709796676, "grad_norm": 3.4678640365600586, "learning_rate": 0.00019815157116451017, "loss": 0.4604, "step": 20 }, { "epoch": 0.05545286506469501, "grad_norm": 3.134012460708618, "learning_rate": 0.00019722735674676528, "loss": 0.2732, "step": 30 }, { "epoch": 0.07393715341959335, "grad_norm": 1.4681167602539062, "learning_rate": 0.00019630314232902034, "loss": 0.4454, "step": 40 }, { "epoch": 0.09242144177449169, "grad_norm": 2.2367947101593018, "learning_rate": 0.00019537892791127544, "loss": 0.3936, "step": 50 }, { "epoch": 0.11090573012939002, "grad_norm": 1.4795340299606323, "learning_rate": 0.0001944547134935305, "loss": 0.2588, "step": 60 }, { "epoch": 0.12939001848428835, "grad_norm": 1.862062692642212, "learning_rate": 0.0001935304990757856, "loss": 0.3119, "step": 70 }, { "epoch": 0.1478743068391867, "grad_norm": 2.609395980834961, "learning_rate": 0.00019260628465804066, "loss": 0.2267, "step": 80 }, { "epoch": 0.16635859519408502, "grad_norm": 1.176942229270935, "learning_rate": 0.00019168207024029577, "loss": 0.1786, "step": 90 }, { "epoch": 0.18484288354898337, "grad_norm": 3.820526599884033, "learning_rate": 0.00019075785582255082, "loss": 0.2214, "step": 100 }, { "epoch": 0.18484288354898337, "eval_accuracy": 0.9607072691552063, "eval_loss": 0.12430191040039062, "eval_runtime": 63.6085, "eval_samples_per_second": 24.006, "eval_steps_per_second": 3.003, "step": 100 }, { "epoch": 0.2033271719038817, "grad_norm": 1.79490065574646, "learning_rate": 0.00018983364140480593, "loss": 0.2468, "step": 110 }, { "epoch": 0.22181146025878004, "grad_norm": 0.4057539701461792, "learning_rate": 0.000188909426987061, "loss": 0.2852, "step": 120 }, { "epoch": 0.24029574861367836, "grad_norm": 0.1476714164018631, "learning_rate": 0.0001879852125693161, "loss": 0.1344, "step": 130 }, { "epoch": 0.2587800369685767, "grad_norm": 3.7497174739837646, "learning_rate": 0.00018706099815157118, "loss": 0.1937, "step": 140 }, { "epoch": 0.27726432532347506, "grad_norm": 1.4686886072158813, "learning_rate": 0.00018613678373382626, "loss": 0.14, "step": 150 }, { "epoch": 0.2957486136783734, "grad_norm": 2.834284782409668, "learning_rate": 0.00018521256931608134, "loss": 0.2202, "step": 160 }, { "epoch": 0.3142329020332717, "grad_norm": 3.5288290977478027, "learning_rate": 0.00018428835489833642, "loss": 0.1548, "step": 170 }, { "epoch": 0.33271719038817005, "grad_norm": 2.251044273376465, "learning_rate": 0.0001833641404805915, "loss": 0.2552, "step": 180 }, { "epoch": 0.3512014787430684, "grad_norm": 2.9413559436798096, "learning_rate": 0.00018243992606284658, "loss": 0.1838, "step": 190 }, { "epoch": 0.36968576709796674, "grad_norm": 1.2569198608398438, "learning_rate": 0.0001815157116451017, "loss": 0.1213, "step": 200 }, { "epoch": 0.36968576709796674, "eval_accuracy": 0.933857236411264, "eval_loss": 0.17631804943084717, "eval_runtime": 52.9442, "eval_samples_per_second": 28.842, "eval_steps_per_second": 3.608, "step": 200 }, { "epoch": 0.38817005545286504, "grad_norm": 6.763731002807617, "learning_rate": 0.00018059149722735675, "loss": 0.1987, "step": 210 }, { "epoch": 0.4066543438077634, "grad_norm": 1.2817621231079102, "learning_rate": 0.00017966728280961186, "loss": 0.2626, "step": 220 }, { "epoch": 0.42513863216266173, "grad_norm": 5.742928504943848, "learning_rate": 0.0001787430683918669, "loss": 0.1899, "step": 230 }, { "epoch": 0.4436229205175601, "grad_norm": 0.5674309134483337, "learning_rate": 0.00017781885397412202, "loss": 0.0699, "step": 240 }, { "epoch": 0.46210720887245843, "grad_norm": 5.822825908660889, "learning_rate": 0.00017689463955637707, "loss": 0.091, "step": 250 }, { "epoch": 0.4805914972273567, "grad_norm": 0.5449599027633667, "learning_rate": 0.00017597042513863218, "loss": 0.1584, "step": 260 }, { "epoch": 0.49907578558225507, "grad_norm": 0.09896630793809891, "learning_rate": 0.00017504621072088724, "loss": 0.2052, "step": 270 }, { "epoch": 0.5175600739371534, "grad_norm": 4.201374530792236, "learning_rate": 0.00017412199630314234, "loss": 0.1447, "step": 280 }, { "epoch": 0.5360443622920518, "grad_norm": 6.048889636993408, "learning_rate": 0.00017319778188539743, "loss": 0.1032, "step": 290 }, { "epoch": 0.5545286506469501, "grad_norm": 5.420243740081787, "learning_rate": 0.0001722735674676525, "loss": 0.1201, "step": 300 }, { "epoch": 0.5545286506469501, "eval_accuracy": 0.9607072691552063, "eval_loss": 0.10179698467254639, "eval_runtime": 50.5522, "eval_samples_per_second": 30.206, "eval_steps_per_second": 3.778, "step": 300 }, { "epoch": 0.5730129390018485, "grad_norm": 1.7872523069381714, "learning_rate": 0.0001713493530499076, "loss": 0.1725, "step": 310 }, { "epoch": 0.5914972273567468, "grad_norm": 3.7505619525909424, "learning_rate": 0.00017042513863216267, "loss": 0.0805, "step": 320 }, { "epoch": 0.609981515711645, "grad_norm": 3.3265647888183594, "learning_rate": 0.00016950092421441775, "loss": 0.0945, "step": 330 }, { "epoch": 0.6284658040665434, "grad_norm": 0.0822325125336647, "learning_rate": 0.00016857670979667283, "loss": 0.184, "step": 340 }, { "epoch": 0.6469500924214417, "grad_norm": 6.458297252655029, "learning_rate": 0.00016765249537892791, "loss": 0.2462, "step": 350 }, { "epoch": 0.6654343807763401, "grad_norm": 8.956634521484375, "learning_rate": 0.000166728280961183, "loss": 0.1549, "step": 360 }, { "epoch": 0.6839186691312384, "grad_norm": 8.221627235412598, "learning_rate": 0.00016580406654343808, "loss": 0.0573, "step": 370 }, { "epoch": 0.7024029574861368, "grad_norm": 1.3730543851852417, "learning_rate": 0.00016487985212569316, "loss": 0.1435, "step": 380 }, { "epoch": 0.7208872458410351, "grad_norm": 3.857621669769287, "learning_rate": 0.00016395563770794827, "loss": 0.1512, "step": 390 }, { "epoch": 0.7393715341959335, "grad_norm": 0.7978448867797852, "learning_rate": 0.00016303142329020332, "loss": 0.0991, "step": 400 }, { "epoch": 0.7393715341959335, "eval_accuracy": 0.9417157825802227, "eval_loss": 0.20708701014518738, "eval_runtime": 49.1709, "eval_samples_per_second": 31.055, "eval_steps_per_second": 3.884, "step": 400 }, { "epoch": 0.7578558225508318, "grad_norm": 0.13973641395568848, "learning_rate": 0.00016210720887245843, "loss": 0.2316, "step": 410 }, { "epoch": 0.7763401109057301, "grad_norm": 5.2767415046691895, "learning_rate": 0.00016118299445471348, "loss": 0.0945, "step": 420 }, { "epoch": 0.7948243992606284, "grad_norm": 1.6643435955047607, "learning_rate": 0.0001602587800369686, "loss": 0.1037, "step": 430 }, { "epoch": 0.8133086876155268, "grad_norm": 2.7701528072357178, "learning_rate": 0.00015933456561922367, "loss": 0.0687, "step": 440 }, { "epoch": 0.8317929759704251, "grad_norm": 0.07744336128234863, "learning_rate": 0.00015841035120147876, "loss": 0.1097, "step": 450 }, { "epoch": 0.8502772643253235, "grad_norm": 1.1288193464279175, "learning_rate": 0.00015748613678373384, "loss": 0.1874, "step": 460 }, { "epoch": 0.8687615526802218, "grad_norm": 7.898807525634766, "learning_rate": 0.00015656192236598892, "loss": 0.1038, "step": 470 }, { "epoch": 0.8872458410351202, "grad_norm": 2.908022403717041, "learning_rate": 0.000155637707948244, "loss": 0.0918, "step": 480 }, { "epoch": 0.9057301293900185, "grad_norm": 1.4139397144317627, "learning_rate": 0.00015471349353049908, "loss": 0.1258, "step": 490 }, { "epoch": 0.9242144177449169, "grad_norm": 0.09938838332891464, "learning_rate": 0.00015378927911275416, "loss": 0.1127, "step": 500 }, { "epoch": 0.9242144177449169, "eval_accuracy": 0.9666011787819253, "eval_loss": 0.08861085772514343, "eval_runtime": 48.3076, "eval_samples_per_second": 31.61, "eval_steps_per_second": 3.954, "step": 500 }, { "epoch": 0.9426987060998152, "grad_norm": 0.2065439671278, "learning_rate": 0.00015286506469500925, "loss": 0.0658, "step": 510 }, { "epoch": 0.9611829944547134, "grad_norm": 4.408117771148682, "learning_rate": 0.00015194085027726433, "loss": 0.0739, "step": 520 }, { "epoch": 0.9796672828096118, "grad_norm": 0.18956203758716583, "learning_rate": 0.0001510166358595194, "loss": 0.1474, "step": 530 }, { "epoch": 0.9981515711645101, "grad_norm": 2.8244571685791016, "learning_rate": 0.0001500924214417745, "loss": 0.0509, "step": 540 }, { "epoch": 1.0166358595194085, "grad_norm": 0.705922544002533, "learning_rate": 0.00014916820702402957, "loss": 0.0719, "step": 550 }, { "epoch": 1.0351201478743068, "grad_norm": 1.5707758665084839, "learning_rate": 0.00014824399260628468, "loss": 0.0954, "step": 560 }, { "epoch": 1.0536044362292052, "grad_norm": 1.2642130851745605, "learning_rate": 0.00014731977818853976, "loss": 0.0859, "step": 570 }, { "epoch": 1.0720887245841035, "grad_norm": 0.222818985581398, "learning_rate": 0.00014639556377079484, "loss": 0.0827, "step": 580 }, { "epoch": 1.0905730129390019, "grad_norm": 0.05685073137283325, "learning_rate": 0.00014547134935304992, "loss": 0.0628, "step": 590 }, { "epoch": 1.1090573012939002, "grad_norm": 0.06841599196195602, "learning_rate": 0.000144547134935305, "loss": 0.0314, "step": 600 }, { "epoch": 1.1090573012939002, "eval_accuracy": 0.9908316961362148, "eval_loss": 0.033296775072813034, "eval_runtime": 49.081, "eval_samples_per_second": 31.112, "eval_steps_per_second": 3.892, "step": 600 }, { "epoch": 1.1275415896487986, "grad_norm": 0.8085036873817444, "learning_rate": 0.0001436229205175601, "loss": 0.0295, "step": 610 }, { "epoch": 1.146025878003697, "grad_norm": 0.030383585020899773, "learning_rate": 0.00014269870609981517, "loss": 0.0168, "step": 620 }, { "epoch": 1.1645101663585953, "grad_norm": 2.179034948348999, "learning_rate": 0.00014177449168207025, "loss": 0.217, "step": 630 }, { "epoch": 1.1829944547134936, "grad_norm": 0.033746711909770966, "learning_rate": 0.00014085027726432533, "loss": 0.12, "step": 640 }, { "epoch": 1.201478743068392, "grad_norm": 0.06319836527109146, "learning_rate": 0.0001399260628465804, "loss": 0.01, "step": 650 }, { "epoch": 1.21996303142329, "grad_norm": 0.06434024125337601, "learning_rate": 0.0001390018484288355, "loss": 0.0755, "step": 660 }, { "epoch": 1.2384473197781884, "grad_norm": 3.2345378398895264, "learning_rate": 0.00013807763401109058, "loss": 0.1755, "step": 670 }, { "epoch": 1.2569316081330868, "grad_norm": 8.074490547180176, "learning_rate": 0.00013715341959334566, "loss": 0.1248, "step": 680 }, { "epoch": 1.2754158964879851, "grad_norm": 0.21135617792606354, "learning_rate": 0.00013622920517560074, "loss": 0.0662, "step": 690 }, { "epoch": 1.2939001848428835, "grad_norm": 0.0901883915066719, "learning_rate": 0.00013530499075785582, "loss": 0.0252, "step": 700 }, { "epoch": 1.2939001848428835, "eval_accuracy": 0.9973804846103471, "eval_loss": 0.010960910469293594, "eval_runtime": 49.6376, "eval_samples_per_second": 30.763, "eval_steps_per_second": 3.848, "step": 700 }, { "epoch": 1.3123844731977818, "grad_norm": 2.1113812923431396, "learning_rate": 0.0001343807763401109, "loss": 0.0685, "step": 710 }, { "epoch": 1.3308687615526802, "grad_norm": 0.062072403728961945, "learning_rate": 0.000133456561922366, "loss": 0.0747, "step": 720 }, { "epoch": 1.3493530499075785, "grad_norm": 4.824169158935547, "learning_rate": 0.00013253234750462106, "loss": 0.0791, "step": 730 }, { "epoch": 1.3678373382624769, "grad_norm": 0.12804123759269714, "learning_rate": 0.00013160813308687617, "loss": 0.045, "step": 740 }, { "epoch": 1.3863216266173752, "grad_norm": 0.3526214361190796, "learning_rate": 0.00013068391866913125, "loss": 0.0749, "step": 750 }, { "epoch": 1.4048059149722736, "grad_norm": 0.5252203345298767, "learning_rate": 0.00012975970425138634, "loss": 0.0459, "step": 760 }, { "epoch": 1.423290203327172, "grad_norm": 9.271531105041504, "learning_rate": 0.00012883548983364142, "loss": 0.1033, "step": 770 }, { "epoch": 1.4417744916820703, "grad_norm": 0.13968196511268616, "learning_rate": 0.0001279112754158965, "loss": 0.03, "step": 780 }, { "epoch": 1.4602587800369686, "grad_norm": 0.14128534495830536, "learning_rate": 0.00012698706099815158, "loss": 0.0402, "step": 790 }, { "epoch": 1.478743068391867, "grad_norm": 1.0647764205932617, "learning_rate": 0.00012606284658040666, "loss": 0.0582, "step": 800 }, { "epoch": 1.478743068391867, "eval_accuracy": 0.9986902423051736, "eval_loss": 0.010355145670473576, "eval_runtime": 48.0295, "eval_samples_per_second": 31.793, "eval_steps_per_second": 3.977, "step": 800 }, { "epoch": 1.4972273567467653, "grad_norm": 3.4088709354400635, "learning_rate": 0.00012513863216266174, "loss": 0.0495, "step": 810 }, { "epoch": 1.5157116451016637, "grad_norm": 0.2625955045223236, "learning_rate": 0.00012421441774491682, "loss": 0.0641, "step": 820 }, { "epoch": 1.5341959334565618, "grad_norm": 0.08498267084360123, "learning_rate": 0.0001232902033271719, "loss": 0.0331, "step": 830 }, { "epoch": 1.5526802218114604, "grad_norm": 10.900208473205566, "learning_rate": 0.000122365988909427, "loss": 0.0478, "step": 840 }, { "epoch": 1.5711645101663585, "grad_norm": 0.056279949843883514, "learning_rate": 0.00012144177449168208, "loss": 0.0754, "step": 850 }, { "epoch": 1.589648798521257, "grad_norm": 0.05696770176291466, "learning_rate": 0.00012051756007393715, "loss": 0.0906, "step": 860 }, { "epoch": 1.6081330868761552, "grad_norm": 0.022501414641737938, "learning_rate": 0.00011959334565619225, "loss": 0.0413, "step": 870 }, { "epoch": 1.6266173752310538, "grad_norm": 2.3988966941833496, "learning_rate": 0.00011866913123844731, "loss": 0.074, "step": 880 }, { "epoch": 1.645101663585952, "grad_norm": 0.03616836294531822, "learning_rate": 0.00011774491682070241, "loss": 0.0094, "step": 890 }, { "epoch": 1.6635859519408502, "grad_norm": 9.183988571166992, "learning_rate": 0.00011682070240295748, "loss": 0.0455, "step": 900 }, { "epoch": 1.6635859519408502, "eval_accuracy": 0.9954158480681073, "eval_loss": 0.01981273666024208, "eval_runtime": 48.1294, "eval_samples_per_second": 31.727, "eval_steps_per_second": 3.968, "step": 900 }, { "epoch": 1.6820702402957486, "grad_norm": 20.508174896240234, "learning_rate": 0.00011589648798521257, "loss": 0.1063, "step": 910 }, { "epoch": 1.700554528650647, "grad_norm": 3.3739655017852783, "learning_rate": 0.00011497227356746765, "loss": 0.0711, "step": 920 }, { "epoch": 1.7190388170055453, "grad_norm": 0.03310403227806091, "learning_rate": 0.00011404805914972275, "loss": 0.0216, "step": 930 }, { "epoch": 1.7375231053604436, "grad_norm": 0.027661019936203957, "learning_rate": 0.00011312384473197783, "loss": 0.05, "step": 940 }, { "epoch": 1.756007393715342, "grad_norm": 0.02968147210776806, "learning_rate": 0.00011219963031423291, "loss": 0.0368, "step": 950 }, { "epoch": 1.7744916820702403, "grad_norm": 0.01928904838860035, "learning_rate": 0.000111275415896488, "loss": 0.0083, "step": 960 }, { "epoch": 1.7929759704251387, "grad_norm": 0.048537448048591614, "learning_rate": 0.00011035120147874307, "loss": 0.0421, "step": 970 }, { "epoch": 1.8114602587800368, "grad_norm": 0.01963023841381073, "learning_rate": 0.00010942698706099817, "loss": 0.0059, "step": 980 }, { "epoch": 1.8299445471349354, "grad_norm": 0.035353146493434906, "learning_rate": 0.00010850277264325324, "loss": 0.0446, "step": 990 }, { "epoch": 1.8484288354898335, "grad_norm": 0.015135395340621471, "learning_rate": 0.00010757855822550833, "loss": 0.0569, "step": 1000 }, { "epoch": 1.8484288354898335, "eval_accuracy": 0.9960707269155207, "eval_loss": 0.017953284084796906, "eval_runtime": 49.2028, "eval_samples_per_second": 31.035, "eval_steps_per_second": 3.882, "step": 1000 }, { "epoch": 1.866913123844732, "grad_norm": 0.02287949249148369, "learning_rate": 0.0001066543438077634, "loss": 0.0033, "step": 1010 }, { "epoch": 1.8853974121996302, "grad_norm": 0.024308230727910995, "learning_rate": 0.0001057301293900185, "loss": 0.0028, "step": 1020 }, { "epoch": 1.9038817005545288, "grad_norm": 8.346418380737305, "learning_rate": 0.00010480591497227356, "loss": 0.0468, "step": 1030 }, { "epoch": 1.922365988909427, "grad_norm": 0.015379060991108418, "learning_rate": 0.00010388170055452866, "loss": 0.1142, "step": 1040 }, { "epoch": 1.9408502772643255, "grad_norm": 0.021259386092424393, "learning_rate": 0.00010295748613678373, "loss": 0.057, "step": 1050 }, { "epoch": 1.9593345656192236, "grad_norm": 4.173085689544678, "learning_rate": 0.00010203327171903882, "loss": 0.0333, "step": 1060 }, { "epoch": 1.9778188539741222, "grad_norm": 0.013770132325589657, "learning_rate": 0.0001011090573012939, "loss": 0.0248, "step": 1070 }, { "epoch": 1.9963031423290203, "grad_norm": 0.029461657628417015, "learning_rate": 0.000100184842883549, "loss": 0.1002, "step": 1080 }, { "epoch": 2.014787430683919, "grad_norm": 0.042121052742004395, "learning_rate": 9.926062846580408e-05, "loss": 0.0499, "step": 1090 }, { "epoch": 2.033271719038817, "grad_norm": 6.605316162109375, "learning_rate": 9.833641404805916e-05, "loss": 0.0627, "step": 1100 }, { "epoch": 2.033271719038817, "eval_accuracy": 0.9947609692206941, "eval_loss": 0.024407994002103806, "eval_runtime": 48.147, "eval_samples_per_second": 31.715, "eval_steps_per_second": 3.967, "step": 1100 }, { "epoch": 2.0517560073937156, "grad_norm": 0.03806009516119957, "learning_rate": 9.741219963031424e-05, "loss": 0.0162, "step": 1110 }, { "epoch": 2.0702402957486137, "grad_norm": 0.04799278452992439, "learning_rate": 9.648798521256932e-05, "loss": 0.0083, "step": 1120 }, { "epoch": 2.088724584103512, "grad_norm": 3.807424545288086, "learning_rate": 9.55637707948244e-05, "loss": 0.0349, "step": 1130 }, { "epoch": 2.1072088724584104, "grad_norm": 0.013640133664011955, "learning_rate": 9.463955637707949e-05, "loss": 0.0079, "step": 1140 }, { "epoch": 2.1256931608133085, "grad_norm": 0.9286332726478577, "learning_rate": 9.371534195933457e-05, "loss": 0.0675, "step": 1150 }, { "epoch": 2.144177449168207, "grad_norm": 7.584729194641113, "learning_rate": 9.279112754158965e-05, "loss": 0.0223, "step": 1160 }, { "epoch": 2.162661737523105, "grad_norm": 0.02116883546113968, "learning_rate": 9.186691312384473e-05, "loss": 0.0046, "step": 1170 }, { "epoch": 2.1811460258780038, "grad_norm": 0.014812180772423744, "learning_rate": 9.094269870609981e-05, "loss": 0.0194, "step": 1180 }, { "epoch": 2.199630314232902, "grad_norm": 0.011532890610396862, "learning_rate": 9.001848428835489e-05, "loss": 0.0236, "step": 1190 }, { "epoch": 2.2181146025878005, "grad_norm": 0.01616760343313217, "learning_rate": 8.909426987060999e-05, "loss": 0.0328, "step": 1200 }, { "epoch": 2.2181146025878005, "eval_accuracy": 0.9986902423051736, "eval_loss": 0.0054244487546384335, "eval_runtime": 49.4505, "eval_samples_per_second": 30.879, "eval_steps_per_second": 3.862, "step": 1200 }, { "epoch": 2.2365988909426986, "grad_norm": 0.04048744961619377, "learning_rate": 8.817005545286507e-05, "loss": 0.0329, "step": 1210 }, { "epoch": 2.255083179297597, "grad_norm": 0.0121224420145154, "learning_rate": 8.724584103512015e-05, "loss": 0.0229, "step": 1220 }, { "epoch": 2.2735674676524953, "grad_norm": 3.312351942062378, "learning_rate": 8.632162661737525e-05, "loss": 0.0337, "step": 1230 }, { "epoch": 2.292051756007394, "grad_norm": 0.04162617400288582, "learning_rate": 8.539741219963033e-05, "loss": 0.0562, "step": 1240 }, { "epoch": 2.310536044362292, "grad_norm": 0.0170124601572752, "learning_rate": 8.447319778188541e-05, "loss": 0.0048, "step": 1250 }, { "epoch": 2.3290203327171906, "grad_norm": 0.03438381850719452, "learning_rate": 8.354898336414049e-05, "loss": 0.0305, "step": 1260 }, { "epoch": 2.3475046210720887, "grad_norm": 4.124629974365234, "learning_rate": 8.262476894639557e-05, "loss": 0.0413, "step": 1270 }, { "epoch": 2.3659889094269873, "grad_norm": 0.009506890550255775, "learning_rate": 8.170055452865065e-05, "loss": 0.0775, "step": 1280 }, { "epoch": 2.3844731977818854, "grad_norm": 0.009906689636409283, "learning_rate": 8.077634011090573e-05, "loss": 0.0432, "step": 1290 }, { "epoch": 2.402957486136784, "grad_norm": 0.0475531667470932, "learning_rate": 7.985212569316082e-05, "loss": 0.0156, "step": 1300 }, { "epoch": 2.402957486136784, "eval_accuracy": 0.9947609692206941, "eval_loss": 0.0192621648311615, "eval_runtime": 48.3339, "eval_samples_per_second": 31.593, "eval_steps_per_second": 3.952, "step": 1300 }, { "epoch": 2.421441774491682, "grad_norm": 0.05756278708577156, "learning_rate": 7.89279112754159e-05, "loss": 0.0771, "step": 1310 }, { "epoch": 2.43992606284658, "grad_norm": 0.021766463294625282, "learning_rate": 7.800369685767098e-05, "loss": 0.0661, "step": 1320 }, { "epoch": 2.4584103512014788, "grad_norm": 0.02948659099638462, "learning_rate": 7.707948243992606e-05, "loss": 0.0166, "step": 1330 }, { "epoch": 2.476894639556377, "grad_norm": 9.3042573928833, "learning_rate": 7.615526802218114e-05, "loss": 0.0309, "step": 1340 }, { "epoch": 2.4953789279112755, "grad_norm": 0.023814814165234566, "learning_rate": 7.523105360443624e-05, "loss": 0.0783, "step": 1350 }, { "epoch": 2.5138632162661736, "grad_norm": 3.380216598510742, "learning_rate": 7.430683918669132e-05, "loss": 0.0556, "step": 1360 }, { "epoch": 2.532347504621072, "grad_norm": 0.0165497325360775, "learning_rate": 7.33826247689464e-05, "loss": 0.0221, "step": 1370 }, { "epoch": 2.5508317929759703, "grad_norm": 0.19721367955207825, "learning_rate": 7.245841035120148e-05, "loss": 0.0147, "step": 1380 }, { "epoch": 2.569316081330869, "grad_norm": 7.320788383483887, "learning_rate": 7.153419593345656e-05, "loss": 0.0251, "step": 1390 }, { "epoch": 2.587800369685767, "grad_norm": 0.009770031087100506, "learning_rate": 7.060998151571166e-05, "loss": 0.0016, "step": 1400 }, { "epoch": 2.587800369685767, "eval_accuracy": 0.9973804846103471, "eval_loss": 0.007392051629722118, "eval_runtime": 49.7571, "eval_samples_per_second": 30.689, "eval_steps_per_second": 3.839, "step": 1400 }, { "epoch": 2.6062846580406656, "grad_norm": 0.009464599192142487, "learning_rate": 6.968576709796674e-05, "loss": 0.0235, "step": 1410 }, { "epoch": 2.6247689463955637, "grad_norm": 3.2390127182006836, "learning_rate": 6.876155268022182e-05, "loss": 0.0419, "step": 1420 }, { "epoch": 2.6432532347504623, "grad_norm": 0.015608754009008408, "learning_rate": 6.78373382624769e-05, "loss": 0.0043, "step": 1430 }, { "epoch": 2.6617375231053604, "grad_norm": 0.35069820284843445, "learning_rate": 6.691312384473198e-05, "loss": 0.0079, "step": 1440 }, { "epoch": 2.6802218114602585, "grad_norm": 0.012105106376111507, "learning_rate": 6.598890942698706e-05, "loss": 0.0501, "step": 1450 }, { "epoch": 2.698706099815157, "grad_norm": 0.06180037185549736, "learning_rate": 6.506469500924215e-05, "loss": 0.0282, "step": 1460 }, { "epoch": 2.7171903881700556, "grad_norm": 0.008026616647839546, "learning_rate": 6.414048059149723e-05, "loss": 0.0033, "step": 1470 }, { "epoch": 2.7356746765249538, "grad_norm": 0.4511905610561371, "learning_rate": 6.321626617375231e-05, "loss": 0.0269, "step": 1480 }, { "epoch": 2.754158964879852, "grad_norm": 0.05588386207818985, "learning_rate": 6.229205175600739e-05, "loss": 0.0051, "step": 1490 }, { "epoch": 2.7726432532347505, "grad_norm": 0.025905737653374672, "learning_rate": 6.136783733826249e-05, "loss": 0.0032, "step": 1500 }, { "epoch": 2.7726432532347505, "eval_accuracy": 0.9980353634577603, "eval_loss": 0.004458332899957895, "eval_runtime": 48.7133, "eval_samples_per_second": 31.347, "eval_steps_per_second": 3.921, "step": 1500 }, { "epoch": 2.791127541589649, "grad_norm": 0.007365287281572819, "learning_rate": 6.044362292051756e-05, "loss": 0.0238, "step": 1510 }, { "epoch": 2.809611829944547, "grad_norm": 0.0069998023100197315, "learning_rate": 5.951940850277264e-05, "loss": 0.0382, "step": 1520 }, { "epoch": 2.8280961182994453, "grad_norm": 2.074650287628174, "learning_rate": 5.859519408502773e-05, "loss": 0.052, "step": 1530 }, { "epoch": 2.846580406654344, "grad_norm": 0.02161816880106926, "learning_rate": 5.767097966728281e-05, "loss": 0.0027, "step": 1540 }, { "epoch": 2.865064695009242, "grad_norm": 0.008178635500371456, "learning_rate": 5.674676524953789e-05, "loss": 0.0529, "step": 1550 }, { "epoch": 2.8835489833641406, "grad_norm": 0.010601267218589783, "learning_rate": 5.5822550831792974e-05, "loss": 0.0018, "step": 1560 }, { "epoch": 2.9020332717190387, "grad_norm": 0.009498382918536663, "learning_rate": 5.4898336414048056e-05, "loss": 0.0232, "step": 1570 }, { "epoch": 2.9205175600739373, "grad_norm": 0.012483976781368256, "learning_rate": 5.397412199630314e-05, "loss": 0.0016, "step": 1580 }, { "epoch": 2.9390018484288354, "grad_norm": 0.09681292623281479, "learning_rate": 5.304990757855823e-05, "loss": 0.0156, "step": 1590 }, { "epoch": 2.957486136783734, "grad_norm": 0.08324334770441055, "learning_rate": 5.2125693160813314e-05, "loss": 0.0233, "step": 1600 }, { "epoch": 2.957486136783734, "eval_accuracy": 0.9993451211525868, "eval_loss": 0.002875428879633546, "eval_runtime": 49.5419, "eval_samples_per_second": 30.822, "eval_steps_per_second": 3.855, "step": 1600 }, { "epoch": 2.975970425138632, "grad_norm": 0.00964986253529787, "learning_rate": 5.1201478743068395e-05, "loss": 0.0189, "step": 1610 }, { "epoch": 2.9944547134935307, "grad_norm": 0.023254286497831345, "learning_rate": 5.027726432532348e-05, "loss": 0.0016, "step": 1620 }, { "epoch": 3.0129390018484288, "grad_norm": 0.03303303197026253, "learning_rate": 4.935304990757856e-05, "loss": 0.0421, "step": 1630 }, { "epoch": 3.0314232902033273, "grad_norm": 0.05670865997672081, "learning_rate": 4.8428835489833646e-05, "loss": 0.0164, "step": 1640 }, { "epoch": 3.0499075785582255, "grad_norm": 0.6677367091178894, "learning_rate": 4.750462107208873e-05, "loss": 0.0021, "step": 1650 }, { "epoch": 3.068391866913124, "grad_norm": 0.07437073439359665, "learning_rate": 4.658040665434381e-05, "loss": 0.0311, "step": 1660 }, { "epoch": 3.086876155268022, "grad_norm": 0.011145181953907013, "learning_rate": 4.565619223659889e-05, "loss": 0.0021, "step": 1670 }, { "epoch": 3.1053604436229207, "grad_norm": 0.3542933464050293, "learning_rate": 4.473197781885398e-05, "loss": 0.0024, "step": 1680 }, { "epoch": 3.123844731977819, "grad_norm": 0.012382660992443562, "learning_rate": 4.380776340110906e-05, "loss": 0.0046, "step": 1690 }, { "epoch": 3.142329020332717, "grad_norm": 0.0069410833530128, "learning_rate": 4.288354898336414e-05, "loss": 0.0434, "step": 1700 }, { "epoch": 3.142329020332717, "eval_accuracy": 0.9993451211525868, "eval_loss": 0.0025608371943235397, "eval_runtime": 48.733, "eval_samples_per_second": 31.334, "eval_steps_per_second": 3.919, "step": 1700 }, { "epoch": 3.1608133086876156, "grad_norm": 0.3573768436908722, "learning_rate": 4.195933456561922e-05, "loss": 0.0014, "step": 1710 }, { "epoch": 3.1792975970425137, "grad_norm": 0.009119544178247452, "learning_rate": 4.1035120147874305e-05, "loss": 0.0009, "step": 1720 }, { "epoch": 3.1977818853974123, "grad_norm": 0.02414746955037117, "learning_rate": 4.011090573012939e-05, "loss": 0.0009, "step": 1730 }, { "epoch": 3.2162661737523104, "grad_norm": 0.006854603998363018, "learning_rate": 3.9186691312384474e-05, "loss": 0.0396, "step": 1740 }, { "epoch": 3.234750462107209, "grad_norm": 0.010423395782709122, "learning_rate": 3.826247689463956e-05, "loss": 0.0375, "step": 1750 }, { "epoch": 3.253234750462107, "grad_norm": 0.010691582225263119, "learning_rate": 3.7338262476894644e-05, "loss": 0.0121, "step": 1760 }, { "epoch": 3.2717190388170057, "grad_norm": 0.1874719113111496, "learning_rate": 3.6414048059149726e-05, "loss": 0.0019, "step": 1770 }, { "epoch": 3.290203327171904, "grad_norm": 0.14755026996135712, "learning_rate": 3.548983364140481e-05, "loss": 0.0115, "step": 1780 }, { "epoch": 3.3086876155268024, "grad_norm": 0.024332020431756973, "learning_rate": 3.456561922365989e-05, "loss": 0.0062, "step": 1790 }, { "epoch": 3.3271719038817005, "grad_norm": 0.007643954362720251, "learning_rate": 3.364140480591497e-05, "loss": 0.0079, "step": 1800 }, { "epoch": 3.3271719038817005, "eval_accuracy": 0.9980353634577603, "eval_loss": 0.009474328719079494, "eval_runtime": 48.3759, "eval_samples_per_second": 31.565, "eval_steps_per_second": 3.948, "step": 1800 }, { "epoch": 3.345656192236599, "grad_norm": 0.008659069426357746, "learning_rate": 3.271719038817006e-05, "loss": 0.0012, "step": 1810 }, { "epoch": 3.364140480591497, "grad_norm": 0.11470120400190353, "learning_rate": 3.179297597042514e-05, "loss": 0.0026, "step": 1820 }, { "epoch": 3.3826247689463957, "grad_norm": 9.137892723083496, "learning_rate": 3.086876155268023e-05, "loss": 0.0298, "step": 1830 }, { "epoch": 3.401109057301294, "grad_norm": 0.01612740568816662, "learning_rate": 2.994454713493531e-05, "loss": 0.0405, "step": 1840 }, { "epoch": 3.4195933456561924, "grad_norm": 0.01397150382399559, "learning_rate": 2.902033271719039e-05, "loss": 0.0018, "step": 1850 }, { "epoch": 3.4380776340110906, "grad_norm": 0.0072495522908866405, "learning_rate": 2.8096118299445472e-05, "loss": 0.0008, "step": 1860 }, { "epoch": 3.4565619223659887, "grad_norm": 0.009662143886089325, "learning_rate": 2.7171903881700557e-05, "loss": 0.0484, "step": 1870 }, { "epoch": 3.4750462107208873, "grad_norm": 0.005946171935647726, "learning_rate": 2.624768946395564e-05, "loss": 0.0014, "step": 1880 }, { "epoch": 3.4935304990757854, "grad_norm": 0.010195981711149216, "learning_rate": 2.532347504621072e-05, "loss": 0.0479, "step": 1890 }, { "epoch": 3.512014787430684, "grad_norm": 0.02414027974009514, "learning_rate": 2.4399260628465805e-05, "loss": 0.0175, "step": 1900 }, { "epoch": 3.512014787430684, "eval_accuracy": 0.9973804846103471, "eval_loss": 0.011128582060337067, "eval_runtime": 48.1238, "eval_samples_per_second": 31.731, "eval_steps_per_second": 3.969, "step": 1900 }, { "epoch": 3.530499075785582, "grad_norm": 0.028868084773421288, "learning_rate": 2.347504621072089e-05, "loss": 0.001, "step": 1910 }, { "epoch": 3.5489833641404807, "grad_norm": 0.005811081267893314, "learning_rate": 2.255083179297597e-05, "loss": 0.0204, "step": 1920 }, { "epoch": 3.567467652495379, "grad_norm": 0.6555341482162476, "learning_rate": 2.1626617375231053e-05, "loss": 0.04, "step": 1930 }, { "epoch": 3.5859519408502774, "grad_norm": 0.006032618228346109, "learning_rate": 2.0702402957486137e-05, "loss": 0.0096, "step": 1940 }, { "epoch": 3.6044362292051755, "grad_norm": 0.006055204197764397, "learning_rate": 1.9778188539741222e-05, "loss": 0.0075, "step": 1950 }, { "epoch": 3.622920517560074, "grad_norm": 0.010445200838148594, "learning_rate": 1.8853974121996304e-05, "loss": 0.0042, "step": 1960 }, { "epoch": 3.641404805914972, "grad_norm": 0.02863822504878044, "learning_rate": 1.7929759704251385e-05, "loss": 0.001, "step": 1970 }, { "epoch": 3.6598890942698707, "grad_norm": 0.007255155127495527, "learning_rate": 1.700554528650647e-05, "loss": 0.0015, "step": 1980 }, { "epoch": 3.678373382624769, "grad_norm": 0.012167246080935001, "learning_rate": 1.6081330868761555e-05, "loss": 0.0185, "step": 1990 }, { "epoch": 3.6968576709796674, "grad_norm": 0.008204938843846321, "learning_rate": 1.5157116451016636e-05, "loss": 0.0013, "step": 2000 }, { "epoch": 3.6968576709796674, "eval_accuracy": 0.9973804846103471, "eval_loss": 0.010935045778751373, "eval_runtime": 47.9657, "eval_samples_per_second": 31.835, "eval_steps_per_second": 3.982, "step": 2000 }, { "epoch": 3.7153419593345656, "grad_norm": 0.0491810217499733, "learning_rate": 1.423290203327172e-05, "loss": 0.0011, "step": 2010 }, { "epoch": 3.733826247689464, "grad_norm": 0.006103880237787962, "learning_rate": 1.3308687615526803e-05, "loss": 0.0008, "step": 2020 }, { "epoch": 3.7523105360443623, "grad_norm": 0.008025890216231346, "learning_rate": 1.2384473197781886e-05, "loss": 0.0102, "step": 2030 }, { "epoch": 3.7707948243992604, "grad_norm": 0.005999017972499132, "learning_rate": 1.1460258780036969e-05, "loss": 0.0007, "step": 2040 }, { "epoch": 3.789279112754159, "grad_norm": 0.030818577855825424, "learning_rate": 1.0536044362292052e-05, "loss": 0.0088, "step": 2050 }, { "epoch": 3.8077634011090575, "grad_norm": 0.005371921230107546, "learning_rate": 9.611829944547135e-06, "loss": 0.0007, "step": 2060 }, { "epoch": 3.8262476894639557, "grad_norm": 0.005443415604531765, "learning_rate": 8.687615526802218e-06, "loss": 0.0015, "step": 2070 }, { "epoch": 3.844731977818854, "grad_norm": 0.005951932165771723, "learning_rate": 7.763401109057302e-06, "loss": 0.0008, "step": 2080 }, { "epoch": 3.8632162661737524, "grad_norm": 0.006055652163922787, "learning_rate": 6.8391866913123855e-06, "loss": 0.0009, "step": 2090 }, { "epoch": 3.8817005545286505, "grad_norm": 0.005791305564343929, "learning_rate": 5.914972273567468e-06, "loss": 0.0008, "step": 2100 }, { "epoch": 3.8817005545286505, "eval_accuracy": 0.9986902423051736, "eval_loss": 0.005253881681710482, "eval_runtime": 48.3848, "eval_samples_per_second": 31.56, "eval_steps_per_second": 3.948, "step": 2100 }, { "epoch": 3.900184842883549, "grad_norm": 0.011120261624455452, "learning_rate": 4.990757855822551e-06, "loss": 0.0023, "step": 2110 }, { "epoch": 3.918669131238447, "grad_norm": 0.005558283068239689, "learning_rate": 4.066543438077634e-06, "loss": 0.0143, "step": 2120 }, { "epoch": 3.9371534195933457, "grad_norm": 0.005820298567414284, "learning_rate": 3.1423290203327177e-06, "loss": 0.0016, "step": 2130 }, { "epoch": 3.955637707948244, "grad_norm": 0.006517091300338507, "learning_rate": 2.2181146025878004e-06, "loss": 0.0015, "step": 2140 }, { "epoch": 3.9741219963031424, "grad_norm": 0.08097829669713974, "learning_rate": 1.2939001848428836e-06, "loss": 0.0379, "step": 2150 }, { "epoch": 3.9926062846580406, "grad_norm": 0.0057843709364533424, "learning_rate": 3.6968576709796674e-07, "loss": 0.0008, "step": 2160 }, { "epoch": 4.0, "step": 2164, "total_flos": 2.6818427765818e+18, "train_loss": 0.07006576062802418, "train_runtime": 2504.6639, "train_samples_per_second": 13.817, "train_steps_per_second": 0.864 } ], "logging_steps": 10, "max_steps": 2164, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6818427765818e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }