|
{ |
|
"best_metric": 0.0025608371943235397, |
|
"best_model_checkpoint": "finetuned-arsenic/checkpoint-1700", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 2164, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018484288354898338, |
|
"grad_norm": 3.1622745990753174, |
|
"learning_rate": 0.0001990757855822551, |
|
"loss": 0.5015, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.036968576709796676, |
|
"grad_norm": 3.4678640365600586, |
|
"learning_rate": 0.00019815157116451017, |
|
"loss": 0.4604, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05545286506469501, |
|
"grad_norm": 3.134012460708618, |
|
"learning_rate": 0.00019722735674676528, |
|
"loss": 0.2732, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07393715341959335, |
|
"grad_norm": 1.4681167602539062, |
|
"learning_rate": 0.00019630314232902034, |
|
"loss": 0.4454, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09242144177449169, |
|
"grad_norm": 2.2367947101593018, |
|
"learning_rate": 0.00019537892791127544, |
|
"loss": 0.3936, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11090573012939002, |
|
"grad_norm": 1.4795340299606323, |
|
"learning_rate": 0.0001944547134935305, |
|
"loss": 0.2588, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12939001848428835, |
|
"grad_norm": 1.862062692642212, |
|
"learning_rate": 0.0001935304990757856, |
|
"loss": 0.3119, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1478743068391867, |
|
"grad_norm": 2.609395980834961, |
|
"learning_rate": 0.00019260628465804066, |
|
"loss": 0.2267, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16635859519408502, |
|
"grad_norm": 1.176942229270935, |
|
"learning_rate": 0.00019168207024029577, |
|
"loss": 0.1786, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18484288354898337, |
|
"grad_norm": 3.820526599884033, |
|
"learning_rate": 0.00019075785582255082, |
|
"loss": 0.2214, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18484288354898337, |
|
"eval_accuracy": 0.9607072691552063, |
|
"eval_loss": 0.12430191040039062, |
|
"eval_runtime": 63.6085, |
|
"eval_samples_per_second": 24.006, |
|
"eval_steps_per_second": 3.003, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2033271719038817, |
|
"grad_norm": 1.79490065574646, |
|
"learning_rate": 0.00018983364140480593, |
|
"loss": 0.2468, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22181146025878004, |
|
"grad_norm": 0.4057539701461792, |
|
"learning_rate": 0.000188909426987061, |
|
"loss": 0.2852, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24029574861367836, |
|
"grad_norm": 0.1476714164018631, |
|
"learning_rate": 0.0001879852125693161, |
|
"loss": 0.1344, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2587800369685767, |
|
"grad_norm": 3.7497174739837646, |
|
"learning_rate": 0.00018706099815157118, |
|
"loss": 0.1937, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27726432532347506, |
|
"grad_norm": 1.4686886072158813, |
|
"learning_rate": 0.00018613678373382626, |
|
"loss": 0.14, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2957486136783734, |
|
"grad_norm": 2.834284782409668, |
|
"learning_rate": 0.00018521256931608134, |
|
"loss": 0.2202, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3142329020332717, |
|
"grad_norm": 3.5288290977478027, |
|
"learning_rate": 0.00018428835489833642, |
|
"loss": 0.1548, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33271719038817005, |
|
"grad_norm": 2.251044273376465, |
|
"learning_rate": 0.0001833641404805915, |
|
"loss": 0.2552, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3512014787430684, |
|
"grad_norm": 2.9413559436798096, |
|
"learning_rate": 0.00018243992606284658, |
|
"loss": 0.1838, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"grad_norm": 1.2569198608398438, |
|
"learning_rate": 0.0001815157116451017, |
|
"loss": 0.1213, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"eval_accuracy": 0.933857236411264, |
|
"eval_loss": 0.17631804943084717, |
|
"eval_runtime": 52.9442, |
|
"eval_samples_per_second": 28.842, |
|
"eval_steps_per_second": 3.608, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38817005545286504, |
|
"grad_norm": 6.763731002807617, |
|
"learning_rate": 0.00018059149722735675, |
|
"loss": 0.1987, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4066543438077634, |
|
"grad_norm": 1.2817621231079102, |
|
"learning_rate": 0.00017966728280961186, |
|
"loss": 0.2626, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.42513863216266173, |
|
"grad_norm": 5.742928504943848, |
|
"learning_rate": 0.0001787430683918669, |
|
"loss": 0.1899, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4436229205175601, |
|
"grad_norm": 0.5674309134483337, |
|
"learning_rate": 0.00017781885397412202, |
|
"loss": 0.0699, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.46210720887245843, |
|
"grad_norm": 5.822825908660889, |
|
"learning_rate": 0.00017689463955637707, |
|
"loss": 0.091, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4805914972273567, |
|
"grad_norm": 0.5449599027633667, |
|
"learning_rate": 0.00017597042513863218, |
|
"loss": 0.1584, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.49907578558225507, |
|
"grad_norm": 0.09896630793809891, |
|
"learning_rate": 0.00017504621072088724, |
|
"loss": 0.2052, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5175600739371534, |
|
"grad_norm": 4.201374530792236, |
|
"learning_rate": 0.00017412199630314234, |
|
"loss": 0.1447, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5360443622920518, |
|
"grad_norm": 6.048889636993408, |
|
"learning_rate": 0.00017319778188539743, |
|
"loss": 0.1032, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5545286506469501, |
|
"grad_norm": 5.420243740081787, |
|
"learning_rate": 0.0001722735674676525, |
|
"loss": 0.1201, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5545286506469501, |
|
"eval_accuracy": 0.9607072691552063, |
|
"eval_loss": 0.10179698467254639, |
|
"eval_runtime": 50.5522, |
|
"eval_samples_per_second": 30.206, |
|
"eval_steps_per_second": 3.778, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5730129390018485, |
|
"grad_norm": 1.7872523069381714, |
|
"learning_rate": 0.0001713493530499076, |
|
"loss": 0.1725, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5914972273567468, |
|
"grad_norm": 3.7505619525909424, |
|
"learning_rate": 0.00017042513863216267, |
|
"loss": 0.0805, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.609981515711645, |
|
"grad_norm": 3.3265647888183594, |
|
"learning_rate": 0.00016950092421441775, |
|
"loss": 0.0945, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6284658040665434, |
|
"grad_norm": 0.0822325125336647, |
|
"learning_rate": 0.00016857670979667283, |
|
"loss": 0.184, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6469500924214417, |
|
"grad_norm": 6.458297252655029, |
|
"learning_rate": 0.00016765249537892791, |
|
"loss": 0.2462, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6654343807763401, |
|
"grad_norm": 8.956634521484375, |
|
"learning_rate": 0.000166728280961183, |
|
"loss": 0.1549, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6839186691312384, |
|
"grad_norm": 8.221627235412598, |
|
"learning_rate": 0.00016580406654343808, |
|
"loss": 0.0573, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7024029574861368, |
|
"grad_norm": 1.3730543851852417, |
|
"learning_rate": 0.00016487985212569316, |
|
"loss": 0.1435, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7208872458410351, |
|
"grad_norm": 3.857621669769287, |
|
"learning_rate": 0.00016395563770794827, |
|
"loss": 0.1512, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"grad_norm": 0.7978448867797852, |
|
"learning_rate": 0.00016303142329020332, |
|
"loss": 0.0991, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"eval_accuracy": 0.9417157825802227, |
|
"eval_loss": 0.20708701014518738, |
|
"eval_runtime": 49.1709, |
|
"eval_samples_per_second": 31.055, |
|
"eval_steps_per_second": 3.884, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7578558225508318, |
|
"grad_norm": 0.13973641395568848, |
|
"learning_rate": 0.00016210720887245843, |
|
"loss": 0.2316, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7763401109057301, |
|
"grad_norm": 5.2767415046691895, |
|
"learning_rate": 0.00016118299445471348, |
|
"loss": 0.0945, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7948243992606284, |
|
"grad_norm": 1.6643435955047607, |
|
"learning_rate": 0.0001602587800369686, |
|
"loss": 0.1037, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8133086876155268, |
|
"grad_norm": 2.7701528072357178, |
|
"learning_rate": 0.00015933456561922367, |
|
"loss": 0.0687, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8317929759704251, |
|
"grad_norm": 0.07744336128234863, |
|
"learning_rate": 0.00015841035120147876, |
|
"loss": 0.1097, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8502772643253235, |
|
"grad_norm": 1.1288193464279175, |
|
"learning_rate": 0.00015748613678373384, |
|
"loss": 0.1874, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8687615526802218, |
|
"grad_norm": 7.898807525634766, |
|
"learning_rate": 0.00015656192236598892, |
|
"loss": 0.1038, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8872458410351202, |
|
"grad_norm": 2.908022403717041, |
|
"learning_rate": 0.000155637707948244, |
|
"loss": 0.0918, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9057301293900185, |
|
"grad_norm": 1.4139397144317627, |
|
"learning_rate": 0.00015471349353049908, |
|
"loss": 0.1258, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9242144177449169, |
|
"grad_norm": 0.09938838332891464, |
|
"learning_rate": 0.00015378927911275416, |
|
"loss": 0.1127, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9242144177449169, |
|
"eval_accuracy": 0.9666011787819253, |
|
"eval_loss": 0.08861085772514343, |
|
"eval_runtime": 48.3076, |
|
"eval_samples_per_second": 31.61, |
|
"eval_steps_per_second": 3.954, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9426987060998152, |
|
"grad_norm": 0.2065439671278, |
|
"learning_rate": 0.00015286506469500925, |
|
"loss": 0.0658, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9611829944547134, |
|
"grad_norm": 4.408117771148682, |
|
"learning_rate": 0.00015194085027726433, |
|
"loss": 0.0739, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9796672828096118, |
|
"grad_norm": 0.18956203758716583, |
|
"learning_rate": 0.0001510166358595194, |
|
"loss": 0.1474, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9981515711645101, |
|
"grad_norm": 2.8244571685791016, |
|
"learning_rate": 0.0001500924214417745, |
|
"loss": 0.0509, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0166358595194085, |
|
"grad_norm": 0.705922544002533, |
|
"learning_rate": 0.00014916820702402957, |
|
"loss": 0.0719, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0351201478743068, |
|
"grad_norm": 1.5707758665084839, |
|
"learning_rate": 0.00014824399260628468, |
|
"loss": 0.0954, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0536044362292052, |
|
"grad_norm": 1.2642130851745605, |
|
"learning_rate": 0.00014731977818853976, |
|
"loss": 0.0859, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0720887245841035, |
|
"grad_norm": 0.222818985581398, |
|
"learning_rate": 0.00014639556377079484, |
|
"loss": 0.0827, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0905730129390019, |
|
"grad_norm": 0.05685073137283325, |
|
"learning_rate": 0.00014547134935304992, |
|
"loss": 0.0628, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1090573012939002, |
|
"grad_norm": 0.06841599196195602, |
|
"learning_rate": 0.000144547134935305, |
|
"loss": 0.0314, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1090573012939002, |
|
"eval_accuracy": 0.9908316961362148, |
|
"eval_loss": 0.033296775072813034, |
|
"eval_runtime": 49.081, |
|
"eval_samples_per_second": 31.112, |
|
"eval_steps_per_second": 3.892, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1275415896487986, |
|
"grad_norm": 0.8085036873817444, |
|
"learning_rate": 0.0001436229205175601, |
|
"loss": 0.0295, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.146025878003697, |
|
"grad_norm": 0.030383585020899773, |
|
"learning_rate": 0.00014269870609981517, |
|
"loss": 0.0168, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1645101663585953, |
|
"grad_norm": 2.179034948348999, |
|
"learning_rate": 0.00014177449168207025, |
|
"loss": 0.217, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1829944547134936, |
|
"grad_norm": 0.033746711909770966, |
|
"learning_rate": 0.00014085027726432533, |
|
"loss": 0.12, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.201478743068392, |
|
"grad_norm": 0.06319836527109146, |
|
"learning_rate": 0.0001399260628465804, |
|
"loss": 0.01, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.21996303142329, |
|
"grad_norm": 0.06434024125337601, |
|
"learning_rate": 0.0001390018484288355, |
|
"loss": 0.0755, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2384473197781884, |
|
"grad_norm": 3.2345378398895264, |
|
"learning_rate": 0.00013807763401109058, |
|
"loss": 0.1755, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2569316081330868, |
|
"grad_norm": 8.074490547180176, |
|
"learning_rate": 0.00013715341959334566, |
|
"loss": 0.1248, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2754158964879851, |
|
"grad_norm": 0.21135617792606354, |
|
"learning_rate": 0.00013622920517560074, |
|
"loss": 0.0662, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2939001848428835, |
|
"grad_norm": 0.0901883915066719, |
|
"learning_rate": 0.00013530499075785582, |
|
"loss": 0.0252, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2939001848428835, |
|
"eval_accuracy": 0.9973804846103471, |
|
"eval_loss": 0.010960910469293594, |
|
"eval_runtime": 49.6376, |
|
"eval_samples_per_second": 30.763, |
|
"eval_steps_per_second": 3.848, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3123844731977818, |
|
"grad_norm": 2.1113812923431396, |
|
"learning_rate": 0.0001343807763401109, |
|
"loss": 0.0685, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3308687615526802, |
|
"grad_norm": 0.062072403728961945, |
|
"learning_rate": 0.000133456561922366, |
|
"loss": 0.0747, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3493530499075785, |
|
"grad_norm": 4.824169158935547, |
|
"learning_rate": 0.00013253234750462106, |
|
"loss": 0.0791, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3678373382624769, |
|
"grad_norm": 0.12804123759269714, |
|
"learning_rate": 0.00013160813308687617, |
|
"loss": 0.045, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3863216266173752, |
|
"grad_norm": 0.3526214361190796, |
|
"learning_rate": 0.00013068391866913125, |
|
"loss": 0.0749, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4048059149722736, |
|
"grad_norm": 0.5252203345298767, |
|
"learning_rate": 0.00012975970425138634, |
|
"loss": 0.0459, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.423290203327172, |
|
"grad_norm": 9.271531105041504, |
|
"learning_rate": 0.00012883548983364142, |
|
"loss": 0.1033, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4417744916820703, |
|
"grad_norm": 0.13968196511268616, |
|
"learning_rate": 0.0001279112754158965, |
|
"loss": 0.03, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4602587800369686, |
|
"grad_norm": 0.14128534495830536, |
|
"learning_rate": 0.00012698706099815158, |
|
"loss": 0.0402, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.478743068391867, |
|
"grad_norm": 1.0647764205932617, |
|
"learning_rate": 0.00012606284658040666, |
|
"loss": 0.0582, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.478743068391867, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.010355145670473576, |
|
"eval_runtime": 48.0295, |
|
"eval_samples_per_second": 31.793, |
|
"eval_steps_per_second": 3.977, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4972273567467653, |
|
"grad_norm": 3.4088709354400635, |
|
"learning_rate": 0.00012513863216266174, |
|
"loss": 0.0495, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5157116451016637, |
|
"grad_norm": 0.2625955045223236, |
|
"learning_rate": 0.00012421441774491682, |
|
"loss": 0.0641, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5341959334565618, |
|
"grad_norm": 0.08498267084360123, |
|
"learning_rate": 0.0001232902033271719, |
|
"loss": 0.0331, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5526802218114604, |
|
"grad_norm": 10.900208473205566, |
|
"learning_rate": 0.000122365988909427, |
|
"loss": 0.0478, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5711645101663585, |
|
"grad_norm": 0.056279949843883514, |
|
"learning_rate": 0.00012144177449168208, |
|
"loss": 0.0754, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.589648798521257, |
|
"grad_norm": 0.05696770176291466, |
|
"learning_rate": 0.00012051756007393715, |
|
"loss": 0.0906, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6081330868761552, |
|
"grad_norm": 0.022501414641737938, |
|
"learning_rate": 0.00011959334565619225, |
|
"loss": 0.0413, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6266173752310538, |
|
"grad_norm": 2.3988966941833496, |
|
"learning_rate": 0.00011866913123844731, |
|
"loss": 0.074, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.645101663585952, |
|
"grad_norm": 0.03616836294531822, |
|
"learning_rate": 0.00011774491682070241, |
|
"loss": 0.0094, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6635859519408502, |
|
"grad_norm": 9.183988571166992, |
|
"learning_rate": 0.00011682070240295748, |
|
"loss": 0.0455, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6635859519408502, |
|
"eval_accuracy": 0.9954158480681073, |
|
"eval_loss": 0.01981273666024208, |
|
"eval_runtime": 48.1294, |
|
"eval_samples_per_second": 31.727, |
|
"eval_steps_per_second": 3.968, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6820702402957486, |
|
"grad_norm": 20.508174896240234, |
|
"learning_rate": 0.00011589648798521257, |
|
"loss": 0.1063, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.700554528650647, |
|
"grad_norm": 3.3739655017852783, |
|
"learning_rate": 0.00011497227356746765, |
|
"loss": 0.0711, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7190388170055453, |
|
"grad_norm": 0.03310403227806091, |
|
"learning_rate": 0.00011404805914972275, |
|
"loss": 0.0216, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7375231053604436, |
|
"grad_norm": 0.027661019936203957, |
|
"learning_rate": 0.00011312384473197783, |
|
"loss": 0.05, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.756007393715342, |
|
"grad_norm": 0.02968147210776806, |
|
"learning_rate": 0.00011219963031423291, |
|
"loss": 0.0368, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7744916820702403, |
|
"grad_norm": 0.01928904838860035, |
|
"learning_rate": 0.000111275415896488, |
|
"loss": 0.0083, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7929759704251387, |
|
"grad_norm": 0.048537448048591614, |
|
"learning_rate": 0.00011035120147874307, |
|
"loss": 0.0421, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8114602587800368, |
|
"grad_norm": 0.01963023841381073, |
|
"learning_rate": 0.00010942698706099817, |
|
"loss": 0.0059, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8299445471349354, |
|
"grad_norm": 0.035353146493434906, |
|
"learning_rate": 0.00010850277264325324, |
|
"loss": 0.0446, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8484288354898335, |
|
"grad_norm": 0.015135395340621471, |
|
"learning_rate": 0.00010757855822550833, |
|
"loss": 0.0569, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8484288354898335, |
|
"eval_accuracy": 0.9960707269155207, |
|
"eval_loss": 0.017953284084796906, |
|
"eval_runtime": 49.2028, |
|
"eval_samples_per_second": 31.035, |
|
"eval_steps_per_second": 3.882, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.866913123844732, |
|
"grad_norm": 0.02287949249148369, |
|
"learning_rate": 0.0001066543438077634, |
|
"loss": 0.0033, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8853974121996302, |
|
"grad_norm": 0.024308230727910995, |
|
"learning_rate": 0.0001057301293900185, |
|
"loss": 0.0028, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9038817005545288, |
|
"grad_norm": 8.346418380737305, |
|
"learning_rate": 0.00010480591497227356, |
|
"loss": 0.0468, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.922365988909427, |
|
"grad_norm": 0.015379060991108418, |
|
"learning_rate": 0.00010388170055452866, |
|
"loss": 0.1142, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9408502772643255, |
|
"grad_norm": 0.021259386092424393, |
|
"learning_rate": 0.00010295748613678373, |
|
"loss": 0.057, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9593345656192236, |
|
"grad_norm": 4.173085689544678, |
|
"learning_rate": 0.00010203327171903882, |
|
"loss": 0.0333, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9778188539741222, |
|
"grad_norm": 0.013770132325589657, |
|
"learning_rate": 0.0001011090573012939, |
|
"loss": 0.0248, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9963031423290203, |
|
"grad_norm": 0.029461657628417015, |
|
"learning_rate": 0.000100184842883549, |
|
"loss": 0.1002, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.014787430683919, |
|
"grad_norm": 0.042121052742004395, |
|
"learning_rate": 9.926062846580408e-05, |
|
"loss": 0.0499, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.033271719038817, |
|
"grad_norm": 6.605316162109375, |
|
"learning_rate": 9.833641404805916e-05, |
|
"loss": 0.0627, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.033271719038817, |
|
"eval_accuracy": 0.9947609692206941, |
|
"eval_loss": 0.024407994002103806, |
|
"eval_runtime": 48.147, |
|
"eval_samples_per_second": 31.715, |
|
"eval_steps_per_second": 3.967, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0517560073937156, |
|
"grad_norm": 0.03806009516119957, |
|
"learning_rate": 9.741219963031424e-05, |
|
"loss": 0.0162, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.0702402957486137, |
|
"grad_norm": 0.04799278452992439, |
|
"learning_rate": 9.648798521256932e-05, |
|
"loss": 0.0083, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.088724584103512, |
|
"grad_norm": 3.807424545288086, |
|
"learning_rate": 9.55637707948244e-05, |
|
"loss": 0.0349, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.1072088724584104, |
|
"grad_norm": 0.013640133664011955, |
|
"learning_rate": 9.463955637707949e-05, |
|
"loss": 0.0079, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1256931608133085, |
|
"grad_norm": 0.9286332726478577, |
|
"learning_rate": 9.371534195933457e-05, |
|
"loss": 0.0675, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.144177449168207, |
|
"grad_norm": 7.584729194641113, |
|
"learning_rate": 9.279112754158965e-05, |
|
"loss": 0.0223, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.162661737523105, |
|
"grad_norm": 0.02116883546113968, |
|
"learning_rate": 9.186691312384473e-05, |
|
"loss": 0.0046, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1811460258780038, |
|
"grad_norm": 0.014812180772423744, |
|
"learning_rate": 9.094269870609981e-05, |
|
"loss": 0.0194, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.199630314232902, |
|
"grad_norm": 0.011532890610396862, |
|
"learning_rate": 9.001848428835489e-05, |
|
"loss": 0.0236, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.2181146025878005, |
|
"grad_norm": 0.01616760343313217, |
|
"learning_rate": 8.909426987060999e-05, |
|
"loss": 0.0328, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2181146025878005, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.0054244487546384335, |
|
"eval_runtime": 49.4505, |
|
"eval_samples_per_second": 30.879, |
|
"eval_steps_per_second": 3.862, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2365988909426986, |
|
"grad_norm": 0.04048744961619377, |
|
"learning_rate": 8.817005545286507e-05, |
|
"loss": 0.0329, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.255083179297597, |
|
"grad_norm": 0.0121224420145154, |
|
"learning_rate": 8.724584103512015e-05, |
|
"loss": 0.0229, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.2735674676524953, |
|
"grad_norm": 3.312351942062378, |
|
"learning_rate": 8.632162661737525e-05, |
|
"loss": 0.0337, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.292051756007394, |
|
"grad_norm": 0.04162617400288582, |
|
"learning_rate": 8.539741219963033e-05, |
|
"loss": 0.0562, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.310536044362292, |
|
"grad_norm": 0.0170124601572752, |
|
"learning_rate": 8.447319778188541e-05, |
|
"loss": 0.0048, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3290203327171906, |
|
"grad_norm": 0.03438381850719452, |
|
"learning_rate": 8.354898336414049e-05, |
|
"loss": 0.0305, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3475046210720887, |
|
"grad_norm": 4.124629974365234, |
|
"learning_rate": 8.262476894639557e-05, |
|
"loss": 0.0413, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.3659889094269873, |
|
"grad_norm": 0.009506890550255775, |
|
"learning_rate": 8.170055452865065e-05, |
|
"loss": 0.0775, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3844731977818854, |
|
"grad_norm": 0.009906689636409283, |
|
"learning_rate": 8.077634011090573e-05, |
|
"loss": 0.0432, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.402957486136784, |
|
"grad_norm": 0.0475531667470932, |
|
"learning_rate": 7.985212569316082e-05, |
|
"loss": 0.0156, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.402957486136784, |
|
"eval_accuracy": 0.9947609692206941, |
|
"eval_loss": 0.0192621648311615, |
|
"eval_runtime": 48.3339, |
|
"eval_samples_per_second": 31.593, |
|
"eval_steps_per_second": 3.952, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.421441774491682, |
|
"grad_norm": 0.05756278708577156, |
|
"learning_rate": 7.89279112754159e-05, |
|
"loss": 0.0771, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.43992606284658, |
|
"grad_norm": 0.021766463294625282, |
|
"learning_rate": 7.800369685767098e-05, |
|
"loss": 0.0661, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.4584103512014788, |
|
"grad_norm": 0.02948659099638462, |
|
"learning_rate": 7.707948243992606e-05, |
|
"loss": 0.0166, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.476894639556377, |
|
"grad_norm": 9.3042573928833, |
|
"learning_rate": 7.615526802218114e-05, |
|
"loss": 0.0309, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.4953789279112755, |
|
"grad_norm": 0.023814814165234566, |
|
"learning_rate": 7.523105360443624e-05, |
|
"loss": 0.0783, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.5138632162661736, |
|
"grad_norm": 3.380216598510742, |
|
"learning_rate": 7.430683918669132e-05, |
|
"loss": 0.0556, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.532347504621072, |
|
"grad_norm": 0.0165497325360775, |
|
"learning_rate": 7.33826247689464e-05, |
|
"loss": 0.0221, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.5508317929759703, |
|
"grad_norm": 0.19721367955207825, |
|
"learning_rate": 7.245841035120148e-05, |
|
"loss": 0.0147, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.569316081330869, |
|
"grad_norm": 7.320788383483887, |
|
"learning_rate": 7.153419593345656e-05, |
|
"loss": 0.0251, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.587800369685767, |
|
"grad_norm": 0.009770031087100506, |
|
"learning_rate": 7.060998151571166e-05, |
|
"loss": 0.0016, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.587800369685767, |
|
"eval_accuracy": 0.9973804846103471, |
|
"eval_loss": 0.007392051629722118, |
|
"eval_runtime": 49.7571, |
|
"eval_samples_per_second": 30.689, |
|
"eval_steps_per_second": 3.839, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.6062846580406656, |
|
"grad_norm": 0.009464599192142487, |
|
"learning_rate": 6.968576709796674e-05, |
|
"loss": 0.0235, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.6247689463955637, |
|
"grad_norm": 3.2390127182006836, |
|
"learning_rate": 6.876155268022182e-05, |
|
"loss": 0.0419, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.6432532347504623, |
|
"grad_norm": 0.015608754009008408, |
|
"learning_rate": 6.78373382624769e-05, |
|
"loss": 0.0043, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.6617375231053604, |
|
"grad_norm": 0.35069820284843445, |
|
"learning_rate": 6.691312384473198e-05, |
|
"loss": 0.0079, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.6802218114602585, |
|
"grad_norm": 0.012105106376111507, |
|
"learning_rate": 6.598890942698706e-05, |
|
"loss": 0.0501, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.698706099815157, |
|
"grad_norm": 0.06180037185549736, |
|
"learning_rate": 6.506469500924215e-05, |
|
"loss": 0.0282, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7171903881700556, |
|
"grad_norm": 0.008026616647839546, |
|
"learning_rate": 6.414048059149723e-05, |
|
"loss": 0.0033, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7356746765249538, |
|
"grad_norm": 0.4511905610561371, |
|
"learning_rate": 6.321626617375231e-05, |
|
"loss": 0.0269, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.754158964879852, |
|
"grad_norm": 0.05588386207818985, |
|
"learning_rate": 6.229205175600739e-05, |
|
"loss": 0.0051, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.7726432532347505, |
|
"grad_norm": 0.025905737653374672, |
|
"learning_rate": 6.136783733826249e-05, |
|
"loss": 0.0032, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7726432532347505, |
|
"eval_accuracy": 0.9980353634577603, |
|
"eval_loss": 0.004458332899957895, |
|
"eval_runtime": 48.7133, |
|
"eval_samples_per_second": 31.347, |
|
"eval_steps_per_second": 3.921, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.791127541589649, |
|
"grad_norm": 0.007365287281572819, |
|
"learning_rate": 6.044362292051756e-05, |
|
"loss": 0.0238, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.809611829944547, |
|
"grad_norm": 0.0069998023100197315, |
|
"learning_rate": 5.951940850277264e-05, |
|
"loss": 0.0382, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.8280961182994453, |
|
"grad_norm": 2.074650287628174, |
|
"learning_rate": 5.859519408502773e-05, |
|
"loss": 0.052, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.846580406654344, |
|
"grad_norm": 0.02161816880106926, |
|
"learning_rate": 5.767097966728281e-05, |
|
"loss": 0.0027, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.865064695009242, |
|
"grad_norm": 0.008178635500371456, |
|
"learning_rate": 5.674676524953789e-05, |
|
"loss": 0.0529, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.8835489833641406, |
|
"grad_norm": 0.010601267218589783, |
|
"learning_rate": 5.5822550831792974e-05, |
|
"loss": 0.0018, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.9020332717190387, |
|
"grad_norm": 0.009498382918536663, |
|
"learning_rate": 5.4898336414048056e-05, |
|
"loss": 0.0232, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.9205175600739373, |
|
"grad_norm": 0.012483976781368256, |
|
"learning_rate": 5.397412199630314e-05, |
|
"loss": 0.0016, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.9390018484288354, |
|
"grad_norm": 0.09681292623281479, |
|
"learning_rate": 5.304990757855823e-05, |
|
"loss": 0.0156, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.957486136783734, |
|
"grad_norm": 0.08324334770441055, |
|
"learning_rate": 5.2125693160813314e-05, |
|
"loss": 0.0233, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.957486136783734, |
|
"eval_accuracy": 0.9993451211525868, |
|
"eval_loss": 0.002875428879633546, |
|
"eval_runtime": 49.5419, |
|
"eval_samples_per_second": 30.822, |
|
"eval_steps_per_second": 3.855, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.975970425138632, |
|
"grad_norm": 0.00964986253529787, |
|
"learning_rate": 5.1201478743068395e-05, |
|
"loss": 0.0189, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9944547134935307, |
|
"grad_norm": 0.023254286497831345, |
|
"learning_rate": 5.027726432532348e-05, |
|
"loss": 0.0016, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.0129390018484288, |
|
"grad_norm": 0.03303303197026253, |
|
"learning_rate": 4.935304990757856e-05, |
|
"loss": 0.0421, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.0314232902033273, |
|
"grad_norm": 0.05670865997672081, |
|
"learning_rate": 4.8428835489833646e-05, |
|
"loss": 0.0164, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.0499075785582255, |
|
"grad_norm": 0.6677367091178894, |
|
"learning_rate": 4.750462107208873e-05, |
|
"loss": 0.0021, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.068391866913124, |
|
"grad_norm": 0.07437073439359665, |
|
"learning_rate": 4.658040665434381e-05, |
|
"loss": 0.0311, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.086876155268022, |
|
"grad_norm": 0.011145181953907013, |
|
"learning_rate": 4.565619223659889e-05, |
|
"loss": 0.0021, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.1053604436229207, |
|
"grad_norm": 0.3542933464050293, |
|
"learning_rate": 4.473197781885398e-05, |
|
"loss": 0.0024, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.123844731977819, |
|
"grad_norm": 0.012382660992443562, |
|
"learning_rate": 4.380776340110906e-05, |
|
"loss": 0.0046, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.142329020332717, |
|
"grad_norm": 0.0069410833530128, |
|
"learning_rate": 4.288354898336414e-05, |
|
"loss": 0.0434, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.142329020332717, |
|
"eval_accuracy": 0.9993451211525868, |
|
"eval_loss": 0.0025608371943235397, |
|
"eval_runtime": 48.733, |
|
"eval_samples_per_second": 31.334, |
|
"eval_steps_per_second": 3.919, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.1608133086876156, |
|
"grad_norm": 0.3573768436908722, |
|
"learning_rate": 4.195933456561922e-05, |
|
"loss": 0.0014, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.1792975970425137, |
|
"grad_norm": 0.009119544178247452, |
|
"learning_rate": 4.1035120147874305e-05, |
|
"loss": 0.0009, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.1977818853974123, |
|
"grad_norm": 0.02414746955037117, |
|
"learning_rate": 4.011090573012939e-05, |
|
"loss": 0.0009, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.2162661737523104, |
|
"grad_norm": 0.006854603998363018, |
|
"learning_rate": 3.9186691312384474e-05, |
|
"loss": 0.0396, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.234750462107209, |
|
"grad_norm": 0.010423395782709122, |
|
"learning_rate": 3.826247689463956e-05, |
|
"loss": 0.0375, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.253234750462107, |
|
"grad_norm": 0.010691582225263119, |
|
"learning_rate": 3.7338262476894644e-05, |
|
"loss": 0.0121, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.2717190388170057, |
|
"grad_norm": 0.1874719113111496, |
|
"learning_rate": 3.6414048059149726e-05, |
|
"loss": 0.0019, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.290203327171904, |
|
"grad_norm": 0.14755026996135712, |
|
"learning_rate": 3.548983364140481e-05, |
|
"loss": 0.0115, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.3086876155268024, |
|
"grad_norm": 0.024332020431756973, |
|
"learning_rate": 3.456561922365989e-05, |
|
"loss": 0.0062, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.3271719038817005, |
|
"grad_norm": 0.007643954362720251, |
|
"learning_rate": 3.364140480591497e-05, |
|
"loss": 0.0079, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.3271719038817005, |
|
"eval_accuracy": 0.9980353634577603, |
|
"eval_loss": 0.009474328719079494, |
|
"eval_runtime": 48.3759, |
|
"eval_samples_per_second": 31.565, |
|
"eval_steps_per_second": 3.948, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.345656192236599, |
|
"grad_norm": 0.008659069426357746, |
|
"learning_rate": 3.271719038817006e-05, |
|
"loss": 0.0012, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.364140480591497, |
|
"grad_norm": 0.11470120400190353, |
|
"learning_rate": 3.179297597042514e-05, |
|
"loss": 0.0026, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.3826247689463957, |
|
"grad_norm": 9.137892723083496, |
|
"learning_rate": 3.086876155268023e-05, |
|
"loss": 0.0298, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.401109057301294, |
|
"grad_norm": 0.01612740568816662, |
|
"learning_rate": 2.994454713493531e-05, |
|
"loss": 0.0405, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.4195933456561924, |
|
"grad_norm": 0.01397150382399559, |
|
"learning_rate": 2.902033271719039e-05, |
|
"loss": 0.0018, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.4380776340110906, |
|
"grad_norm": 0.0072495522908866405, |
|
"learning_rate": 2.8096118299445472e-05, |
|
"loss": 0.0008, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.4565619223659887, |
|
"grad_norm": 0.009662143886089325, |
|
"learning_rate": 2.7171903881700557e-05, |
|
"loss": 0.0484, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.4750462107208873, |
|
"grad_norm": 0.005946171935647726, |
|
"learning_rate": 2.624768946395564e-05, |
|
"loss": 0.0014, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.4935304990757854, |
|
"grad_norm": 0.010195981711149216, |
|
"learning_rate": 2.532347504621072e-05, |
|
"loss": 0.0479, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.512014787430684, |
|
"grad_norm": 0.02414027974009514, |
|
"learning_rate": 2.4399260628465805e-05, |
|
"loss": 0.0175, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.512014787430684, |
|
"eval_accuracy": 0.9973804846103471, |
|
"eval_loss": 0.011128582060337067, |
|
"eval_runtime": 48.1238, |
|
"eval_samples_per_second": 31.731, |
|
"eval_steps_per_second": 3.969, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.530499075785582, |
|
"grad_norm": 0.028868084773421288, |
|
"learning_rate": 2.347504621072089e-05, |
|
"loss": 0.001, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.5489833641404807, |
|
"grad_norm": 0.005811081267893314, |
|
"learning_rate": 2.255083179297597e-05, |
|
"loss": 0.0204, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.567467652495379, |
|
"grad_norm": 0.6555341482162476, |
|
"learning_rate": 2.1626617375231053e-05, |
|
"loss": 0.04, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.5859519408502774, |
|
"grad_norm": 0.006032618228346109, |
|
"learning_rate": 2.0702402957486137e-05, |
|
"loss": 0.0096, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.6044362292051755, |
|
"grad_norm": 0.006055204197764397, |
|
"learning_rate": 1.9778188539741222e-05, |
|
"loss": 0.0075, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.622920517560074, |
|
"grad_norm": 0.010445200838148594, |
|
"learning_rate": 1.8853974121996304e-05, |
|
"loss": 0.0042, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.641404805914972, |
|
"grad_norm": 0.02863822504878044, |
|
"learning_rate": 1.7929759704251385e-05, |
|
"loss": 0.001, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.6598890942698707, |
|
"grad_norm": 0.007255155127495527, |
|
"learning_rate": 1.700554528650647e-05, |
|
"loss": 0.0015, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.678373382624769, |
|
"grad_norm": 0.012167246080935001, |
|
"learning_rate": 1.6081330868761555e-05, |
|
"loss": 0.0185, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.6968576709796674, |
|
"grad_norm": 0.008204938843846321, |
|
"learning_rate": 1.5157116451016636e-05, |
|
"loss": 0.0013, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.6968576709796674, |
|
"eval_accuracy": 0.9973804846103471, |
|
"eval_loss": 0.010935045778751373, |
|
"eval_runtime": 47.9657, |
|
"eval_samples_per_second": 31.835, |
|
"eval_steps_per_second": 3.982, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.7153419593345656, |
|
"grad_norm": 0.0491810217499733, |
|
"learning_rate": 1.423290203327172e-05, |
|
"loss": 0.0011, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.733826247689464, |
|
"grad_norm": 0.006103880237787962, |
|
"learning_rate": 1.3308687615526803e-05, |
|
"loss": 0.0008, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.7523105360443623, |
|
"grad_norm": 0.008025890216231346, |
|
"learning_rate": 1.2384473197781886e-05, |
|
"loss": 0.0102, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.7707948243992604, |
|
"grad_norm": 0.005999017972499132, |
|
"learning_rate": 1.1460258780036969e-05, |
|
"loss": 0.0007, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.789279112754159, |
|
"grad_norm": 0.030818577855825424, |
|
"learning_rate": 1.0536044362292052e-05, |
|
"loss": 0.0088, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.8077634011090575, |
|
"grad_norm": 0.005371921230107546, |
|
"learning_rate": 9.611829944547135e-06, |
|
"loss": 0.0007, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.8262476894639557, |
|
"grad_norm": 0.005443415604531765, |
|
"learning_rate": 8.687615526802218e-06, |
|
"loss": 0.0015, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.844731977818854, |
|
"grad_norm": 0.005951932165771723, |
|
"learning_rate": 7.763401109057302e-06, |
|
"loss": 0.0008, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.8632162661737524, |
|
"grad_norm": 0.006055652163922787, |
|
"learning_rate": 6.8391866913123855e-06, |
|
"loss": 0.0009, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.8817005545286505, |
|
"grad_norm": 0.005791305564343929, |
|
"learning_rate": 5.914972273567468e-06, |
|
"loss": 0.0008, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.8817005545286505, |
|
"eval_accuracy": 0.9986902423051736, |
|
"eval_loss": 0.005253881681710482, |
|
"eval_runtime": 48.3848, |
|
"eval_samples_per_second": 31.56, |
|
"eval_steps_per_second": 3.948, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.900184842883549, |
|
"grad_norm": 0.011120261624455452, |
|
"learning_rate": 4.990757855822551e-06, |
|
"loss": 0.0023, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.918669131238447, |
|
"grad_norm": 0.005558283068239689, |
|
"learning_rate": 4.066543438077634e-06, |
|
"loss": 0.0143, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.9371534195933457, |
|
"grad_norm": 0.005820298567414284, |
|
"learning_rate": 3.1423290203327177e-06, |
|
"loss": 0.0016, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.955637707948244, |
|
"grad_norm": 0.006517091300338507, |
|
"learning_rate": 2.2181146025878004e-06, |
|
"loss": 0.0015, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.9741219963031424, |
|
"grad_norm": 0.08097829669713974, |
|
"learning_rate": 1.2939001848428836e-06, |
|
"loss": 0.0379, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.9926062846580406, |
|
"grad_norm": 0.0057843709364533424, |
|
"learning_rate": 3.6968576709796674e-07, |
|
"loss": 0.0008, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 2164, |
|
"total_flos": 2.6818427765818e+18, |
|
"train_loss": 0.07006576062802418, |
|
"train_runtime": 2504.6639, |
|
"train_samples_per_second": 13.817, |
|
"train_steps_per_second": 0.864 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2164, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6818427765818e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|