|
{ |
|
"best_metric": 0.10886295884847641, |
|
"best_model_checkpoint": "./vit-base-brain-tumor-detection/checkpoint-3700", |
|
"epoch": 20.0, |
|
"eval_steps": 100, |
|
"global_step": 6400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.5038443803787231, |
|
"learning_rate": 0.0001996875, |
|
"loss": 1.1434, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.8518216013908386, |
|
"learning_rate": 0.000199375, |
|
"loss": 0.9712, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 2.0169894695281982, |
|
"learning_rate": 0.00019906250000000002, |
|
"loss": 0.9002, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.4780454635620117, |
|
"learning_rate": 0.00019875, |
|
"loss": 0.9304, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.8681000471115112, |
|
"learning_rate": 0.00019843750000000002, |
|
"loss": 0.9191, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.9179439544677734, |
|
"learning_rate": 0.000198125, |
|
"loss": 0.929, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 1.7340352535247803, |
|
"learning_rate": 0.0001978125, |
|
"loss": 0.878, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0093904733657837, |
|
"learning_rate": 0.00019750000000000003, |
|
"loss": 0.9233, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 2.5266435146331787, |
|
"learning_rate": 0.00019718750000000002, |
|
"loss": 0.9763, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 3.176175594329834, |
|
"learning_rate": 0.000196875, |
|
"loss": 0.8826, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_accuracy": 0.575, |
|
"eval_loss": 0.9027458429336548, |
|
"eval_runtime": 6.7552, |
|
"eval_samples_per_second": 189.483, |
|
"eval_steps_per_second": 23.685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 2.2321112155914307, |
|
"learning_rate": 0.00019656250000000001, |
|
"loss": 0.8653, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 2.9958410263061523, |
|
"learning_rate": 0.00019625, |
|
"loss": 0.8645, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.3575609922409058, |
|
"learning_rate": 0.0001959375, |
|
"loss": 0.8028, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.6939704418182373, |
|
"learning_rate": 0.00019562500000000003, |
|
"loss": 0.7853, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 3.4280309677124023, |
|
"learning_rate": 0.0001953125, |
|
"loss": 0.8574, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.5754916667938232, |
|
"learning_rate": 0.000195, |
|
"loss": 0.9127, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 1.0515691041946411, |
|
"learning_rate": 0.0001946875, |
|
"loss": 0.6715, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 4.097135066986084, |
|
"learning_rate": 0.00019437500000000002, |
|
"loss": 0.9268, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 1.1513463258743286, |
|
"learning_rate": 0.0001940625, |
|
"loss": 1.1014, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.027341604232788, |
|
"learning_rate": 0.00019375000000000002, |
|
"loss": 0.8908, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_accuracy": 0.5984375, |
|
"eval_loss": 0.8483627438545227, |
|
"eval_runtime": 6.1939, |
|
"eval_samples_per_second": 206.656, |
|
"eval_steps_per_second": 25.832, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 3.054396629333496, |
|
"learning_rate": 0.0001934375, |
|
"loss": 1.0798, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.4708349704742432, |
|
"learning_rate": 0.000193125, |
|
"loss": 0.9002, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 3.5276074409484863, |
|
"learning_rate": 0.00019281250000000003, |
|
"loss": 0.7594, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7735655307769775, |
|
"learning_rate": 0.00019250000000000002, |
|
"loss": 0.8413, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 2.533834457397461, |
|
"learning_rate": 0.0001921875, |
|
"loss": 0.6925, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.8991681337356567, |
|
"learning_rate": 0.00019187500000000002, |
|
"loss": 0.7569, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 1.8567323684692383, |
|
"learning_rate": 0.0001915625, |
|
"loss": 0.8479, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 2.904021739959717, |
|
"learning_rate": 0.00019125000000000001, |
|
"loss": 0.7737, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 2.1224987506866455, |
|
"learning_rate": 0.00019093750000000003, |
|
"loss": 0.7647, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 2.221484661102295, |
|
"learning_rate": 0.000190625, |
|
"loss": 0.8229, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_accuracy": 0.66953125, |
|
"eval_loss": 0.7514046430587769, |
|
"eval_runtime": 6.6282, |
|
"eval_samples_per_second": 193.115, |
|
"eval_steps_per_second": 24.139, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 2.0415520668029785, |
|
"learning_rate": 0.0001903125, |
|
"loss": 0.8343, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.7857444286346436, |
|
"learning_rate": 0.00019, |
|
"loss": 0.7851, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 3.682004451751709, |
|
"learning_rate": 0.00018968750000000002, |
|
"loss": 0.659, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 3.909653902053833, |
|
"learning_rate": 0.000189375, |
|
"loss": 0.5986, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 2.665677785873413, |
|
"learning_rate": 0.00018906250000000002, |
|
"loss": 0.599, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 3.378941059112549, |
|
"learning_rate": 0.00018875, |
|
"loss": 0.6162, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 2.2860324382781982, |
|
"learning_rate": 0.0001884375, |
|
"loss": 0.5133, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 4.41148567199707, |
|
"learning_rate": 0.000188125, |
|
"loss": 0.4787, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 2.802339792251587, |
|
"learning_rate": 0.00018781250000000002, |
|
"loss": 0.8346, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.0579328536987305, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.5299, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_accuracy": 0.71640625, |
|
"eval_loss": 0.6797709465026855, |
|
"eval_runtime": 6.1605, |
|
"eval_samples_per_second": 207.776, |
|
"eval_steps_per_second": 25.972, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 4.516961574554443, |
|
"learning_rate": 0.00018718750000000002, |
|
"loss": 0.5706, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 4.825928688049316, |
|
"learning_rate": 0.000186875, |
|
"loss": 0.4748, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 4.58062219619751, |
|
"learning_rate": 0.00018656250000000001, |
|
"loss": 0.589, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 3.5321483612060547, |
|
"learning_rate": 0.00018625, |
|
"loss": 0.4854, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 2.8487465381622314, |
|
"learning_rate": 0.0001859375, |
|
"loss": 0.4736, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 3.3944597244262695, |
|
"learning_rate": 0.000185625, |
|
"loss": 0.5127, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 3.1018853187561035, |
|
"learning_rate": 0.0001853125, |
|
"loss": 0.5043, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.7340843677520752, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 0.4293, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 5.125267505645752, |
|
"learning_rate": 0.0001846875, |
|
"loss": 0.4616, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 4.208590030670166, |
|
"learning_rate": 0.000184375, |
|
"loss": 0.5207, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"eval_accuracy": 0.7375, |
|
"eval_loss": 0.6465662717819214, |
|
"eval_runtime": 6.4735, |
|
"eval_samples_per_second": 197.729, |
|
"eval_steps_per_second": 24.716, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 3.470857620239258, |
|
"learning_rate": 0.0001840625, |
|
"loss": 0.5758, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 2.353254556655884, |
|
"learning_rate": 0.00018375, |
|
"loss": 0.4954, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 5.26539421081543, |
|
"learning_rate": 0.0001834375, |
|
"loss": 0.4687, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 4.430329322814941, |
|
"learning_rate": 0.00018312500000000002, |
|
"loss": 0.3183, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 3.066425323486328, |
|
"learning_rate": 0.0001828125, |
|
"loss": 0.4647, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.959084987640381, |
|
"learning_rate": 0.0001825, |
|
"loss": 0.3383, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 7.477952480316162, |
|
"learning_rate": 0.0001821875, |
|
"loss": 0.4317, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 3.7508246898651123, |
|
"learning_rate": 0.00018187500000000002, |
|
"loss": 0.621, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 6.189945220947266, |
|
"learning_rate": 0.0001815625, |
|
"loss": 0.5086, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 6.064743995666504, |
|
"learning_rate": 0.00018125000000000001, |
|
"loss": 0.4967, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"eval_accuracy": 0.74609375, |
|
"eval_loss": 0.6302900314331055, |
|
"eval_runtime": 6.8591, |
|
"eval_samples_per_second": 186.614, |
|
"eval_steps_per_second": 23.327, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 2.6553187370300293, |
|
"learning_rate": 0.0001809375, |
|
"loss": 0.4273, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 8.86166000366211, |
|
"learning_rate": 0.000180625, |
|
"loss": 0.4264, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 5.616076946258545, |
|
"learning_rate": 0.00018031250000000003, |
|
"loss": 0.4907, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.036799907684326, |
|
"learning_rate": 0.00018, |
|
"loss": 0.4537, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 4.771281719207764, |
|
"learning_rate": 0.0001796875, |
|
"loss": 0.2492, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 3.654841423034668, |
|
"learning_rate": 0.000179375, |
|
"loss": 0.2568, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 3.043989896774292, |
|
"learning_rate": 0.0001790625, |
|
"loss": 0.2354, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 6.0935516357421875, |
|
"learning_rate": 0.00017875, |
|
"loss": 0.2106, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 2.231640577316284, |
|
"learning_rate": 0.00017843750000000002, |
|
"loss": 0.2804, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 8.500765800476074, |
|
"learning_rate": 0.000178125, |
|
"loss": 0.3977, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"eval_accuracy": 0.771875, |
|
"eval_loss": 0.7239754796028137, |
|
"eval_runtime": 6.8507, |
|
"eval_samples_per_second": 186.843, |
|
"eval_steps_per_second": 23.355, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 5.513547420501709, |
|
"learning_rate": 0.0001778125, |
|
"loss": 0.3942, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 6.819497585296631, |
|
"learning_rate": 0.0001775, |
|
"loss": 0.2489, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 2.9887595176696777, |
|
"learning_rate": 0.00017718750000000002, |
|
"loss": 0.2054, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 2.461519479751587, |
|
"learning_rate": 0.000176875, |
|
"loss": 0.2252, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.7205028533935547, |
|
"learning_rate": 0.00017656250000000002, |
|
"loss": 0.1975, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.9731912612915039, |
|
"learning_rate": 0.00017625, |
|
"loss": 0.2049, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 4.950948715209961, |
|
"learning_rate": 0.0001759375, |
|
"loss": 0.283, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 3.5972580909729004, |
|
"learning_rate": 0.00017562500000000003, |
|
"loss": 0.2172, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 8.976056098937988, |
|
"learning_rate": 0.0001753125, |
|
"loss": 0.2733, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.2202197313308716, |
|
"learning_rate": 0.000175, |
|
"loss": 0.2744, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_accuracy": 0.8734375, |
|
"eval_loss": 0.35440793633461, |
|
"eval_runtime": 6.2483, |
|
"eval_samples_per_second": 204.854, |
|
"eval_steps_per_second": 25.607, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 7.248142719268799, |
|
"learning_rate": 0.0001746875, |
|
"loss": 0.2173, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 6.068428039550781, |
|
"learning_rate": 0.000174375, |
|
"loss": 0.2172, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 5.297908306121826, |
|
"learning_rate": 0.0001740625, |
|
"loss": 0.3827, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 8.936563491821289, |
|
"learning_rate": 0.00017375000000000002, |
|
"loss": 0.2638, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 5.005488395690918, |
|
"learning_rate": 0.0001734375, |
|
"loss": 0.281, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 0.3621249794960022, |
|
"learning_rate": 0.000173125, |
|
"loss": 0.1424, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 2.321965217590332, |
|
"learning_rate": 0.0001728125, |
|
"loss": 0.2586, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 9.775487899780273, |
|
"learning_rate": 0.00017250000000000002, |
|
"loss": 0.185, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.6502866744995117, |
|
"learning_rate": 0.0001721875, |
|
"loss": 0.1635, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 3.4629428386688232, |
|
"learning_rate": 0.00017187500000000002, |
|
"loss": 0.4271, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"eval_accuracy": 0.89375, |
|
"eval_loss": 0.30369627475738525, |
|
"eval_runtime": 6.7624, |
|
"eval_samples_per_second": 189.281, |
|
"eval_steps_per_second": 23.66, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 1.4681894779205322, |
|
"learning_rate": 0.0001715625, |
|
"loss": 0.193, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 2.8025732040405273, |
|
"learning_rate": 0.00017125, |
|
"loss": 0.2532, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 5.300601482391357, |
|
"learning_rate": 0.00017093750000000003, |
|
"loss": 0.178, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.9630404710769653, |
|
"learning_rate": 0.00017062500000000001, |
|
"loss": 0.2453, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 3.942497968673706, |
|
"learning_rate": 0.0001703125, |
|
"loss": 0.2302, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 5.0087809562683105, |
|
"learning_rate": 0.00017, |
|
"loss": 0.2989, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.03125, |
|
"grad_norm": 6.783499717712402, |
|
"learning_rate": 0.0001696875, |
|
"loss": 0.3932, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 4.156474590301514, |
|
"learning_rate": 0.000169375, |
|
"loss": 0.1148, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.09375, |
|
"grad_norm": 0.22207698225975037, |
|
"learning_rate": 0.00016906250000000002, |
|
"loss": 0.0925, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 7.46227502822876, |
|
"learning_rate": 0.00016875, |
|
"loss": 0.2484, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"eval_accuracy": 0.86015625, |
|
"eval_loss": 0.4111490249633789, |
|
"eval_runtime": 6.2118, |
|
"eval_samples_per_second": 206.058, |
|
"eval_steps_per_second": 25.757, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.15625, |
|
"grad_norm": 1.8679814338684082, |
|
"learning_rate": 0.0001684375, |
|
"loss": 0.1024, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 1.4344357252120972, |
|
"learning_rate": 0.000168125, |
|
"loss": 0.1532, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.21875, |
|
"grad_norm": 5.7756218910217285, |
|
"learning_rate": 0.00016781250000000002, |
|
"loss": 0.1433, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 6.2325358390808105, |
|
"learning_rate": 0.0001675, |
|
"loss": 0.2056, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.28125, |
|
"grad_norm": 0.7062513828277588, |
|
"learning_rate": 0.00016718750000000002, |
|
"loss": 0.1883, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 0.5934551954269409, |
|
"learning_rate": 0.000166875, |
|
"loss": 0.1311, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.34375, |
|
"grad_norm": 2.90659236907959, |
|
"learning_rate": 0.0001665625, |
|
"loss": 0.0926, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.12364790588617325, |
|
"learning_rate": 0.00016625000000000003, |
|
"loss": 0.0998, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.40625, |
|
"grad_norm": 8.838610649108887, |
|
"learning_rate": 0.00016593750000000002, |
|
"loss": 0.1322, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 0.9064333438873291, |
|
"learning_rate": 0.000165625, |
|
"loss": 0.0797, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"eval_accuracy": 0.8953125, |
|
"eval_loss": 0.3781999349594116, |
|
"eval_runtime": 6.8762, |
|
"eval_samples_per_second": 186.149, |
|
"eval_steps_per_second": 23.269, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.46875, |
|
"grad_norm": 0.37031790614128113, |
|
"learning_rate": 0.0001653125, |
|
"loss": 0.1157, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 10.903687477111816, |
|
"learning_rate": 0.000165, |
|
"loss": 0.1616, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.53125, |
|
"grad_norm": 7.575557231903076, |
|
"learning_rate": 0.0001646875, |
|
"loss": 0.0864, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 0.7798458337783813, |
|
"learning_rate": 0.00016437500000000002, |
|
"loss": 0.1479, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.59375, |
|
"grad_norm": 0.30808526277542114, |
|
"learning_rate": 0.0001640625, |
|
"loss": 0.091, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 1.3779064416885376, |
|
"learning_rate": 0.00016375, |
|
"loss": 0.0918, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.65625, |
|
"grad_norm": 10.742758750915527, |
|
"learning_rate": 0.0001634375, |
|
"loss": 0.213, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 4.325877666473389, |
|
"learning_rate": 0.00016312500000000002, |
|
"loss": 0.1672, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.71875, |
|
"grad_norm": 0.06019480153918266, |
|
"learning_rate": 0.0001628125, |
|
"loss": 0.1229, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.060391902923583984, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 0.0662, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_accuracy": 0.9171875, |
|
"eval_loss": 0.309553325176239, |
|
"eval_runtime": 6.3504, |
|
"eval_samples_per_second": 201.561, |
|
"eval_steps_per_second": 25.195, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.78125, |
|
"grad_norm": 7.1998066902160645, |
|
"learning_rate": 0.0001621875, |
|
"loss": 0.2368, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 2.0582878589630127, |
|
"learning_rate": 0.000161875, |
|
"loss": 0.1161, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.84375, |
|
"grad_norm": 0.4987049698829651, |
|
"learning_rate": 0.0001615625, |
|
"loss": 0.1798, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 6.303996562957764, |
|
"learning_rate": 0.00016125000000000002, |
|
"loss": 0.2193, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.90625, |
|
"grad_norm": 0.1131618395447731, |
|
"learning_rate": 0.0001609375, |
|
"loss": 0.1626, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 8.112652778625488, |
|
"learning_rate": 0.00016062500000000001, |
|
"loss": 0.1033, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.96875, |
|
"grad_norm": 0.14582502841949463, |
|
"learning_rate": 0.0001603125, |
|
"loss": 0.0548, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 7.206060409545898, |
|
"learning_rate": 0.00016, |
|
"loss": 0.1758, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.03125, |
|
"grad_norm": 0.12302320450544357, |
|
"learning_rate": 0.0001596875, |
|
"loss": 0.0118, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"grad_norm": 1.9936612844467163, |
|
"learning_rate": 0.000159375, |
|
"loss": 0.0894, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.0625, |
|
"eval_accuracy": 0.92890625, |
|
"eval_loss": 0.281791090965271, |
|
"eval_runtime": 6.6912, |
|
"eval_samples_per_second": 191.295, |
|
"eval_steps_per_second": 23.912, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.09375, |
|
"grad_norm": 15.060647010803223, |
|
"learning_rate": 0.0001590625, |
|
"loss": 0.0328, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.125, |
|
"grad_norm": 0.06893154978752136, |
|
"learning_rate": 0.00015875, |
|
"loss": 0.0508, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.15625, |
|
"grad_norm": 0.38513150811195374, |
|
"learning_rate": 0.00015843750000000002, |
|
"loss": 0.054, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.1875, |
|
"grad_norm": 9.78172492980957, |
|
"learning_rate": 0.000158125, |
|
"loss": 0.0225, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.21875, |
|
"grad_norm": 0.9426405429840088, |
|
"learning_rate": 0.00015781250000000002, |
|
"loss": 0.0897, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.03851567581295967, |
|
"learning_rate": 0.0001575, |
|
"loss": 0.0723, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.28125, |
|
"grad_norm": 3.465240001678467, |
|
"learning_rate": 0.0001571875, |
|
"loss": 0.0281, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.3125, |
|
"grad_norm": 4.168702125549316, |
|
"learning_rate": 0.000156875, |
|
"loss": 0.0334, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.34375, |
|
"grad_norm": 0.022729417309165, |
|
"learning_rate": 0.00015656250000000002, |
|
"loss": 0.0786, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 4.370222568511963, |
|
"learning_rate": 0.00015625, |
|
"loss": 0.1005, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"eval_accuracy": 0.946875, |
|
"eval_loss": 0.21635571122169495, |
|
"eval_runtime": 6.8886, |
|
"eval_samples_per_second": 185.815, |
|
"eval_steps_per_second": 23.227, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.40625, |
|
"grad_norm": 2.6576523780822754, |
|
"learning_rate": 0.00015593750000000002, |
|
"loss": 0.0235, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.4375, |
|
"grad_norm": 9.146688461303711, |
|
"learning_rate": 0.000155625, |
|
"loss": 0.0842, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.46875, |
|
"grad_norm": 0.019813504070043564, |
|
"learning_rate": 0.00015531250000000001, |
|
"loss": 0.092, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.2571779191493988, |
|
"learning_rate": 0.000155, |
|
"loss": 0.0473, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.53125, |
|
"grad_norm": 5.699305057525635, |
|
"learning_rate": 0.0001546875, |
|
"loss": 0.1053, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.5625, |
|
"grad_norm": 0.04671861603856087, |
|
"learning_rate": 0.000154375, |
|
"loss": 0.0068, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.59375, |
|
"grad_norm": 0.1444374918937683, |
|
"learning_rate": 0.0001540625, |
|
"loss": 0.0946, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.625, |
|
"grad_norm": 0.06924466788768768, |
|
"learning_rate": 0.00015375000000000002, |
|
"loss": 0.0213, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.65625, |
|
"grad_norm": 0.03169933706521988, |
|
"learning_rate": 0.0001534375, |
|
"loss": 0.24, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"grad_norm": 4.107417106628418, |
|
"learning_rate": 0.000153125, |
|
"loss": 0.0997, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.6875, |
|
"eval_accuracy": 0.9109375, |
|
"eval_loss": 0.3378385901451111, |
|
"eval_runtime": 6.6178, |
|
"eval_samples_per_second": 193.418, |
|
"eval_steps_per_second": 24.177, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.71875, |
|
"grad_norm": 0.2504006028175354, |
|
"learning_rate": 0.0001528125, |
|
"loss": 0.0903, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 3.6016788482666016, |
|
"learning_rate": 0.0001525, |
|
"loss": 0.1676, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.78125, |
|
"grad_norm": 0.02008502557873726, |
|
"learning_rate": 0.0001521875, |
|
"loss": 0.0731, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.8125, |
|
"grad_norm": 0.019816860556602478, |
|
"learning_rate": 0.00015187500000000002, |
|
"loss": 0.0559, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.84375, |
|
"grad_norm": 1.7278194427490234, |
|
"learning_rate": 0.0001515625, |
|
"loss": 0.0306, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.875, |
|
"grad_norm": 0.19783662259578705, |
|
"learning_rate": 0.00015125, |
|
"loss": 0.1335, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.90625, |
|
"grad_norm": 11.390993118286133, |
|
"learning_rate": 0.0001509375, |
|
"loss": 0.1606, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.9375, |
|
"grad_norm": 4.290327072143555, |
|
"learning_rate": 0.00015062500000000002, |
|
"loss": 0.0807, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.96875, |
|
"grad_norm": 0.28723642230033875, |
|
"learning_rate": 0.0001503125, |
|
"loss": 0.1673, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.05478620529174805, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0715, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.91328125, |
|
"eval_loss": 0.3626965880393982, |
|
"eval_runtime": 6.9646, |
|
"eval_samples_per_second": 183.788, |
|
"eval_steps_per_second": 22.973, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.03125, |
|
"grad_norm": 3.0136938095092773, |
|
"learning_rate": 0.0001496875, |
|
"loss": 0.1965, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.0625, |
|
"grad_norm": 0.11938930302858353, |
|
"learning_rate": 0.00014937499999999999, |
|
"loss": 0.0065, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.09375, |
|
"grad_norm": 0.04192354157567024, |
|
"learning_rate": 0.00014906250000000003, |
|
"loss": 0.0813, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.125, |
|
"grad_norm": 0.018585534766316414, |
|
"learning_rate": 0.00014875, |
|
"loss": 0.0973, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.15625, |
|
"grad_norm": 3.9032442569732666, |
|
"learning_rate": 0.0001484375, |
|
"loss": 0.0587, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.1875, |
|
"grad_norm": 0.04510480910539627, |
|
"learning_rate": 0.000148125, |
|
"loss": 0.1506, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.21875, |
|
"grad_norm": 0.2749982178211212, |
|
"learning_rate": 0.0001478125, |
|
"loss": 0.1367, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 6.217327117919922, |
|
"learning_rate": 0.0001475, |
|
"loss": 0.0734, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.28125, |
|
"grad_norm": 0.11703751981258392, |
|
"learning_rate": 0.00014718750000000002, |
|
"loss": 0.0961, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"grad_norm": 1.1948308944702148, |
|
"learning_rate": 0.000146875, |
|
"loss": 0.0567, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.3125, |
|
"eval_accuracy": 0.9234375, |
|
"eval_loss": 0.30611464381217957, |
|
"eval_runtime": 7.2211, |
|
"eval_samples_per_second": 177.258, |
|
"eval_steps_per_second": 22.157, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.34375, |
|
"grad_norm": 0.025335168465971947, |
|
"learning_rate": 0.0001465625, |
|
"loss": 0.0511, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.375, |
|
"grad_norm": 0.033230509608983994, |
|
"learning_rate": 0.00014625, |
|
"loss": 0.0857, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.40625, |
|
"grad_norm": 0.027418823912739754, |
|
"learning_rate": 0.00014593750000000002, |
|
"loss": 0.0054, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.4375, |
|
"grad_norm": 7.358951568603516, |
|
"learning_rate": 0.000145625, |
|
"loss": 0.0582, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.46875, |
|
"grad_norm": 0.06502091139554977, |
|
"learning_rate": 0.00014531250000000002, |
|
"loss": 0.11, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 3.4256505966186523, |
|
"learning_rate": 0.000145, |
|
"loss": 0.1004, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.53125, |
|
"grad_norm": 0.023563764989376068, |
|
"learning_rate": 0.0001446875, |
|
"loss": 0.0066, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.5625, |
|
"grad_norm": 0.049036819487810135, |
|
"learning_rate": 0.00014437500000000003, |
|
"loss": 0.0271, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.59375, |
|
"grad_norm": 0.02042466588318348, |
|
"learning_rate": 0.0001440625, |
|
"loss": 0.0132, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"grad_norm": 0.05669878423213959, |
|
"learning_rate": 0.00014375, |
|
"loss": 0.0558, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.625, |
|
"eval_accuracy": 0.94609375, |
|
"eval_loss": 0.23927736282348633, |
|
"eval_runtime": 6.5481, |
|
"eval_samples_per_second": 195.476, |
|
"eval_steps_per_second": 24.434, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.65625, |
|
"grad_norm": 0.9565289616584778, |
|
"learning_rate": 0.0001434375, |
|
"loss": 0.0366, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.6875, |
|
"grad_norm": 0.024000531062483788, |
|
"learning_rate": 0.000143125, |
|
"loss": 0.14, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.71875, |
|
"grad_norm": 6.16231107711792, |
|
"learning_rate": 0.0001428125, |
|
"loss": 0.1586, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 10.281587600708008, |
|
"learning_rate": 0.00014250000000000002, |
|
"loss": 0.1454, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.78125, |
|
"grad_norm": 8.064495086669922, |
|
"learning_rate": 0.0001421875, |
|
"loss": 0.1354, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.8125, |
|
"grad_norm": 0.27762091159820557, |
|
"learning_rate": 0.000141875, |
|
"loss": 0.0356, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.84375, |
|
"grad_norm": 7.199831962585449, |
|
"learning_rate": 0.0001415625, |
|
"loss": 0.0485, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.875, |
|
"grad_norm": 0.013454968109726906, |
|
"learning_rate": 0.00014125000000000002, |
|
"loss": 0.0317, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.90625, |
|
"grad_norm": 0.8754172921180725, |
|
"learning_rate": 0.0001409375, |
|
"loss": 0.0198, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"grad_norm": 0.06731715798377991, |
|
"learning_rate": 0.00014062500000000002, |
|
"loss": 0.0061, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.9375, |
|
"eval_accuracy": 0.95859375, |
|
"eval_loss": 0.17380020022392273, |
|
"eval_runtime": 6.5259, |
|
"eval_samples_per_second": 196.142, |
|
"eval_steps_per_second": 24.518, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.96875, |
|
"grad_norm": 1.3012233972549438, |
|
"learning_rate": 0.0001403125, |
|
"loss": 0.1023, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.058877017349004745, |
|
"learning_rate": 0.00014, |
|
"loss": 0.0555, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.03125, |
|
"grad_norm": 1.3218984603881836, |
|
"learning_rate": 0.00013968750000000003, |
|
"loss": 0.1671, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.0625, |
|
"grad_norm": 0.7441987991333008, |
|
"learning_rate": 0.000139375, |
|
"loss": 0.046, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.09375, |
|
"grad_norm": 0.3219761550426483, |
|
"learning_rate": 0.0001390625, |
|
"loss": 0.0741, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.125, |
|
"grad_norm": 0.3803882598876953, |
|
"learning_rate": 0.00013875, |
|
"loss": 0.1102, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.15625, |
|
"grad_norm": 0.034619808197021484, |
|
"learning_rate": 0.0001384375, |
|
"loss": 0.0844, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.1875, |
|
"grad_norm": 0.03565617650747299, |
|
"learning_rate": 0.000138125, |
|
"loss": 0.0162, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.21875, |
|
"grad_norm": 0.05813159421086311, |
|
"learning_rate": 0.00013781250000000002, |
|
"loss": 0.0303, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.16888560354709625, |
|
"learning_rate": 0.0001375, |
|
"loss": 0.0449, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_accuracy": 0.94921875, |
|
"eval_loss": 0.20937061309814453, |
|
"eval_runtime": 6.6824, |
|
"eval_samples_per_second": 191.549, |
|
"eval_steps_per_second": 23.944, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.28125, |
|
"grad_norm": 0.13594718277454376, |
|
"learning_rate": 0.0001371875, |
|
"loss": 0.0127, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.3125, |
|
"grad_norm": 5.2705183029174805, |
|
"learning_rate": 0.000136875, |
|
"loss": 0.032, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.34375, |
|
"grad_norm": 11.641499519348145, |
|
"learning_rate": 0.00013656250000000002, |
|
"loss": 0.063, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.375, |
|
"grad_norm": 0.017323823645710945, |
|
"learning_rate": 0.00013625, |
|
"loss": 0.0106, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.40625, |
|
"grad_norm": 0.029373859986662865, |
|
"learning_rate": 0.00013593750000000002, |
|
"loss": 0.0032, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.4375, |
|
"grad_norm": 0.3746764063835144, |
|
"learning_rate": 0.000135625, |
|
"loss": 0.0606, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.46875, |
|
"grad_norm": 0.1948755830526352, |
|
"learning_rate": 0.0001353125, |
|
"loss": 0.0763, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.017781907692551613, |
|
"learning_rate": 0.00013500000000000003, |
|
"loss": 0.1066, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.53125, |
|
"grad_norm": 7.0899577140808105, |
|
"learning_rate": 0.00013468750000000001, |
|
"loss": 0.0805, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"grad_norm": 0.2712390720844269, |
|
"learning_rate": 0.000134375, |
|
"loss": 0.0073, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.5625, |
|
"eval_accuracy": 0.95390625, |
|
"eval_loss": 0.18335095047950745, |
|
"eval_runtime": 7.1604, |
|
"eval_samples_per_second": 178.761, |
|
"eval_steps_per_second": 22.345, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.59375, |
|
"grad_norm": 18.94267463684082, |
|
"learning_rate": 0.0001340625, |
|
"loss": 0.0978, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.625, |
|
"grad_norm": 12.507984161376953, |
|
"learning_rate": 0.00013375, |
|
"loss": 0.1564, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.65625, |
|
"grad_norm": 0.11404982954263687, |
|
"learning_rate": 0.0001334375, |
|
"loss": 0.2087, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.6875, |
|
"grad_norm": 5.735910415649414, |
|
"learning_rate": 0.00013312500000000002, |
|
"loss": 0.1009, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.71875, |
|
"grad_norm": 0.24378864467144012, |
|
"learning_rate": 0.0001328125, |
|
"loss": 0.117, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 1.378218412399292, |
|
"learning_rate": 0.0001325, |
|
"loss": 0.0559, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.78125, |
|
"grad_norm": 0.11509452760219574, |
|
"learning_rate": 0.0001321875, |
|
"loss": 0.1116, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 6.8125, |
|
"grad_norm": 0.024781059473752975, |
|
"learning_rate": 0.00013187500000000002, |
|
"loss": 0.0162, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.84375, |
|
"grad_norm": 0.013253854587674141, |
|
"learning_rate": 0.0001315625, |
|
"loss": 0.0124, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"grad_norm": 0.009435000829398632, |
|
"learning_rate": 0.00013125000000000002, |
|
"loss": 0.0425, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.875, |
|
"eval_accuracy": 0.9265625, |
|
"eval_loss": 0.2847265601158142, |
|
"eval_runtime": 6.4944, |
|
"eval_samples_per_second": 197.092, |
|
"eval_steps_per_second": 24.637, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.90625, |
|
"grad_norm": 0.009621995501220226, |
|
"learning_rate": 0.0001309375, |
|
"loss": 0.0092, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.9375, |
|
"grad_norm": 13.595647811889648, |
|
"learning_rate": 0.000130625, |
|
"loss": 0.033, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.96875, |
|
"grad_norm": 0.046547386795282364, |
|
"learning_rate": 0.0001303125, |
|
"loss": 0.0697, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.018974941223859787, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 0.0325, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.03125, |
|
"grad_norm": 0.011768043972551823, |
|
"learning_rate": 0.0001296875, |
|
"loss": 0.1392, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.0625, |
|
"grad_norm": 0.10145868360996246, |
|
"learning_rate": 0.00012937500000000001, |
|
"loss": 0.0034, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.09375, |
|
"grad_norm": 2.6255838871002197, |
|
"learning_rate": 0.0001290625, |
|
"loss": 0.0205, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 7.125, |
|
"grad_norm": 7.713711261749268, |
|
"learning_rate": 0.00012875, |
|
"loss": 0.085, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.15625, |
|
"grad_norm": 0.015717368572950363, |
|
"learning_rate": 0.0001284375, |
|
"loss": 0.073, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"grad_norm": 0.023385796695947647, |
|
"learning_rate": 0.000128125, |
|
"loss": 0.0397, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.1875, |
|
"eval_accuracy": 0.9125, |
|
"eval_loss": 0.40313416719436646, |
|
"eval_runtime": 7.0181, |
|
"eval_samples_per_second": 182.385, |
|
"eval_steps_per_second": 22.798, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.21875, |
|
"grad_norm": 0.05634606257081032, |
|
"learning_rate": 0.0001278125, |
|
"loss": 0.0661, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.0715768113732338, |
|
"learning_rate": 0.0001275, |
|
"loss": 0.0194, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.28125, |
|
"grad_norm": 0.03949156031012535, |
|
"learning_rate": 0.00012718750000000002, |
|
"loss": 0.0039, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.3125, |
|
"grad_norm": 0.011410553939640522, |
|
"learning_rate": 0.000126875, |
|
"loss": 0.0305, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.34375, |
|
"grad_norm": 0.00789843499660492, |
|
"learning_rate": 0.0001265625, |
|
"loss": 0.0022, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 7.375, |
|
"grad_norm": 0.6067203283309937, |
|
"learning_rate": 0.00012625, |
|
"loss": 0.0022, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.40625, |
|
"grad_norm": 3.3476336002349854, |
|
"learning_rate": 0.0001259375, |
|
"loss": 0.0642, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.4375, |
|
"grad_norm": 0.017286688089370728, |
|
"learning_rate": 0.000125625, |
|
"loss": 0.0081, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.46875, |
|
"grad_norm": 0.007009466178715229, |
|
"learning_rate": 0.00012531250000000002, |
|
"loss": 0.0308, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 5.178075313568115, |
|
"learning_rate": 0.000125, |
|
"loss": 0.0284, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_accuracy": 0.940625, |
|
"eval_loss": 0.29945191740989685, |
|
"eval_runtime": 6.4921, |
|
"eval_samples_per_second": 197.161, |
|
"eval_steps_per_second": 24.645, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.53125, |
|
"grad_norm": 0.03435547277331352, |
|
"learning_rate": 0.0001246875, |
|
"loss": 0.0786, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.5625, |
|
"grad_norm": 0.2063717395067215, |
|
"learning_rate": 0.000124375, |
|
"loss": 0.0575, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.59375, |
|
"grad_norm": 0.021375322714447975, |
|
"learning_rate": 0.00012406250000000001, |
|
"loss": 0.0017, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.625, |
|
"grad_norm": 0.010769976302981377, |
|
"learning_rate": 0.00012375, |
|
"loss": 0.0437, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.65625, |
|
"grad_norm": 0.01713966764509678, |
|
"learning_rate": 0.0001234375, |
|
"loss": 0.0018, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.6875, |
|
"grad_norm": 0.007944900542497635, |
|
"learning_rate": 0.000123125, |
|
"loss": 0.011, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.71875, |
|
"grad_norm": 0.041198715567588806, |
|
"learning_rate": 0.0001228125, |
|
"loss": 0.0019, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.009142044000327587, |
|
"learning_rate": 0.00012250000000000002, |
|
"loss": 0.0287, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.78125, |
|
"grad_norm": 0.00957415159791708, |
|
"learning_rate": 0.0001221875, |
|
"loss": 0.0015, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"grad_norm": 0.018495427444577217, |
|
"learning_rate": 0.00012187500000000001, |
|
"loss": 0.0158, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.8125, |
|
"eval_accuracy": 0.96640625, |
|
"eval_loss": 0.19092732667922974, |
|
"eval_runtime": 6.9695, |
|
"eval_samples_per_second": 183.658, |
|
"eval_steps_per_second": 22.957, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.84375, |
|
"grad_norm": 0.005418677814304829, |
|
"learning_rate": 0.00012156250000000001, |
|
"loss": 0.0196, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 7.875, |
|
"grad_norm": 0.30978670716285706, |
|
"learning_rate": 0.00012124999999999999, |
|
"loss": 0.0327, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 7.90625, |
|
"grad_norm": 0.006560006178915501, |
|
"learning_rate": 0.00012093750000000002, |
|
"loss": 0.0079, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.9375, |
|
"grad_norm": 0.006091834977269173, |
|
"learning_rate": 0.000120625, |
|
"loss": 0.0086, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 7.96875, |
|
"grad_norm": 4.461112022399902, |
|
"learning_rate": 0.0001203125, |
|
"loss": 0.0603, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 18.875268936157227, |
|
"learning_rate": 0.00012, |
|
"loss": 0.0695, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.03125, |
|
"grad_norm": 0.006029163487255573, |
|
"learning_rate": 0.0001196875, |
|
"loss": 0.0015, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 8.0625, |
|
"grad_norm": 0.00949636660516262, |
|
"learning_rate": 0.00011937500000000001, |
|
"loss": 0.0029, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.09375, |
|
"grad_norm": 0.011131886392831802, |
|
"learning_rate": 0.00011906250000000001, |
|
"loss": 0.0037, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"grad_norm": 0.43485090136528015, |
|
"learning_rate": 0.00011875, |
|
"loss": 0.006, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.125, |
|
"eval_accuracy": 0.9296875, |
|
"eval_loss": 0.35239508748054504, |
|
"eval_runtime": 6.7581, |
|
"eval_samples_per_second": 189.401, |
|
"eval_steps_per_second": 23.675, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.15625, |
|
"grad_norm": 0.004519434180110693, |
|
"learning_rate": 0.0001184375, |
|
"loss": 0.0774, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.1875, |
|
"grad_norm": 6.793065547943115, |
|
"learning_rate": 0.000118125, |
|
"loss": 0.0311, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.21875, |
|
"grad_norm": 0.0822053775191307, |
|
"learning_rate": 0.00011781250000000001, |
|
"loss": 0.0164, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.0055194334127008915, |
|
"learning_rate": 0.00011750000000000001, |
|
"loss": 0.0019, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.28125, |
|
"grad_norm": 0.016379250213503838, |
|
"learning_rate": 0.00011718750000000001, |
|
"loss": 0.0206, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 8.3125, |
|
"grad_norm": 0.00409812992438674, |
|
"learning_rate": 0.000116875, |
|
"loss": 0.0017, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 8.34375, |
|
"grad_norm": 0.004736763890832663, |
|
"learning_rate": 0.0001165625, |
|
"loss": 0.0173, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 8.375, |
|
"grad_norm": 0.006192313041538, |
|
"learning_rate": 0.00011625000000000002, |
|
"loss": 0.0059, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.40625, |
|
"grad_norm": 0.019787801429629326, |
|
"learning_rate": 0.0001159375, |
|
"loss": 0.0559, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"grad_norm": 0.945681095123291, |
|
"learning_rate": 0.000115625, |
|
"loss": 0.0017, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.4375, |
|
"eval_accuracy": 0.96171875, |
|
"eval_loss": 0.19076545536518097, |
|
"eval_runtime": 6.3936, |
|
"eval_samples_per_second": 200.199, |
|
"eval_steps_per_second": 25.025, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.46875, |
|
"grad_norm": 0.3741857409477234, |
|
"learning_rate": 0.0001153125, |
|
"loss": 0.001, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.0038401270285248756, |
|
"learning_rate": 0.00011499999999999999, |
|
"loss": 0.0011, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.53125, |
|
"grad_norm": 0.005083560012280941, |
|
"learning_rate": 0.00011468750000000002, |
|
"loss": 0.0008, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.5625, |
|
"grad_norm": 0.0035322927869856358, |
|
"learning_rate": 0.00011437500000000002, |
|
"loss": 0.0183, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.59375, |
|
"grad_norm": 0.005947966128587723, |
|
"learning_rate": 0.0001140625, |
|
"loss": 0.0011, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.625, |
|
"grad_norm": 5.917221546173096, |
|
"learning_rate": 0.00011375, |
|
"loss": 0.0021, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.65625, |
|
"grad_norm": 0.0036874141078442335, |
|
"learning_rate": 0.0001134375, |
|
"loss": 0.0186, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 8.6875, |
|
"grad_norm": 0.005372929852455854, |
|
"learning_rate": 0.00011312500000000001, |
|
"loss": 0.0241, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.71875, |
|
"grad_norm": 0.004914106801152229, |
|
"learning_rate": 0.00011281250000000001, |
|
"loss": 0.0421, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.02046949602663517, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 0.0026, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_accuracy": 0.9625, |
|
"eval_loss": 0.17868757247924805, |
|
"eval_runtime": 6.3446, |
|
"eval_samples_per_second": 201.746, |
|
"eval_steps_per_second": 25.218, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.78125, |
|
"grad_norm": 0.023118741810321808, |
|
"learning_rate": 0.0001121875, |
|
"loss": 0.0016, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 8.8125, |
|
"grad_norm": 16.51854133605957, |
|
"learning_rate": 0.000111875, |
|
"loss": 0.0685, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 8.84375, |
|
"grad_norm": 0.011586804874241352, |
|
"learning_rate": 0.00011156250000000001, |
|
"loss": 0.0323, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 8.875, |
|
"grad_norm": 0.018752580508589745, |
|
"learning_rate": 0.00011125000000000001, |
|
"loss": 0.0012, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 8.90625, |
|
"grad_norm": 0.014984137378633022, |
|
"learning_rate": 0.0001109375, |
|
"loss": 0.0288, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 8.9375, |
|
"grad_norm": 12.684847831726074, |
|
"learning_rate": 0.00011065625, |
|
"loss": 0.1146, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 8.96875, |
|
"grad_norm": 0.09825449436903, |
|
"learning_rate": 0.00011034375000000001, |
|
"loss": 0.0147, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.011579761281609535, |
|
"learning_rate": 0.00011003125000000001, |
|
"loss": 0.0012, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.03125, |
|
"grad_norm": 0.009486192837357521, |
|
"learning_rate": 0.00010971875000000001, |
|
"loss": 0.0257, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"grad_norm": 0.005146791692823172, |
|
"learning_rate": 0.00010940624999999999, |
|
"loss": 0.001, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.0625, |
|
"eval_accuracy": 0.96875, |
|
"eval_loss": 0.1328631341457367, |
|
"eval_runtime": 6.918, |
|
"eval_samples_per_second": 185.024, |
|
"eval_steps_per_second": 23.128, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.09375, |
|
"grad_norm": 0.005191614385694265, |
|
"learning_rate": 0.00010909374999999999, |
|
"loss": 0.0013, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.125, |
|
"grad_norm": 0.024972369894385338, |
|
"learning_rate": 0.00010878125000000002, |
|
"loss": 0.0356, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.15625, |
|
"grad_norm": 0.00825554970651865, |
|
"learning_rate": 0.00010846875, |
|
"loss": 0.0009, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 9.1875, |
|
"grad_norm": 0.004267066717147827, |
|
"learning_rate": 0.00010815625, |
|
"loss": 0.0395, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 9.21875, |
|
"grad_norm": 0.7409490346908569, |
|
"learning_rate": 0.00010784375, |
|
"loss": 0.0037, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.006523266900330782, |
|
"learning_rate": 0.00010753124999999999, |
|
"loss": 0.0205, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 9.28125, |
|
"grad_norm": 0.00338526233099401, |
|
"learning_rate": 0.00010721875000000001, |
|
"loss": 0.0185, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 9.3125, |
|
"grad_norm": 0.00550084188580513, |
|
"learning_rate": 0.00010690625000000001, |
|
"loss": 0.0135, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 9.34375, |
|
"grad_norm": 0.0031597930938005447, |
|
"learning_rate": 0.00010659375, |
|
"loss": 0.0012, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"grad_norm": 0.004799798596650362, |
|
"learning_rate": 0.00010628125, |
|
"loss": 0.0497, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.375, |
|
"eval_accuracy": 0.959375, |
|
"eval_loss": 0.1878364086151123, |
|
"eval_runtime": 6.9355, |
|
"eval_samples_per_second": 184.558, |
|
"eval_steps_per_second": 23.07, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.40625, |
|
"grad_norm": 0.03460833802819252, |
|
"learning_rate": 0.00010596875, |
|
"loss": 0.002, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 9.4375, |
|
"grad_norm": 2.7818896770477295, |
|
"learning_rate": 0.00010565625000000001, |
|
"loss": 0.0333, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.46875, |
|
"grad_norm": 0.00796230137348175, |
|
"learning_rate": 0.00010534375000000001, |
|
"loss": 0.041, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.0041164797730743885, |
|
"learning_rate": 0.00010503125000000001, |
|
"loss": 0.071, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.53125, |
|
"grad_norm": 0.004923259373754263, |
|
"learning_rate": 0.00010471875, |
|
"loss": 0.0016, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.5625, |
|
"grad_norm": 0.03983521834015846, |
|
"learning_rate": 0.00010440625, |
|
"loss": 0.0245, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.59375, |
|
"grad_norm": 0.005190184339880943, |
|
"learning_rate": 0.00010409375, |
|
"loss": 0.0134, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.625, |
|
"grad_norm": 0.05828193947672844, |
|
"learning_rate": 0.00010378125, |
|
"loss": 0.0599, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.65625, |
|
"grad_norm": 0.0039481050334870815, |
|
"learning_rate": 0.00010346875, |
|
"loss": 0.0581, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"grad_norm": 0.007265688385814428, |
|
"learning_rate": 0.00010315625, |
|
"loss": 0.09, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.6875, |
|
"eval_accuracy": 0.96484375, |
|
"eval_loss": 0.1753551959991455, |
|
"eval_runtime": 6.4623, |
|
"eval_samples_per_second": 198.073, |
|
"eval_steps_per_second": 24.759, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.71875, |
|
"grad_norm": 0.009442336857318878, |
|
"learning_rate": 0.00010284374999999999, |
|
"loss": 0.0009, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.0029557254165410995, |
|
"learning_rate": 0.00010253125000000002, |
|
"loss": 0.0011, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.78125, |
|
"grad_norm": 0.6488510370254517, |
|
"learning_rate": 0.00010221875, |
|
"loss": 0.0207, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 9.8125, |
|
"grad_norm": 0.008899745531380177, |
|
"learning_rate": 0.00010190625, |
|
"loss": 0.0106, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 9.84375, |
|
"grad_norm": 0.002760515082627535, |
|
"learning_rate": 0.00010159375, |
|
"loss": 0.0172, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 9.875, |
|
"grad_norm": 0.9725448489189148, |
|
"learning_rate": 0.00010128125, |
|
"loss": 0.0065, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 9.90625, |
|
"grad_norm": 0.002948402427136898, |
|
"learning_rate": 0.00010096875000000001, |
|
"loss": 0.0008, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 9.9375, |
|
"grad_norm": 0.0027872335631400347, |
|
"learning_rate": 0.00010065625000000001, |
|
"loss": 0.0023, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 9.96875, |
|
"grad_norm": 0.00609046733006835, |
|
"learning_rate": 0.00010034375000000001, |
|
"loss": 0.0385, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.025308266282081604, |
|
"learning_rate": 0.00010003125, |
|
"loss": 0.0046, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9671875, |
|
"eval_loss": 0.15844720602035522, |
|
"eval_runtime": 6.5917, |
|
"eval_samples_per_second": 194.184, |
|
"eval_steps_per_second": 24.273, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 10.03125, |
|
"grad_norm": 0.0036033540964126587, |
|
"learning_rate": 9.971875000000001e-05, |
|
"loss": 0.0007, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 10.0625, |
|
"grad_norm": 0.005487027112394571, |
|
"learning_rate": 9.940625000000001e-05, |
|
"loss": 0.0893, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 10.09375, |
|
"grad_norm": 0.017159543931484222, |
|
"learning_rate": 9.909375e-05, |
|
"loss": 0.003, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 10.125, |
|
"grad_norm": 0.006799894850701094, |
|
"learning_rate": 9.878125e-05, |
|
"loss": 0.0026, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 10.15625, |
|
"grad_norm": 5.967584609985352, |
|
"learning_rate": 9.846875e-05, |
|
"loss": 0.035, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 10.1875, |
|
"grad_norm": 0.027164561673998833, |
|
"learning_rate": 9.815625e-05, |
|
"loss": 0.0187, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 10.21875, |
|
"grad_norm": 0.010049635544419289, |
|
"learning_rate": 9.784375e-05, |
|
"loss": 0.0012, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"grad_norm": 0.006957762409001589, |
|
"learning_rate": 9.753125e-05, |
|
"loss": 0.0009, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 10.28125, |
|
"grad_norm": 0.009408445097506046, |
|
"learning_rate": 9.721875e-05, |
|
"loss": 0.0016, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 10.3125, |
|
"grad_norm": 0.004588890355080366, |
|
"learning_rate": 9.690625000000001e-05, |
|
"loss": 0.0006, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 10.3125, |
|
"eval_accuracy": 0.96484375, |
|
"eval_loss": 0.20075881481170654, |
|
"eval_runtime": 6.9603, |
|
"eval_samples_per_second": 183.901, |
|
"eval_steps_per_second": 22.988, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 10.34375, |
|
"grad_norm": 0.0026115712244063616, |
|
"learning_rate": 9.659375e-05, |
|
"loss": 0.0008, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 10.375, |
|
"grad_norm": 0.010643471032381058, |
|
"learning_rate": 9.628125e-05, |
|
"loss": 0.0424, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 10.40625, |
|
"grad_norm": 23.894906997680664, |
|
"learning_rate": 9.596875000000001e-05, |
|
"loss": 0.0637, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 10.4375, |
|
"grad_norm": 0.00423228507861495, |
|
"learning_rate": 9.565625e-05, |
|
"loss": 0.0012, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 10.46875, |
|
"grad_norm": 0.0030155859421938658, |
|
"learning_rate": 9.534375000000001e-05, |
|
"loss": 0.0186, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"grad_norm": 0.003306414932012558, |
|
"learning_rate": 9.503125000000001e-05, |
|
"loss": 0.0363, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 10.53125, |
|
"grad_norm": 0.0035443324595689774, |
|
"learning_rate": 9.471875e-05, |
|
"loss": 0.0269, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 10.5625, |
|
"grad_norm": 0.00464298902079463, |
|
"learning_rate": 9.440625000000001e-05, |
|
"loss": 0.0006, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 10.59375, |
|
"grad_norm": 0.003652532584965229, |
|
"learning_rate": 9.409375000000001e-05, |
|
"loss": 0.0006, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 10.625, |
|
"grad_norm": 0.0041074915789067745, |
|
"learning_rate": 9.378125e-05, |
|
"loss": 0.0008, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 10.625, |
|
"eval_accuracy": 0.975, |
|
"eval_loss": 0.12715043127536774, |
|
"eval_runtime": 6.4535, |
|
"eval_samples_per_second": 198.341, |
|
"eval_steps_per_second": 24.793, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 10.65625, |
|
"grad_norm": 0.003479533363133669, |
|
"learning_rate": 9.346875e-05, |
|
"loss": 0.0022, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 10.6875, |
|
"grad_norm": 0.002636971650645137, |
|
"learning_rate": 9.315625e-05, |
|
"loss": 0.0264, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 10.71875, |
|
"grad_norm": 0.0028728495817631483, |
|
"learning_rate": 9.284375e-05, |
|
"loss": 0.0121, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.0024031987413764, |
|
"learning_rate": 9.253125e-05, |
|
"loss": 0.0147, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 10.78125, |
|
"grad_norm": 11.363987922668457, |
|
"learning_rate": 9.221875000000002e-05, |
|
"loss": 0.0628, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 10.8125, |
|
"grad_norm": 0.01323883980512619, |
|
"learning_rate": 9.190625e-05, |
|
"loss": 0.0016, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 10.84375, |
|
"grad_norm": 0.005075179971754551, |
|
"learning_rate": 9.159375e-05, |
|
"loss": 0.0298, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 10.875, |
|
"grad_norm": 0.03815023973584175, |
|
"learning_rate": 9.128125000000001e-05, |
|
"loss": 0.0006, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 10.90625, |
|
"grad_norm": 0.028709406033158302, |
|
"learning_rate": 9.096875e-05, |
|
"loss": 0.0165, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 10.9375, |
|
"grad_norm": 0.002887293929234147, |
|
"learning_rate": 9.065625000000001e-05, |
|
"loss": 0.028, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 10.9375, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.14528806507587433, |
|
"eval_runtime": 6.3552, |
|
"eval_samples_per_second": 201.41, |
|
"eval_steps_per_second": 25.176, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 10.96875, |
|
"grad_norm": 0.0032660234719514847, |
|
"learning_rate": 9.034375000000001e-05, |
|
"loss": 0.0008, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.003932199906557798, |
|
"learning_rate": 9.003125e-05, |
|
"loss": 0.0327, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 11.03125, |
|
"grad_norm": 0.0021477933041751385, |
|
"learning_rate": 8.971875000000001e-05, |
|
"loss": 0.0005, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 11.0625, |
|
"grad_norm": 0.002401479985564947, |
|
"learning_rate": 8.940625000000001e-05, |
|
"loss": 0.0306, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 11.09375, |
|
"grad_norm": 8.878641128540039, |
|
"learning_rate": 8.909375000000001e-05, |
|
"loss": 0.0523, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 11.125, |
|
"grad_norm": 0.0029539538081735373, |
|
"learning_rate": 8.878125e-05, |
|
"loss": 0.0006, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 11.15625, |
|
"grad_norm": 0.004306386224925518, |
|
"learning_rate": 8.846875e-05, |
|
"loss": 0.001, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 11.1875, |
|
"grad_norm": 0.013514366932213306, |
|
"learning_rate": 8.815625e-05, |
|
"loss": 0.0005, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 11.21875, |
|
"grad_norm": 0.013187670148909092, |
|
"learning_rate": 8.784375e-05, |
|
"loss": 0.0005, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"grad_norm": 0.002293806755915284, |
|
"learning_rate": 8.753125e-05, |
|
"loss": 0.0005, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"eval_accuracy": 0.975, |
|
"eval_loss": 0.1256314218044281, |
|
"eval_runtime": 6.8185, |
|
"eval_samples_per_second": 187.725, |
|
"eval_steps_per_second": 23.466, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 11.28125, |
|
"grad_norm": 0.0030747319106012583, |
|
"learning_rate": 8.721875e-05, |
|
"loss": 0.0213, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 11.3125, |
|
"grad_norm": 0.002191092586144805, |
|
"learning_rate": 8.690625e-05, |
|
"loss": 0.0014, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 11.34375, |
|
"grad_norm": 15.502256393432617, |
|
"learning_rate": 8.659375e-05, |
|
"loss": 0.0188, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 11.375, |
|
"grad_norm": 0.002841190667822957, |
|
"learning_rate": 8.628125e-05, |
|
"loss": 0.0006, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 11.40625, |
|
"grad_norm": 0.009304801933467388, |
|
"learning_rate": 8.596875000000001e-05, |
|
"loss": 0.0005, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 11.4375, |
|
"grad_norm": 0.002438412746414542, |
|
"learning_rate": 8.565625e-05, |
|
"loss": 0.0004, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 11.46875, |
|
"grad_norm": 0.0025553186424076557, |
|
"learning_rate": 8.534375e-05, |
|
"loss": 0.0005, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.007872486487030983, |
|
"learning_rate": 8.503125000000001e-05, |
|
"loss": 0.0004, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 11.53125, |
|
"grad_norm": 0.003905347315594554, |
|
"learning_rate": 8.471875e-05, |
|
"loss": 0.0004, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 11.5625, |
|
"grad_norm": 0.0032572925556451082, |
|
"learning_rate": 8.440625000000001e-05, |
|
"loss": 0.0005, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 11.5625, |
|
"eval_accuracy": 0.97890625, |
|
"eval_loss": 0.10886295884847641, |
|
"eval_runtime": 6.8932, |
|
"eval_samples_per_second": 185.689, |
|
"eval_steps_per_second": 23.211, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 11.59375, |
|
"grad_norm": 0.005474507808685303, |
|
"learning_rate": 8.409375000000001e-05, |
|
"loss": 0.0004, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 11.625, |
|
"grad_norm": 0.003425732720643282, |
|
"learning_rate": 8.378125e-05, |
|
"loss": 0.0005, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 11.65625, |
|
"grad_norm": 0.010524573735892773, |
|
"learning_rate": 8.346875e-05, |
|
"loss": 0.0004, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 11.6875, |
|
"grad_norm": 0.002859619678929448, |
|
"learning_rate": 8.315625e-05, |
|
"loss": 0.0004, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 11.71875, |
|
"grad_norm": 0.0019814125262200832, |
|
"learning_rate": 8.284375e-05, |
|
"loss": 0.0003, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 11.75, |
|
"grad_norm": 0.0019714718218892813, |
|
"learning_rate": 8.253125e-05, |
|
"loss": 0.0004, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 11.78125, |
|
"grad_norm": 0.0027500391006469727, |
|
"learning_rate": 8.221875e-05, |
|
"loss": 0.0004, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 11.8125, |
|
"grad_norm": 0.001707090763375163, |
|
"learning_rate": 8.190625e-05, |
|
"loss": 0.0004, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 11.84375, |
|
"grad_norm": 0.0019228786695748568, |
|
"learning_rate": 8.159375e-05, |
|
"loss": 0.0004, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 11.875, |
|
"grad_norm": 0.0019343816675245762, |
|
"learning_rate": 8.128125000000001e-05, |
|
"loss": 0.0004, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 11.875, |
|
"eval_accuracy": 0.978125, |
|
"eval_loss": 0.1097874864935875, |
|
"eval_runtime": 6.3764, |
|
"eval_samples_per_second": 200.739, |
|
"eval_steps_per_second": 25.092, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 11.90625, |
|
"grad_norm": 0.001760054030455649, |
|
"learning_rate": 8.096875e-05, |
|
"loss": 0.0003, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 11.9375, |
|
"grad_norm": 0.005902810953557491, |
|
"learning_rate": 8.065625e-05, |
|
"loss": 0.0024, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 11.96875, |
|
"grad_norm": 0.0037092138081789017, |
|
"learning_rate": 8.034375000000001e-05, |
|
"loss": 0.0464, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.016846604645252228, |
|
"learning_rate": 8.003125e-05, |
|
"loss": 0.0023, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 12.03125, |
|
"grad_norm": 0.0020532661583274603, |
|
"learning_rate": 7.971875000000001e-05, |
|
"loss": 0.0004, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 12.0625, |
|
"grad_norm": 0.005589602515101433, |
|
"learning_rate": 7.940625000000001e-05, |
|
"loss": 0.0003, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 12.09375, |
|
"grad_norm": 0.008196796290576458, |
|
"learning_rate": 7.909375e-05, |
|
"loss": 0.0025, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 12.125, |
|
"grad_norm": 0.7199010848999023, |
|
"learning_rate": 7.878125000000001e-05, |
|
"loss": 0.0313, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 12.15625, |
|
"grad_norm": 0.0015094159170985222, |
|
"learning_rate": 7.846875e-05, |
|
"loss": 0.0007, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 12.1875, |
|
"grad_norm": 0.019164152443408966, |
|
"learning_rate": 7.815625e-05, |
|
"loss": 0.0003, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 12.1875, |
|
"eval_accuracy": 0.9625, |
|
"eval_loss": 0.17790503799915314, |
|
"eval_runtime": 6.2858, |
|
"eval_samples_per_second": 203.634, |
|
"eval_steps_per_second": 25.454, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 12.21875, |
|
"grad_norm": 0.002245397539809346, |
|
"learning_rate": 7.784375e-05, |
|
"loss": 0.0009, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 12.25, |
|
"grad_norm": 0.0015030098147690296, |
|
"learning_rate": 7.753125e-05, |
|
"loss": 0.0003, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 12.28125, |
|
"grad_norm": 0.11134529858827591, |
|
"learning_rate": 7.721875e-05, |
|
"loss": 0.0005, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 12.3125, |
|
"grad_norm": 0.002319012302905321, |
|
"learning_rate": 7.690625e-05, |
|
"loss": 0.0332, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 12.34375, |
|
"grad_norm": 0.002300011459738016, |
|
"learning_rate": 7.659375000000002e-05, |
|
"loss": 0.0005, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 12.375, |
|
"grad_norm": 0.004005583468824625, |
|
"learning_rate": 7.628125e-05, |
|
"loss": 0.0006, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 12.40625, |
|
"grad_norm": 0.0016996270278468728, |
|
"learning_rate": 7.596875e-05, |
|
"loss": 0.0052, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 12.4375, |
|
"grad_norm": 0.0016149221919476986, |
|
"learning_rate": 7.565625000000001e-05, |
|
"loss": 0.0003, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 12.46875, |
|
"grad_norm": 0.004814577754586935, |
|
"learning_rate": 7.534375e-05, |
|
"loss": 0.0041, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.010618665255606174, |
|
"learning_rate": 7.503125000000001e-05, |
|
"loss": 0.0163, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"eval_accuracy": 0.95390625, |
|
"eval_loss": 0.25004318356513977, |
|
"eval_runtime": 6.3222, |
|
"eval_samples_per_second": 202.461, |
|
"eval_steps_per_second": 25.308, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 12.53125, |
|
"grad_norm": 0.010156257078051567, |
|
"learning_rate": 7.471875000000001e-05, |
|
"loss": 0.0349, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 12.5625, |
|
"grad_norm": 0.015218588523566723, |
|
"learning_rate": 7.440625e-05, |
|
"loss": 0.1117, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 12.59375, |
|
"grad_norm": 0.00266676745377481, |
|
"learning_rate": 7.409375000000001e-05, |
|
"loss": 0.0004, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 12.625, |
|
"grad_norm": 0.001848011976107955, |
|
"learning_rate": 7.378125000000001e-05, |
|
"loss": 0.001, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 12.65625, |
|
"grad_norm": 0.014798123389482498, |
|
"learning_rate": 7.346875000000001e-05, |
|
"loss": 0.0005, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 12.6875, |
|
"grad_norm": 0.0023186809848994017, |
|
"learning_rate": 7.315625e-05, |
|
"loss": 0.0371, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 12.71875, |
|
"grad_norm": 0.001521400292403996, |
|
"learning_rate": 7.284375e-05, |
|
"loss": 0.0036, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.0016448964597657323, |
|
"learning_rate": 7.253125e-05, |
|
"loss": 0.0003, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 12.78125, |
|
"grad_norm": 0.003130651544779539, |
|
"learning_rate": 7.221875e-05, |
|
"loss": 0.0004, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 12.8125, |
|
"grad_norm": 0.001429844181984663, |
|
"learning_rate": 7.190625e-05, |
|
"loss": 0.0003, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 12.8125, |
|
"eval_accuracy": 0.9734375, |
|
"eval_loss": 0.1555672138929367, |
|
"eval_runtime": 6.821, |
|
"eval_samples_per_second": 187.655, |
|
"eval_steps_per_second": 23.457, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 12.84375, |
|
"grad_norm": 0.0016585810808464885, |
|
"learning_rate": 7.159375e-05, |
|
"loss": 0.0097, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 12.875, |
|
"grad_norm": 0.0025756254326552153, |
|
"learning_rate": 7.128125e-05, |
|
"loss": 0.0003, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 12.90625, |
|
"grad_norm": 0.0014616530388593674, |
|
"learning_rate": 7.096875e-05, |
|
"loss": 0.0011, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 12.9375, |
|
"grad_norm": 0.001520119491033256, |
|
"learning_rate": 7.065625e-05, |
|
"loss": 0.0004, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 12.96875, |
|
"grad_norm": 0.0017162116710096598, |
|
"learning_rate": 7.034375000000001e-05, |
|
"loss": 0.0381, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.0027610217221081257, |
|
"learning_rate": 7.003125e-05, |
|
"loss": 0.0286, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 13.03125, |
|
"grad_norm": 0.0013741106959059834, |
|
"learning_rate": 6.971875e-05, |
|
"loss": 0.0099, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 13.0625, |
|
"grad_norm": 0.08995310217142105, |
|
"learning_rate": 6.940625000000001e-05, |
|
"loss": 0.0005, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 13.09375, |
|
"grad_norm": 0.002924903528764844, |
|
"learning_rate": 6.909375e-05, |
|
"loss": 0.0003, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 13.125, |
|
"grad_norm": 0.013759560883045197, |
|
"learning_rate": 6.878125000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 13.125, |
|
"eval_accuracy": 0.97421875, |
|
"eval_loss": 0.12048967182636261, |
|
"eval_runtime": 7.0166, |
|
"eval_samples_per_second": 182.425, |
|
"eval_steps_per_second": 22.803, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 13.15625, |
|
"grad_norm": 1.506090760231018, |
|
"learning_rate": 6.846875000000001e-05, |
|
"loss": 0.001, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 13.1875, |
|
"grad_norm": 0.001895511755719781, |
|
"learning_rate": 6.815624999999999e-05, |
|
"loss": 0.0003, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 13.21875, |
|
"grad_norm": 0.006083915941417217, |
|
"learning_rate": 6.784375e-05, |
|
"loss": 0.0004, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 13.25, |
|
"grad_norm": 19.973133087158203, |
|
"learning_rate": 6.753125e-05, |
|
"loss": 0.0203, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 13.28125, |
|
"grad_norm": 0.0028971272986382246, |
|
"learning_rate": 6.721875e-05, |
|
"loss": 0.0003, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 13.3125, |
|
"grad_norm": 0.022618748247623444, |
|
"learning_rate": 6.690625e-05, |
|
"loss": 0.0003, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 13.34375, |
|
"grad_norm": 0.001784573425538838, |
|
"learning_rate": 6.659375e-05, |
|
"loss": 0.0004, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 13.375, |
|
"grad_norm": 0.004388020373880863, |
|
"learning_rate": 6.628125e-05, |
|
"loss": 0.0003, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 13.40625, |
|
"grad_norm": 0.0013094112509861588, |
|
"learning_rate": 6.596875e-05, |
|
"loss": 0.0003, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 13.4375, |
|
"grad_norm": 0.001405878458172083, |
|
"learning_rate": 6.565625000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 13.4375, |
|
"eval_accuracy": 0.971875, |
|
"eval_loss": 0.15426388382911682, |
|
"eval_runtime": 6.7768, |
|
"eval_samples_per_second": 188.88, |
|
"eval_steps_per_second": 23.61, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 13.46875, |
|
"grad_norm": 0.001250360975973308, |
|
"learning_rate": 6.534375e-05, |
|
"loss": 0.0003, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.0014933838974684477, |
|
"learning_rate": 6.503125e-05, |
|
"loss": 0.0003, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 13.53125, |
|
"grad_norm": 0.0013753235107287765, |
|
"learning_rate": 6.471875000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 13.5625, |
|
"grad_norm": 0.0015354104107245803, |
|
"learning_rate": 6.440625e-05, |
|
"loss": 0.0004, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 13.59375, |
|
"grad_norm": 0.0018886132165789604, |
|
"learning_rate": 6.409375000000001e-05, |
|
"loss": 0.0145, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 13.625, |
|
"grad_norm": 0.0012300664093345404, |
|
"learning_rate": 6.378125000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 13.65625, |
|
"grad_norm": 0.0015221609501168132, |
|
"learning_rate": 6.346875e-05, |
|
"loss": 0.0248, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 13.6875, |
|
"grad_norm": 0.0017438657814636827, |
|
"learning_rate": 6.315625000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 13.71875, |
|
"grad_norm": 0.0020069123711436987, |
|
"learning_rate": 6.284375e-05, |
|
"loss": 0.0002, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 0.0013785591581836343, |
|
"learning_rate": 6.253125e-05, |
|
"loss": 0.0002, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"eval_accuracy": 0.975, |
|
"eval_loss": 0.15483084321022034, |
|
"eval_runtime": 6.2696, |
|
"eval_samples_per_second": 204.159, |
|
"eval_steps_per_second": 25.52, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 13.78125, |
|
"grad_norm": 1.8585741519927979, |
|
"learning_rate": 6.221875e-05, |
|
"loss": 0.0007, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 13.8125, |
|
"grad_norm": 0.0011989879421889782, |
|
"learning_rate": 6.190625e-05, |
|
"loss": 0.0003, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 13.84375, |
|
"grad_norm": 10.47256851196289, |
|
"learning_rate": 6.159375e-05, |
|
"loss": 0.0275, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 13.875, |
|
"grad_norm": 1.398241639137268, |
|
"learning_rate": 6.128125e-05, |
|
"loss": 0.0011, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 13.90625, |
|
"grad_norm": 0.0019299676641821861, |
|
"learning_rate": 6.096875000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 13.9375, |
|
"grad_norm": 0.0016532372683286667, |
|
"learning_rate": 6.065625e-05, |
|
"loss": 0.0003, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 13.96875, |
|
"grad_norm": 0.002245826181024313, |
|
"learning_rate": 6.034375e-05, |
|
"loss": 0.0259, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.002995037008076906, |
|
"learning_rate": 6.0031250000000006e-05, |
|
"loss": 0.0003, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 14.03125, |
|
"grad_norm": 0.0015550617827102542, |
|
"learning_rate": 5.971875e-05, |
|
"loss": 0.0002, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 14.0625, |
|
"grad_norm": 0.0018337038345634937, |
|
"learning_rate": 5.940625000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 14.0625, |
|
"eval_accuracy": 0.975, |
|
"eval_loss": 0.14965741336345673, |
|
"eval_runtime": 6.9272, |
|
"eval_samples_per_second": 184.78, |
|
"eval_steps_per_second": 23.097, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 14.09375, |
|
"grad_norm": 0.0013705631718039513, |
|
"learning_rate": 5.909375e-05, |
|
"loss": 0.0002, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 14.125, |
|
"grad_norm": 0.0015554011333733797, |
|
"learning_rate": 5.8781249999999996e-05, |
|
"loss": 0.0003, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 14.15625, |
|
"grad_norm": 0.0012009346392005682, |
|
"learning_rate": 5.846875000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 14.1875, |
|
"grad_norm": 0.0011792039731517434, |
|
"learning_rate": 5.815625e-05, |
|
"loss": 0.0002, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 14.21875, |
|
"grad_norm": 9.245038032531738, |
|
"learning_rate": 5.784375000000001e-05, |
|
"loss": 0.0038, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.0013690210180357099, |
|
"learning_rate": 5.7531250000000006e-05, |
|
"loss": 0.0003, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 14.28125, |
|
"grad_norm": 0.0013083881931379437, |
|
"learning_rate": 5.721875e-05, |
|
"loss": 0.0012, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 14.3125, |
|
"grad_norm": 34.58968734741211, |
|
"learning_rate": 5.6906250000000004e-05, |
|
"loss": 0.02, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 14.34375, |
|
"grad_norm": 0.0016999391373246908, |
|
"learning_rate": 5.6593750000000003e-05, |
|
"loss": 0.0002, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 14.375, |
|
"grad_norm": 0.0012809026520699263, |
|
"learning_rate": 5.628125000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 14.375, |
|
"eval_accuracy": 0.9640625, |
|
"eval_loss": 0.23174042999744415, |
|
"eval_runtime": 6.8846, |
|
"eval_samples_per_second": 185.923, |
|
"eval_steps_per_second": 23.24, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 14.40625, |
|
"grad_norm": 0.001300526550039649, |
|
"learning_rate": 5.596875e-05, |
|
"loss": 0.0006, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 14.4375, |
|
"grad_norm": 0.0010748986387625337, |
|
"learning_rate": 5.565625e-05, |
|
"loss": 0.0002, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 14.46875, |
|
"grad_norm": 0.0012471479130908847, |
|
"learning_rate": 5.534375000000001e-05, |
|
"loss": 0.0012, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.003839879296720028, |
|
"learning_rate": 5.503125e-05, |
|
"loss": 0.0002, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 14.53125, |
|
"grad_norm": 0.0013317788252606988, |
|
"learning_rate": 5.4718750000000005e-05, |
|
"loss": 0.0003, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 14.5625, |
|
"grad_norm": 0.001501008402556181, |
|
"learning_rate": 5.4406250000000004e-05, |
|
"loss": 0.0193, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 14.59375, |
|
"grad_norm": 0.0018197267781943083, |
|
"learning_rate": 5.409375e-05, |
|
"loss": 0.0035, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 14.625, |
|
"grad_norm": 0.0025837391149252653, |
|
"learning_rate": 5.378125e-05, |
|
"loss": 0.0005, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 14.65625, |
|
"grad_norm": 0.0016741983126848936, |
|
"learning_rate": 5.346875e-05, |
|
"loss": 0.0004, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 14.6875, |
|
"grad_norm": 0.036828938871622086, |
|
"learning_rate": 5.315625000000001e-05, |
|
"loss": 0.0003, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 14.6875, |
|
"eval_accuracy": 0.978125, |
|
"eval_loss": 0.14183716475963593, |
|
"eval_runtime": 6.7893, |
|
"eval_samples_per_second": 188.532, |
|
"eval_steps_per_second": 23.566, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 14.71875, |
|
"grad_norm": 0.0009561299229972064, |
|
"learning_rate": 5.284375e-05, |
|
"loss": 0.0002, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 14.75, |
|
"grad_norm": 0.001365609117783606, |
|
"learning_rate": 5.253125e-05, |
|
"loss": 0.0002, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 14.78125, |
|
"grad_norm": 0.0010248490143567324, |
|
"learning_rate": 5.2218750000000006e-05, |
|
"loss": 0.0002, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 14.8125, |
|
"grad_norm": 0.001031559775583446, |
|
"learning_rate": 5.190625e-05, |
|
"loss": 0.0002, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 14.84375, |
|
"grad_norm": 0.001064546755515039, |
|
"learning_rate": 5.159375000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 14.875, |
|
"grad_norm": 0.0011686970246955752, |
|
"learning_rate": 5.128125e-05, |
|
"loss": 0.0002, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 14.90625, |
|
"grad_norm": 0.0009815421653911471, |
|
"learning_rate": 5.0968749999999995e-05, |
|
"loss": 0.0002, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 14.9375, |
|
"grad_norm": 0.05007686838507652, |
|
"learning_rate": 5.065625000000001e-05, |
|
"loss": 0.0004, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 14.96875, |
|
"grad_norm": 0.001136138685978949, |
|
"learning_rate": 5.034375e-05, |
|
"loss": 0.0308, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.0011941984994336963, |
|
"learning_rate": 5.0031250000000007e-05, |
|
"loss": 0.0002, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9734375, |
|
"eval_loss": 0.15367402136325836, |
|
"eval_runtime": 6.8694, |
|
"eval_samples_per_second": 186.333, |
|
"eval_steps_per_second": 23.292, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 15.03125, |
|
"grad_norm": 0.0012780999531969428, |
|
"learning_rate": 4.9718750000000006e-05, |
|
"loss": 0.0003, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 15.0625, |
|
"grad_norm": 0.0019433508859947324, |
|
"learning_rate": 4.9406250000000005e-05, |
|
"loss": 0.0002, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 15.09375, |
|
"grad_norm": 0.0011221400927752256, |
|
"learning_rate": 4.9093750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 15.125, |
|
"grad_norm": 0.0010336939012631774, |
|
"learning_rate": 4.878125e-05, |
|
"loss": 0.0002, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 15.15625, |
|
"grad_norm": 0.11716494709253311, |
|
"learning_rate": 4.846875e-05, |
|
"loss": 0.0002, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 15.1875, |
|
"grad_norm": 0.0021654649171978235, |
|
"learning_rate": 4.815625e-05, |
|
"loss": 0.0002, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 15.21875, |
|
"grad_norm": 0.0010919362539425492, |
|
"learning_rate": 4.784375e-05, |
|
"loss": 0.0002, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 15.25, |
|
"grad_norm": 0.00111663737334311, |
|
"learning_rate": 4.753125000000001e-05, |
|
"loss": 0.0002, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 15.28125, |
|
"grad_norm": 0.0014220753218978643, |
|
"learning_rate": 4.721875e-05, |
|
"loss": 0.0002, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 15.3125, |
|
"grad_norm": 0.001022492302581668, |
|
"learning_rate": 4.690625e-05, |
|
"loss": 0.0002, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 15.3125, |
|
"eval_accuracy": 0.978125, |
|
"eval_loss": 0.14259929955005646, |
|
"eval_runtime": 6.2156, |
|
"eval_samples_per_second": 205.935, |
|
"eval_steps_per_second": 25.742, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 15.34375, |
|
"grad_norm": 0.001057266490533948, |
|
"learning_rate": 4.6593750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 15.375, |
|
"grad_norm": 0.001144145498983562, |
|
"learning_rate": 4.6281250000000003e-05, |
|
"loss": 0.0002, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 15.40625, |
|
"grad_norm": 0.0009692934690974653, |
|
"learning_rate": 4.596875e-05, |
|
"loss": 0.0002, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 15.4375, |
|
"grad_norm": 0.000945160398259759, |
|
"learning_rate": 4.565625e-05, |
|
"loss": 0.0002, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 15.46875, |
|
"grad_norm": 0.0009887829655781388, |
|
"learning_rate": 4.534375e-05, |
|
"loss": 0.0002, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"grad_norm": 0.002527805743739009, |
|
"learning_rate": 4.503125e-05, |
|
"loss": 0.0002, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 15.53125, |
|
"grad_norm": 0.0008647911017760634, |
|
"learning_rate": 4.4718750000000006e-05, |
|
"loss": 0.0002, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 15.5625, |
|
"grad_norm": 0.0011423344258219004, |
|
"learning_rate": 4.4406250000000005e-05, |
|
"loss": 0.0002, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 15.59375, |
|
"grad_norm": 0.0022281960118561983, |
|
"learning_rate": 4.409375e-05, |
|
"loss": 0.0002, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 15.625, |
|
"grad_norm": 0.0011202392634004354, |
|
"learning_rate": 4.3781250000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 15.625, |
|
"eval_accuracy": 0.98203125, |
|
"eval_loss": 0.12530331313610077, |
|
"eval_runtime": 6.8051, |
|
"eval_samples_per_second": 188.096, |
|
"eval_steps_per_second": 23.512, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 15.65625, |
|
"grad_norm": 0.0008281685295514762, |
|
"learning_rate": 4.346875e-05, |
|
"loss": 0.0002, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 15.6875, |
|
"grad_norm": 0.001133206533268094, |
|
"learning_rate": 4.315625e-05, |
|
"loss": 0.0002, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 15.71875, |
|
"grad_norm": 0.000926899432670325, |
|
"learning_rate": 4.284375000000001e-05, |
|
"loss": 0.0002, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 0.0009977484587579966, |
|
"learning_rate": 4.253125e-05, |
|
"loss": 0.0006, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 15.78125, |
|
"grad_norm": 0.0011439846130087972, |
|
"learning_rate": 4.221875e-05, |
|
"loss": 0.0428, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 15.8125, |
|
"grad_norm": 0.0012359012616798282, |
|
"learning_rate": 4.1906250000000006e-05, |
|
"loss": 0.0002, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 15.84375, |
|
"grad_norm": 0.0009622338111512363, |
|
"learning_rate": 4.1593750000000005e-05, |
|
"loss": 0.0002, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 15.875, |
|
"grad_norm": 0.0010638950625434518, |
|
"learning_rate": 4.1281250000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 15.90625, |
|
"grad_norm": 0.0011404824908822775, |
|
"learning_rate": 4.096875e-05, |
|
"loss": 0.0002, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 15.9375, |
|
"grad_norm": 0.0012403588043525815, |
|
"learning_rate": 4.065625e-05, |
|
"loss": 0.0002, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 15.9375, |
|
"eval_accuracy": 0.98359375, |
|
"eval_loss": 0.1128150224685669, |
|
"eval_runtime": 6.2783, |
|
"eval_samples_per_second": 203.878, |
|
"eval_steps_per_second": 25.485, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 15.96875, |
|
"grad_norm": 0.001123543013818562, |
|
"learning_rate": 4.034375e-05, |
|
"loss": 0.0002, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.001036747358739376, |
|
"learning_rate": 4.003125e-05, |
|
"loss": 0.0002, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 16.03125, |
|
"grad_norm": 0.0013284431770443916, |
|
"learning_rate": 3.9718750000000007e-05, |
|
"loss": 0.0002, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 16.0625, |
|
"grad_norm": 0.0008586233016103506, |
|
"learning_rate": 3.940625e-05, |
|
"loss": 0.0002, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 16.09375, |
|
"grad_norm": 0.0012248932616785169, |
|
"learning_rate": 3.909375e-05, |
|
"loss": 0.0002, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 16.125, |
|
"grad_norm": 0.006662206724286079, |
|
"learning_rate": 3.8781250000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 16.15625, |
|
"grad_norm": 0.0011745645897462964, |
|
"learning_rate": 3.846875e-05, |
|
"loss": 0.0002, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 16.1875, |
|
"grad_norm": 0.0010919731575995684, |
|
"learning_rate": 3.815625e-05, |
|
"loss": 0.0002, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 16.21875, |
|
"grad_norm": 0.0010604038834571838, |
|
"learning_rate": 3.784375e-05, |
|
"loss": 0.0002, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"grad_norm": 0.0011869663139805198, |
|
"learning_rate": 3.753125e-05, |
|
"loss": 0.0002, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"eval_accuracy": 0.98046875, |
|
"eval_loss": 0.1246190294623375, |
|
"eval_runtime": 6.8551, |
|
"eval_samples_per_second": 186.722, |
|
"eval_steps_per_second": 23.34, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 16.28125, |
|
"grad_norm": 0.0010713781230151653, |
|
"learning_rate": 3.721875e-05, |
|
"loss": 0.0002, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 16.3125, |
|
"grad_norm": 0.0007901139324530959, |
|
"learning_rate": 3.6906250000000006e-05, |
|
"loss": 0.0002, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 16.34375, |
|
"grad_norm": 0.001095872139558196, |
|
"learning_rate": 3.6593750000000005e-05, |
|
"loss": 0.0002, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 16.375, |
|
"grad_norm": 0.0010994484182447195, |
|
"learning_rate": 3.628125e-05, |
|
"loss": 0.0002, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 16.40625, |
|
"grad_norm": 0.0017003176035359502, |
|
"learning_rate": 3.5968750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 16.4375, |
|
"grad_norm": 0.0007925952086225152, |
|
"learning_rate": 3.565625e-05, |
|
"loss": 0.0002, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 16.46875, |
|
"grad_norm": 0.0008510766783729196, |
|
"learning_rate": 3.534375e-05, |
|
"loss": 0.0002, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.18106159567832947, |
|
"learning_rate": 3.503125e-05, |
|
"loss": 0.0002, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 16.53125, |
|
"grad_norm": 0.0008954983204603195, |
|
"learning_rate": 3.471875e-05, |
|
"loss": 0.0002, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 16.5625, |
|
"grad_norm": 0.0011244250927120447, |
|
"learning_rate": 3.440625e-05, |
|
"loss": 0.0002, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 16.5625, |
|
"eval_accuracy": 0.9828125, |
|
"eval_loss": 0.11365531384944916, |
|
"eval_runtime": 6.8269, |
|
"eval_samples_per_second": 187.495, |
|
"eval_steps_per_second": 23.437, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 16.59375, |
|
"grad_norm": 0.0009858476696535945, |
|
"learning_rate": 3.4093750000000005e-05, |
|
"loss": 0.0002, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 16.625, |
|
"grad_norm": 0.0012397761456668377, |
|
"learning_rate": 3.3781250000000005e-05, |
|
"loss": 0.0002, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 16.65625, |
|
"grad_norm": 0.0008871417958289385, |
|
"learning_rate": 3.3468750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 16.6875, |
|
"grad_norm": 0.0007771385135129094, |
|
"learning_rate": 3.315625e-05, |
|
"loss": 0.0002, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 16.71875, |
|
"grad_norm": 0.0007905489183031023, |
|
"learning_rate": 3.284375e-05, |
|
"loss": 0.0002, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"grad_norm": 0.0009036400006152689, |
|
"learning_rate": 3.253125e-05, |
|
"loss": 0.0001, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 16.78125, |
|
"grad_norm": 0.0007690058555454016, |
|
"learning_rate": 3.221875e-05, |
|
"loss": 0.0002, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 16.8125, |
|
"grad_norm": 0.0011280475882813334, |
|
"learning_rate": 3.1906250000000006e-05, |
|
"loss": 0.0002, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 16.84375, |
|
"grad_norm": 0.000766371435020119, |
|
"learning_rate": 3.159375e-05, |
|
"loss": 0.0002, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 16.875, |
|
"grad_norm": 0.005487964954227209, |
|
"learning_rate": 3.128125e-05, |
|
"loss": 0.0001, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 16.875, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11014194786548615, |
|
"eval_runtime": 6.2479, |
|
"eval_samples_per_second": 204.867, |
|
"eval_steps_per_second": 25.608, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 16.90625, |
|
"grad_norm": 0.0009019103599712253, |
|
"learning_rate": 3.0968750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 16.9375, |
|
"grad_norm": 0.0008425983251072466, |
|
"learning_rate": 3.065625e-05, |
|
"loss": 0.0001, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 16.96875, |
|
"grad_norm": 0.000717274087946862, |
|
"learning_rate": 3.0343750000000006e-05, |
|
"loss": 0.0002, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.0007335466798394918, |
|
"learning_rate": 3.0031249999999998e-05, |
|
"loss": 0.0001, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 17.03125, |
|
"grad_norm": 0.0015981667675077915, |
|
"learning_rate": 2.971875e-05, |
|
"loss": 0.0001, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 17.0625, |
|
"grad_norm": 0.000765695353038609, |
|
"learning_rate": 2.9406250000000003e-05, |
|
"loss": 0.0002, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 17.09375, |
|
"grad_norm": 0.0017417181516066194, |
|
"learning_rate": 2.9093750000000002e-05, |
|
"loss": 0.0002, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 17.125, |
|
"grad_norm": 0.0010076581966131926, |
|
"learning_rate": 2.8781250000000005e-05, |
|
"loss": 0.0001, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 17.15625, |
|
"grad_norm": 0.0008663997869007289, |
|
"learning_rate": 2.846875e-05, |
|
"loss": 0.0002, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 17.1875, |
|
"grad_norm": 0.0007040807977318764, |
|
"learning_rate": 2.815625e-05, |
|
"loss": 0.0001, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 17.1875, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11123112589120865, |
|
"eval_runtime": 7.2338, |
|
"eval_samples_per_second": 176.947, |
|
"eval_steps_per_second": 22.118, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 17.21875, |
|
"grad_norm": 0.0008027940057218075, |
|
"learning_rate": 2.7843750000000003e-05, |
|
"loss": 0.0001, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 17.25, |
|
"grad_norm": 0.0009467356721870601, |
|
"learning_rate": 2.7531250000000002e-05, |
|
"loss": 0.0002, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 17.28125, |
|
"grad_norm": 0.0007668856414966285, |
|
"learning_rate": 2.7218750000000004e-05, |
|
"loss": 0.0002, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 17.3125, |
|
"grad_norm": 0.0011194231919944286, |
|
"learning_rate": 2.690625e-05, |
|
"loss": 0.0002, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 17.34375, |
|
"grad_norm": 0.0010725741740316153, |
|
"learning_rate": 2.659375e-05, |
|
"loss": 0.0002, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 17.375, |
|
"grad_norm": 0.0013631158508360386, |
|
"learning_rate": 2.6281250000000002e-05, |
|
"loss": 0.0002, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 17.40625, |
|
"grad_norm": 0.0008963380823843181, |
|
"learning_rate": 2.5968750000000004e-05, |
|
"loss": 0.0001, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 17.4375, |
|
"grad_norm": 0.0008231993415392935, |
|
"learning_rate": 2.5656250000000004e-05, |
|
"loss": 0.0001, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 17.46875, |
|
"grad_norm": 0.0007719449349679053, |
|
"learning_rate": 2.534375e-05, |
|
"loss": 0.0002, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 0.0008315558661706746, |
|
"learning_rate": 2.5031250000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11211228370666504, |
|
"eval_runtime": 6.9482, |
|
"eval_samples_per_second": 184.221, |
|
"eval_steps_per_second": 23.028, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 17.53125, |
|
"grad_norm": 0.0007284819148480892, |
|
"learning_rate": 2.471875e-05, |
|
"loss": 0.0001, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 17.5625, |
|
"grad_norm": 0.0006902694585733116, |
|
"learning_rate": 2.440625e-05, |
|
"loss": 0.0001, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 17.59375, |
|
"grad_norm": 0.0007927055121399462, |
|
"learning_rate": 2.409375e-05, |
|
"loss": 0.0001, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 17.625, |
|
"grad_norm": 0.0008816330228000879, |
|
"learning_rate": 2.3781250000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 17.65625, |
|
"grad_norm": 0.00072521495167166, |
|
"learning_rate": 2.346875e-05, |
|
"loss": 0.0001, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 17.6875, |
|
"grad_norm": 0.0009029438951984048, |
|
"learning_rate": 2.315625e-05, |
|
"loss": 0.0001, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 17.71875, |
|
"grad_norm": 0.0007223158027045429, |
|
"learning_rate": 2.284375e-05, |
|
"loss": 0.0001, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.0007876714807935059, |
|
"learning_rate": 2.2531250000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 17.78125, |
|
"grad_norm": 0.0010673877550289035, |
|
"learning_rate": 2.221875e-05, |
|
"loss": 0.0001, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 17.8125, |
|
"grad_norm": 0.001492728479206562, |
|
"learning_rate": 2.190625e-05, |
|
"loss": 0.0001, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 17.8125, |
|
"eval_accuracy": 0.98359375, |
|
"eval_loss": 0.11293692886829376, |
|
"eval_runtime": 6.3413, |
|
"eval_samples_per_second": 201.851, |
|
"eval_steps_per_second": 25.231, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 17.84375, |
|
"grad_norm": 0.0008830283186398447, |
|
"learning_rate": 2.1593750000000003e-05, |
|
"loss": 0.0001, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 17.875, |
|
"grad_norm": 0.0006816980894654989, |
|
"learning_rate": 2.128125e-05, |
|
"loss": 0.0001, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 17.90625, |
|
"grad_norm": 0.0008512111380696297, |
|
"learning_rate": 2.096875e-05, |
|
"loss": 0.0001, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 17.9375, |
|
"grad_norm": 0.0008296071318909526, |
|
"learning_rate": 2.065625e-05, |
|
"loss": 0.0001, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 17.96875, |
|
"grad_norm": 0.0007265993626788259, |
|
"learning_rate": 2.034375e-05, |
|
"loss": 0.0001, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.001100354827940464, |
|
"learning_rate": 2.0031250000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 18.03125, |
|
"grad_norm": 0.0008238213486038148, |
|
"learning_rate": 1.9718749999999998e-05, |
|
"loss": 0.0001, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 18.0625, |
|
"grad_norm": 0.0010971089359372854, |
|
"learning_rate": 1.940625e-05, |
|
"loss": 0.0002, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 18.09375, |
|
"grad_norm": 0.0007329233339987695, |
|
"learning_rate": 1.9093750000000003e-05, |
|
"loss": 0.0002, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 18.125, |
|
"grad_norm": 0.0008615129627287388, |
|
"learning_rate": 1.878125e-05, |
|
"loss": 0.0001, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 18.125, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11349210888147354, |
|
"eval_runtime": 6.4474, |
|
"eval_samples_per_second": 198.53, |
|
"eval_steps_per_second": 24.816, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 18.15625, |
|
"grad_norm": 0.0007490671123377979, |
|
"learning_rate": 1.846875e-05, |
|
"loss": 0.0001, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 18.1875, |
|
"grad_norm": 0.0011704419739544392, |
|
"learning_rate": 1.815625e-05, |
|
"loss": 0.0001, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 18.21875, |
|
"grad_norm": 0.0007434096769429743, |
|
"learning_rate": 1.784375e-05, |
|
"loss": 0.0002, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 18.25, |
|
"grad_norm": 0.0007305287872441113, |
|
"learning_rate": 1.7531250000000003e-05, |
|
"loss": 0.0001, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 18.28125, |
|
"grad_norm": 0.0011461430694907904, |
|
"learning_rate": 1.7218750000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 18.3125, |
|
"grad_norm": 0.001021283445879817, |
|
"learning_rate": 1.690625e-05, |
|
"loss": 0.0001, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 18.34375, |
|
"grad_norm": 0.0006852642400190234, |
|
"learning_rate": 1.659375e-05, |
|
"loss": 0.0001, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 18.375, |
|
"grad_norm": 0.0007609901949763298, |
|
"learning_rate": 1.628125e-05, |
|
"loss": 0.0001, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 18.40625, |
|
"grad_norm": 0.0038062427192926407, |
|
"learning_rate": 1.5968750000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 18.4375, |
|
"grad_norm": 0.0007008612737990916, |
|
"learning_rate": 1.565625e-05, |
|
"loss": 0.0001, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 18.4375, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11404496431350708, |
|
"eval_runtime": 6.2975, |
|
"eval_samples_per_second": 203.256, |
|
"eval_steps_per_second": 25.407, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 18.46875, |
|
"grad_norm": 0.001011393149383366, |
|
"learning_rate": 1.534375e-05, |
|
"loss": 0.0001, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.0007079701754264534, |
|
"learning_rate": 1.503125e-05, |
|
"loss": 0.0001, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 18.53125, |
|
"grad_norm": 0.001830734545364976, |
|
"learning_rate": 1.471875e-05, |
|
"loss": 0.0001, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 18.5625, |
|
"grad_norm": 0.0007542877574451268, |
|
"learning_rate": 1.4406250000000001e-05, |
|
"loss": 0.0001, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 18.59375, |
|
"grad_norm": 0.0006455339025706053, |
|
"learning_rate": 1.409375e-05, |
|
"loss": 0.0001, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 18.625, |
|
"grad_norm": 0.0009898885618895292, |
|
"learning_rate": 1.3781250000000001e-05, |
|
"loss": 0.0001, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 18.65625, |
|
"grad_norm": 0.0009699007496237755, |
|
"learning_rate": 1.3468749999999999e-05, |
|
"loss": 0.0001, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 18.6875, |
|
"grad_norm": 0.0007313139503821731, |
|
"learning_rate": 1.3156250000000001e-05, |
|
"loss": 0.0001, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 18.71875, |
|
"grad_norm": 0.0010632864432409406, |
|
"learning_rate": 1.2843750000000002e-05, |
|
"loss": 0.0001, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"grad_norm": 0.000874130695592612, |
|
"learning_rate": 1.253125e-05, |
|
"loss": 0.0001, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11457158625125885, |
|
"eval_runtime": 6.7696, |
|
"eval_samples_per_second": 189.082, |
|
"eval_steps_per_second": 23.635, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 18.78125, |
|
"grad_norm": 0.0007607897859998047, |
|
"learning_rate": 1.221875e-05, |
|
"loss": 0.0001, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 18.8125, |
|
"grad_norm": 0.0006213558372110128, |
|
"learning_rate": 1.1906250000000001e-05, |
|
"loss": 0.0001, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 18.84375, |
|
"grad_norm": 0.0009066364727914333, |
|
"learning_rate": 1.159375e-05, |
|
"loss": 0.0001, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 18.875, |
|
"grad_norm": 0.0008698303136043251, |
|
"learning_rate": 1.128125e-05, |
|
"loss": 0.0001, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 18.90625, |
|
"grad_norm": 0.0008234538836404681, |
|
"learning_rate": 1.096875e-05, |
|
"loss": 0.0001, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 18.9375, |
|
"grad_norm": 0.0009270088630728424, |
|
"learning_rate": 1.0656250000000002e-05, |
|
"loss": 0.0001, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 18.96875, |
|
"grad_norm": 0.0007937824120745063, |
|
"learning_rate": 1.034375e-05, |
|
"loss": 0.0001, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.0009614995797164738, |
|
"learning_rate": 1.003125e-05, |
|
"loss": 0.0001, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 19.03125, |
|
"grad_norm": 0.0006743675330653787, |
|
"learning_rate": 9.71875e-06, |
|
"loss": 0.0001, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 19.0625, |
|
"grad_norm": 0.0007780918967910111, |
|
"learning_rate": 9.40625e-06, |
|
"loss": 0.0001, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 19.0625, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11495751142501831, |
|
"eval_runtime": 6.295, |
|
"eval_samples_per_second": 203.337, |
|
"eval_steps_per_second": 25.417, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 19.09375, |
|
"grad_norm": 0.0010844800854101777, |
|
"learning_rate": 9.09375e-06, |
|
"loss": 0.0001, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 19.125, |
|
"grad_norm": 0.0007763968897052109, |
|
"learning_rate": 8.78125e-06, |
|
"loss": 0.0001, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 19.15625, |
|
"grad_norm": 0.0006313940975815058, |
|
"learning_rate": 8.468750000000001e-06, |
|
"loss": 0.0001, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 19.1875, |
|
"grad_norm": 0.0010176225332543254, |
|
"learning_rate": 8.15625e-06, |
|
"loss": 0.0001, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 19.21875, |
|
"grad_norm": 0.0006113633280619979, |
|
"learning_rate": 7.84375e-06, |
|
"loss": 0.0001, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.0009647855767980218, |
|
"learning_rate": 7.531250000000001e-06, |
|
"loss": 0.0001, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 19.28125, |
|
"grad_norm": 0.0008165627950802445, |
|
"learning_rate": 7.21875e-06, |
|
"loss": 0.0001, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 19.3125, |
|
"grad_norm": 0.000816655985545367, |
|
"learning_rate": 6.90625e-06, |
|
"loss": 0.0001, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 19.34375, |
|
"grad_norm": 0.0008548317127861083, |
|
"learning_rate": 6.59375e-06, |
|
"loss": 0.0001, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 19.375, |
|
"grad_norm": 0.0006300067761912942, |
|
"learning_rate": 6.281249999999999e-06, |
|
"loss": 0.0001, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 19.375, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11527726799249649, |
|
"eval_runtime": 6.2032, |
|
"eval_samples_per_second": 206.344, |
|
"eval_steps_per_second": 25.793, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 19.40625, |
|
"grad_norm": 0.0009761661058291793, |
|
"learning_rate": 5.96875e-06, |
|
"loss": 0.0001, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 19.4375, |
|
"grad_norm": 0.0007255289820022881, |
|
"learning_rate": 5.65625e-06, |
|
"loss": 0.0001, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 19.46875, |
|
"grad_norm": 0.0006587054231204093, |
|
"learning_rate": 5.34375e-06, |
|
"loss": 0.0001, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 19.5, |
|
"grad_norm": 0.0009364929865114391, |
|
"learning_rate": 5.03125e-06, |
|
"loss": 0.0001, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 19.53125, |
|
"grad_norm": 0.0006768327439203858, |
|
"learning_rate": 4.71875e-06, |
|
"loss": 0.0001, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 19.5625, |
|
"grad_norm": 0.0015788966557011008, |
|
"learning_rate": 4.40625e-06, |
|
"loss": 0.0001, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 19.59375, |
|
"grad_norm": 0.0006980461766943336, |
|
"learning_rate": 4.09375e-06, |
|
"loss": 0.0001, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 19.625, |
|
"grad_norm": 0.0008529416518285871, |
|
"learning_rate": 3.78125e-06, |
|
"loss": 0.0001, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 19.65625, |
|
"grad_norm": 0.0008394841570407152, |
|
"learning_rate": 3.4687500000000005e-06, |
|
"loss": 0.0001, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 19.6875, |
|
"grad_norm": 0.0008695558062754571, |
|
"learning_rate": 3.15625e-06, |
|
"loss": 0.0001, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 19.6875, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11545456945896149, |
|
"eval_runtime": 6.236, |
|
"eval_samples_per_second": 205.259, |
|
"eval_steps_per_second": 25.657, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 19.71875, |
|
"grad_norm": 0.0012889541685581207, |
|
"learning_rate": 2.84375e-06, |
|
"loss": 0.0001, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 19.75, |
|
"grad_norm": 0.0006377049721777439, |
|
"learning_rate": 2.53125e-06, |
|
"loss": 0.0001, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 19.78125, |
|
"grad_norm": 0.0007706707692705095, |
|
"learning_rate": 2.21875e-06, |
|
"loss": 0.0001, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 19.8125, |
|
"grad_norm": 0.0007259553531184793, |
|
"learning_rate": 1.90625e-06, |
|
"loss": 0.0001, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 19.84375, |
|
"grad_norm": 0.0009117970475926995, |
|
"learning_rate": 1.5937500000000002e-06, |
|
"loss": 0.0001, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 19.875, |
|
"grad_norm": 0.0008638539584353566, |
|
"learning_rate": 1.28125e-06, |
|
"loss": 0.0001, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 19.90625, |
|
"grad_norm": 0.0006554033607244492, |
|
"learning_rate": 9.6875e-07, |
|
"loss": 0.0001, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 19.9375, |
|
"grad_norm": 0.00083553371950984, |
|
"learning_rate": 6.5625e-07, |
|
"loss": 0.0001, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 19.96875, |
|
"grad_norm": 0.0006699099321849644, |
|
"learning_rate": 3.4375000000000004e-07, |
|
"loss": 0.0001, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.000951381167396903, |
|
"learning_rate": 3.1250000000000005e-08, |
|
"loss": 0.0001, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.11548350006341934, |
|
"eval_runtime": 6.596, |
|
"eval_samples_per_second": 194.058, |
|
"eval_steps_per_second": 24.257, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 6400, |
|
"total_flos": 7.935321977546342e+18, |
|
"train_loss": 0.10612523018964566, |
|
"train_runtime": 3045.4016, |
|
"train_samples_per_second": 33.624, |
|
"train_steps_per_second": 2.102 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.935321977546342e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|