|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.985096870342772, |
|
"eval_steps": 500, |
|
"global_step": 3350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029806259314456036, |
|
"grad_norm": 4.8203125, |
|
"learning_rate": 0.00019999560279257314, |
|
"loss": 3.0078, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05961251862891207, |
|
"grad_norm": 3.638671875, |
|
"learning_rate": 0.0001999824115570012, |
|
"loss": 3.2729, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08941877794336811, |
|
"grad_norm": 2.904296875, |
|
"learning_rate": 0.00019996042745337617, |
|
"loss": 2.9446, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11922503725782414, |
|
"grad_norm": 3.033203125, |
|
"learning_rate": 0.0001999296524150713, |
|
"loss": 2.8675, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14903129657228018, |
|
"grad_norm": 3.251953125, |
|
"learning_rate": 0.00019989008914857116, |
|
"loss": 2.7271, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17883755588673622, |
|
"grad_norm": 2.93359375, |
|
"learning_rate": 0.00019984174113323353, |
|
"loss": 2.6584, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20864381520119224, |
|
"grad_norm": 5.30078125, |
|
"learning_rate": 0.00019978461262098343, |
|
"loss": 2.6879, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23845007451564829, |
|
"grad_norm": 2.580078125, |
|
"learning_rate": 0.00019971870863593925, |
|
"loss": 2.708, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26825633383010433, |
|
"grad_norm": 4.54296875, |
|
"learning_rate": 0.00019964403497397084, |
|
"loss": 2.6389, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.29806259314456035, |
|
"grad_norm": 3.798828125, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 2.6708, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 2.888671875, |
|
"learning_rate": 0.00019946840565837203, |
|
"loss": 2.5963, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.35767511177347244, |
|
"grad_norm": 4.51171875, |
|
"learning_rate": 0.00019936746545031223, |
|
"loss": 2.6241, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.38748137108792846, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.0001992577864551111, |
|
"loss": 2.6388, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4172876304023845, |
|
"grad_norm": 3.748046875, |
|
"learning_rate": 0.0001991393783183945, |
|
"loss": 2.5508, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.44709388971684055, |
|
"grad_norm": 3.177734375, |
|
"learning_rate": 0.0001990122514534651, |
|
"loss": 2.6173, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47690014903129657, |
|
"grad_norm": 3.462890625, |
|
"learning_rate": 0.00019887641704038688, |
|
"loss": 2.5658, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5067064083457526, |
|
"grad_norm": 3.923828125, |
|
"learning_rate": 0.00019873188702500163, |
|
"loss": 2.5239, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5365126676602087, |
|
"grad_norm": 2.8984375, |
|
"learning_rate": 0.00019857867411787847, |
|
"loss": 2.6104, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5663189269746647, |
|
"grad_norm": 4.87109375, |
|
"learning_rate": 0.00019841679179319606, |
|
"loss": 2.5866, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5961251862891207, |
|
"grad_norm": 2.427734375, |
|
"learning_rate": 0.0001982462542875576, |
|
"loss": 2.5702, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6259314456035767, |
|
"grad_norm": 3.05078125, |
|
"learning_rate": 0.00019806707659873887, |
|
"loss": 2.5746, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.0001978792744843691, |
|
"loss": 2.4715, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6855439642324889, |
|
"grad_norm": 4.15234375, |
|
"learning_rate": 0.00019768286446054532, |
|
"loss": 2.53, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7153502235469449, |
|
"grad_norm": 2.3203125, |
|
"learning_rate": 0.0001974778638003799, |
|
"loss": 2.4326, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7451564828614009, |
|
"grad_norm": 3.6015625, |
|
"learning_rate": 0.0001972642905324813, |
|
"loss": 2.4958, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7749627421758569, |
|
"grad_norm": 2.88671875, |
|
"learning_rate": 0.00019704216343936873, |
|
"loss": 2.5034, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8047690014903129, |
|
"grad_norm": 3.33984375, |
|
"learning_rate": 0.00019681150205582025, |
|
"loss": 2.5304, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.834575260804769, |
|
"grad_norm": 4.29296875, |
|
"learning_rate": 0.00019657232666715486, |
|
"loss": 2.5214, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8643815201192251, |
|
"grad_norm": 3.17578125, |
|
"learning_rate": 0.00019632465830744846, |
|
"loss": 2.476, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8941877794336811, |
|
"grad_norm": 11.734375, |
|
"learning_rate": 0.000196068518757684, |
|
"loss": 2.5233, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9239940387481371, |
|
"grad_norm": 2.697265625, |
|
"learning_rate": 0.00019580393054383622, |
|
"loss": 2.4108, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9538002980625931, |
|
"grad_norm": 2.669921875, |
|
"learning_rate": 0.00019553091693489018, |
|
"loss": 2.5019, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 3.626953125, |
|
"learning_rate": 0.00019524950194079534, |
|
"loss": 2.4359, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0134128166915053, |
|
"grad_norm": 2.32421875, |
|
"learning_rate": 0.00019495971031035367, |
|
"loss": 2.3362, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0432190760059612, |
|
"grad_norm": 3.6015625, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 2.2718, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0730253353204173, |
|
"grad_norm": 3.55078125, |
|
"learning_rate": 0.00019435509981677762, |
|
"loss": 2.2036, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1028315946348732, |
|
"grad_norm": 3.361328125, |
|
"learning_rate": 0.00019404033412559826, |
|
"loss": 2.1379, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1326378539493294, |
|
"grad_norm": 2.87890625, |
|
"learning_rate": 0.00019371729813730606, |
|
"loss": 2.2055, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1624441132637853, |
|
"grad_norm": 4.19921875, |
|
"learning_rate": 0.00019338602026102594, |
|
"loss": 2.223, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1922503725782414, |
|
"grad_norm": 7.328125, |
|
"learning_rate": 0.0001930465296307087, |
|
"loss": 2.1372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2220566318926975, |
|
"grad_norm": 3.384765625, |
|
"learning_rate": 0.00019269885610256865, |
|
"loss": 2.1608, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2518628912071534, |
|
"grad_norm": 3.04296875, |
|
"learning_rate": 0.00019234303025245835, |
|
"loss": 2.1672, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2816691505216096, |
|
"grad_norm": 4.5078125, |
|
"learning_rate": 0.0001919790833731791, |
|
"loss": 2.1802, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 5.1171875, |
|
"learning_rate": 0.00019160704747172934, |
|
"loss": 2.2136, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3412816691505216, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.00019122695526648968, |
|
"loss": 2.2758, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3710879284649775, |
|
"grad_norm": 3.951171875, |
|
"learning_rate": 0.00019083884018434547, |
|
"loss": 2.2679, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4008941877794336, |
|
"grad_norm": 2.701171875, |
|
"learning_rate": 0.00019044273635774705, |
|
"loss": 2.2514, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4307004470938898, |
|
"grad_norm": 2.97265625, |
|
"learning_rate": 0.00019003867862170832, |
|
"loss": 2.1568, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4605067064083457, |
|
"grad_norm": 3.541015625, |
|
"learning_rate": 0.00018962670251074275, |
|
"loss": 2.1824, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4903129657228018, |
|
"grad_norm": 3.330078125, |
|
"learning_rate": 0.00018920684425573865, |
|
"loss": 2.1767, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.520119225037258, |
|
"grad_norm": 3.2265625, |
|
"learning_rate": 0.0001887791407807728, |
|
"loss": 2.2287, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5499254843517138, |
|
"grad_norm": 4.98828125, |
|
"learning_rate": 0.00018834362969986308, |
|
"loss": 2.2313, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5797317436661698, |
|
"grad_norm": 3.599609375, |
|
"learning_rate": 0.00018790034931366072, |
|
"loss": 2.0869, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6095380029806259, |
|
"grad_norm": 4.42578125, |
|
"learning_rate": 0.00018744933860608183, |
|
"loss": 2.1447, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"grad_norm": 8.7578125, |
|
"learning_rate": 0.00018699063724087904, |
|
"loss": 2.2302, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.669150521609538, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 0.0001865242855581534, |
|
"loss": 2.0925, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.698956780923994, |
|
"grad_norm": 3.29296875, |
|
"learning_rate": 0.00018605032457080653, |
|
"loss": 2.2165, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7287630402384502, |
|
"grad_norm": 3.107421875, |
|
"learning_rate": 0.00018556879596093393, |
|
"loss": 2.1134, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.758569299552906, |
|
"grad_norm": 3.1796875, |
|
"learning_rate": 0.00018507974207615917, |
|
"loss": 2.1852, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.788375558867362, |
|
"grad_norm": 3.337890625, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 2.1492, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 3.28515625, |
|
"learning_rate": 0.00018407923117763462, |
|
"loss": 2.1912, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8479880774962743, |
|
"grad_norm": 3.3046875, |
|
"learning_rate": 0.00018356786215296386, |
|
"loss": 2.1557, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8777943368107302, |
|
"grad_norm": 2.759765625, |
|
"learning_rate": 0.000183049143823811, |
|
"loss": 2.1776, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9076005961251863, |
|
"grad_norm": 19.671875, |
|
"learning_rate": 0.00018252312180841776, |
|
"loss": 2.0987, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9374068554396424, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 0.00018198984236734246, |
|
"loss": 2.1745, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 4.73046875, |
|
"learning_rate": 0.00018144935239939144, |
|
"loss": 2.192, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9970193740685542, |
|
"grad_norm": 2.73828125, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 2.2213, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0268256333830106, |
|
"grad_norm": 4.38671875, |
|
"learning_rate": 0.00018034693164452578, |
|
"loss": 1.8964, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0566318926974665, |
|
"grad_norm": 2.90234375, |
|
"learning_rate": 0.0001797850978090658, |
|
"loss": 1.821, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.0864381520119224, |
|
"grad_norm": 3.3515625, |
|
"learning_rate": 0.00017921624734111292, |
|
"loss": 1.8791, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1162444113263787, |
|
"grad_norm": 3.490234375, |
|
"learning_rate": 0.0001786404302677374, |
|
"loss": 1.8005, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.1460506706408347, |
|
"grad_norm": 4.43359375, |
|
"learning_rate": 0.0001780576972286813, |
|
"loss": 1.9255, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1758569299552906, |
|
"grad_norm": 5.66015625, |
|
"learning_rate": 0.0001774680994719057, |
|
"loss": 1.9036, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2056631892697465, |
|
"grad_norm": 4.51171875, |
|
"learning_rate": 0.00017687168884908316, |
|
"loss": 1.9144, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.235469448584203, |
|
"grad_norm": 5.9609375, |
|
"learning_rate": 0.0001762685178110382, |
|
"loss": 1.9545, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2652757078986587, |
|
"grad_norm": 35.65625, |
|
"learning_rate": 0.00017565863940313415, |
|
"loss": 2.097, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.2950819672131146, |
|
"grad_norm": 13.2890625, |
|
"learning_rate": 0.00017504210726060828, |
|
"loss": 2.4501, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3248882265275705, |
|
"grad_norm": 8.046875, |
|
"learning_rate": 0.00017441897560385491, |
|
"loss": 2.852, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.354694485842027, |
|
"grad_norm": 13.984375, |
|
"learning_rate": 0.00017378929923365704, |
|
"loss": 3.0607, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.384500745156483, |
|
"grad_norm": 134.375, |
|
"learning_rate": 0.0001731531335263669, |
|
"loss": 3.6671, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4143070044709387, |
|
"grad_norm": 15.6171875, |
|
"learning_rate": 0.00017251053442903595, |
|
"loss": 3.3738, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.444113263785395, |
|
"grad_norm": 11.6796875, |
|
"learning_rate": 0.00017186155845449466, |
|
"loss": 3.5971, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.473919523099851, |
|
"grad_norm": 8.5546875, |
|
"learning_rate": 0.0001712062626763825, |
|
"loss": 3.9785, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.503725782414307, |
|
"grad_norm": 7.640625, |
|
"learning_rate": 0.00017054470472412873, |
|
"loss": 3.5969, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.533532041728763, |
|
"grad_norm": 9.0234375, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 3.6101, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.563338301043219, |
|
"grad_norm": 15.671875, |
|
"learning_rate": 0.0001692030355634046, |
|
"loss": 3.9166, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.593144560357675, |
|
"grad_norm": 70.6875, |
|
"learning_rate": 0.00016852304234688626, |
|
"loss": 4.0454, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 15.1953125, |
|
"learning_rate": 0.0001678370229297535, |
|
"loss": 4.0769, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6527570789865873, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016714503764339987, |
|
"loss": 678.3638, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.682563338301043, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016644714734388217, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.712369597615499, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001657434134065686, |
|
"loss": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.742175856929955, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016503389772074104, |
|
"loss": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.7719821162444114, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016431866268415237, |
|
"loss": 0.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8017883755588673, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016359777119753885, |
|
"loss": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8315946348733236, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001628712866590885, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8614008941877795, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016213927295886547, |
|
"loss": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.8912071535022354, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016140179447319132, |
|
"loss": 0.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9210134128166914, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016065891605898357, |
|
"loss": 0.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.9508196721311473, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015991070304805183, |
|
"loss": 0.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9806259314456036, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015915722124135227, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0104321907600595, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015839853690320074, |
|
"loss": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.0402384500745154, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015763471675544547, |
|
"loss": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.070044709388972, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015686582797159893, |
|
"loss": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.0998509687034277, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015609193817093058, |
|
"loss": 0.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.1296572280178836, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015531311541251995, |
|
"loss": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.15946348733234, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015452942818927143, |
|
"loss": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.189269746646796, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015374094542189054, |
|
"loss": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.2190760059612518, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000152947736452823, |
|
"loss": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.248882265275708, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001521498710401561, |
|
"loss": 0.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.278688524590164, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.30849478390462, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001505404519577379, |
|
"loss": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.338301043219076, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014972903982697744, |
|
"loss": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.368107302533532, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014891325431815183, |
|
"loss": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.397913561847988, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000148093167174823, |
|
"loss": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.427719821162444, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014726885051885653, |
|
"loss": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.4575260804769004, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014644037684407882, |
|
"loss": 0.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.4873323397913563, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014560781900990185, |
|
"loss": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.517138599105812, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014477125023491536, |
|
"loss": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.546944858420268, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014393074409044802, |
|
"loss": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.5767511177347244, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014308637449409706, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6065573770491803, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014223821570322762, |
|
"loss": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001413863423084424, |
|
"loss": 0.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.6661698956780926, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014053082922702183, |
|
"loss": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.6959761549925485, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013967175169633537, |
|
"loss": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.7257824143070044, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013880918526722497, |
|
"loss": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.7555886736214603, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013794320579736083, |
|
"loss": 0.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.7853949329359167, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013707388944457006, |
|
"loss": 0.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.8152011922503726, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013620131266013912, |
|
"loss": 0.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.8450074515648285, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013532555218209036, |
|
"loss": 0.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.874813710879285, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001344466850284333, |
|
"loss": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.9046199701937407, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001335647884903915, |
|
"loss": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.9344262295081966, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013267994012560504, |
|
"loss": 0.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.9642324888226526, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013179221775131005, |
|
"loss": 0.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.994038748137109, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.023845007451565, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001300084635000341, |
|
"loss": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.053651266766021, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000129112588493802, |
|
"loss": 0.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.083457526080477, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001282141532057631, |
|
"loss": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.113263785394933, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001273132366480438, |
|
"loss": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.143070044709389, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012640991805098367, |
|
"loss": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.172876304023845, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012550427685616765, |
|
"loss": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.202682563338301, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012459639270943944, |
|
"loss": 0.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.2324888226527575, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012368634545389733, |
|
"loss": 0.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.262295081967213, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012277421512287226, |
|
"loss": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.292101341281669, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012186008193288962, |
|
"loss": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.321907600596125, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012094402627661447, |
|
"loss": 0.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.351713859910581, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012002612871578143, |
|
"loss": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.381520119225037, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011910646997411001, |
|
"loss": 0.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.411326378539493, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011818513093020513, |
|
"loss": 0.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.44113263785395, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001172621926104446, |
|
"loss": 0.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.470938897168406, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011633773618185302, |
|
"loss": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.5007451564828616, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011541184294496392, |
|
"loss": 0.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.5305514157973175, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011448459432666961, |
|
"loss": 0.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.560357675111773, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011355607187306037, |
|
"loss": 0.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.590163934426229, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011262635724225272, |
|
"loss": 0.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.619970193740685, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011169553219720828, |
|
"loss": 0.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.649776453055141, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011076367859854304, |
|
"loss": 0.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.679582712369598, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010983087839732833, |
|
"loss": 0.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.709388971684054, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010889721362788361, |
|
"loss": 0.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.73919523099851, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000107962766400562, |
|
"loss": 0.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.769001490312966, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001070276188945293, |
|
"loss": 0.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.7988077496274215, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010609185335053669, |
|
"loss": 0.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.828614008941877, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010515555206368815, |
|
"loss": 0.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.858420268256334, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010421879737620312, |
|
"loss": 0.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.88822652757079, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000103281671670175, |
|
"loss": 0.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.918032786885246, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010234425736032607, |
|
"loss": 0.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.947839046199702, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001014066368867596, |
|
"loss": 0.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.977645305514158, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010046889270770987, |
|
"loss": 0.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.007451564828614, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.953110729229017e-05, |
|
"loss": 0.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.03725782414307, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.859336311324041e-05, |
|
"loss": 0.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.0670640834575265, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.765574263967396e-05, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.096870342771982, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.671832832982502e-05, |
|
"loss": 0.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.126676602086438, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.57812026237969e-05, |
|
"loss": 0.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.156482861400894, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.484444793631186e-05, |
|
"loss": 0.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.18628912071535, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.390814664946331e-05, |
|
"loss": 0.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.216095380029806, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.297238110547074e-05, |
|
"loss": 0.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.245901639344262, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.203723359943802e-05, |
|
"loss": 0.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.275707898658719, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.110278637211643e-05, |
|
"loss": 0.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.305514157973175, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.016912160267168e-05, |
|
"loss": 0.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.3353204172876305, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.923632140145701e-05, |
|
"loss": 0.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.365126676602086, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.830446780279176e-05, |
|
"loss": 0.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.394932935916542, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.73736427577473e-05, |
|
"loss": 0.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.424739195230998, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.644392812693968e-05, |
|
"loss": 0.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.55154056733304e-05, |
|
"loss": 0.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.484351713859911, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.458815705503611e-05, |
|
"loss": 0.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.514157973174367, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 0.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.543964232488823, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.273780738955544e-05, |
|
"loss": 0.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.573770491803279, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.181486906979487e-05, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.603576751117735, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.089353002589001e-05, |
|
"loss": 0.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.6333830104321905, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.997387128421858e-05, |
|
"loss": 0.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.663189269746647, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.905597372338558e-05, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.692995529061103, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.813991806711039e-05, |
|
"loss": 0.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.722801788375559, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.722578487712776e-05, |
|
"loss": 0.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.752608047690015, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.631365454610273e-05, |
|
"loss": 0.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.782414307004471, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.540360729056058e-05, |
|
"loss": 0.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.812220566318927, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.449572314383237e-05, |
|
"loss": 0.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.842026825633383, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.359008194901632e-05, |
|
"loss": 0.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.8718330849478395, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.268676335195623e-05, |
|
"loss": 0.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.901639344262295, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.178584679423695e-05, |
|
"loss": 0.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.931445603576751, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.088741150619803e-05, |
|
"loss": 0.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.961251862891207, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.999153649996595e-05, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.991058122205663, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.020864381520119, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.820778224868998e-05, |
|
"loss": 0.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.050670640834575, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.732005987439494e-05, |
|
"loss": 0.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.080476900149031, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.643521150960854e-05, |
|
"loss": 0.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.110283159463488, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.555331497156672e-05, |
|
"loss": 0.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.140089418777944, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.467444781790966e-05, |
|
"loss": 0.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.1698956780923995, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.379868733986089e-05, |
|
"loss": 0.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.199701937406855, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.292611055542998e-05, |
|
"loss": 0.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.229508196721311, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.205679420263916e-05, |
|
"loss": 0.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.259314456035767, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.119081473277501e-05, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.289120715350224, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.032824830366466e-05, |
|
"loss": 0.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.31892697466468, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9469170772978186e-05, |
|
"loss": 0.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.348733233979136, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.86136576915576e-05, |
|
"loss": 0.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.378539493293592, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.7761784296772395e-05, |
|
"loss": 0.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.408345752608048, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.691362550590297e-05, |
|
"loss": 0.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.4381520119225035, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.606925590955199e-05, |
|
"loss": 0.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.467958271236959, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.522874976508463e-05, |
|
"loss": 0.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 6.497764530551416, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.439218099009822e-05, |
|
"loss": 0.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.527570789865872, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.355962315592118e-05, |
|
"loss": 0.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.557377049180328, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.273114948114346e-05, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.587183308494784, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.190683282517701e-05, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.61698956780924, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.108674568184822e-05, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.646795827123696, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.0270960173022604e-05, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 6.676602086438152, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.945954804226214e-05, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 6.7064083457526085, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.865258064851579e-05, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.736214605067064, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.785012895984397e-05, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 6.76602086438152, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.705226354717703e-05, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.795827123695976, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.6259054578109426e-05, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.825633383010432, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.547057181072861e-05, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.855439642324888, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.468688458748006e-05, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.885245901639344, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.390806182906946e-05, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.915052160953801, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.313417202840106e-05, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.944858420268257, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2365283244554545e-05, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 6.9746646795827125, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.1601463096799274e-05, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.004470938897168, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.084277875864776e-05, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 7.034277198211624, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.008929695194819e-05, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.06408345752608, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.9341083941016445e-05, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.093889716840536, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.859820552680867e-05, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.123695976154993, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.786072704113456e-05, |
|
"loss": 0.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.153502235469449, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7128713340911535e-05, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.183308494783905, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.640222880246117e-05, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.213114754098361, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.568133731584767e-05, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.242921013412817, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.496610227925896e-05, |
|
"loss": 0.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.4256586593431407e-05, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.302533532041728, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.355285265611784e-05, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.332339791356185, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2854962356600124e-05, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.362146050670641, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.216297707024655e-05, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 7.391952309985097, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.147695765311377e-05, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.421758569299553, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.079696443659538e-05, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.451564828614009, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0123057222115836e-05, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.481371087928465, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9455295275871298e-05, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 7.511177347242921, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8793737323617553e-05, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 7.540983606557377, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8138441545505366e-05, |
|
"loss": 0.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.570789865871833, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.748946557096407e-05, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 7.600596125186289, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6846866473633125e-05, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 7.630402384500745, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6210700766342965e-05, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 7.660208643815201, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5581024396145116e-05, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 7.690014903129657, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4957892739391765e-05, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 7.719821162444113, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.434136059686587e-05, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.74962742175857, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3731482188961818e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.779433681073026, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3128311150916826e-05, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 7.8092399403874815, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.253190052809434e-05, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 7.839046199701937, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1942302771318712e-05, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 7.868852459016393, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.135956973226262e-05, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 7.898658718330849, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0783752658887066e-05, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.928464977645305, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.021490219093426e-05, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 7.958271236959762, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9653068355474214e-05, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 7.988077496274218, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.017883755588674, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8550647600608573e-05, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 8.04769001490313, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8010157632657543e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.077496274217586, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7476878191582246e-05, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.107302533532042, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6950856176189033e-05, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.137108792846497, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6432137847036145e-05, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.166915052160954, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5920768822365418e-05, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.19672131147541, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5416794074090258e-05, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.226527570789866, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4920257923840864e-05, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.256333830104323, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4431204039066082e-05, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 8.286140089418778, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3949675429193466e-05, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.315946348733235, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.347571444184661e-05, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.34575260804769, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.300936275912098e-05, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.375558867362146, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2550661393918217e-05, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 8.405365126676601, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2099650686339303e-05, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 8.435171385991058, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1656370300136943e-05, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 8.464977645305515, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1220859219227232e-05, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 8.49478390461997, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0793155744261351e-05, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 8.524590163934427, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0373297489257272e-05, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 8.554396423248882, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.96132137829171e-06, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 8.584202682563339, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.557263642252945e-06, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 8.614008941877794, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.161159815654574e-06, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 8.64381520119225, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.773044733510338e-06, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 8.673621460506707, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.392952528270659e-06, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 8.703427719821162, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.020916626820919e-06, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 8.733233979135619, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.656969747541665e-06, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 8.763040238450074, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.301143897431339e-06, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 8.79284649776453, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.953470369291348e-06, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.822652757078986, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.613979738974074e-06, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 8.852459016393443, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.2827018626939624e-06, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 8.8822652757079, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.959665874401765e-06, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 8.912071535022354, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6449001832223905e-06, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 8.941877794336811, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.338432470956589e-06, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.971684053651266, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.040289689646338e-06, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 9.001490312965723, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.750498059204677e-06, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.031296572280178, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.469083065109825e-06, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.061102831594635, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.196069456163787e-06, |
|
"loss": 0.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.931481242315993e-06, |
|
"loss": 0.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.120715350223547, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6753416925515593e-06, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.150521609538004, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.427673332845138e-06, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.180327868852459, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.188497944179758e-06, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.210134128166915, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9578365606312665e-06, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.23994038748137, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.269746646795827, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5221361996200955e-06, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 9.299552906110284, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3171355394546624e-06, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.329359165424739, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.120725515630906e-06, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 9.359165424739196, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.932923401261133e-06, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 9.38897168405365, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7537457124423895e-06, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 9.418777943368108, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5832082068039544e-06, |
|
"loss": 0.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 9.448584202682563, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4213258821215381e-06, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 9.47839046199702, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.268112974998381e-06, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 9.508196721311476, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1235829596131232e-06, |
|
"loss": 0.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 9.538002980625931, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.877485465349058e-07, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 9.567809239940388, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.606216816055334e-07, |
|
"loss": 0.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 9.597615499254843, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.422135448889033e-07, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 9.6274217585693, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.325345496877688e-07, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 9.657228017883755, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.315943416279834e-07, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 9.687034277198212, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3940179781019055e-07, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 9.716840536512668, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5596502602917027e-07, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 9.746646795827123, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8129136406075394e-07, |
|
"loss": 0.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 9.77645305514158, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.15387379016585e-07, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 9.806259314456035, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5825886676649192e-07, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 9.836065573770492, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0991085142886271e-07, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.865871833084947, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.034758492872052e-08, |
|
"loss": 0.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 9.895678092399404, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.9572546623856125e-08, |
|
"loss": 0.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 9.92548435171386, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7588442998817122e-08, |
|
"loss": 0.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 9.955290611028316, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.397207426865002e-09, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 9.985096870342772, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 9.985096870342772, |
|
"step": 3350, |
|
"total_flos": 1.647823703506944e+17, |
|
"train_loss": 2.6826694021651996, |
|
"train_runtime": 3317.5003, |
|
"train_samples_per_second": 4.045, |
|
"train_steps_per_second": 1.01 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3350, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.647823703506944e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|