diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6743 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999843448311851, + "eval_steps": 500, + "global_step": 4790, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 5.78125, + "learning_rate": 4.175365344467641e-07, + "loss": 1.6189, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 3.46875, + "learning_rate": 2.0876826722338207e-06, + "loss": 1.7423, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 9.9375, + "learning_rate": 4.175365344467641e-06, + "loss": 1.6192, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 13.375, + "learning_rate": 6.2630480167014616e-06, + "loss": 1.557, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 9.4375, + "learning_rate": 8.350730688935283e-06, + "loss": 1.5582, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 10.625, + "learning_rate": 1.0438413361169103e-05, + "loss": 1.6962, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 4.28125, + "learning_rate": 1.2526096033402923e-05, + "loss": 1.424, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 5.75, + "learning_rate": 1.4613778705636743e-05, + "loss": 1.4941, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 8.3125, + "learning_rate": 1.6701461377870565e-05, + "loss": 1.4198, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 40.75, + "learning_rate": 1.8789144050104384e-05, + "loss": 1.3663, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.5234375, + "learning_rate": 2.0876826722338206e-05, + "loss": 1.2479, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.3828125, + "learning_rate": 2.2964509394572024e-05, + "loss": 1.5104, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.3984375, + "learning_rate": 2.5052192066805846e-05, + "loss": 1.1935, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 5.46875, + "learning_rate": 2.7139874739039668e-05, + "loss": 1.304, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.5546875, + "learning_rate": 2.9227557411273487e-05, + "loss": 1.3205, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 1.4140625, + "learning_rate": 3.131524008350731e-05, + "loss": 1.1825, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 2.359375, + "learning_rate": 3.340292275574113e-05, + "loss": 1.17, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 0.828125, + "learning_rate": 3.5490605427974946e-05, + "loss": 1.1457, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 0.9296875, + "learning_rate": 3.757828810020877e-05, + "loss": 1.1369, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 1.171875, + "learning_rate": 3.966597077244259e-05, + "loss": 1.1408, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 0.63671875, + "learning_rate": 4.175365344467641e-05, + "loss": 1.1776, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 0.91796875, + "learning_rate": 4.3841336116910233e-05, + "loss": 1.0779, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 0.6484375, + "learning_rate": 4.592901878914405e-05, + "loss": 1.1247, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 1.0078125, + "learning_rate": 4.801670146137787e-05, + "loss": 1.2182, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 1.3125, + "learning_rate": 5.010438413361169e-05, + "loss": 1.2062, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 1.75, + "learning_rate": 5.219206680584552e-05, + "loss": 1.1983, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 0.5859375, + "learning_rate": 5.4279749478079336e-05, + "loss": 1.2013, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 1.390625, + "learning_rate": 5.636743215031316e-05, + "loss": 1.158, + "step": 135 + }, + { + "epoch": 0.03, + "grad_norm": 0.6484375, + "learning_rate": 5.8455114822546973e-05, + "loss": 1.1559, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 0.4765625, + "learning_rate": 6.05427974947808e-05, + "loss": 1.1545, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 0.578125, + "learning_rate": 6.263048016701462e-05, + "loss": 1.1553, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 0.484375, + "learning_rate": 6.471816283924845e-05, + "loss": 1.0636, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 0.6015625, + "learning_rate": 6.680584551148226e-05, + "loss": 1.1026, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 0.82421875, + "learning_rate": 6.889352818371608e-05, + "loss": 1.1697, + "step": 165 + }, + { + "epoch": 0.04, + "grad_norm": 1.0078125, + "learning_rate": 7.098121085594989e-05, + "loss": 1.1397, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 0.52734375, + "learning_rate": 7.306889352818372e-05, + "loss": 1.1051, + "step": 175 + }, + { + "epoch": 0.04, + "grad_norm": 0.53515625, + "learning_rate": 7.515657620041754e-05, + "loss": 1.0573, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 0.41015625, + "learning_rate": 7.724425887265136e-05, + "loss": 1.1934, + "step": 185 + }, + { + "epoch": 0.04, + "grad_norm": 0.5625, + "learning_rate": 7.933194154488518e-05, + "loss": 1.1273, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 0.50390625, + "learning_rate": 8.141962421711901e-05, + "loss": 1.1812, + "step": 195 + }, + { + "epoch": 0.04, + "grad_norm": 0.482421875, + "learning_rate": 8.350730688935282e-05, + "loss": 1.1121, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 0.453125, + "learning_rate": 8.559498956158665e-05, + "loss": 1.1803, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 0.50390625, + "learning_rate": 8.768267223382047e-05, + "loss": 1.139, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 0.3984375, + "learning_rate": 8.977035490605428e-05, + "loss": 1.1549, + "step": 215 + }, + { + "epoch": 0.05, + "grad_norm": 0.58203125, + "learning_rate": 9.18580375782881e-05, + "loss": 1.1616, + "step": 220 + }, + { + "epoch": 0.05, + "grad_norm": 0.8203125, + "learning_rate": 9.394572025052193e-05, + "loss": 1.1901, + "step": 225 + }, + { + "epoch": 0.05, + "grad_norm": 3.53125, + "learning_rate": 9.603340292275574e-05, + "loss": 1.121, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 0.51953125, + "learning_rate": 9.812108559498957e-05, + "loss": 1.0502, + "step": 235 + }, + { + "epoch": 0.05, + "grad_norm": 0.37109375, + "learning_rate": 0.00010020876826722338, + "loss": 1.0768, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 0.470703125, + "learning_rate": 0.00010229645093945721, + "loss": 1.2241, + "step": 245 + }, + { + "epoch": 0.05, + "grad_norm": 0.3671875, + "learning_rate": 0.00010438413361169104, + "loss": 1.1032, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 0.5234375, + "learning_rate": 0.00010647181628392484, + "loss": 1.0418, + "step": 255 + }, + { + "epoch": 0.05, + "grad_norm": 0.427734375, + "learning_rate": 0.00010855949895615867, + "loss": 1.1505, + "step": 260 + }, + { + "epoch": 0.06, + "grad_norm": 0.5546875, + "learning_rate": 0.00011064718162839249, + "loss": 1.0452, + "step": 265 + }, + { + "epoch": 0.06, + "grad_norm": 0.380859375, + "learning_rate": 0.00011273486430062632, + "loss": 1.062, + "step": 270 + }, + { + "epoch": 0.06, + "grad_norm": 0.375, + "learning_rate": 0.00011482254697286012, + "loss": 1.1468, + "step": 275 + }, + { + "epoch": 0.06, + "grad_norm": 0.390625, + "learning_rate": 0.00011691022964509395, + "loss": 1.1486, + "step": 280 + }, + { + "epoch": 0.06, + "grad_norm": 0.47265625, + "learning_rate": 0.00011899791231732778, + "loss": 1.1573, + "step": 285 + }, + { + "epoch": 0.06, + "grad_norm": 1.171875, + "learning_rate": 0.0001210855949895616, + "loss": 1.0818, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 0.416015625, + "learning_rate": 0.0001231732776617954, + "loss": 1.1409, + "step": 295 + }, + { + "epoch": 0.06, + "grad_norm": 0.466796875, + "learning_rate": 0.00012526096033402923, + "loss": 1.2038, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 0.419921875, + "learning_rate": 0.00012734864300626306, + "loss": 1.1181, + "step": 305 + }, + { + "epoch": 0.06, + "grad_norm": 0.380859375, + "learning_rate": 0.0001294363256784969, + "loss": 1.1275, + "step": 310 + }, + { + "epoch": 0.07, + "grad_norm": 0.3828125, + "learning_rate": 0.0001315240083507307, + "loss": 1.1049, + "step": 315 + }, + { + "epoch": 0.07, + "grad_norm": 0.41796875, + "learning_rate": 0.00013361169102296452, + "loss": 1.0732, + "step": 320 + }, + { + "epoch": 0.07, + "grad_norm": 0.79296875, + "learning_rate": 0.00013569937369519835, + "loss": 1.1349, + "step": 325 + }, + { + "epoch": 0.07, + "grad_norm": 0.353515625, + "learning_rate": 0.00013778705636743215, + "loss": 1.1851, + "step": 330 + }, + { + "epoch": 0.07, + "grad_norm": 0.376953125, + "learning_rate": 0.00013987473903966598, + "loss": 1.1427, + "step": 335 + }, + { + "epoch": 0.07, + "grad_norm": 0.369140625, + "learning_rate": 0.00014196242171189978, + "loss": 1.1225, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 0.333984375, + "learning_rate": 0.0001440501043841336, + "loss": 1.0863, + "step": 345 + }, + { + "epoch": 0.07, + "grad_norm": 0.416015625, + "learning_rate": 0.00014613778705636744, + "loss": 1.1447, + "step": 350 + }, + { + "epoch": 0.07, + "grad_norm": 0.58984375, + "learning_rate": 0.00014822546972860124, + "loss": 1.0539, + "step": 355 + }, + { + "epoch": 0.08, + "grad_norm": 0.37109375, + "learning_rate": 0.00015031315240083507, + "loss": 1.1486, + "step": 360 + }, + { + "epoch": 0.08, + "grad_norm": 0.3515625, + "learning_rate": 0.0001524008350730689, + "loss": 1.0433, + "step": 365 + }, + { + "epoch": 0.08, + "grad_norm": 0.380859375, + "learning_rate": 0.00015448851774530273, + "loss": 1.0943, + "step": 370 + }, + { + "epoch": 0.08, + "grad_norm": 0.4140625, + "learning_rate": 0.00015657620041753653, + "loss": 1.0619, + "step": 375 + }, + { + "epoch": 0.08, + "grad_norm": 0.416015625, + "learning_rate": 0.00015866388308977036, + "loss": 1.0754, + "step": 380 + }, + { + "epoch": 0.08, + "grad_norm": 0.453125, + "learning_rate": 0.0001607515657620042, + "loss": 1.1213, + "step": 385 + }, + { + "epoch": 0.08, + "grad_norm": 0.50390625, + "learning_rate": 0.00016283924843423802, + "loss": 1.2721, + "step": 390 + }, + { + "epoch": 0.08, + "grad_norm": 0.44921875, + "learning_rate": 0.00016492693110647182, + "loss": 1.0433, + "step": 395 + }, + { + "epoch": 0.08, + "grad_norm": 0.341796875, + "learning_rate": 0.00016701461377870565, + "loss": 1.0319, + "step": 400 + }, + { + "epoch": 0.08, + "grad_norm": 0.48828125, + "learning_rate": 0.00016910229645093947, + "loss": 1.1656, + "step": 405 + }, + { + "epoch": 0.09, + "grad_norm": 0.376953125, + "learning_rate": 0.0001711899791231733, + "loss": 1.0521, + "step": 410 + }, + { + "epoch": 0.09, + "grad_norm": 0.373046875, + "learning_rate": 0.0001732776617954071, + "loss": 1.19, + "step": 415 + }, + { + "epoch": 0.09, + "grad_norm": 0.361328125, + "learning_rate": 0.00017536534446764093, + "loss": 1.1473, + "step": 420 + }, + { + "epoch": 0.09, + "grad_norm": 0.462890625, + "learning_rate": 0.00017745302713987476, + "loss": 1.0851, + "step": 425 + }, + { + "epoch": 0.09, + "grad_norm": 0.341796875, + "learning_rate": 0.00017954070981210856, + "loss": 1.0106, + "step": 430 + }, + { + "epoch": 0.09, + "grad_norm": 0.416015625, + "learning_rate": 0.0001816283924843424, + "loss": 1.0673, + "step": 435 + }, + { + "epoch": 0.09, + "grad_norm": 0.546875, + "learning_rate": 0.0001837160751565762, + "loss": 1.0706, + "step": 440 + }, + { + "epoch": 0.09, + "grad_norm": 0.484375, + "learning_rate": 0.00018580375782881002, + "loss": 1.1294, + "step": 445 + }, + { + "epoch": 0.09, + "grad_norm": 1.1953125, + "learning_rate": 0.00018789144050104385, + "loss": 1.2233, + "step": 450 + }, + { + "epoch": 0.09, + "grad_norm": 0.439453125, + "learning_rate": 0.00018997912317327765, + "loss": 1.0567, + "step": 455 + }, + { + "epoch": 0.1, + "grad_norm": 0.39453125, + "learning_rate": 0.00019206680584551148, + "loss": 1.1792, + "step": 460 + }, + { + "epoch": 0.1, + "grad_norm": 0.392578125, + "learning_rate": 0.0001941544885177453, + "loss": 1.2399, + "step": 465 + }, + { + "epoch": 0.1, + "grad_norm": 0.455078125, + "learning_rate": 0.00019624217118997914, + "loss": 1.0533, + "step": 470 + }, + { + "epoch": 0.1, + "grad_norm": 0.6796875, + "learning_rate": 0.00019832985386221294, + "loss": 1.1026, + "step": 475 + }, + { + "epoch": 0.1, + "grad_norm": 0.455078125, + "learning_rate": 0.00019999997344699456, + "loss": 1.3194, + "step": 480 + }, + { + "epoch": 0.1, + "grad_norm": 0.451171875, + "learning_rate": 0.000199999044093284, + "loss": 1.2249, + "step": 485 + }, + { + "epoch": 0.1, + "grad_norm": 0.369140625, + "learning_rate": 0.0001999967871034016, + "loss": 1.0606, + "step": 490 + }, + { + "epoch": 0.1, + "grad_norm": 0.65625, + "learning_rate": 0.00019999320250731225, + "loss": 1.1662, + "step": 495 + }, + { + "epoch": 0.1, + "grad_norm": 0.38671875, + "learning_rate": 0.00019998829035260677, + "loss": 1.0885, + "step": 500 + }, + { + "epoch": 0.11, + "grad_norm": 0.375, + "learning_rate": 0.0001999820507045013, + "loss": 1.2287, + "step": 505 + }, + { + "epoch": 0.11, + "grad_norm": 0.859375, + "learning_rate": 0.00019997448364583652, + "loss": 1.2669, + "step": 510 + }, + { + "epoch": 0.11, + "grad_norm": 0.5859375, + "learning_rate": 0.00019996558927707635, + "loss": 1.0815, + "step": 515 + }, + { + "epoch": 0.11, + "grad_norm": 1.0390625, + "learning_rate": 0.00019995536771630682, + "loss": 1.2549, + "step": 520 + }, + { + "epoch": 0.11, + "grad_norm": 0.375, + "learning_rate": 0.00019994381909923433, + "loss": 1.0642, + "step": 525 + }, + { + "epoch": 0.11, + "grad_norm": 0.43359375, + "learning_rate": 0.00019993094357918396, + "loss": 1.2543, + "step": 530 + }, + { + "epoch": 0.11, + "grad_norm": 0.494140625, + "learning_rate": 0.00019991674132709743, + "loss": 1.1474, + "step": 535 + }, + { + "epoch": 0.11, + "grad_norm": 0.365234375, + "learning_rate": 0.00019990121253153075, + "loss": 1.0659, + "step": 540 + }, + { + "epoch": 0.11, + "grad_norm": 0.373046875, + "learning_rate": 0.00019988435739865182, + "loss": 1.1007, + "step": 545 + }, + { + "epoch": 0.11, + "grad_norm": 0.369140625, + "learning_rate": 0.00019986617615223767, + "loss": 1.1559, + "step": 550 + }, + { + "epoch": 0.12, + "grad_norm": 0.546875, + "learning_rate": 0.0001998466690336713, + "loss": 0.9967, + "step": 555 + }, + { + "epoch": 0.12, + "grad_norm": 0.49609375, + "learning_rate": 0.0001998258363019388, + "loss": 1.1432, + "step": 560 + }, + { + "epoch": 0.12, + "grad_norm": 0.38671875, + "learning_rate": 0.00019980367823362573, + "loss": 1.0002, + "step": 565 + }, + { + "epoch": 0.12, + "grad_norm": 8.625, + "learning_rate": 0.00019978019512291337, + "loss": 1.0953, + "step": 570 + }, + { + "epoch": 0.12, + "grad_norm": 0.40625, + "learning_rate": 0.00019975538728157503, + "loss": 1.1463, + "step": 575 + }, + { + "epoch": 0.12, + "grad_norm": 0.384765625, + "learning_rate": 0.0001997292550389717, + "loss": 1.0971, + "step": 580 + }, + { + "epoch": 0.12, + "grad_norm": 0.74609375, + "learning_rate": 0.0001997017987420478, + "loss": 1.0907, + "step": 585 + }, + { + "epoch": 0.12, + "grad_norm": 0.40625, + "learning_rate": 0.00019967301875532657, + "loss": 1.1168, + "step": 590 + }, + { + "epoch": 0.12, + "grad_norm": 1.0390625, + "learning_rate": 0.00019964291546090517, + "loss": 0.9757, + "step": 595 + }, + { + "epoch": 0.13, + "grad_norm": 0.451171875, + "learning_rate": 0.0001996114892584496, + "loss": 1.067, + "step": 600 + }, + { + "epoch": 0.13, + "grad_norm": 0.3828125, + "learning_rate": 0.00019957874056518955, + "loss": 1.1446, + "step": 605 + }, + { + "epoch": 0.13, + "grad_norm": 0.5859375, + "learning_rate": 0.00019954466981591263, + "loss": 1.1734, + "step": 610 + }, + { + "epoch": 0.13, + "grad_norm": 0.466796875, + "learning_rate": 0.00019950927746295875, + "loss": 1.175, + "step": 615 + }, + { + "epoch": 0.13, + "grad_norm": 0.482421875, + "learning_rate": 0.0001994725639762141, + "loss": 1.1536, + "step": 620 + }, + { + "epoch": 0.13, + "grad_norm": 0.6171875, + "learning_rate": 0.0001994345298431049, + "loss": 1.1167, + "step": 625 + }, + { + "epoch": 0.13, + "grad_norm": 0.462890625, + "learning_rate": 0.00019939517556859083, + "loss": 1.1631, + "step": 630 + }, + { + "epoch": 0.13, + "grad_norm": 0.416015625, + "learning_rate": 0.00019935450167515848, + "loss": 1.2179, + "step": 635 + }, + { + "epoch": 0.13, + "grad_norm": 0.458984375, + "learning_rate": 0.00019931250870281435, + "loss": 1.183, + "step": 640 + }, + { + "epoch": 0.13, + "grad_norm": 0.5, + "learning_rate": 0.00019926919720907762, + "loss": 1.1522, + "step": 645 + }, + { + "epoch": 0.14, + "grad_norm": 0.404296875, + "learning_rate": 0.00019922456776897294, + "loss": 1.23, + "step": 650 + }, + { + "epoch": 0.14, + "grad_norm": 0.34765625, + "learning_rate": 0.00019917862097502246, + "loss": 1.1084, + "step": 655 + }, + { + "epoch": 0.14, + "grad_norm": 0.396484375, + "learning_rate": 0.0001991313574372383, + "loss": 1.0821, + "step": 660 + }, + { + "epoch": 0.14, + "grad_norm": 0.462890625, + "learning_rate": 0.0001990827777831143, + "loss": 1.1474, + "step": 665 + }, + { + "epoch": 0.14, + "grad_norm": 0.453125, + "learning_rate": 0.0001990328826576177, + "loss": 1.1337, + "step": 670 + }, + { + "epoch": 0.14, + "grad_norm": 0.48828125, + "learning_rate": 0.00019898167272318053, + "loss": 1.0593, + "step": 675 + }, + { + "epoch": 0.14, + "grad_norm": 0.41015625, + "learning_rate": 0.00019892914865969093, + "loss": 1.2849, + "step": 680 + }, + { + "epoch": 0.14, + "grad_norm": 0.87109375, + "learning_rate": 0.00019887531116448399, + "loss": 1.133, + "step": 685 + }, + { + "epoch": 0.14, + "grad_norm": 0.3671875, + "learning_rate": 0.00019882016095233262, + "loss": 1.1837, + "step": 690 + }, + { + "epoch": 0.15, + "grad_norm": 0.41015625, + "learning_rate": 0.00019876369875543803, + "loss": 1.1552, + "step": 695 + }, + { + "epoch": 0.15, + "grad_norm": 0.396484375, + "learning_rate": 0.00019870592532341989, + "loss": 1.1478, + "step": 700 + }, + { + "epoch": 0.15, + "grad_norm": 0.46484375, + "learning_rate": 0.0001986468414233065, + "loss": 1.2432, + "step": 705 + }, + { + "epoch": 0.15, + "grad_norm": 0.3984375, + "learning_rate": 0.00019858644783952466, + "loss": 1.136, + "step": 710 + }, + { + "epoch": 0.15, + "grad_norm": 3.15625, + "learning_rate": 0.000198524745373889, + "loss": 1.2197, + "step": 715 + }, + { + "epoch": 0.15, + "grad_norm": 0.6484375, + "learning_rate": 0.00019846173484559167, + "loss": 1.1506, + "step": 720 + }, + { + "epoch": 0.15, + "grad_norm": 0.51171875, + "learning_rate": 0.0001983974170911912, + "loss": 0.9831, + "step": 725 + }, + { + "epoch": 0.15, + "grad_norm": 0.37890625, + "learning_rate": 0.00019833179296460156, + "loss": 1.0984, + "step": 730 + }, + { + "epoch": 0.15, + "grad_norm": 0.353515625, + "learning_rate": 0.0001982648633370807, + "loss": 1.0827, + "step": 735 + }, + { + "epoch": 0.15, + "grad_norm": 1.2890625, + "learning_rate": 0.00019819662909721905, + "loss": 1.1374, + "step": 740 + }, + { + "epoch": 0.16, + "grad_norm": 0.38671875, + "learning_rate": 0.00019812709115092774, + "loss": 1.0391, + "step": 745 + }, + { + "epoch": 0.16, + "grad_norm": 0.35546875, + "learning_rate": 0.0001980562504214265, + "loss": 1.0482, + "step": 750 + }, + { + "epoch": 0.16, + "grad_norm": 0.435546875, + "learning_rate": 0.0001979841078492315, + "loss": 1.0314, + "step": 755 + }, + { + "epoch": 0.16, + "grad_norm": 0.44140625, + "learning_rate": 0.0001979106643921427, + "loss": 0.9726, + "step": 760 + }, + { + "epoch": 0.16, + "grad_norm": 28.875, + "learning_rate": 0.00019783592102523144, + "loss": 1.2156, + "step": 765 + }, + { + "epoch": 0.16, + "grad_norm": 0.4375, + "learning_rate": 0.00019775987874082714, + "loss": 1.1393, + "step": 770 + }, + { + "epoch": 0.16, + "grad_norm": 0.54296875, + "learning_rate": 0.0001976825385485043, + "loss": 0.9954, + "step": 775 + }, + { + "epoch": 0.16, + "grad_norm": 0.35546875, + "learning_rate": 0.00019760390147506907, + "loss": 1.1198, + "step": 780 + }, + { + "epoch": 0.16, + "grad_norm": 0.44921875, + "learning_rate": 0.00019752396856454575, + "loss": 1.0508, + "step": 785 + }, + { + "epoch": 0.16, + "grad_norm": 0.39453125, + "learning_rate": 0.00019744274087816262, + "loss": 1.0594, + "step": 790 + }, + { + "epoch": 0.17, + "grad_norm": 0.359375, + "learning_rate": 0.00019736021949433825, + "loss": 1.0685, + "step": 795 + }, + { + "epoch": 0.17, + "grad_norm": 0.7734375, + "learning_rate": 0.00019727640550866677, + "loss": 1.18, + "step": 800 + }, + { + "epoch": 0.17, + "grad_norm": 0.392578125, + "learning_rate": 0.00019719130003390364, + "loss": 1.1706, + "step": 805 + }, + { + "epoch": 0.17, + "grad_norm": 0.51171875, + "learning_rate": 0.0001971049041999507, + "loss": 1.2276, + "step": 810 + }, + { + "epoch": 0.17, + "grad_norm": 0.4140625, + "learning_rate": 0.00019701721915384125, + "loss": 1.2431, + "step": 815 + }, + { + "epoch": 0.17, + "grad_norm": 0.345703125, + "learning_rate": 0.00019692824605972486, + "loss": 1.0623, + "step": 820 + }, + { + "epoch": 0.17, + "grad_norm": 0.353515625, + "learning_rate": 0.00019683798609885174, + "loss": 1.062, + "step": 825 + }, + { + "epoch": 0.17, + "grad_norm": 0.78515625, + "learning_rate": 0.00019674644046955732, + "loss": 1.1808, + "step": 830 + }, + { + "epoch": 0.17, + "grad_norm": 0.36328125, + "learning_rate": 0.00019665361038724607, + "loss": 1.1256, + "step": 835 + }, + { + "epoch": 0.18, + "grad_norm": 0.39453125, + "learning_rate": 0.00019655949708437544, + "loss": 1.0927, + "step": 840 + }, + { + "epoch": 0.18, + "grad_norm": 0.40234375, + "learning_rate": 0.00019646410181043973, + "loss": 1.0819, + "step": 845 + }, + { + "epoch": 0.18, + "grad_norm": 0.34765625, + "learning_rate": 0.00019636742583195317, + "loss": 1.1013, + "step": 850 + }, + { + "epoch": 0.18, + "grad_norm": 0.4765625, + "learning_rate": 0.00019626947043243327, + "loss": 1.1833, + "step": 855 + }, + { + "epoch": 0.18, + "grad_norm": 0.6015625, + "learning_rate": 0.00019617023691238382, + "loss": 1.1131, + "step": 860 + }, + { + "epoch": 0.18, + "grad_norm": 0.388671875, + "learning_rate": 0.0001960697265892775, + "loss": 1.1258, + "step": 865 + }, + { + "epoch": 0.18, + "grad_norm": 0.384765625, + "learning_rate": 0.00019596794079753847, + "loss": 1.2949, + "step": 870 + }, + { + "epoch": 0.18, + "grad_norm": 0.384765625, + "learning_rate": 0.00019586488088852463, + "loss": 1.1795, + "step": 875 + }, + { + "epoch": 0.18, + "grad_norm": 0.640625, + "learning_rate": 0.00019576054823050972, + "loss": 1.1488, + "step": 880 + }, + { + "epoch": 0.18, + "grad_norm": 0.388671875, + "learning_rate": 0.00019565494420866502, + "loss": 1.1299, + "step": 885 + }, + { + "epoch": 0.19, + "grad_norm": 0.5, + "learning_rate": 0.00019554807022504123, + "loss": 1.1733, + "step": 890 + }, + { + "epoch": 0.19, + "grad_norm": 0.375, + "learning_rate": 0.0001954399276985495, + "loss": 1.0494, + "step": 895 + }, + { + "epoch": 0.19, + "grad_norm": 0.38671875, + "learning_rate": 0.00019533051806494285, + "loss": 1.1806, + "step": 900 + }, + { + "epoch": 0.19, + "grad_norm": 0.458984375, + "learning_rate": 0.00019521984277679708, + "loss": 1.2164, + "step": 905 + }, + { + "epoch": 0.19, + "grad_norm": 0.400390625, + "learning_rate": 0.00019510790330349136, + "loss": 1.0622, + "step": 910 + }, + { + "epoch": 0.19, + "grad_norm": 0.40625, + "learning_rate": 0.00019499470113118887, + "loss": 1.112, + "step": 915 + }, + { + "epoch": 0.19, + "grad_norm": 0.39453125, + "learning_rate": 0.00019488023776281695, + "loss": 1.1172, + "step": 920 + }, + { + "epoch": 0.19, + "grad_norm": 0.396484375, + "learning_rate": 0.00019476451471804715, + "loss": 1.1562, + "step": 925 + }, + { + "epoch": 0.19, + "grad_norm": 0.333984375, + "learning_rate": 0.00019464753353327525, + "loss": 1.1697, + "step": 930 + }, + { + "epoch": 0.2, + "grad_norm": 0.353515625, + "learning_rate": 0.00019452929576160057, + "loss": 1.0866, + "step": 935 + }, + { + "epoch": 0.2, + "grad_norm": 0.341796875, + "learning_rate": 0.00019440980297280556, + "loss": 1.1256, + "step": 940 + }, + { + "epoch": 0.2, + "grad_norm": 0.57421875, + "learning_rate": 0.00019428905675333486, + "loss": 1.0887, + "step": 945 + }, + { + "epoch": 0.2, + "grad_norm": 0.333984375, + "learning_rate": 0.00019416705870627427, + "loss": 1.0343, + "step": 950 + }, + { + "epoch": 0.2, + "grad_norm": 0.33203125, + "learning_rate": 0.0001940438104513295, + "loss": 1.1126, + "step": 955 + }, + { + "epoch": 0.2, + "grad_norm": 0.39453125, + "learning_rate": 0.00019391931362480456, + "loss": 1.114, + "step": 960 + }, + { + "epoch": 0.2, + "grad_norm": 0.419921875, + "learning_rate": 0.0001937935698795802, + "loss": 1.0697, + "step": 965 + }, + { + "epoch": 0.2, + "grad_norm": 0.62109375, + "learning_rate": 0.00019366658088509184, + "loss": 1.1501, + "step": 970 + }, + { + "epoch": 0.2, + "grad_norm": 0.427734375, + "learning_rate": 0.00019353834832730736, + "loss": 1.104, + "step": 975 + }, + { + "epoch": 0.2, + "grad_norm": 0.40234375, + "learning_rate": 0.00019340887390870487, + "loss": 1.1771, + "step": 980 + }, + { + "epoch": 0.21, + "grad_norm": 0.431640625, + "learning_rate": 0.00019327815934825005, + "loss": 1.0209, + "step": 985 + }, + { + "epoch": 0.21, + "grad_norm": 0.44140625, + "learning_rate": 0.00019314620638137325, + "loss": 1.0394, + "step": 990 + }, + { + "epoch": 0.21, + "grad_norm": 0.390625, + "learning_rate": 0.00019301301675994653, + "loss": 1.1564, + "step": 995 + }, + { + "epoch": 0.21, + "grad_norm": 0.373046875, + "learning_rate": 0.00019287859225226042, + "loss": 1.165, + "step": 1000 + }, + { + "epoch": 0.21, + "grad_norm": 0.365234375, + "learning_rate": 0.00019274293464300032, + "loss": 1.0667, + "step": 1005 + }, + { + "epoch": 0.21, + "grad_norm": 0.357421875, + "learning_rate": 0.000192606045733223, + "loss": 1.0807, + "step": 1010 + }, + { + "epoch": 0.21, + "grad_norm": 0.375, + "learning_rate": 0.00019246792734033244, + "loss": 1.1203, + "step": 1015 + }, + { + "epoch": 0.21, + "grad_norm": 0.353515625, + "learning_rate": 0.00019232858129805593, + "loss": 1.0797, + "step": 1020 + }, + { + "epoch": 0.21, + "grad_norm": 0.36328125, + "learning_rate": 0.00019218800945641963, + "loss": 1.1469, + "step": 1025 + }, + { + "epoch": 0.21, + "grad_norm": 0.341796875, + "learning_rate": 0.00019204621368172403, + "loss": 0.9757, + "step": 1030 + }, + { + "epoch": 0.22, + "grad_norm": 0.337890625, + "learning_rate": 0.00019190319585651907, + "loss": 1.056, + "step": 1035 + }, + { + "epoch": 0.22, + "grad_norm": 0.34375, + "learning_rate": 0.00019175895787957933, + "loss": 1.1211, + "step": 1040 + }, + { + "epoch": 0.22, + "grad_norm": 0.349609375, + "learning_rate": 0.0001916135016658786, + "loss": 1.1362, + "step": 1045 + }, + { + "epoch": 0.22, + "grad_norm": 0.361328125, + "learning_rate": 0.00019146682914656472, + "loss": 1.0445, + "step": 1050 + }, + { + "epoch": 0.22, + "grad_norm": 0.384765625, + "learning_rate": 0.0001913189422689337, + "loss": 1.1363, + "step": 1055 + }, + { + "epoch": 0.22, + "grad_norm": 0.384765625, + "learning_rate": 0.00019116984299640395, + "loss": 1.0689, + "step": 1060 + }, + { + "epoch": 0.22, + "grad_norm": 0.41015625, + "learning_rate": 0.00019101953330849031, + "loss": 1.1139, + "step": 1065 + }, + { + "epoch": 0.22, + "grad_norm": 0.51171875, + "learning_rate": 0.00019086801520077766, + "loss": 1.0314, + "step": 1070 + }, + { + "epoch": 0.22, + "grad_norm": 0.376953125, + "learning_rate": 0.00019071529068489436, + "loss": 1.0727, + "step": 1075 + }, + { + "epoch": 0.23, + "grad_norm": 0.384765625, + "learning_rate": 0.0001905613617884858, + "loss": 1.1662, + "step": 1080 + }, + { + "epoch": 0.23, + "grad_norm": 0.408203125, + "learning_rate": 0.00019040623055518712, + "loss": 1.2001, + "step": 1085 + }, + { + "epoch": 0.23, + "grad_norm": 0.376953125, + "learning_rate": 0.00019024989904459647, + "loss": 1.1899, + "step": 1090 + }, + { + "epoch": 0.23, + "grad_norm": 0.361328125, + "learning_rate": 0.00019009236933224734, + "loss": 1.1244, + "step": 1095 + }, + { + "epoch": 0.23, + "grad_norm": 0.412109375, + "learning_rate": 0.00018993364350958114, + "loss": 1.0512, + "step": 1100 + }, + { + "epoch": 0.23, + "grad_norm": 0.34375, + "learning_rate": 0.00018977372368391947, + "loss": 1.1879, + "step": 1105 + }, + { + "epoch": 0.23, + "grad_norm": 0.451171875, + "learning_rate": 0.0001896126119784361, + "loss": 1.1454, + "step": 1110 + }, + { + "epoch": 0.23, + "grad_norm": 0.69921875, + "learning_rate": 0.0001894503105321287, + "loss": 1.1091, + "step": 1115 + }, + { + "epoch": 0.23, + "grad_norm": 0.50390625, + "learning_rate": 0.00018928682149979063, + "loss": 1.1875, + "step": 1120 + }, + { + "epoch": 0.23, + "grad_norm": 0.3828125, + "learning_rate": 0.00018912214705198212, + "loss": 1.1465, + "step": 1125 + }, + { + "epoch": 0.24, + "grad_norm": 0.458984375, + "learning_rate": 0.00018895628937500166, + "loss": 1.0818, + "step": 1130 + }, + { + "epoch": 0.24, + "grad_norm": 0.40625, + "learning_rate": 0.0001887892506708568, + "loss": 1.1909, + "step": 1135 + }, + { + "epoch": 0.24, + "grad_norm": 0.4296875, + "learning_rate": 0.00018862103315723494, + "loss": 1.08, + "step": 1140 + }, + { + "epoch": 0.24, + "grad_norm": 0.443359375, + "learning_rate": 0.00018845163906747406, + "loss": 1.0751, + "step": 1145 + }, + { + "epoch": 0.24, + "grad_norm": 0.54296875, + "learning_rate": 0.0001882810706505328, + "loss": 1.1537, + "step": 1150 + }, + { + "epoch": 0.24, + "grad_norm": 0.349609375, + "learning_rate": 0.00018810933017096088, + "loss": 1.0934, + "step": 1155 + }, + { + "epoch": 0.24, + "grad_norm": 0.373046875, + "learning_rate": 0.00018793641990886876, + "loss": 1.029, + "step": 1160 + }, + { + "epoch": 0.24, + "grad_norm": 0.515625, + "learning_rate": 0.00018776234215989762, + "loss": 1.0162, + "step": 1165 + }, + { + "epoch": 0.24, + "grad_norm": 0.453125, + "learning_rate": 0.00018758709923518863, + "loss": 1.0271, + "step": 1170 + }, + { + "epoch": 0.25, + "grad_norm": 0.373046875, + "learning_rate": 0.0001874106934613526, + "loss": 1.1106, + "step": 1175 + }, + { + "epoch": 0.25, + "grad_norm": 0.373046875, + "learning_rate": 0.00018723312718043873, + "loss": 1.1233, + "step": 1180 + }, + { + "epoch": 0.25, + "grad_norm": 0.42578125, + "learning_rate": 0.00018705440274990371, + "loss": 1.0942, + "step": 1185 + }, + { + "epoch": 0.25, + "grad_norm": 0.38671875, + "learning_rate": 0.00018687452254258045, + "loss": 1.1519, + "step": 1190 + }, + { + "epoch": 0.25, + "grad_norm": 0.51171875, + "learning_rate": 0.00018669348894664642, + "loss": 1.1205, + "step": 1195 + }, + { + "epoch": 0.25, + "grad_norm": 0.39453125, + "learning_rate": 0.00018651130436559215, + "loss": 1.179, + "step": 1200 + }, + { + "epoch": 0.25, + "grad_norm": 0.376953125, + "learning_rate": 0.00018632797121818914, + "loss": 1.0658, + "step": 1205 + }, + { + "epoch": 0.25, + "grad_norm": 0.455078125, + "learning_rate": 0.00018614349193845785, + "loss": 1.048, + "step": 1210 + }, + { + "epoch": 0.25, + "grad_norm": 0.55859375, + "learning_rate": 0.00018595786897563529, + "loss": 1.2007, + "step": 1215 + }, + { + "epoch": 0.25, + "grad_norm": 0.89453125, + "learning_rate": 0.0001857711047941427, + "loss": 1.2782, + "step": 1220 + }, + { + "epoch": 0.26, + "grad_norm": 0.447265625, + "learning_rate": 0.00018558320187355253, + "loss": 1.0843, + "step": 1225 + }, + { + "epoch": 0.26, + "grad_norm": 0.365234375, + "learning_rate": 0.0001853941627085558, + "loss": 1.1152, + "step": 1230 + }, + { + "epoch": 0.26, + "grad_norm": 0.396484375, + "learning_rate": 0.00018520398980892888, + "loss": 1.1012, + "step": 1235 + }, + { + "epoch": 0.26, + "grad_norm": 0.39453125, + "learning_rate": 0.00018501268569950003, + "loss": 1.1181, + "step": 1240 + }, + { + "epoch": 0.26, + "grad_norm": 3.109375, + "learning_rate": 0.00018482025292011612, + "loss": 1.1313, + "step": 1245 + }, + { + "epoch": 0.26, + "grad_norm": 0.4140625, + "learning_rate": 0.0001846266940256088, + "loss": 1.0561, + "step": 1250 + }, + { + "epoch": 0.26, + "grad_norm": 0.4453125, + "learning_rate": 0.00018443201158576048, + "loss": 1.1315, + "step": 1255 + }, + { + "epoch": 0.26, + "grad_norm": 1.9453125, + "learning_rate": 0.00018423620818527035, + "loss": 1.0607, + "step": 1260 + }, + { + "epoch": 0.26, + "grad_norm": 0.380859375, + "learning_rate": 0.00018403928642372005, + "loss": 1.1792, + "step": 1265 + }, + { + "epoch": 0.27, + "grad_norm": 0.447265625, + "learning_rate": 0.0001838412489155391, + "loss": 0.975, + "step": 1270 + }, + { + "epoch": 0.27, + "grad_norm": 0.4140625, + "learning_rate": 0.00018364209828997027, + "loss": 1.1655, + "step": 1275 + }, + { + "epoch": 0.27, + "grad_norm": 0.41796875, + "learning_rate": 0.00018344183719103452, + "loss": 1.1981, + "step": 1280 + }, + { + "epoch": 0.27, + "grad_norm": 0.609375, + "learning_rate": 0.00018324046827749606, + "loss": 1.1521, + "step": 1285 + }, + { + "epoch": 0.27, + "grad_norm": 0.349609375, + "learning_rate": 0.000183037994222827, + "loss": 1.1028, + "step": 1290 + }, + { + "epoch": 0.27, + "grad_norm": 0.42578125, + "learning_rate": 0.00018283441771517182, + "loss": 1.1165, + "step": 1295 + }, + { + "epoch": 0.27, + "grad_norm": 0.4765625, + "learning_rate": 0.00018262974145731172, + "loss": 1.0332, + "step": 1300 + }, + { + "epoch": 0.27, + "grad_norm": 0.40234375, + "learning_rate": 0.0001824239681666287, + "loss": 1.1238, + "step": 1305 + }, + { + "epoch": 0.27, + "grad_norm": 0.392578125, + "learning_rate": 0.00018221710057506947, + "loss": 1.077, + "step": 1310 + }, + { + "epoch": 0.27, + "grad_norm": 0.419921875, + "learning_rate": 0.00018200914142910935, + "loss": 1.0885, + "step": 1315 + }, + { + "epoch": 0.28, + "grad_norm": 0.396484375, + "learning_rate": 0.00018180009348971548, + "loss": 1.1532, + "step": 1320 + }, + { + "epoch": 0.28, + "grad_norm": 0.45703125, + "learning_rate": 0.00018158995953231055, + "loss": 1.159, + "step": 1325 + }, + { + "epoch": 0.28, + "grad_norm": 0.384765625, + "learning_rate": 0.00018137874234673557, + "loss": 1.1303, + "step": 1330 + }, + { + "epoch": 0.28, + "grad_norm": 0.36328125, + "learning_rate": 0.00018116644473721314, + "loss": 0.9563, + "step": 1335 + }, + { + "epoch": 0.28, + "grad_norm": 0.408203125, + "learning_rate": 0.00018095306952231006, + "loss": 1.1481, + "step": 1340 + }, + { + "epoch": 0.28, + "grad_norm": 0.48828125, + "learning_rate": 0.00018073861953489992, + "loss": 1.1995, + "step": 1345 + }, + { + "epoch": 0.28, + "grad_norm": 0.40234375, + "learning_rate": 0.00018052309762212556, + "loss": 1.1704, + "step": 1350 + }, + { + "epoch": 0.28, + "grad_norm": 0.3515625, + "learning_rate": 0.00018030650664536122, + "loss": 1.0302, + "step": 1355 + }, + { + "epoch": 0.28, + "grad_norm": 0.376953125, + "learning_rate": 0.0001800888494801745, + "loss": 1.199, + "step": 1360 + }, + { + "epoch": 0.28, + "grad_norm": 0.384765625, + "learning_rate": 0.00017987012901628832, + "loss": 1.0257, + "step": 1365 + }, + { + "epoch": 0.29, + "grad_norm": 0.40625, + "learning_rate": 0.0001796503481575424, + "loss": 1.2543, + "step": 1370 + }, + { + "epoch": 0.29, + "grad_norm": 0.357421875, + "learning_rate": 0.00017942950982185484, + "loss": 1.0754, + "step": 1375 + }, + { + "epoch": 0.29, + "grad_norm": 0.412109375, + "learning_rate": 0.00017920761694118327, + "loss": 1.1201, + "step": 1380 + }, + { + "epoch": 0.29, + "grad_norm": 0.404296875, + "learning_rate": 0.000178984672461486, + "loss": 1.0574, + "step": 1385 + }, + { + "epoch": 0.29, + "grad_norm": 0.3515625, + "learning_rate": 0.0001787606793426829, + "loss": 1.1474, + "step": 1390 + }, + { + "epoch": 0.29, + "grad_norm": 0.96484375, + "learning_rate": 0.00017853564055861603, + "loss": 1.2119, + "step": 1395 + }, + { + "epoch": 0.29, + "grad_norm": 0.416015625, + "learning_rate": 0.0001783095590970103, + "loss": 1.1593, + "step": 1400 + }, + { + "epoch": 0.29, + "grad_norm": 0.365234375, + "learning_rate": 0.0001780824379594336, + "loss": 1.1803, + "step": 1405 + }, + { + "epoch": 0.29, + "grad_norm": 0.33984375, + "learning_rate": 0.00017785428016125717, + "loss": 1.0862, + "step": 1410 + }, + { + "epoch": 0.3, + "grad_norm": 0.3984375, + "learning_rate": 0.00017762508873161542, + "loss": 1.1009, + "step": 1415 + }, + { + "epoch": 0.3, + "grad_norm": 0.412109375, + "learning_rate": 0.00017739486671336575, + "loss": 1.1385, + "step": 1420 + }, + { + "epoch": 0.3, + "grad_norm": 0.484375, + "learning_rate": 0.0001771636171630482, + "loss": 1.1652, + "step": 1425 + }, + { + "epoch": 0.3, + "grad_norm": 0.365234375, + "learning_rate": 0.00017693134315084475, + "loss": 1.1657, + "step": 1430 + }, + { + "epoch": 0.3, + "grad_norm": 0.408203125, + "learning_rate": 0.0001766980477605387, + "loss": 1.0517, + "step": 1435 + }, + { + "epoch": 0.3, + "grad_norm": 0.408203125, + "learning_rate": 0.00017646373408947364, + "loss": 1.0722, + "step": 1440 + }, + { + "epoch": 0.3, + "grad_norm": 0.400390625, + "learning_rate": 0.00017622840524851235, + "loss": 1.1638, + "step": 1445 + }, + { + "epoch": 0.3, + "grad_norm": 0.365234375, + "learning_rate": 0.00017599206436199556, + "loss": 1.1001, + "step": 1450 + }, + { + "epoch": 0.3, + "grad_norm": 0.6171875, + "learning_rate": 0.0001757547145677003, + "loss": 1.0481, + "step": 1455 + }, + { + "epoch": 0.3, + "grad_norm": 0.9453125, + "learning_rate": 0.00017551635901679849, + "loss": 1.1408, + "step": 1460 + }, + { + "epoch": 0.31, + "grad_norm": 0.953125, + "learning_rate": 0.00017527700087381482, + "loss": 1.0477, + "step": 1465 + }, + { + "epoch": 0.31, + "grad_norm": 0.5625, + "learning_rate": 0.00017503664331658498, + "loss": 1.0721, + "step": 1470 + }, + { + "epoch": 0.31, + "grad_norm": 0.322265625, + "learning_rate": 0.00017479528953621336, + "loss": 1.1062, + "step": 1475 + }, + { + "epoch": 0.31, + "grad_norm": 0.39453125, + "learning_rate": 0.00017455294273703066, + "loss": 1.1635, + "step": 1480 + }, + { + "epoch": 0.31, + "grad_norm": 0.365234375, + "learning_rate": 0.00017430960613655142, + "loss": 0.9949, + "step": 1485 + }, + { + "epoch": 0.31, + "grad_norm": 0.40625, + "learning_rate": 0.0001740652829654312, + "loss": 1.156, + "step": 1490 + }, + { + "epoch": 0.31, + "grad_norm": 0.40625, + "learning_rate": 0.00017381997646742387, + "loss": 1.1216, + "step": 1495 + }, + { + "epoch": 0.31, + "grad_norm": 0.58203125, + "learning_rate": 0.00017357368989933834, + "loss": 1.1143, + "step": 1500 + }, + { + "epoch": 0.31, + "grad_norm": 0.396484375, + "learning_rate": 0.00017332642653099543, + "loss": 1.1209, + "step": 1505 + }, + { + "epoch": 0.32, + "grad_norm": 0.54296875, + "learning_rate": 0.00017307818964518444, + "loss": 1.0504, + "step": 1510 + }, + { + "epoch": 0.32, + "grad_norm": 0.392578125, + "learning_rate": 0.00017282898253761952, + "loss": 1.1089, + "step": 1515 + }, + { + "epoch": 0.32, + "grad_norm": 1.5546875, + "learning_rate": 0.00017257880851689604, + "loss": 1.0741, + "step": 1520 + }, + { + "epoch": 0.32, + "grad_norm": 0.357421875, + "learning_rate": 0.00017232767090444656, + "loss": 1.0457, + "step": 1525 + }, + { + "epoch": 0.32, + "grad_norm": 0.45703125, + "learning_rate": 0.00017207557303449672, + "loss": 1.1921, + "step": 1530 + }, + { + "epoch": 0.32, + "grad_norm": 0.34765625, + "learning_rate": 0.000171822518254021, + "loss": 1.1792, + "step": 1535 + }, + { + "epoch": 0.32, + "grad_norm": 1.0078125, + "learning_rate": 0.00017156850992269837, + "loss": 1.081, + "step": 1540 + }, + { + "epoch": 0.32, + "grad_norm": 0.369140625, + "learning_rate": 0.00017131355141286747, + "loss": 1.0944, + "step": 1545 + }, + { + "epoch": 0.32, + "grad_norm": 0.380859375, + "learning_rate": 0.00017105764610948212, + "loss": 1.0123, + "step": 1550 + }, + { + "epoch": 0.32, + "grad_norm": 0.484375, + "learning_rate": 0.00017080079741006618, + "loss": 0.9975, + "step": 1555 + }, + { + "epoch": 0.33, + "grad_norm": 0.39453125, + "learning_rate": 0.00017054300872466842, + "loss": 1.0664, + "step": 1560 + }, + { + "epoch": 0.33, + "grad_norm": 0.498046875, + "learning_rate": 0.00017028428347581746, + "loss": 1.108, + "step": 1565 + }, + { + "epoch": 0.33, + "grad_norm": 0.4140625, + "learning_rate": 0.00017002462509847606, + "loss": 1.0253, + "step": 1570 + }, + { + "epoch": 0.33, + "grad_norm": 0.376953125, + "learning_rate": 0.00016976403703999578, + "loss": 1.0861, + "step": 1575 + }, + { + "epoch": 0.33, + "grad_norm": 0.404296875, + "learning_rate": 0.00016950252276007102, + "loss": 1.065, + "step": 1580 + }, + { + "epoch": 0.33, + "grad_norm": 0.4375, + "learning_rate": 0.00016924008573069305, + "loss": 1.0213, + "step": 1585 + }, + { + "epoch": 0.33, + "grad_norm": 0.42578125, + "learning_rate": 0.00016897672943610417, + "loss": 1.1514, + "step": 1590 + }, + { + "epoch": 0.33, + "grad_norm": 0.322265625, + "learning_rate": 0.00016871245737275125, + "loss": 1.1526, + "step": 1595 + }, + { + "epoch": 0.33, + "grad_norm": 0.416015625, + "learning_rate": 0.00016844727304923931, + "loss": 1.1442, + "step": 1600 + }, + { + "epoch": 0.34, + "grad_norm": 0.392578125, + "learning_rate": 0.00016818117998628499, + "loss": 1.0245, + "step": 1605 + }, + { + "epoch": 0.34, + "grad_norm": 0.37109375, + "learning_rate": 0.00016791418171666983, + "loss": 1.0738, + "step": 1610 + }, + { + "epoch": 0.34, + "grad_norm": 0.369140625, + "learning_rate": 0.00016764628178519335, + "loss": 1.0517, + "step": 1615 + }, + { + "epoch": 0.34, + "grad_norm": 0.376953125, + "learning_rate": 0.0001673774837486259, + "loss": 1.2543, + "step": 1620 + }, + { + "epoch": 0.34, + "grad_norm": 0.404296875, + "learning_rate": 0.00016710779117566158, + "loss": 1.1085, + "step": 1625 + }, + { + "epoch": 0.34, + "grad_norm": 0.396484375, + "learning_rate": 0.00016683720764687076, + "loss": 1.1164, + "step": 1630 + }, + { + "epoch": 0.34, + "grad_norm": 0.36328125, + "learning_rate": 0.00016656573675465257, + "loss": 1.1121, + "step": 1635 + }, + { + "epoch": 0.34, + "grad_norm": 0.39453125, + "learning_rate": 0.00016629338210318723, + "loss": 1.1551, + "step": 1640 + }, + { + "epoch": 0.34, + "grad_norm": 0.40234375, + "learning_rate": 0.00016602014730838818, + "loss": 1.149, + "step": 1645 + }, + { + "epoch": 0.34, + "grad_norm": 0.390625, + "learning_rate": 0.00016574603599785412, + "loss": 1.0301, + "step": 1650 + }, + { + "epoch": 0.35, + "grad_norm": 0.357421875, + "learning_rate": 0.00016547105181082062, + "loss": 1.0695, + "step": 1655 + }, + { + "epoch": 0.35, + "grad_norm": 1.5546875, + "learning_rate": 0.00016519519839811224, + "loss": 1.0497, + "step": 1660 + }, + { + "epoch": 0.35, + "grad_norm": 0.75, + "learning_rate": 0.0001649184794220936, + "loss": 1.1484, + "step": 1665 + }, + { + "epoch": 0.35, + "grad_norm": 0.38671875, + "learning_rate": 0.00016464089855662111, + "loss": 1.056, + "step": 1670 + }, + { + "epoch": 0.35, + "grad_norm": 0.390625, + "learning_rate": 0.0001643624594869939, + "loss": 1.1911, + "step": 1675 + }, + { + "epoch": 0.35, + "grad_norm": 0.466796875, + "learning_rate": 0.00016408316590990513, + "loss": 1.0945, + "step": 1680 + }, + { + "epoch": 0.35, + "grad_norm": 0.40234375, + "learning_rate": 0.00016380302153339282, + "loss": 1.1104, + "step": 1685 + }, + { + "epoch": 0.35, + "grad_norm": 0.435546875, + "learning_rate": 0.0001635220300767906, + "loss": 1.0678, + "step": 1690 + }, + { + "epoch": 0.35, + "grad_norm": 0.59765625, + "learning_rate": 0.0001632401952706783, + "loss": 1.0883, + "step": 1695 + }, + { + "epoch": 0.35, + "grad_norm": 0.392578125, + "learning_rate": 0.00016295752085683258, + "loss": 1.221, + "step": 1700 + }, + { + "epoch": 0.36, + "grad_norm": 0.388671875, + "learning_rate": 0.00016267401058817709, + "loss": 1.1249, + "step": 1705 + }, + { + "epoch": 0.36, + "grad_norm": 0.474609375, + "learning_rate": 0.0001623896682287326, + "loss": 1.0804, + "step": 1710 + }, + { + "epoch": 0.36, + "grad_norm": 0.375, + "learning_rate": 0.0001621044975535673, + "loss": 1.0464, + "step": 1715 + }, + { + "epoch": 0.36, + "grad_norm": 0.41796875, + "learning_rate": 0.00016181850234874636, + "loss": 1.1579, + "step": 1720 + }, + { + "epoch": 0.36, + "grad_norm": 0.345703125, + "learning_rate": 0.00016153168641128192, + "loss": 1.0265, + "step": 1725 + }, + { + "epoch": 0.36, + "grad_norm": 0.33984375, + "learning_rate": 0.00016124405354908246, + "loss": 0.9793, + "step": 1730 + }, + { + "epoch": 0.36, + "grad_norm": 0.41015625, + "learning_rate": 0.00016095560758090245, + "loss": 1.1421, + "step": 1735 + }, + { + "epoch": 0.36, + "grad_norm": 0.439453125, + "learning_rate": 0.00016066635233629145, + "loss": 1.0974, + "step": 1740 + }, + { + "epoch": 0.36, + "grad_norm": 0.37109375, + "learning_rate": 0.0001603762916555435, + "loss": 1.0533, + "step": 1745 + }, + { + "epoch": 0.37, + "grad_norm": 0.380859375, + "learning_rate": 0.00016008542938964588, + "loss": 1.1346, + "step": 1750 + }, + { + "epoch": 0.37, + "grad_norm": 0.34375, + "learning_rate": 0.00015979376940022813, + "loss": 1.1334, + "step": 1755 + }, + { + "epoch": 0.37, + "grad_norm": 0.3515625, + "learning_rate": 0.00015950131555951086, + "loss": 1.0466, + "step": 1760 + }, + { + "epoch": 0.37, + "grad_norm": 0.357421875, + "learning_rate": 0.0001592080717502541, + "loss": 1.0547, + "step": 1765 + }, + { + "epoch": 0.37, + "grad_norm": 0.36328125, + "learning_rate": 0.000158914041865706, + "loss": 1.1026, + "step": 1770 + }, + { + "epoch": 0.37, + "grad_norm": 0.35546875, + "learning_rate": 0.00015861922980955093, + "loss": 0.9938, + "step": 1775 + }, + { + "epoch": 0.37, + "grad_norm": 0.314453125, + "learning_rate": 0.0001583236394958578, + "loss": 1.0709, + "step": 1780 + }, + { + "epoch": 0.37, + "grad_norm": 0.38671875, + "learning_rate": 0.0001580272748490281, + "loss": 1.2601, + "step": 1785 + }, + { + "epoch": 0.37, + "grad_norm": 0.396484375, + "learning_rate": 0.00015773013980374364, + "loss": 1.021, + "step": 1790 + }, + { + "epoch": 0.37, + "grad_norm": 0.404296875, + "learning_rate": 0.0001574322383049145, + "loss": 0.9888, + "step": 1795 + }, + { + "epoch": 0.38, + "grad_norm": 0.396484375, + "learning_rate": 0.0001571335743076265, + "loss": 1.0506, + "step": 1800 + }, + { + "epoch": 0.38, + "grad_norm": 0.388671875, + "learning_rate": 0.00015683415177708887, + "loss": 1.0916, + "step": 1805 + }, + { + "epoch": 0.38, + "grad_norm": 0.39453125, + "learning_rate": 0.0001565339746885814, + "loss": 1.068, + "step": 1810 + }, + { + "epoch": 0.38, + "grad_norm": 0.59375, + "learning_rate": 0.00015623304702740178, + "loss": 1.1657, + "step": 1815 + }, + { + "epoch": 0.38, + "grad_norm": 0.357421875, + "learning_rate": 0.00015593137278881273, + "loss": 0.9727, + "step": 1820 + }, + { + "epoch": 0.38, + "grad_norm": 0.40625, + "learning_rate": 0.0001556289559779888, + "loss": 1.1467, + "step": 1825 + }, + { + "epoch": 0.38, + "grad_norm": 0.37109375, + "learning_rate": 0.0001553258006099633, + "loss": 1.1254, + "step": 1830 + }, + { + "epoch": 0.38, + "grad_norm": 0.435546875, + "learning_rate": 0.00015502191070957514, + "loss": 1.0301, + "step": 1835 + }, + { + "epoch": 0.38, + "grad_norm": 0.42578125, + "learning_rate": 0.00015471729031141503, + "loss": 1.0504, + "step": 1840 + }, + { + "epoch": 0.39, + "grad_norm": 0.369140625, + "learning_rate": 0.0001544119434597723, + "loss": 1.1046, + "step": 1845 + }, + { + "epoch": 0.39, + "grad_norm": 0.55078125, + "learning_rate": 0.00015410587420858088, + "loss": 1.065, + "step": 1850 + }, + { + "epoch": 0.39, + "grad_norm": 0.373046875, + "learning_rate": 0.00015379908662136574, + "loss": 1.0126, + "step": 1855 + }, + { + "epoch": 0.39, + "grad_norm": 0.37109375, + "learning_rate": 0.00015349158477118884, + "loss": 1.0121, + "step": 1860 + }, + { + "epoch": 0.39, + "grad_norm": 0.419921875, + "learning_rate": 0.00015318337274059491, + "loss": 1.1957, + "step": 1865 + }, + { + "epoch": 0.39, + "grad_norm": 0.55078125, + "learning_rate": 0.0001528744546215575, + "loss": 1.0178, + "step": 1870 + }, + { + "epoch": 0.39, + "grad_norm": 0.396484375, + "learning_rate": 0.0001525648345154245, + "loss": 1.1288, + "step": 1875 + }, + { + "epoch": 0.39, + "grad_norm": 0.48046875, + "learning_rate": 0.00015225451653286376, + "loss": 1.2488, + "step": 1880 + }, + { + "epoch": 0.39, + "grad_norm": 0.357421875, + "learning_rate": 0.00015194350479380843, + "loss": 1.0424, + "step": 1885 + }, + { + "epoch": 0.39, + "grad_norm": 0.369140625, + "learning_rate": 0.00015163180342740232, + "loss": 1.1758, + "step": 1890 + }, + { + "epoch": 0.4, + "grad_norm": 0.36328125, + "learning_rate": 0.00015131941657194506, + "loss": 1.0522, + "step": 1895 + }, + { + "epoch": 0.4, + "grad_norm": 0.396484375, + "learning_rate": 0.00015100634837483728, + "loss": 1.0631, + "step": 1900 + }, + { + "epoch": 0.4, + "grad_norm": 2.515625, + "learning_rate": 0.0001506926029925252, + "loss": 1.0577, + "step": 1905 + }, + { + "epoch": 0.4, + "grad_norm": 0.400390625, + "learning_rate": 0.00015037818459044588, + "loss": 1.0577, + "step": 1910 + }, + { + "epoch": 0.4, + "grad_norm": 0.384765625, + "learning_rate": 0.0001500630973429717, + "loss": 1.185, + "step": 1915 + }, + { + "epoch": 0.4, + "grad_norm": 0.349609375, + "learning_rate": 0.00014974734543335487, + "loss": 1.0734, + "step": 1920 + }, + { + "epoch": 0.4, + "grad_norm": 0.353515625, + "learning_rate": 0.00014943093305367204, + "loss": 1.0677, + "step": 1925 + }, + { + "epoch": 0.4, + "grad_norm": 0.458984375, + "learning_rate": 0.00014911386440476858, + "loss": 1.1153, + "step": 1930 + }, + { + "epoch": 0.4, + "grad_norm": 0.3671875, + "learning_rate": 0.0001487961436962028, + "loss": 1.115, + "step": 1935 + }, + { + "epoch": 0.4, + "grad_norm": 0.423828125, + "learning_rate": 0.00014847777514619007, + "loss": 1.181, + "step": 1940 + }, + { + "epoch": 0.41, + "grad_norm": 0.3828125, + "learning_rate": 0.00014815876298154683, + "loss": 1.0703, + "step": 1945 + }, + { + "epoch": 0.41, + "grad_norm": 0.380859375, + "learning_rate": 0.00014783911143763443, + "loss": 1.0116, + "step": 1950 + }, + { + "epoch": 0.41, + "grad_norm": 0.357421875, + "learning_rate": 0.00014751882475830301, + "loss": 1.0731, + "step": 1955 + }, + { + "epoch": 0.41, + "grad_norm": 0.376953125, + "learning_rate": 0.00014719790719583498, + "loss": 0.9598, + "step": 1960 + }, + { + "epoch": 0.41, + "grad_norm": 0.431640625, + "learning_rate": 0.00014687636301088873, + "loss": 1.0058, + "step": 1965 + }, + { + "epoch": 0.41, + "grad_norm": 0.390625, + "learning_rate": 0.00014655419647244196, + "loss": 1.1702, + "step": 1970 + }, + { + "epoch": 0.41, + "grad_norm": 0.392578125, + "learning_rate": 0.00014623141185773507, + "loss": 1.1904, + "step": 1975 + }, + { + "epoch": 0.41, + "grad_norm": 0.359375, + "learning_rate": 0.0001459080134522143, + "loss": 1.0873, + "step": 1980 + }, + { + "epoch": 0.41, + "grad_norm": 0.373046875, + "learning_rate": 0.00014558400554947495, + "loss": 1.1216, + "step": 1985 + }, + { + "epoch": 0.42, + "grad_norm": 0.349609375, + "learning_rate": 0.00014525939245120422, + "loss": 0.9945, + "step": 1990 + }, + { + "epoch": 0.42, + "grad_norm": 0.40625, + "learning_rate": 0.00014493417846712424, + "loss": 1.1011, + "step": 1995 + }, + { + "epoch": 0.42, + "grad_norm": 0.373046875, + "learning_rate": 0.00014460836791493477, + "loss": 1.0888, + "step": 2000 + }, + { + "epoch": 0.42, + "grad_norm": 0.390625, + "learning_rate": 0.00014428196512025586, + "loss": 1.0757, + "step": 2005 + }, + { + "epoch": 0.42, + "grad_norm": 0.412109375, + "learning_rate": 0.0001439549744165706, + "loss": 1.069, + "step": 2010 + }, + { + "epoch": 0.42, + "grad_norm": 0.359375, + "learning_rate": 0.00014362740014516723, + "loss": 1.1268, + "step": 2015 + }, + { + "epoch": 0.42, + "grad_norm": 0.375, + "learning_rate": 0.0001432992466550819, + "loss": 1.0224, + "step": 2020 + }, + { + "epoch": 0.42, + "grad_norm": 0.353515625, + "learning_rate": 0.0001429705183030407, + "loss": 1.0092, + "step": 2025 + }, + { + "epoch": 0.42, + "grad_norm": 0.376953125, + "learning_rate": 0.00014264121945340187, + "loss": 1.102, + "step": 2030 + }, + { + "epoch": 0.42, + "grad_norm": 0.353515625, + "learning_rate": 0.00014231135447809778, + "loss": 1.0176, + "step": 2035 + }, + { + "epoch": 0.43, + "grad_norm": 0.380859375, + "learning_rate": 0.00014198092775657704, + "loss": 1.1124, + "step": 2040 + }, + { + "epoch": 0.43, + "grad_norm": 0.578125, + "learning_rate": 0.0001416499436757463, + "loss": 1.2026, + "step": 2045 + }, + { + "epoch": 0.43, + "grad_norm": 0.359375, + "learning_rate": 0.00014131840662991196, + "loss": 1.1109, + "step": 2050 + }, + { + "epoch": 0.43, + "grad_norm": 0.421875, + "learning_rate": 0.0001409863210207218, + "loss": 1.1017, + "step": 2055 + }, + { + "epoch": 0.43, + "grad_norm": 0.4140625, + "learning_rate": 0.0001406536912571066, + "loss": 0.9984, + "step": 2060 + }, + { + "epoch": 0.43, + "grad_norm": 0.4375, + "learning_rate": 0.0001403205217552217, + "loss": 1.0476, + "step": 2065 + }, + { + "epoch": 0.43, + "grad_norm": 0.404296875, + "learning_rate": 0.00013998681693838818, + "loss": 1.17, + "step": 2070 + }, + { + "epoch": 0.43, + "grad_norm": 0.328125, + "learning_rate": 0.0001396525812370342, + "loss": 1.0703, + "step": 2075 + }, + { + "epoch": 0.43, + "grad_norm": 0.39453125, + "learning_rate": 0.00013931781908863635, + "loss": 1.0546, + "step": 2080 + }, + { + "epoch": 0.44, + "grad_norm": 0.365234375, + "learning_rate": 0.00013898253493766042, + "loss": 1.0501, + "step": 2085 + }, + { + "epoch": 0.44, + "grad_norm": 0.369140625, + "learning_rate": 0.00013864673323550264, + "loss": 0.9337, + "step": 2090 + }, + { + "epoch": 0.44, + "grad_norm": 0.474609375, + "learning_rate": 0.00013831041844043047, + "loss": 0.9993, + "step": 2095 + }, + { + "epoch": 0.44, + "grad_norm": 0.3828125, + "learning_rate": 0.00013797359501752348, + "loss": 1.0086, + "step": 2100 + }, + { + "epoch": 0.44, + "grad_norm": 0.380859375, + "learning_rate": 0.00013763626743861404, + "loss": 1.0966, + "step": 2105 + }, + { + "epoch": 0.44, + "grad_norm": 0.36328125, + "learning_rate": 0.00013729844018222788, + "loss": 1.1107, + "step": 2110 + }, + { + "epoch": 0.44, + "grad_norm": 0.353515625, + "learning_rate": 0.00013696011773352474, + "loss": 1.0831, + "step": 2115 + }, + { + "epoch": 0.44, + "grad_norm": 0.50390625, + "learning_rate": 0.00013662130458423876, + "loss": 1.0856, + "step": 2120 + }, + { + "epoch": 0.44, + "grad_norm": 0.3984375, + "learning_rate": 0.00013628200523261888, + "loss": 1.0221, + "step": 2125 + }, + { + "epoch": 0.44, + "grad_norm": 0.40234375, + "learning_rate": 0.00013594222418336906, + "loss": 1.118, + "step": 2130 + }, + { + "epoch": 0.45, + "grad_norm": 0.81640625, + "learning_rate": 0.00013560196594758855, + "loss": 1.1019, + "step": 2135 + }, + { + "epoch": 0.45, + "grad_norm": 0.408203125, + "learning_rate": 0.00013526123504271196, + "loss": 1.1237, + "step": 2140 + }, + { + "epoch": 0.45, + "grad_norm": 0.337890625, + "learning_rate": 0.00013492003599244922, + "loss": 1.005, + "step": 2145 + }, + { + "epoch": 0.45, + "grad_norm": 0.4375, + "learning_rate": 0.0001345783733267257, + "loss": 1.0955, + "step": 2150 + }, + { + "epoch": 0.45, + "grad_norm": 0.33203125, + "learning_rate": 0.0001342362515816219, + "loss": 1.1209, + "step": 2155 + }, + { + "epoch": 0.45, + "grad_norm": 0.365234375, + "learning_rate": 0.00013389367529931324, + "loss": 0.9948, + "step": 2160 + }, + { + "epoch": 0.45, + "grad_norm": 0.361328125, + "learning_rate": 0.00013355064902800988, + "loss": 1.0733, + "step": 2165 + }, + { + "epoch": 0.45, + "grad_norm": 0.39453125, + "learning_rate": 0.00013320717732189614, + "loss": 1.119, + "step": 2170 + }, + { + "epoch": 0.45, + "grad_norm": 0.35546875, + "learning_rate": 0.0001328632647410703, + "loss": 0.9886, + "step": 2175 + }, + { + "epoch": 0.46, + "grad_norm": 0.37890625, + "learning_rate": 0.0001325189158514838, + "loss": 1.1871, + "step": 2180 + }, + { + "epoch": 0.46, + "grad_norm": 0.353515625, + "learning_rate": 0.0001321741352248807, + "loss": 1.0246, + "step": 2185 + }, + { + "epoch": 0.46, + "grad_norm": 0.392578125, + "learning_rate": 0.00013182892743873715, + "loss": 1.1359, + "step": 2190 + }, + { + "epoch": 0.46, + "grad_norm": 0.361328125, + "learning_rate": 0.00013148329707620042, + "loss": 1.0841, + "step": 2195 + }, + { + "epoch": 0.46, + "grad_norm": 0.41015625, + "learning_rate": 0.00013113724872602798, + "loss": 1.0725, + "step": 2200 + }, + { + "epoch": 0.46, + "grad_norm": 0.37890625, + "learning_rate": 0.00013079078698252687, + "loss": 1.0312, + "step": 2205 + }, + { + "epoch": 0.46, + "grad_norm": 0.3671875, + "learning_rate": 0.00013044391644549252, + "loss": 1.0211, + "step": 2210 + }, + { + "epoch": 0.46, + "grad_norm": 0.60546875, + "learning_rate": 0.00013009664172014765, + "loss": 1.0574, + "step": 2215 + }, + { + "epoch": 0.46, + "grad_norm": 0.37890625, + "learning_rate": 0.00012974896741708115, + "loss": 1.0335, + "step": 2220 + }, + { + "epoch": 0.46, + "grad_norm": 0.439453125, + "learning_rate": 0.00012940089815218704, + "loss": 1.1947, + "step": 2225 + }, + { + "epoch": 0.47, + "grad_norm": 0.640625, + "learning_rate": 0.00012905243854660286, + "loss": 1.0073, + "step": 2230 + }, + { + "epoch": 0.47, + "grad_norm": 0.333984375, + "learning_rate": 0.00012870359322664872, + "loss": 1.0853, + "step": 2235 + }, + { + "epoch": 0.47, + "grad_norm": 0.34375, + "learning_rate": 0.00012835436682376544, + "loss": 1.0972, + "step": 2240 + }, + { + "epoch": 0.47, + "grad_norm": 0.423828125, + "learning_rate": 0.0001280047639744534, + "loss": 1.0882, + "step": 2245 + }, + { + "epoch": 0.47, + "grad_norm": 0.40234375, + "learning_rate": 0.00012765478932021094, + "loss": 1.0716, + "step": 2250 + }, + { + "epoch": 0.47, + "grad_norm": 0.373046875, + "learning_rate": 0.00012730444750747247, + "loss": 1.1108, + "step": 2255 + }, + { + "epoch": 0.47, + "grad_norm": 0.4453125, + "learning_rate": 0.00012695374318754716, + "loss": 1.0067, + "step": 2260 + }, + { + "epoch": 0.47, + "grad_norm": 0.37109375, + "learning_rate": 0.00012660268101655686, + "loss": 1.0883, + "step": 2265 + }, + { + "epoch": 0.47, + "grad_norm": 0.353515625, + "learning_rate": 0.00012625126565537456, + "loss": 1.1754, + "step": 2270 + }, + { + "epoch": 0.47, + "grad_norm": 0.3515625, + "learning_rate": 0.00012589950176956227, + "loss": 1.1861, + "step": 2275 + }, + { + "epoch": 0.48, + "grad_norm": 0.60546875, + "learning_rate": 0.00012554739402930923, + "loss": 1.0978, + "step": 2280 + }, + { + "epoch": 0.48, + "grad_norm": 0.3828125, + "learning_rate": 0.00012519494710936982, + "loss": 0.9778, + "step": 2285 + }, + { + "epoch": 0.48, + "grad_norm": 0.46875, + "learning_rate": 0.00012484216568900162, + "loss": 1.0441, + "step": 2290 + }, + { + "epoch": 0.48, + "grad_norm": 0.365234375, + "learning_rate": 0.0001244890544519031, + "loss": 1.0973, + "step": 2295 + }, + { + "epoch": 0.48, + "grad_norm": 0.337890625, + "learning_rate": 0.00012413561808615159, + "loss": 1.1415, + "step": 2300 + }, + { + "epoch": 0.48, + "grad_norm": 0.376953125, + "learning_rate": 0.000123781861284141, + "loss": 1.1161, + "step": 2305 + }, + { + "epoch": 0.48, + "grad_norm": 0.36328125, + "learning_rate": 0.0001234277887425195, + "loss": 1.0397, + "step": 2310 + }, + { + "epoch": 0.48, + "grad_norm": 0.365234375, + "learning_rate": 0.00012307340516212712, + "loss": 1.1378, + "step": 2315 + }, + { + "epoch": 0.48, + "grad_norm": 0.353515625, + "learning_rate": 0.00012271871524793348, + "loss": 1.0375, + "step": 2320 + }, + { + "epoch": 0.49, + "grad_norm": 0.3671875, + "learning_rate": 0.0001223637237089752, + "loss": 1.1031, + "step": 2325 + }, + { + "epoch": 0.49, + "grad_norm": 0.3671875, + "learning_rate": 0.0001220084352582933, + "loss": 1.0545, + "step": 2330 + }, + { + "epoch": 0.49, + "grad_norm": 0.578125, + "learning_rate": 0.000121652854612871, + "loss": 1.0811, + "step": 2335 + }, + { + "epoch": 0.49, + "grad_norm": 0.482421875, + "learning_rate": 0.0001212969864935706, + "loss": 0.9833, + "step": 2340 + }, + { + "epoch": 0.49, + "grad_norm": 0.353515625, + "learning_rate": 0.00012094083562507119, + "loss": 1.0778, + "step": 2345 + }, + { + "epoch": 0.49, + "grad_norm": 0.37890625, + "learning_rate": 0.00012058440673580572, + "loss": 0.9276, + "step": 2350 + }, + { + "epoch": 0.49, + "grad_norm": 0.408203125, + "learning_rate": 0.00012022770455789824, + "loss": 1.0733, + "step": 2355 + }, + { + "epoch": 0.49, + "grad_norm": 0.92578125, + "learning_rate": 0.00011987073382710123, + "loss": 1.0961, + "step": 2360 + }, + { + "epoch": 0.49, + "grad_norm": 0.388671875, + "learning_rate": 0.00011951349928273253, + "loss": 1.094, + "step": 2365 + }, + { + "epoch": 0.49, + "grad_norm": 0.40625, + "learning_rate": 0.00011915600566761243, + "loss": 1.1453, + "step": 2370 + }, + { + "epoch": 0.5, + "grad_norm": 0.55859375, + "learning_rate": 0.00011879825772800088, + "loss": 1.1801, + "step": 2375 + }, + { + "epoch": 0.5, + "grad_norm": 0.48046875, + "learning_rate": 0.0001184402602135344, + "loss": 1.024, + "step": 2380 + }, + { + "epoch": 0.5, + "grad_norm": 0.369140625, + "learning_rate": 0.00011808201787716284, + "loss": 1.1035, + "step": 2385 + }, + { + "epoch": 0.5, + "grad_norm": 0.37890625, + "learning_rate": 0.00011772353547508656, + "loss": 1.0209, + "step": 2390 + }, + { + "epoch": 0.5, + "grad_norm": 0.87890625, + "learning_rate": 0.00011736481776669306, + "loss": 0.9905, + "step": 2395 + }, + { + "epoch": 0.5, + "grad_norm": 0.390625, + "learning_rate": 0.00011700586951449393, + "loss": 1.1018, + "step": 2400 + }, + { + "epoch": 0.5, + "grad_norm": 0.42578125, + "learning_rate": 0.00011664669548406153, + "loss": 1.0384, + "step": 2405 + }, + { + "epoch": 0.5, + "grad_norm": 0.369140625, + "learning_rate": 0.00011628730044396583, + "loss": 1.0272, + "step": 2410 + }, + { + "epoch": 0.5, + "grad_norm": 0.36328125, + "learning_rate": 0.00011592768916571095, + "loss": 0.9559, + "step": 2415 + }, + { + "epoch": 0.51, + "grad_norm": 0.3671875, + "learning_rate": 0.00011556786642367201, + "loss": 1.0277, + "step": 2420 + }, + { + "epoch": 0.51, + "grad_norm": 0.376953125, + "learning_rate": 0.00011520783699503148, + "loss": 1.0187, + "step": 2425 + }, + { + "epoch": 0.51, + "grad_norm": 0.37890625, + "learning_rate": 0.00011484760565971602, + "loss": 0.995, + "step": 2430 + }, + { + "epoch": 0.51, + "grad_norm": 0.333984375, + "learning_rate": 0.00011448717720033283, + "loss": 1.0452, + "step": 2435 + }, + { + "epoch": 0.51, + "grad_norm": 0.3828125, + "learning_rate": 0.0001141265564021063, + "loss": 1.0964, + "step": 2440 + }, + { + "epoch": 0.51, + "grad_norm": 0.380859375, + "learning_rate": 0.0001137657480528143, + "loss": 1.2053, + "step": 2445 + }, + { + "epoch": 0.51, + "grad_norm": 0.6875, + "learning_rate": 0.00011340475694272478, + "loss": 1.0627, + "step": 2450 + }, + { + "epoch": 0.51, + "grad_norm": 0.455078125, + "learning_rate": 0.00011304358786453218, + "loss": 1.0991, + "step": 2455 + }, + { + "epoch": 0.51, + "grad_norm": 0.3515625, + "learning_rate": 0.00011268224561329354, + "loss": 1.1173, + "step": 2460 + }, + { + "epoch": 0.51, + "grad_norm": 0.3828125, + "learning_rate": 0.00011232073498636526, + "loss": 1.1316, + "step": 2465 + }, + { + "epoch": 0.52, + "grad_norm": 0.39453125, + "learning_rate": 0.000111959060783339, + "loss": 1.0399, + "step": 2470 + }, + { + "epoch": 0.52, + "grad_norm": 0.3828125, + "learning_rate": 0.00011159722780597826, + "loss": 1.1867, + "step": 2475 + }, + { + "epoch": 0.52, + "grad_norm": 0.365234375, + "learning_rate": 0.0001112352408581544, + "loss": 1.039, + "step": 2480 + }, + { + "epoch": 0.52, + "grad_norm": 0.341796875, + "learning_rate": 0.00011087310474578305, + "loss": 1.0116, + "step": 2485 + }, + { + "epoch": 0.52, + "grad_norm": 0.400390625, + "learning_rate": 0.00011051082427676017, + "loss": 1.0806, + "step": 2490 + }, + { + "epoch": 0.52, + "grad_norm": 0.365234375, + "learning_rate": 0.00011014840426089829, + "loss": 1.0751, + "step": 2495 + }, + { + "epoch": 0.52, + "grad_norm": 0.408203125, + "learning_rate": 0.00010978584950986264, + "loss": 1.1224, + "step": 2500 + }, + { + "epoch": 0.52, + "grad_norm": 0.353515625, + "learning_rate": 0.00010942316483710722, + "loss": 1.07, + "step": 2505 + }, + { + "epoch": 0.52, + "grad_norm": 0.390625, + "learning_rate": 0.00010906035505781101, + "loss": 1.059, + "step": 2510 + }, + { + "epoch": 0.52, + "grad_norm": 0.376953125, + "learning_rate": 0.00010869742498881388, + "loss": 1.2009, + "step": 2515 + }, + { + "epoch": 0.53, + "grad_norm": 0.40625, + "learning_rate": 0.00010833437944855278, + "loss": 1.0943, + "step": 2520 + }, + { + "epoch": 0.53, + "grad_norm": 0.369140625, + "learning_rate": 0.00010797122325699769, + "loss": 1.059, + "step": 2525 + }, + { + "epoch": 0.53, + "grad_norm": 0.359375, + "learning_rate": 0.0001076079612355877, + "loss": 1.1079, + "step": 2530 + }, + { + "epoch": 0.53, + "grad_norm": 0.359375, + "learning_rate": 0.00010724459820716683, + "loss": 0.9847, + "step": 2535 + }, + { + "epoch": 0.53, + "grad_norm": 0.3515625, + "learning_rate": 0.00010688113899592025, + "loss": 1.0511, + "step": 2540 + }, + { + "epoch": 0.53, + "grad_norm": 0.396484375, + "learning_rate": 0.00010651758842731003, + "loss": 1.1623, + "step": 2545 + }, + { + "epoch": 0.53, + "grad_norm": 0.3671875, + "learning_rate": 0.00010615395132801118, + "loss": 1.026, + "step": 2550 + }, + { + "epoch": 0.53, + "grad_norm": 0.3671875, + "learning_rate": 0.00010579023252584748, + "loss": 1.0156, + "step": 2555 + }, + { + "epoch": 0.53, + "grad_norm": 0.314453125, + "learning_rate": 0.00010542643684972748, + "loss": 1.112, + "step": 2560 + }, + { + "epoch": 0.54, + "grad_norm": 0.373046875, + "learning_rate": 0.00010506256912958037, + "loss": 1.0403, + "step": 2565 + }, + { + "epoch": 0.54, + "grad_norm": 0.357421875, + "learning_rate": 0.00010469863419629176, + "loss": 1.0756, + "step": 2570 + }, + { + "epoch": 0.54, + "grad_norm": 0.365234375, + "learning_rate": 0.00010433463688163969, + "loss": 1.0707, + "step": 2575 + }, + { + "epoch": 0.54, + "grad_norm": 0.359375, + "learning_rate": 0.0001039705820182303, + "loss": 1.0884, + "step": 2580 + }, + { + "epoch": 0.54, + "grad_norm": 0.373046875, + "learning_rate": 0.00010360647443943392, + "loss": 1.1053, + "step": 2585 + }, + { + "epoch": 0.54, + "grad_norm": 0.37890625, + "learning_rate": 0.00010324231897932068, + "loss": 1.034, + "step": 2590 + }, + { + "epoch": 0.54, + "grad_norm": 0.451171875, + "learning_rate": 0.00010287812047259632, + "loss": 1.1258, + "step": 2595 + }, + { + "epoch": 0.54, + "grad_norm": 1.4921875, + "learning_rate": 0.00010251388375453827, + "loss": 1.0579, + "step": 2600 + }, + { + "epoch": 0.54, + "grad_norm": 0.3828125, + "learning_rate": 0.00010214961366093114, + "loss": 1.1272, + "step": 2605 + }, + { + "epoch": 0.54, + "grad_norm": 0.341796875, + "learning_rate": 0.00010178531502800266, + "loss": 1.0785, + "step": 2610 + }, + { + "epoch": 0.55, + "grad_norm": 0.79296875, + "learning_rate": 0.00010142099269235952, + "loss": 1.0422, + "step": 2615 + }, + { + "epoch": 0.55, + "grad_norm": 0.39453125, + "learning_rate": 0.00010105665149092305, + "loss": 1.1162, + "step": 2620 + }, + { + "epoch": 0.55, + "grad_norm": 0.349609375, + "learning_rate": 0.00010069229626086509, + "loss": 0.9534, + "step": 2625 + }, + { + "epoch": 0.55, + "grad_norm": 0.3359375, + "learning_rate": 0.0001003279318395437, + "loss": 1.0155, + "step": 2630 + }, + { + "epoch": 0.55, + "grad_norm": 0.56640625, + "learning_rate": 9.996356306443895e-05, + "loss": 1.1337, + "step": 2635 + }, + { + "epoch": 0.55, + "grad_norm": 0.34765625, + "learning_rate": 9.959919477308878e-05, + "loss": 1.0822, + "step": 2640 + }, + { + "epoch": 0.55, + "grad_norm": 0.5234375, + "learning_rate": 9.923483180302463e-05, + "loss": 1.1467, + "step": 2645 + }, + { + "epoch": 0.55, + "grad_norm": 0.392578125, + "learning_rate": 9.887047899170735e-05, + "loss": 1.0377, + "step": 2650 + }, + { + "epoch": 0.55, + "grad_norm": 0.349609375, + "learning_rate": 9.850614117646289e-05, + "loss": 1.0582, + "step": 2655 + }, + { + "epoch": 0.56, + "grad_norm": 0.37890625, + "learning_rate": 9.814182319441814e-05, + "loss": 1.1596, + "step": 2660 + }, + { + "epoch": 0.56, + "grad_norm": 0.3828125, + "learning_rate": 9.777752988243659e-05, + "loss": 1.1117, + "step": 2665 + }, + { + "epoch": 0.56, + "grad_norm": 0.490234375, + "learning_rate": 9.74132660770543e-05, + "loss": 1.0621, + "step": 2670 + }, + { + "epoch": 0.56, + "grad_norm": 0.39453125, + "learning_rate": 9.704903661441553e-05, + "loss": 1.0325, + "step": 2675 + }, + { + "epoch": 0.56, + "grad_norm": 0.37890625, + "learning_rate": 9.668484633020859e-05, + "loss": 1.1046, + "step": 2680 + }, + { + "epoch": 0.56, + "grad_norm": 0.484375, + "learning_rate": 9.632070005960164e-05, + "loss": 1.1527, + "step": 2685 + }, + { + "epoch": 0.56, + "grad_norm": 0.369140625, + "learning_rate": 9.595660263717852e-05, + "loss": 1.0448, + "step": 2690 + }, + { + "epoch": 0.56, + "grad_norm": 0.384765625, + "learning_rate": 9.559255889687454e-05, + "loss": 1.0799, + "step": 2695 + }, + { + "epoch": 0.56, + "grad_norm": 0.392578125, + "learning_rate": 9.522857367191221e-05, + "loss": 1.0383, + "step": 2700 + }, + { + "epoch": 0.56, + "grad_norm": 0.322265625, + "learning_rate": 9.486465179473729e-05, + "loss": 0.9467, + "step": 2705 + }, + { + "epoch": 0.57, + "grad_norm": 0.3671875, + "learning_rate": 9.450079809695447e-05, + "loss": 1.1495, + "step": 2710 + }, + { + "epoch": 0.57, + "grad_norm": 0.37890625, + "learning_rate": 9.413701740926313e-05, + "loss": 1.0475, + "step": 2715 + }, + { + "epoch": 0.57, + "grad_norm": 0.58984375, + "learning_rate": 9.377331456139352e-05, + "loss": 0.9938, + "step": 2720 + }, + { + "epoch": 0.57, + "grad_norm": 0.34765625, + "learning_rate": 9.340969438204232e-05, + "loss": 1.0686, + "step": 2725 + }, + { + "epoch": 0.57, + "grad_norm": 0.36328125, + "learning_rate": 9.304616169880871e-05, + "loss": 1.0011, + "step": 2730 + }, + { + "epoch": 0.57, + "grad_norm": 1.1640625, + "learning_rate": 9.268272133813025e-05, + "loss": 1.1045, + "step": 2735 + }, + { + "epoch": 0.57, + "grad_norm": 0.3671875, + "learning_rate": 9.231937812521875e-05, + "loss": 1.0832, + "step": 2740 + }, + { + "epoch": 0.57, + "grad_norm": 0.37109375, + "learning_rate": 9.195613688399623e-05, + "loss": 1.0394, + "step": 2745 + }, + { + "epoch": 0.57, + "grad_norm": 0.392578125, + "learning_rate": 9.159300243703091e-05, + "loss": 1.0736, + "step": 2750 + }, + { + "epoch": 0.58, + "grad_norm": 0.4140625, + "learning_rate": 9.122997960547316e-05, + "loss": 1.1489, + "step": 2755 + }, + { + "epoch": 0.58, + "grad_norm": 0.38671875, + "learning_rate": 9.086707320899144e-05, + "loss": 1.1488, + "step": 2760 + }, + { + "epoch": 0.58, + "grad_norm": 0.373046875, + "learning_rate": 9.050428806570841e-05, + "loss": 0.9796, + "step": 2765 + }, + { + "epoch": 0.58, + "grad_norm": 0.349609375, + "learning_rate": 9.014162899213689e-05, + "loss": 1.0584, + "step": 2770 + }, + { + "epoch": 0.58, + "grad_norm": 0.333984375, + "learning_rate": 8.977910080311601e-05, + "loss": 1.0405, + "step": 2775 + }, + { + "epoch": 0.58, + "grad_norm": 0.431640625, + "learning_rate": 8.941670831174706e-05, + "loss": 1.1109, + "step": 2780 + }, + { + "epoch": 0.58, + "grad_norm": 0.40234375, + "learning_rate": 8.905445632932989e-05, + "loss": 1.1511, + "step": 2785 + }, + { + "epoch": 0.58, + "grad_norm": 0.3359375, + "learning_rate": 8.869234966529883e-05, + "loss": 0.9418, + "step": 2790 + }, + { + "epoch": 0.58, + "grad_norm": 0.365234375, + "learning_rate": 8.833039312715884e-05, + "loss": 1.0695, + "step": 2795 + }, + { + "epoch": 0.58, + "grad_norm": 0.357421875, + "learning_rate": 8.79685915204218e-05, + "loss": 1.0157, + "step": 2800 + }, + { + "epoch": 0.59, + "grad_norm": 0.38671875, + "learning_rate": 8.760694964854265e-05, + "loss": 1.1373, + "step": 2805 + }, + { + "epoch": 0.59, + "grad_norm": 0.3359375, + "learning_rate": 8.724547231285558e-05, + "loss": 0.9823, + "step": 2810 + }, + { + "epoch": 0.59, + "grad_norm": 0.349609375, + "learning_rate": 8.688416431251032e-05, + "loss": 1.0418, + "step": 2815 + }, + { + "epoch": 0.59, + "grad_norm": 0.37890625, + "learning_rate": 8.652303044440841e-05, + "loss": 0.9956, + "step": 2820 + }, + { + "epoch": 0.59, + "grad_norm": 0.365234375, + "learning_rate": 8.61620755031396e-05, + "loss": 1.1068, + "step": 2825 + }, + { + "epoch": 0.59, + "grad_norm": 0.361328125, + "learning_rate": 8.580130428091802e-05, + "loss": 1.0246, + "step": 2830 + }, + { + "epoch": 0.59, + "grad_norm": 0.310546875, + "learning_rate": 8.54407215675187e-05, + "loss": 1.0299, + "step": 2835 + }, + { + "epoch": 0.59, + "grad_norm": 0.4453125, + "learning_rate": 8.508033215021395e-05, + "loss": 1.1331, + "step": 2840 + }, + { + "epoch": 0.59, + "grad_norm": 0.375, + "learning_rate": 8.472014081370977e-05, + "loss": 0.9622, + "step": 2845 + }, + { + "epoch": 0.59, + "grad_norm": 0.353515625, + "learning_rate": 8.436015234008233e-05, + "loss": 1.0842, + "step": 2850 + }, + { + "epoch": 0.6, + "grad_norm": 0.388671875, + "learning_rate": 8.400037150871452e-05, + "loss": 1.0176, + "step": 2855 + }, + { + "epoch": 0.6, + "grad_norm": 0.384765625, + "learning_rate": 8.364080309623244e-05, + "loss": 1.04, + "step": 2860 + }, + { + "epoch": 0.6, + "grad_norm": 0.412109375, + "learning_rate": 8.328145187644206e-05, + "loss": 1.052, + "step": 2865 + }, + { + "epoch": 0.6, + "grad_norm": 0.365234375, + "learning_rate": 8.292232262026574e-05, + "loss": 1.1369, + "step": 2870 + }, + { + "epoch": 0.6, + "grad_norm": 0.54296875, + "learning_rate": 8.256342009567896e-05, + "loss": 1.0515, + "step": 2875 + }, + { + "epoch": 0.6, + "grad_norm": 0.359375, + "learning_rate": 8.220474906764704e-05, + "loss": 1.0096, + "step": 2880 + }, + { + "epoch": 0.6, + "grad_norm": 2.859375, + "learning_rate": 8.184631429806175e-05, + "loss": 1.0463, + "step": 2885 + }, + { + "epoch": 0.6, + "grad_norm": 0.314453125, + "learning_rate": 8.148812054567834e-05, + "loss": 1.0897, + "step": 2890 + }, + { + "epoch": 0.6, + "grad_norm": 0.37890625, + "learning_rate": 8.113017256605196e-05, + "loss": 1.1268, + "step": 2895 + }, + { + "epoch": 0.61, + "grad_norm": 0.7578125, + "learning_rate": 8.077247511147495e-05, + "loss": 1.1213, + "step": 2900 + }, + { + "epoch": 0.61, + "grad_norm": 0.375, + "learning_rate": 8.04150329309135e-05, + "loss": 1.006, + "step": 2905 + }, + { + "epoch": 0.61, + "grad_norm": 0.353515625, + "learning_rate": 8.005785076994466e-05, + "loss": 1.0393, + "step": 2910 + }, + { + "epoch": 0.61, + "grad_norm": 0.375, + "learning_rate": 7.97009333706933e-05, + "loss": 1.0877, + "step": 2915 + }, + { + "epoch": 0.61, + "grad_norm": 0.369140625, + "learning_rate": 7.934428547176925e-05, + "loss": 0.9819, + "step": 2920 + }, + { + "epoch": 0.61, + "grad_norm": 0.375, + "learning_rate": 7.898791180820428e-05, + "loss": 1.149, + "step": 2925 + }, + { + "epoch": 0.61, + "grad_norm": 0.34765625, + "learning_rate": 7.863181711138925e-05, + "loss": 1.2709, + "step": 2930 + }, + { + "epoch": 0.61, + "grad_norm": 0.345703125, + "learning_rate": 7.827600610901138e-05, + "loss": 1.0535, + "step": 2935 + }, + { + "epoch": 0.61, + "grad_norm": 0.396484375, + "learning_rate": 7.792048352499142e-05, + "loss": 1.0346, + "step": 2940 + }, + { + "epoch": 0.61, + "grad_norm": 0.421875, + "learning_rate": 7.756525407942085e-05, + "loss": 1.0115, + "step": 2945 + }, + { + "epoch": 0.62, + "grad_norm": 0.390625, + "learning_rate": 7.72103224884994e-05, + "loss": 1.0973, + "step": 2950 + }, + { + "epoch": 0.62, + "grad_norm": 0.357421875, + "learning_rate": 7.685569346447232e-05, + "loss": 1.0076, + "step": 2955 + }, + { + "epoch": 0.62, + "grad_norm": 0.423828125, + "learning_rate": 7.650137171556781e-05, + "loss": 1.084, + "step": 2960 + }, + { + "epoch": 0.62, + "grad_norm": 0.359375, + "learning_rate": 7.614736194593453e-05, + "loss": 1.0667, + "step": 2965 + }, + { + "epoch": 0.62, + "grad_norm": 0.41015625, + "learning_rate": 7.579366885557914e-05, + "loss": 1.1245, + "step": 2970 + }, + { + "epoch": 0.62, + "grad_norm": 0.369140625, + "learning_rate": 7.544029714030394e-05, + "loss": 0.9752, + "step": 2975 + }, + { + "epoch": 0.62, + "grad_norm": 0.455078125, + "learning_rate": 7.508725149164455e-05, + "loss": 1.0817, + "step": 2980 + }, + { + "epoch": 0.62, + "grad_norm": 0.359375, + "learning_rate": 7.473453659680743e-05, + "loss": 0.9972, + "step": 2985 + }, + { + "epoch": 0.62, + "grad_norm": 0.361328125, + "learning_rate": 7.43821571386079e-05, + "loss": 0.9939, + "step": 2990 + }, + { + "epoch": 0.63, + "grad_norm": 0.36328125, + "learning_rate": 7.403011779540785e-05, + "loss": 1.1253, + "step": 2995 + }, + { + "epoch": 0.63, + "grad_norm": 0.7109375, + "learning_rate": 7.367842324105358e-05, + "loss": 1.1767, + "step": 3000 + }, + { + "epoch": 0.63, + "grad_norm": 0.330078125, + "learning_rate": 7.332707814481387e-05, + "loss": 0.96, + "step": 3005 + }, + { + "epoch": 0.63, + "grad_norm": 0.39453125, + "learning_rate": 7.297608717131788e-05, + "loss": 0.9991, + "step": 3010 + }, + { + "epoch": 0.63, + "grad_norm": 0.341796875, + "learning_rate": 7.262545498049332e-05, + "loss": 1.1189, + "step": 3015 + }, + { + "epoch": 0.63, + "grad_norm": 0.3359375, + "learning_rate": 7.227518622750443e-05, + "loss": 1.081, + "step": 3020 + }, + { + "epoch": 0.63, + "grad_norm": 0.333984375, + "learning_rate": 7.192528556269034e-05, + "loss": 1.1145, + "step": 3025 + }, + { + "epoch": 0.63, + "grad_norm": 0.4140625, + "learning_rate": 7.157575763150327e-05, + "loss": 1.1179, + "step": 3030 + }, + { + "epoch": 0.63, + "grad_norm": 0.390625, + "learning_rate": 7.12266070744468e-05, + "loss": 1.0421, + "step": 3035 + }, + { + "epoch": 0.63, + "grad_norm": 0.3828125, + "learning_rate": 7.087783852701432e-05, + "loss": 1.2192, + "step": 3040 + }, + { + "epoch": 0.64, + "grad_norm": 0.35546875, + "learning_rate": 7.052945661962753e-05, + "loss": 1.0846, + "step": 3045 + }, + { + "epoch": 0.64, + "grad_norm": 0.34375, + "learning_rate": 7.018146597757484e-05, + "loss": 1.0074, + "step": 3050 + }, + { + "epoch": 0.64, + "grad_norm": 0.373046875, + "learning_rate": 6.983387122095004e-05, + "loss": 1.1082, + "step": 3055 + }, + { + "epoch": 0.64, + "grad_norm": 0.37109375, + "learning_rate": 6.948667696459097e-05, + "loss": 1.1076, + "step": 3060 + }, + { + "epoch": 0.64, + "grad_norm": 0.380859375, + "learning_rate": 6.913988781801826e-05, + "loss": 1.0275, + "step": 3065 + }, + { + "epoch": 0.64, + "grad_norm": 0.55859375, + "learning_rate": 6.879350838537403e-05, + "loss": 1.204, + "step": 3070 + }, + { + "epoch": 0.64, + "grad_norm": 0.36328125, + "learning_rate": 6.84475432653609e-05, + "loss": 1.1089, + "step": 3075 + }, + { + "epoch": 0.64, + "grad_norm": 0.373046875, + "learning_rate": 6.810199705118081e-05, + "loss": 1.0234, + "step": 3080 + }, + { + "epoch": 0.64, + "grad_norm": 0.373046875, + "learning_rate": 6.775687433047417e-05, + "loss": 1.17, + "step": 3085 + }, + { + "epoch": 0.64, + "grad_norm": 0.353515625, + "learning_rate": 6.74121796852589e-05, + "loss": 1.1946, + "step": 3090 + }, + { + "epoch": 0.65, + "grad_norm": 0.400390625, + "learning_rate": 6.706791769186946e-05, + "loss": 0.9443, + "step": 3095 + }, + { + "epoch": 0.65, + "grad_norm": 0.400390625, + "learning_rate": 6.672409292089635e-05, + "loss": 1.0418, + "step": 3100 + }, + { + "epoch": 0.65, + "grad_norm": 0.419921875, + "learning_rate": 6.638070993712521e-05, + "loss": 1.1374, + "step": 3105 + }, + { + "epoch": 0.65, + "grad_norm": 0.359375, + "learning_rate": 6.603777329947635e-05, + "loss": 1.1197, + "step": 3110 + }, + { + "epoch": 0.65, + "grad_norm": 0.365234375, + "learning_rate": 6.569528756094409e-05, + "loss": 1.01, + "step": 3115 + }, + { + "epoch": 0.65, + "grad_norm": 0.35546875, + "learning_rate": 6.535325726853647e-05, + "loss": 1.0671, + "step": 3120 + }, + { + "epoch": 0.65, + "grad_norm": 0.35546875, + "learning_rate": 6.501168696321475e-05, + "loss": 0.9577, + "step": 3125 + }, + { + "epoch": 0.65, + "grad_norm": 0.3671875, + "learning_rate": 6.467058117983329e-05, + "loss": 1.1782, + "step": 3130 + }, + { + "epoch": 0.65, + "grad_norm": 0.34765625, + "learning_rate": 6.432994444707906e-05, + "loss": 1.0968, + "step": 3135 + }, + { + "epoch": 0.66, + "grad_norm": 0.341796875, + "learning_rate": 6.398978128741176e-05, + "loss": 0.987, + "step": 3140 + }, + { + "epoch": 0.66, + "grad_norm": 0.359375, + "learning_rate": 6.36500962170038e-05, + "loss": 1.0588, + "step": 3145 + }, + { + "epoch": 0.66, + "grad_norm": 0.365234375, + "learning_rate": 6.331089374568007e-05, + "loss": 1.0049, + "step": 3150 + }, + { + "epoch": 0.66, + "grad_norm": 0.353515625, + "learning_rate": 6.297217837685833e-05, + "loss": 1.0989, + "step": 3155 + }, + { + "epoch": 0.66, + "grad_norm": 0.353515625, + "learning_rate": 6.263395460748938e-05, + "loss": 1.1308, + "step": 3160 + }, + { + "epoch": 0.66, + "grad_norm": 0.361328125, + "learning_rate": 6.229622692799724e-05, + "loss": 0.9604, + "step": 3165 + }, + { + "epoch": 0.66, + "grad_norm": 0.3515625, + "learning_rate": 6.195899982221953e-05, + "loss": 1.0006, + "step": 3170 + }, + { + "epoch": 0.66, + "grad_norm": 0.3359375, + "learning_rate": 6.162227776734817e-05, + "loss": 0.9775, + "step": 3175 + }, + { + "epoch": 0.66, + "grad_norm": 0.357421875, + "learning_rate": 6.128606523386967e-05, + "loss": 0.978, + "step": 3180 + }, + { + "epoch": 0.66, + "grad_norm": 0.3828125, + "learning_rate": 6.0950366685505864e-05, + "loss": 0.992, + "step": 3185 + }, + { + "epoch": 0.67, + "grad_norm": 0.349609375, + "learning_rate": 6.0615186579154756e-05, + "loss": 1.103, + "step": 3190 + }, + { + "epoch": 0.67, + "grad_norm": 0.384765625, + "learning_rate": 6.028052936483122e-05, + "loss": 1.0557, + "step": 3195 + }, + { + "epoch": 0.67, + "grad_norm": 0.396484375, + "learning_rate": 5.994639948560794e-05, + "loss": 1.0671, + "step": 3200 + }, + { + "epoch": 0.67, + "grad_norm": 0.357421875, + "learning_rate": 5.961280137755646e-05, + "loss": 0.9779, + "step": 3205 + }, + { + "epoch": 0.67, + "grad_norm": 0.36328125, + "learning_rate": 5.927973946968827e-05, + "loss": 1.1016, + "step": 3210 + }, + { + "epoch": 0.67, + "grad_norm": 0.453125, + "learning_rate": 5.8947218183896016e-05, + "loss": 1.0919, + "step": 3215 + }, + { + "epoch": 0.67, + "grad_norm": 0.345703125, + "learning_rate": 5.861524193489474e-05, + "loss": 1.0643, + "step": 3220 + }, + { + "epoch": 0.67, + "grad_norm": 0.353515625, + "learning_rate": 5.828381513016336e-05, + "loss": 1.1629, + "step": 3225 + }, + { + "epoch": 0.67, + "grad_norm": 0.37890625, + "learning_rate": 5.795294216988606e-05, + "loss": 1.0624, + "step": 3230 + }, + { + "epoch": 0.68, + "grad_norm": 0.32421875, + "learning_rate": 5.762262744689395e-05, + "loss": 0.9258, + "step": 3235 + }, + { + "epoch": 0.68, + "grad_norm": 0.3203125, + "learning_rate": 5.72928753466067e-05, + "loss": 1.0168, + "step": 3240 + }, + { + "epoch": 0.68, + "grad_norm": 0.357421875, + "learning_rate": 5.6963690246974354e-05, + "loss": 1.1368, + "step": 3245 + }, + { + "epoch": 0.68, + "grad_norm": 0.357421875, + "learning_rate": 5.663507651841906e-05, + "loss": 1.0114, + "step": 3250 + }, + { + "epoch": 0.68, + "grad_norm": 0.37109375, + "learning_rate": 5.630703852377732e-05, + "loss": 1.117, + "step": 3255 + }, + { + "epoch": 0.68, + "grad_norm": 0.3359375, + "learning_rate": 5.5979580618241786e-05, + "loss": 1.0523, + "step": 3260 + }, + { + "epoch": 0.68, + "grad_norm": 0.376953125, + "learning_rate": 5.565270714930364e-05, + "loss": 1.1847, + "step": 3265 + }, + { + "epoch": 0.68, + "grad_norm": 0.67578125, + "learning_rate": 5.5326422456694774e-05, + "loss": 1.0423, + "step": 3270 + }, + { + "epoch": 0.68, + "grad_norm": 0.33984375, + "learning_rate": 5.500073087233021e-05, + "loss": 0.9565, + "step": 3275 + }, + { + "epoch": 0.68, + "grad_norm": 0.353515625, + "learning_rate": 5.4675636720250554e-05, + "loss": 0.9838, + "step": 3280 + }, + { + "epoch": 0.69, + "grad_norm": 0.34375, + "learning_rate": 5.435114431656466e-05, + "loss": 0.9491, + "step": 3285 + }, + { + "epoch": 0.69, + "grad_norm": 0.396484375, + "learning_rate": 5.402725796939216e-05, + "loss": 1.1402, + "step": 3290 + }, + { + "epoch": 0.69, + "grad_norm": 0.375, + "learning_rate": 5.370398197880651e-05, + "loss": 0.9926, + "step": 3295 + }, + { + "epoch": 0.69, + "grad_norm": 0.60546875, + "learning_rate": 5.33813206367777e-05, + "loss": 1.0699, + "step": 3300 + }, + { + "epoch": 0.69, + "grad_norm": 0.390625, + "learning_rate": 5.3059278227115386e-05, + "loss": 1.0185, + "step": 3305 + }, + { + "epoch": 0.69, + "grad_norm": 0.388671875, + "learning_rate": 5.2737859025411954e-05, + "loss": 1.1353, + "step": 3310 + }, + { + "epoch": 0.69, + "grad_norm": 0.365234375, + "learning_rate": 5.241706729898579e-05, + "loss": 1.0359, + "step": 3315 + }, + { + "epoch": 0.69, + "grad_norm": 0.34765625, + "learning_rate": 5.209690730682457e-05, + "loss": 0.9474, + "step": 3320 + }, + { + "epoch": 0.69, + "grad_norm": 0.37109375, + "learning_rate": 5.177738329952888e-05, + "loss": 1.0324, + "step": 3325 + }, + { + "epoch": 0.7, + "grad_norm": 0.369140625, + "learning_rate": 5.145849951925544e-05, + "loss": 1.1495, + "step": 3330 + }, + { + "epoch": 0.7, + "grad_norm": 0.443359375, + "learning_rate": 5.11402601996612e-05, + "loss": 1.0776, + "step": 3335 + }, + { + "epoch": 0.7, + "grad_norm": 0.38671875, + "learning_rate": 5.082266956584687e-05, + "loss": 1.0373, + "step": 3340 + }, + { + "epoch": 0.7, + "grad_norm": 0.3828125, + "learning_rate": 5.0505731834300875e-05, + "loss": 1.1072, + "step": 3345 + }, + { + "epoch": 0.7, + "grad_norm": 0.369140625, + "learning_rate": 5.018945121284342e-05, + "loss": 1.0765, + "step": 3350 + }, + { + "epoch": 0.7, + "grad_norm": 0.43359375, + "learning_rate": 4.9873831900570554e-05, + "loss": 1.1338, + "step": 3355 + }, + { + "epoch": 0.7, + "grad_norm": 0.54296875, + "learning_rate": 4.9558878087798554e-05, + "loss": 1.2131, + "step": 3360 + }, + { + "epoch": 0.7, + "grad_norm": 0.412109375, + "learning_rate": 4.9244593956008046e-05, + "loss": 1.0869, + "step": 3365 + }, + { + "epoch": 0.7, + "grad_norm": 0.35546875, + "learning_rate": 4.893098367778877e-05, + "loss": 1.072, + "step": 3370 + }, + { + "epoch": 0.7, + "grad_norm": 0.353515625, + "learning_rate": 4.8618051416784006e-05, + "loss": 1.1649, + "step": 3375 + }, + { + "epoch": 0.71, + "grad_norm": 0.341796875, + "learning_rate": 4.8305801327635383e-05, + "loss": 0.9534, + "step": 3380 + }, + { + "epoch": 0.71, + "grad_norm": 0.439453125, + "learning_rate": 4.799423755592765e-05, + "loss": 1.0432, + "step": 3385 + }, + { + "epoch": 0.71, + "grad_norm": 0.3984375, + "learning_rate": 4.768336423813368e-05, + "loss": 0.9562, + "step": 3390 + }, + { + "epoch": 0.71, + "grad_norm": 0.32421875, + "learning_rate": 4.737318550155957e-05, + "loss": 0.8911, + "step": 3395 + }, + { + "epoch": 0.71, + "grad_norm": 0.357421875, + "learning_rate": 4.706370546428972e-05, + "loss": 0.9981, + "step": 3400 + }, + { + "epoch": 0.71, + "grad_norm": 0.349609375, + "learning_rate": 4.675492823513237e-05, + "loss": 0.9333, + "step": 3405 + }, + { + "epoch": 0.71, + "grad_norm": 0.3515625, + "learning_rate": 4.644685791356489e-05, + "loss": 1.065, + "step": 3410 + }, + { + "epoch": 0.71, + "grad_norm": 0.40234375, + "learning_rate": 4.613949858967938e-05, + "loss": 0.9751, + "step": 3415 + }, + { + "epoch": 0.71, + "grad_norm": 0.369140625, + "learning_rate": 4.583285434412845e-05, + "loss": 1.2406, + "step": 3420 + }, + { + "epoch": 0.71, + "grad_norm": 0.455078125, + "learning_rate": 4.5526929248070904e-05, + "loss": 1.0822, + "step": 3425 + }, + { + "epoch": 0.72, + "grad_norm": 0.3984375, + "learning_rate": 4.522172736311786e-05, + "loss": 0.9694, + "step": 3430 + }, + { + "epoch": 0.72, + "grad_norm": 0.349609375, + "learning_rate": 4.491725274127858e-05, + "loss": 1.0378, + "step": 3435 + }, + { + "epoch": 0.72, + "grad_norm": 0.369140625, + "learning_rate": 4.461350942490701e-05, + "loss": 1.0835, + "step": 3440 + }, + { + "epoch": 0.72, + "grad_norm": 0.353515625, + "learning_rate": 4.431050144664774e-05, + "loss": 1.0179, + "step": 3445 + }, + { + "epoch": 0.72, + "grad_norm": 0.373046875, + "learning_rate": 4.4008232829382824e-05, + "loss": 1.1077, + "step": 3450 + }, + { + "epoch": 0.72, + "grad_norm": 0.369140625, + "learning_rate": 4.370670758617808e-05, + "loss": 1.0818, + "step": 3455 + }, + { + "epoch": 0.72, + "grad_norm": 0.349609375, + "learning_rate": 4.340592972023e-05, + "loss": 1.0678, + "step": 3460 + }, + { + "epoch": 0.72, + "grad_norm": 0.36328125, + "learning_rate": 4.310590322481248e-05, + "loss": 1.0692, + "step": 3465 + }, + { + "epoch": 0.72, + "grad_norm": 0.349609375, + "learning_rate": 4.2806632083223886e-05, + "loss": 0.9918, + "step": 3470 + }, + { + "epoch": 0.73, + "grad_norm": 0.353515625, + "learning_rate": 4.2508120268734174e-05, + "loss": 1.1388, + "step": 3475 + }, + { + "epoch": 0.73, + "grad_norm": 0.388671875, + "learning_rate": 4.2210371744531965e-05, + "loss": 1.0109, + "step": 3480 + }, + { + "epoch": 0.73, + "grad_norm": 0.390625, + "learning_rate": 4.191339046367219e-05, + "loss": 1.0696, + "step": 3485 + }, + { + "epoch": 0.73, + "grad_norm": 0.34765625, + "learning_rate": 4.161718036902345e-05, + "loss": 1.0385, + "step": 3490 + }, + { + "epoch": 0.73, + "grad_norm": 0.34375, + "learning_rate": 4.132174539321572e-05, + "loss": 0.9972, + "step": 3495 + }, + { + "epoch": 0.73, + "grad_norm": 0.40234375, + "learning_rate": 4.1027089458588065e-05, + "loss": 1.1009, + "step": 3500 + }, + { + "epoch": 0.73, + "grad_norm": 0.41015625, + "learning_rate": 4.073321647713667e-05, + "loss": 0.9775, + "step": 3505 + }, + { + "epoch": 0.73, + "grad_norm": 0.55078125, + "learning_rate": 4.0440130350462865e-05, + "loss": 1.0561, + "step": 3510 + }, + { + "epoch": 0.73, + "grad_norm": 0.3515625, + "learning_rate": 4.014783496972121e-05, + "loss": 1.0012, + "step": 3515 + }, + { + "epoch": 0.73, + "grad_norm": 0.380859375, + "learning_rate": 3.9856334215568035e-05, + "loss": 1.0508, + "step": 3520 + }, + { + "epoch": 0.74, + "grad_norm": 0.345703125, + "learning_rate": 3.956563195810978e-05, + "loss": 1.0059, + "step": 3525 + }, + { + "epoch": 0.74, + "grad_norm": 0.48828125, + "learning_rate": 3.927573205685167e-05, + "loss": 1.1234, + "step": 3530 + }, + { + "epoch": 0.74, + "grad_norm": 0.353515625, + "learning_rate": 3.8986638360646443e-05, + "loss": 1.0895, + "step": 3535 + }, + { + "epoch": 0.74, + "grad_norm": 0.353515625, + "learning_rate": 3.8698354707643284e-05, + "loss": 1.0191, + "step": 3540 + }, + { + "epoch": 0.74, + "grad_norm": 0.34765625, + "learning_rate": 3.841088492523685e-05, + "loss": 1.0382, + "step": 3545 + }, + { + "epoch": 0.74, + "grad_norm": 0.48828125, + "learning_rate": 3.812423283001637e-05, + "loss": 1.1604, + "step": 3550 + }, + { + "epoch": 0.74, + "grad_norm": 0.40625, + "learning_rate": 3.783840222771518e-05, + "loss": 1.0974, + "step": 3555 + }, + { + "epoch": 0.74, + "grad_norm": 0.35546875, + "learning_rate": 3.755339691316e-05, + "loss": 1.0917, + "step": 3560 + }, + { + "epoch": 0.74, + "grad_norm": 0.39453125, + "learning_rate": 3.726922067022066e-05, + "loss": 0.8749, + "step": 3565 + }, + { + "epoch": 0.75, + "grad_norm": 0.3515625, + "learning_rate": 3.698587727175982e-05, + "loss": 1.0033, + "step": 3570 + }, + { + "epoch": 0.75, + "grad_norm": 0.345703125, + "learning_rate": 3.6703370479582875e-05, + "loss": 1.0835, + "step": 3575 + }, + { + "epoch": 0.75, + "grad_norm": 0.357421875, + "learning_rate": 3.642170404438809e-05, + "loss": 1.156, + "step": 3580 + }, + { + "epoch": 0.75, + "grad_norm": 0.94921875, + "learning_rate": 3.614088170571661e-05, + "loss": 1.0586, + "step": 3585 + }, + { + "epoch": 0.75, + "grad_norm": 0.380859375, + "learning_rate": 3.586090719190306e-05, + "loss": 1.0405, + "step": 3590 + }, + { + "epoch": 0.75, + "grad_norm": 0.357421875, + "learning_rate": 3.558178422002589e-05, + "loss": 1.027, + "step": 3595 + }, + { + "epoch": 0.75, + "grad_norm": 0.357421875, + "learning_rate": 3.5303516495858057e-05, + "loss": 1.1363, + "step": 3600 + }, + { + "epoch": 0.75, + "grad_norm": 0.314453125, + "learning_rate": 3.502610771381783e-05, + "loss": 1.106, + "step": 3605 + }, + { + "epoch": 0.75, + "grad_norm": 0.380859375, + "learning_rate": 3.4749561556919775e-05, + "loss": 0.9968, + "step": 3610 + }, + { + "epoch": 0.75, + "grad_norm": 0.337890625, + "learning_rate": 3.4473881696725816e-05, + "loss": 1.0708, + "step": 3615 + }, + { + "epoch": 0.76, + "grad_norm": 0.3515625, + "learning_rate": 3.419907179329641e-05, + "loss": 1.1052, + "step": 3620 + }, + { + "epoch": 0.76, + "grad_norm": 0.3984375, + "learning_rate": 3.392513549514219e-05, + "loss": 1.1304, + "step": 3625 + }, + { + "epoch": 0.76, + "grad_norm": 0.37890625, + "learning_rate": 3.3652076439175237e-05, + "loss": 1.1509, + "step": 3630 + }, + { + "epoch": 0.76, + "grad_norm": 0.412109375, + "learning_rate": 3.3379898250661034e-05, + "loss": 0.9853, + "step": 3635 + }, + { + "epoch": 0.76, + "grad_norm": 0.384765625, + "learning_rate": 3.3108604543170206e-05, + "loss": 1.0669, + "step": 3640 + }, + { + "epoch": 0.76, + "grad_norm": 0.38671875, + "learning_rate": 3.283819891853057e-05, + "loss": 1.1067, + "step": 3645 + }, + { + "epoch": 0.76, + "grad_norm": 0.33984375, + "learning_rate": 3.256868496677935e-05, + "loss": 0.9935, + "step": 3650 + }, + { + "epoch": 0.76, + "grad_norm": 0.361328125, + "learning_rate": 3.230006626611544e-05, + "loss": 0.9701, + "step": 3655 + }, + { + "epoch": 0.76, + "grad_norm": 0.33984375, + "learning_rate": 3.2032346382851995e-05, + "loss": 0.9201, + "step": 3660 + }, + { + "epoch": 0.77, + "grad_norm": 0.369140625, + "learning_rate": 3.1765528871368945e-05, + "loss": 1.1023, + "step": 3665 + }, + { + "epoch": 0.77, + "grad_norm": 0.44921875, + "learning_rate": 3.1499617274065986e-05, + "loss": 1.0817, + "step": 3670 + }, + { + "epoch": 0.77, + "grad_norm": 0.37890625, + "learning_rate": 3.12346151213154e-05, + "loss": 1.1742, + "step": 3675 + }, + { + "epoch": 0.77, + "grad_norm": 0.388671875, + "learning_rate": 3.097052593141527e-05, + "loss": 1.0696, + "step": 3680 + }, + { + "epoch": 0.77, + "grad_norm": 0.369140625, + "learning_rate": 3.070735321054271e-05, + "loss": 1.2406, + "step": 3685 + }, + { + "epoch": 0.77, + "grad_norm": 0.57421875, + "learning_rate": 3.0445100452707375e-05, + "loss": 1.0931, + "step": 3690 + }, + { + "epoch": 0.77, + "grad_norm": 0.40625, + "learning_rate": 3.018377113970503e-05, + "loss": 1.1022, + "step": 3695 + }, + { + "epoch": 0.77, + "grad_norm": 0.33984375, + "learning_rate": 2.992336874107129e-05, + "loss": 0.9678, + "step": 3700 + }, + { + "epoch": 0.77, + "grad_norm": 0.357421875, + "learning_rate": 2.9663896714035644e-05, + "loss": 1.009, + "step": 3705 + }, + { + "epoch": 0.77, + "grad_norm": 0.349609375, + "learning_rate": 2.9405358503475533e-05, + "loss": 0.9928, + "step": 3710 + }, + { + "epoch": 0.78, + "grad_norm": 0.3828125, + "learning_rate": 2.9147757541870546e-05, + "loss": 1.1043, + "step": 3715 + }, + { + "epoch": 0.78, + "grad_norm": 0.9296875, + "learning_rate": 2.8891097249256948e-05, + "loss": 1.0473, + "step": 3720 + }, + { + "epoch": 0.78, + "grad_norm": 0.357421875, + "learning_rate": 2.8635381033182175e-05, + "loss": 1.0229, + "step": 3725 + }, + { + "epoch": 0.78, + "grad_norm": 0.369140625, + "learning_rate": 2.83806122886597e-05, + "loss": 1.014, + "step": 3730 + }, + { + "epoch": 0.78, + "grad_norm": 0.35546875, + "learning_rate": 2.81267943981238e-05, + "loss": 1.0675, + "step": 3735 + }, + { + "epoch": 0.78, + "grad_norm": 0.353515625, + "learning_rate": 2.7873930731384845e-05, + "loss": 1.1727, + "step": 3740 + }, + { + "epoch": 0.78, + "grad_norm": 0.369140625, + "learning_rate": 2.7622024645584423e-05, + "loss": 1.0164, + "step": 3745 + }, + { + "epoch": 0.78, + "grad_norm": 0.361328125, + "learning_rate": 2.737107948515083e-05, + "loss": 1.183, + "step": 3750 + }, + { + "epoch": 0.78, + "grad_norm": 0.380859375, + "learning_rate": 2.7121098581754634e-05, + "loss": 1.0301, + "step": 3755 + }, + { + "epoch": 0.78, + "grad_norm": 0.35546875, + "learning_rate": 2.6872085254264446e-05, + "loss": 1.0661, + "step": 3760 + }, + { + "epoch": 0.79, + "grad_norm": 0.3671875, + "learning_rate": 2.662404280870292e-05, + "loss": 1.0385, + "step": 3765 + }, + { + "epoch": 0.79, + "grad_norm": 0.35546875, + "learning_rate": 2.6376974538202692e-05, + "loss": 1.0785, + "step": 3770 + }, + { + "epoch": 0.79, + "grad_norm": 0.392578125, + "learning_rate": 2.6130883722962886e-05, + "loss": 1.0484, + "step": 3775 + }, + { + "epoch": 0.79, + "grad_norm": 0.396484375, + "learning_rate": 2.5885773630205413e-05, + "loss": 1.1069, + "step": 3780 + }, + { + "epoch": 0.79, + "grad_norm": 0.3515625, + "learning_rate": 2.564164751413162e-05, + "loss": 0.9637, + "step": 3785 + }, + { + "epoch": 0.79, + "grad_norm": 0.349609375, + "learning_rate": 2.5398508615879126e-05, + "loss": 1.1005, + "step": 3790 + }, + { + "epoch": 0.79, + "grad_norm": 1.4453125, + "learning_rate": 2.5156360163478764e-05, + "loss": 1.0053, + "step": 3795 + }, + { + "epoch": 0.79, + "grad_norm": 0.359375, + "learning_rate": 2.4915205371811667e-05, + "loss": 1.0631, + "step": 3800 + }, + { + "epoch": 0.79, + "grad_norm": 0.388671875, + "learning_rate": 2.467504744256669e-05, + "loss": 1.106, + "step": 3805 + }, + { + "epoch": 0.8, + "grad_norm": 0.37109375, + "learning_rate": 2.443588956419791e-05, + "loss": 1.0389, + "step": 3810 + }, + { + "epoch": 0.8, + "grad_norm": 0.37109375, + "learning_rate": 2.4197734911882098e-05, + "loss": 1.133, + "step": 3815 + }, + { + "epoch": 0.8, + "grad_norm": 0.3515625, + "learning_rate": 2.3960586647476824e-05, + "loss": 1.1337, + "step": 3820 + }, + { + "epoch": 0.8, + "grad_norm": 0.3515625, + "learning_rate": 2.372444791947832e-05, + "loss": 0.9681, + "step": 3825 + }, + { + "epoch": 0.8, + "grad_norm": 0.515625, + "learning_rate": 2.3489321862979728e-05, + "loss": 1.062, + "step": 3830 + }, + { + "epoch": 0.8, + "grad_norm": 0.37109375, + "learning_rate": 2.3255211599629467e-05, + "loss": 1.0479, + "step": 3835 + }, + { + "epoch": 0.8, + "grad_norm": 0.396484375, + "learning_rate": 2.3022120237589783e-05, + "loss": 1.0702, + "step": 3840 + }, + { + "epoch": 0.8, + "grad_norm": 0.384765625, + "learning_rate": 2.2790050871495517e-05, + "loss": 1.1138, + "step": 3845 + }, + { + "epoch": 0.8, + "grad_norm": 0.376953125, + "learning_rate": 2.255900658241291e-05, + "loss": 0.992, + "step": 3850 + }, + { + "epoch": 0.8, + "grad_norm": 0.31640625, + "learning_rate": 2.232899043779889e-05, + "loss": 0.9199, + "step": 3855 + }, + { + "epoch": 0.81, + "grad_norm": 0.3984375, + "learning_rate": 2.2100005491460163e-05, + "loss": 1.0894, + "step": 3860 + }, + { + "epoch": 0.81, + "grad_norm": 0.5234375, + "learning_rate": 2.1872054783512775e-05, + "loss": 1.1446, + "step": 3865 + }, + { + "epoch": 0.81, + "grad_norm": 0.34765625, + "learning_rate": 2.16451413403417e-05, + "loss": 1.0609, + "step": 3870 + }, + { + "epoch": 0.81, + "grad_norm": 0.3515625, + "learning_rate": 2.1419268174560693e-05, + "loss": 0.9734, + "step": 3875 + }, + { + "epoch": 0.81, + "grad_norm": 0.349609375, + "learning_rate": 2.1194438284972294e-05, + "loss": 0.9959, + "step": 3880 + }, + { + "epoch": 0.81, + "grad_norm": 0.345703125, + "learning_rate": 2.0970654656527922e-05, + "loss": 0.941, + "step": 3885 + }, + { + "epoch": 0.81, + "grad_norm": 0.33984375, + "learning_rate": 2.074792026028841e-05, + "loss": 0.9481, + "step": 3890 + }, + { + "epoch": 0.81, + "grad_norm": 0.357421875, + "learning_rate": 2.052623805338444e-05, + "loss": 1.0345, + "step": 3895 + }, + { + "epoch": 0.81, + "grad_norm": 0.341796875, + "learning_rate": 2.0305610978977284e-05, + "loss": 1.0368, + "step": 3900 + }, + { + "epoch": 0.82, + "grad_norm": 0.353515625, + "learning_rate": 2.0086041966219827e-05, + "loss": 1.0414, + "step": 3905 + }, + { + "epoch": 0.82, + "grad_norm": 0.609375, + "learning_rate": 1.986753393021753e-05, + "loss": 1.0255, + "step": 3910 + }, + { + "epoch": 0.82, + "grad_norm": 0.376953125, + "learning_rate": 1.965008977198991e-05, + "loss": 1.0517, + "step": 3915 + }, + { + "epoch": 0.82, + "grad_norm": 0.376953125, + "learning_rate": 1.943371237843179e-05, + "loss": 1.202, + "step": 3920 + }, + { + "epoch": 0.82, + "grad_norm": 0.36328125, + "learning_rate": 1.9218404622275197e-05, + "loss": 1.0329, + "step": 3925 + }, + { + "epoch": 0.82, + "grad_norm": 0.349609375, + "learning_rate": 1.9004169362051116e-05, + "loss": 0.8898, + "step": 3930 + }, + { + "epoch": 0.82, + "grad_norm": 0.357421875, + "learning_rate": 1.8791009442051545e-05, + "loss": 1.0638, + "step": 3935 + }, + { + "epoch": 0.82, + "grad_norm": 0.40234375, + "learning_rate": 1.8578927692291737e-05, + "loss": 1.0068, + "step": 3940 + }, + { + "epoch": 0.82, + "grad_norm": 0.42578125, + "learning_rate": 1.8367926928472623e-05, + "loss": 1.0017, + "step": 3945 + }, + { + "epoch": 0.82, + "grad_norm": 0.333984375, + "learning_rate": 1.815800995194348e-05, + "loss": 1.0378, + "step": 3950 + }, + { + "epoch": 0.83, + "grad_norm": 0.33984375, + "learning_rate": 1.7949179549664606e-05, + "loss": 1.1059, + "step": 3955 + }, + { + "epoch": 0.83, + "grad_norm": 0.349609375, + "learning_rate": 1.7741438494170494e-05, + "loss": 0.9109, + "step": 3960 + }, + { + "epoch": 0.83, + "grad_norm": 0.330078125, + "learning_rate": 1.7534789543532894e-05, + "loss": 0.976, + "step": 3965 + }, + { + "epoch": 0.83, + "grad_norm": 0.353515625, + "learning_rate": 1.7329235441324253e-05, + "loss": 0.922, + "step": 3970 + }, + { + "epoch": 0.83, + "grad_norm": 0.33203125, + "learning_rate": 1.7124778916581297e-05, + "loss": 0.9421, + "step": 3975 + }, + { + "epoch": 0.83, + "grad_norm": 0.34375, + "learning_rate": 1.6921422683768706e-05, + "loss": 1.0366, + "step": 3980 + }, + { + "epoch": 0.83, + "grad_norm": 0.369140625, + "learning_rate": 1.671916944274321e-05, + "loss": 1.1087, + "step": 3985 + }, + { + "epoch": 0.83, + "grad_norm": 0.3671875, + "learning_rate": 1.6518021878717672e-05, + "loss": 1.1213, + "step": 3990 + }, + { + "epoch": 0.83, + "grad_norm": 0.34765625, + "learning_rate": 1.6317982662225462e-05, + "loss": 1.1077, + "step": 3995 + }, + { + "epoch": 0.83, + "grad_norm": 0.390625, + "learning_rate": 1.6119054449084902e-05, + "loss": 1.073, + "step": 4000 + }, + { + "epoch": 0.84, + "grad_norm": 0.34375, + "learning_rate": 1.592123988036419e-05, + "loss": 1.0458, + "step": 4005 + }, + { + "epoch": 0.84, + "grad_norm": 0.333984375, + "learning_rate": 1.5724541582346197e-05, + "loss": 1.0022, + "step": 4010 + }, + { + "epoch": 0.84, + "grad_norm": 0.3515625, + "learning_rate": 1.5528962166493642e-05, + "loss": 1.0358, + "step": 4015 + }, + { + "epoch": 0.84, + "grad_norm": 0.396484375, + "learning_rate": 1.5334504229414403e-05, + "loss": 1.0106, + "step": 4020 + }, + { + "epoch": 0.84, + "grad_norm": 0.4140625, + "learning_rate": 1.5141170352827095e-05, + "loss": 1.0975, + "step": 4025 + }, + { + "epoch": 0.84, + "grad_norm": 0.36328125, + "learning_rate": 1.4948963103526726e-05, + "loss": 1.013, + "step": 4030 + }, + { + "epoch": 0.84, + "grad_norm": 0.3515625, + "learning_rate": 1.4757885033350627e-05, + "loss": 1.006, + "step": 4035 + }, + { + "epoch": 0.84, + "grad_norm": 0.328125, + "learning_rate": 1.4567938679144633e-05, + "loss": 0.9755, + "step": 4040 + }, + { + "epoch": 0.84, + "grad_norm": 0.353515625, + "learning_rate": 1.4379126562729373e-05, + "loss": 1.0473, + "step": 4045 + }, + { + "epoch": 0.85, + "grad_norm": 0.361328125, + "learning_rate": 1.4191451190866756e-05, + "loss": 1.0553, + "step": 4050 + }, + { + "epoch": 0.85, + "grad_norm": 0.5234375, + "learning_rate": 1.400491505522673e-05, + "loss": 1.0685, + "step": 4055 + }, + { + "epoch": 0.85, + "grad_norm": 0.365234375, + "learning_rate": 1.3819520632354166e-05, + "loss": 1.0261, + "step": 4060 + }, + { + "epoch": 0.85, + "grad_norm": 0.345703125, + "learning_rate": 1.3635270383636046e-05, + "loss": 1.0794, + "step": 4065 + }, + { + "epoch": 0.85, + "grad_norm": 0.34765625, + "learning_rate": 1.3452166755268635e-05, + "loss": 0.9351, + "step": 4070 + }, + { + "epoch": 0.85, + "grad_norm": 0.9375, + "learning_rate": 1.3270212178225227e-05, + "loss": 0.9637, + "step": 4075 + }, + { + "epoch": 0.85, + "grad_norm": 0.390625, + "learning_rate": 1.308940906822368e-05, + "loss": 0.9613, + "step": 4080 + }, + { + "epoch": 0.85, + "grad_norm": 0.34765625, + "learning_rate": 1.2909759825694411e-05, + "loss": 1.0514, + "step": 4085 + }, + { + "epoch": 0.85, + "grad_norm": 0.30859375, + "learning_rate": 1.273126683574858e-05, + "loss": 1.0097, + "step": 4090 + }, + { + "epoch": 0.85, + "grad_norm": 0.4765625, + "learning_rate": 1.2553932468146312e-05, + "loss": 1.0627, + "step": 4095 + }, + { + "epoch": 0.86, + "grad_norm": 0.44140625, + "learning_rate": 1.2377759077265361e-05, + "loss": 1.0088, + "step": 4100 + }, + { + "epoch": 0.86, + "grad_norm": 0.404296875, + "learning_rate": 1.2202749002069691e-05, + "loss": 1.1016, + "step": 4105 + }, + { + "epoch": 0.86, + "grad_norm": 0.359375, + "learning_rate": 1.2028904566078602e-05, + "loss": 1.0947, + "step": 4110 + }, + { + "epoch": 0.86, + "grad_norm": 0.4609375, + "learning_rate": 1.1856228077335773e-05, + "loss": 1.0247, + "step": 4115 + }, + { + "epoch": 0.86, + "grad_norm": 0.345703125, + "learning_rate": 1.1684721828378642e-05, + "loss": 1.0542, + "step": 4120 + }, + { + "epoch": 0.86, + "grad_norm": 0.330078125, + "learning_rate": 1.1514388096207972e-05, + "loss": 0.974, + "step": 4125 + }, + { + "epoch": 0.86, + "grad_norm": 0.63671875, + "learning_rate": 1.1345229142257629e-05, + "loss": 1.1255, + "step": 4130 + }, + { + "epoch": 0.86, + "grad_norm": 0.349609375, + "learning_rate": 1.1177247212364528e-05, + "loss": 1.0134, + "step": 4135 + }, + { + "epoch": 0.86, + "grad_norm": 0.349609375, + "learning_rate": 1.1010444536738884e-05, + "loss": 1.0641, + "step": 4140 + }, + { + "epoch": 0.87, + "grad_norm": 0.34765625, + "learning_rate": 1.0844823329934472e-05, + "loss": 1.0691, + "step": 4145 + }, + { + "epoch": 0.87, + "grad_norm": 0.353515625, + "learning_rate": 1.0680385790819391e-05, + "loss": 1.0196, + "step": 4150 + }, + { + "epoch": 0.87, + "grad_norm": 0.39453125, + "learning_rate": 1.0517134102546766e-05, + "loss": 1.0477, + "step": 4155 + }, + { + "epoch": 0.87, + "grad_norm": 0.333984375, + "learning_rate": 1.0355070432525772e-05, + "loss": 1.0956, + "step": 4160 + }, + { + "epoch": 0.87, + "grad_norm": 0.361328125, + "learning_rate": 1.0194196932392874e-05, + "loss": 1.0475, + "step": 4165 + }, + { + "epoch": 0.87, + "grad_norm": 0.486328125, + "learning_rate": 1.0034515737983264e-05, + "loss": 1.083, + "step": 4170 + }, + { + "epoch": 0.87, + "grad_norm": 0.40234375, + "learning_rate": 9.87602896930252e-06, + "loss": 1.1112, + "step": 4175 + }, + { + "epoch": 0.87, + "grad_norm": 0.349609375, + "learning_rate": 9.718738730498422e-06, + "loss": 1.0079, + "step": 4180 + }, + { + "epoch": 0.87, + "grad_norm": 0.34375, + "learning_rate": 9.562647109833023e-06, + "loss": 1.0678, + "step": 4185 + }, + { + "epoch": 0.87, + "grad_norm": 0.357421875, + "learning_rate": 9.40775617965496e-06, + "loss": 0.9273, + "step": 4190 + }, + { + "epoch": 0.88, + "grad_norm": 0.353515625, + "learning_rate": 9.254067996371896e-06, + "loss": 1.1094, + "step": 4195 + }, + { + "epoch": 0.88, + "grad_norm": 0.53125, + "learning_rate": 9.101584600423263e-06, + "loss": 1.1018, + "step": 4200 + }, + { + "epoch": 0.88, + "grad_norm": 0.412109375, + "learning_rate": 8.950308016253129e-06, + "loss": 1.1271, + "step": 4205 + }, + { + "epoch": 0.88, + "grad_norm": 0.39453125, + "learning_rate": 8.800240252283343e-06, + "loss": 1.1965, + "step": 4210 + }, + { + "epoch": 0.88, + "grad_norm": 0.36328125, + "learning_rate": 8.651383300886884e-06, + "loss": 0.9942, + "step": 4215 + }, + { + "epoch": 0.88, + "grad_norm": 0.34765625, + "learning_rate": 8.503739138361344e-06, + "loss": 1.0004, + "step": 4220 + }, + { + "epoch": 0.88, + "grad_norm": 0.3671875, + "learning_rate": 8.357309724902773e-06, + "loss": 1.1956, + "step": 4225 + }, + { + "epoch": 0.88, + "grad_norm": 0.328125, + "learning_rate": 8.212097004579622e-06, + "loss": 1.0484, + "step": 4230 + }, + { + "epoch": 0.88, + "grad_norm": 0.3203125, + "learning_rate": 8.068102905306929e-06, + "loss": 0.9999, + "step": 4235 + }, + { + "epoch": 0.89, + "grad_norm": 0.38671875, + "learning_rate": 7.925329338820709e-06, + "loss": 0.9657, + "step": 4240 + }, + { + "epoch": 0.89, + "grad_norm": 0.396484375, + "learning_rate": 7.783778200652591e-06, + "loss": 1.1274, + "step": 4245 + }, + { + "epoch": 0.89, + "grad_norm": 0.375, + "learning_rate": 7.643451370104648e-06, + "loss": 1.1039, + "step": 4250 + }, + { + "epoch": 0.89, + "grad_norm": 0.7890625, + "learning_rate": 7.504350710224484e-06, + "loss": 1.1729, + "step": 4255 + }, + { + "epoch": 0.89, + "grad_norm": 0.384765625, + "learning_rate": 7.366478067780369e-06, + "loss": 1.0566, + "step": 4260 + }, + { + "epoch": 0.89, + "grad_norm": 0.3671875, + "learning_rate": 7.229835273236918e-06, + "loss": 1.0475, + "step": 4265 + }, + { + "epoch": 0.89, + "grad_norm": 0.357421875, + "learning_rate": 7.094424140730615e-06, + "loss": 1.0622, + "step": 4270 + }, + { + "epoch": 0.89, + "grad_norm": 0.376953125, + "learning_rate": 6.960246468045839e-06, + "loss": 0.9255, + "step": 4275 + }, + { + "epoch": 0.89, + "grad_norm": 0.390625, + "learning_rate": 6.82730403659092e-06, + "loss": 1.0548, + "step": 4280 + }, + { + "epoch": 0.89, + "grad_norm": 0.34765625, + "learning_rate": 6.695598611374554e-06, + "loss": 0.967, + "step": 4285 + }, + { + "epoch": 0.9, + "grad_norm": 0.326171875, + "learning_rate": 6.565131940982316e-06, + "loss": 1.204, + "step": 4290 + }, + { + "epoch": 0.9, + "grad_norm": 0.3203125, + "learning_rate": 6.4359057575534624e-06, + "loss": 1.0044, + "step": 4295 + }, + { + "epoch": 0.9, + "grad_norm": 0.328125, + "learning_rate": 6.307921776757953e-06, + "loss": 1.0688, + "step": 4300 + }, + { + "epoch": 0.9, + "grad_norm": 0.396484375, + "learning_rate": 6.181181697773664e-06, + "loss": 1.0628, + "step": 4305 + }, + { + "epoch": 0.9, + "grad_norm": 0.55859375, + "learning_rate": 6.055687203263794e-06, + "loss": 0.8719, + "step": 4310 + }, + { + "epoch": 0.9, + "grad_norm": 0.4765625, + "learning_rate": 5.9314399593545785e-06, + "loss": 0.9327, + "step": 4315 + }, + { + "epoch": 0.9, + "grad_norm": 0.3515625, + "learning_rate": 5.808441615613147e-06, + "loss": 1.1326, + "step": 4320 + }, + { + "epoch": 0.9, + "grad_norm": 0.365234375, + "learning_rate": 5.686693805025622e-06, + "loss": 1.0395, + "step": 4325 + }, + { + "epoch": 0.9, + "grad_norm": 0.361328125, + "learning_rate": 5.566198143975398e-06, + "loss": 1.0369, + "step": 4330 + }, + { + "epoch": 0.9, + "grad_norm": 0.322265625, + "learning_rate": 5.446956232221767e-06, + "loss": 1.1461, + "step": 4335 + }, + { + "epoch": 0.91, + "grad_norm": 0.37109375, + "learning_rate": 5.328969652878624e-06, + "loss": 1.1625, + "step": 4340 + }, + { + "epoch": 0.91, + "grad_norm": 0.33203125, + "learning_rate": 5.2122399723934066e-06, + "loss": 1.0224, + "step": 4345 + }, + { + "epoch": 0.91, + "grad_norm": 0.349609375, + "learning_rate": 5.096768740526426e-06, + "loss": 1.1103, + "step": 4350 + }, + { + "epoch": 0.91, + "grad_norm": 0.375, + "learning_rate": 4.982557490330175e-06, + "loss": 1.0779, + "step": 4355 + }, + { + "epoch": 0.91, + "grad_norm": 0.34375, + "learning_rate": 4.869607738129011e-06, + "loss": 1.1403, + "step": 4360 + }, + { + "epoch": 0.91, + "grad_norm": 0.365234375, + "learning_rate": 4.757920983499053e-06, + "loss": 1.1142, + "step": 4365 + }, + { + "epoch": 0.91, + "grad_norm": 0.40625, + "learning_rate": 4.6474987092481996e-06, + "loss": 1.1299, + "step": 4370 + }, + { + "epoch": 0.91, + "grad_norm": 0.330078125, + "learning_rate": 4.538342381396532e-06, + "loss": 0.9553, + "step": 4375 + }, + { + "epoch": 0.91, + "grad_norm": 0.3671875, + "learning_rate": 4.430453449156802e-06, + "loss": 1.0479, + "step": 4380 + }, + { + "epoch": 0.92, + "grad_norm": 0.35546875, + "learning_rate": 4.323833344915185e-06, + "loss": 1.0784, + "step": 4385 + }, + { + "epoch": 0.92, + "grad_norm": 0.396484375, + "learning_rate": 4.218483484212277e-06, + "loss": 1.0951, + "step": 4390 + }, + { + "epoch": 0.92, + "grad_norm": 0.37890625, + "learning_rate": 4.114405265724319e-06, + "loss": 1.067, + "step": 4395 + }, + { + "epoch": 0.92, + "grad_norm": 0.330078125, + "learning_rate": 4.011600071244592e-06, + "loss": 1.0339, + "step": 4400 + }, + { + "epoch": 0.92, + "grad_norm": 0.373046875, + "learning_rate": 3.910069265665106e-06, + "loss": 1.0851, + "step": 4405 + }, + { + "epoch": 0.92, + "grad_norm": 0.3125, + "learning_rate": 3.8098141969584167e-06, + "loss": 1.0284, + "step": 4410 + }, + { + "epoch": 0.92, + "grad_norm": 0.3515625, + "learning_rate": 3.710836196159806e-06, + "loss": 0.9502, + "step": 4415 + }, + { + "epoch": 0.92, + "grad_norm": 0.36328125, + "learning_rate": 3.613136577349596e-06, + "loss": 0.9807, + "step": 4420 + }, + { + "epoch": 0.92, + "grad_norm": 0.345703125, + "learning_rate": 3.516716637635664e-06, + "loss": 0.9914, + "step": 4425 + }, + { + "epoch": 0.92, + "grad_norm": 0.392578125, + "learning_rate": 3.4215776571362213e-06, + "loss": 1.1339, + "step": 4430 + }, + { + "epoch": 0.93, + "grad_norm": 0.326171875, + "learning_rate": 3.3277208989628826e-06, + "loss": 1.0262, + "step": 4435 + }, + { + "epoch": 0.93, + "grad_norm": 0.3359375, + "learning_rate": 3.2351476092038367e-06, + "loss": 0.9285, + "step": 4440 + }, + { + "epoch": 0.93, + "grad_norm": 0.345703125, + "learning_rate": 3.1438590169072914e-06, + "loss": 1.0673, + "step": 4445 + }, + { + "epoch": 0.93, + "grad_norm": 0.37109375, + "learning_rate": 3.0538563340651973e-06, + "loss": 1.104, + "step": 4450 + }, + { + "epoch": 0.93, + "grad_norm": 0.359375, + "learning_rate": 2.9651407555971734e-06, + "loss": 1.1608, + "step": 4455 + }, + { + "epoch": 0.93, + "grad_norm": 0.35546875, + "learning_rate": 2.8777134593345854e-06, + "loss": 1.1182, + "step": 4460 + }, + { + "epoch": 0.93, + "grad_norm": 0.36328125, + "learning_rate": 2.791575606004926e-06, + "loss": 1.0455, + "step": 4465 + }, + { + "epoch": 0.93, + "grad_norm": 0.34765625, + "learning_rate": 2.7067283392164354e-06, + "loss": 1.029, + "step": 4470 + }, + { + "epoch": 0.93, + "grad_norm": 0.3515625, + "learning_rate": 2.6231727854428843e-06, + "loss": 1.0893, + "step": 4475 + }, + { + "epoch": 0.94, + "grad_norm": 0.40625, + "learning_rate": 2.5409100540086274e-06, + "loss": 1.0287, + "step": 4480 + }, + { + "epoch": 0.94, + "grad_norm": 0.388671875, + "learning_rate": 2.4599412370738815e-06, + "loss": 0.948, + "step": 4485 + }, + { + "epoch": 0.94, + "grad_norm": 0.34375, + "learning_rate": 2.3802674096202404e-06, + "loss": 0.9194, + "step": 4490 + }, + { + "epoch": 0.94, + "grad_norm": 0.392578125, + "learning_rate": 2.301889629436349e-06, + "loss": 1.0878, + "step": 4495 + }, + { + "epoch": 0.94, + "grad_norm": 0.369140625, + "learning_rate": 2.224808937103917e-06, + "loss": 1.0251, + "step": 4500 + }, + { + "epoch": 0.94, + "grad_norm": 0.396484375, + "learning_rate": 2.149026355983896e-06, + "loss": 0.9898, + "step": 4505 + }, + { + "epoch": 0.94, + "grad_norm": 0.375, + "learning_rate": 2.0745428922028576e-06, + "loss": 1.0231, + "step": 4510 + }, + { + "epoch": 0.94, + "grad_norm": 0.376953125, + "learning_rate": 2.0013595346396462e-06, + "loss": 1.0693, + "step": 4515 + }, + { + "epoch": 0.94, + "grad_norm": 0.37109375, + "learning_rate": 1.9294772549122932e-06, + "loss": 0.9087, + "step": 4520 + }, + { + "epoch": 0.94, + "grad_norm": 0.3515625, + "learning_rate": 1.858897007365057e-06, + "loss": 1.015, + "step": 4525 + }, + { + "epoch": 0.95, + "grad_norm": 0.341796875, + "learning_rate": 1.7896197290557914e-06, + "loss": 1.0226, + "step": 4530 + }, + { + "epoch": 0.95, + "grad_norm": 0.365234375, + "learning_rate": 1.721646339743499e-06, + "loss": 1.0009, + "step": 4535 + }, + { + "epoch": 0.95, + "grad_norm": 0.375, + "learning_rate": 1.654977741876107e-06, + "loss": 1.1028, + "step": 4540 + }, + { + "epoch": 0.95, + "grad_norm": 0.337890625, + "learning_rate": 1.5896148205785e-06, + "loss": 1.0865, + "step": 4545 + }, + { + "epoch": 0.95, + "grad_norm": 0.365234375, + "learning_rate": 1.5255584436407622e-06, + "loss": 0.9828, + "step": 4550 + }, + { + "epoch": 0.95, + "grad_norm": 0.380859375, + "learning_rate": 1.4628094615066645e-06, + "loss": 0.9875, + "step": 4555 + }, + { + "epoch": 0.95, + "grad_norm": 1.078125, + "learning_rate": 1.4013687072623294e-06, + "loss": 1.1048, + "step": 4560 + }, + { + "epoch": 0.95, + "grad_norm": 0.333984375, + "learning_rate": 1.3412369966252392e-06, + "loss": 0.9696, + "step": 4565 + }, + { + "epoch": 0.95, + "grad_norm": 0.337890625, + "learning_rate": 1.2824151279333673e-06, + "loss": 1.1658, + "step": 4570 + }, + { + "epoch": 0.95, + "grad_norm": 0.396484375, + "learning_rate": 1.2249038821345982e-06, + "loss": 1.0483, + "step": 4575 + }, + { + "epoch": 0.96, + "grad_norm": 0.3828125, + "learning_rate": 1.1687040227763124e-06, + "loss": 1.0442, + "step": 4580 + }, + { + "epoch": 0.96, + "grad_norm": 0.388671875, + "learning_rate": 1.113816295995318e-06, + "loss": 1.1091, + "step": 4585 + }, + { + "epoch": 0.96, + "grad_norm": 0.3515625, + "learning_rate": 1.0602414305079023e-06, + "loss": 1.089, + "step": 4590 + }, + { + "epoch": 0.96, + "grad_norm": 0.34765625, + "learning_rate": 1.0079801376001398e-06, + "loss": 1.0591, + "step": 4595 + }, + { + "epoch": 0.96, + "grad_norm": 0.333984375, + "learning_rate": 9.570331111184883e-07, + "loss": 1.102, + "step": 4600 + }, + { + "epoch": 0.96, + "grad_norm": 0.3671875, + "learning_rate": 9.074010274605859e-07, + "loss": 1.075, + "step": 4605 + }, + { + "epoch": 0.96, + "grad_norm": 0.419921875, + "learning_rate": 8.590845455662133e-07, + "loss": 1.1246, + "step": 4610 + }, + { + "epoch": 0.96, + "grad_norm": 0.380859375, + "learning_rate": 8.120843069085893e-07, + "loss": 1.0597, + "step": 4615 + }, + { + "epoch": 0.96, + "grad_norm": 0.365234375, + "learning_rate": 7.66400935485867e-07, + "loss": 1.1967, + "step": 4620 + }, + { + "epoch": 0.97, + "grad_norm": 0.345703125, + "learning_rate": 7.220350378127961e-07, + "loss": 1.0108, + "step": 4625 + }, + { + "epoch": 0.97, + "grad_norm": 0.375, + "learning_rate": 6.789872029127064e-07, + "loss": 1.0265, + "step": 4630 + }, + { + "epoch": 0.97, + "grad_norm": 0.333984375, + "learning_rate": 6.37258002309693e-07, + "loss": 0.8866, + "step": 4635 + }, + { + "epoch": 0.97, + "grad_norm": 0.337890625, + "learning_rate": 5.968479900210211e-07, + "loss": 0.9801, + "step": 4640 + }, + { + "epoch": 0.97, + "grad_norm": 0.349609375, + "learning_rate": 5.577577025497438e-07, + "loss": 1.0485, + "step": 4645 + }, + { + "epoch": 0.97, + "grad_norm": 0.365234375, + "learning_rate": 5.199876588776409e-07, + "loss": 0.9552, + "step": 4650 + }, + { + "epoch": 0.97, + "grad_norm": 0.34765625, + "learning_rate": 4.835383604582467e-07, + "loss": 1.0244, + "step": 4655 + }, + { + "epoch": 0.97, + "grad_norm": 0.322265625, + "learning_rate": 4.484102912102661e-07, + "loss": 0.9478, + "step": 4660 + }, + { + "epoch": 0.97, + "grad_norm": 0.341796875, + "learning_rate": 4.1460391751110274e-07, + "loss": 1.0376, + "step": 4665 + }, + { + "epoch": 0.97, + "grad_norm": 0.384765625, + "learning_rate": 3.821196881907074e-07, + "loss": 1.1337, + "step": 4670 + }, + { + "epoch": 0.98, + "grad_norm": 0.4296875, + "learning_rate": 3.5095803452557254e-07, + "loss": 1.1147, + "step": 4675 + }, + { + "epoch": 0.98, + "grad_norm": 0.462890625, + "learning_rate": 3.211193702330362e-07, + "loss": 0.9671, + "step": 4680 + }, + { + "epoch": 0.98, + "grad_norm": 0.345703125, + "learning_rate": 2.9260409146578684e-07, + "loss": 1.0073, + "step": 4685 + }, + { + "epoch": 0.98, + "grad_norm": 0.34765625, + "learning_rate": 2.654125768066118e-07, + "loss": 1.1421, + "step": 4690 + }, + { + "epoch": 0.98, + "grad_norm": 0.376953125, + "learning_rate": 2.3954518726332365e-07, + "loss": 1.0143, + "step": 4695 + }, + { + "epoch": 0.98, + "grad_norm": 0.3671875, + "learning_rate": 2.1500226626404163e-07, + "loss": 0.8934, + "step": 4700 + }, + { + "epoch": 0.98, + "grad_norm": 0.35546875, + "learning_rate": 1.9178413965258435e-07, + "loss": 1.0531, + "step": 4705 + }, + { + "epoch": 0.98, + "grad_norm": 0.333984375, + "learning_rate": 1.6989111568411764e-07, + "loss": 0.9631, + "step": 4710 + }, + { + "epoch": 0.98, + "grad_norm": 0.35546875, + "learning_rate": 1.4932348502114669e-07, + "loss": 0.9886, + "step": 4715 + }, + { + "epoch": 0.99, + "grad_norm": 0.3671875, + "learning_rate": 1.3008152072958579e-07, + "loss": 1.0686, + "step": 4720 + }, + { + "epoch": 0.99, + "grad_norm": 0.419921875, + "learning_rate": 1.1216547827515022e-07, + "loss": 1.0989, + "step": 4725 + }, + { + "epoch": 0.99, + "grad_norm": 0.345703125, + "learning_rate": 9.557559551996998e-08, + "loss": 0.9859, + "step": 4730 + }, + { + "epoch": 0.99, + "grad_norm": 0.373046875, + "learning_rate": 8.031209271944784e-08, + "loss": 0.98, + "step": 4735 + }, + { + "epoch": 0.99, + "grad_norm": 0.376953125, + "learning_rate": 6.637517251928404e-08, + "loss": 1.0466, + "step": 4740 + }, + { + "epoch": 0.99, + "grad_norm": 0.333984375, + "learning_rate": 5.37650199528672e-08, + "loss": 1.169, + "step": 4745 + }, + { + "epoch": 0.99, + "grad_norm": 0.328125, + "learning_rate": 4.2481802438720794e-08, + "loss": 1.0257, + "step": 4750 + }, + { + "epoch": 0.99, + "grad_norm": 0.37109375, + "learning_rate": 3.252566977837157e-08, + "loss": 1.0731, + "step": 4755 + }, + { + "epoch": 0.99, + "grad_norm": 1.046875, + "learning_rate": 2.3896754154284497e-08, + "loss": 0.9857, + "step": 4760 + }, + { + "epoch": 0.99, + "grad_norm": 0.3671875, + "learning_rate": 1.659517012817524e-08, + "loss": 1.0577, + "step": 4765 + }, + { + "epoch": 1.0, + "grad_norm": 2.625, + "learning_rate": 1.062101463944476e-08, + "loss": 1.1085, + "step": 4770 + }, + { + "epoch": 1.0, + "grad_norm": 1.3203125, + "learning_rate": 5.974367003880321e-09, + "loss": 1.1863, + "step": 4775 + }, + { + "epoch": 1.0, + "grad_norm": 0.32421875, + "learning_rate": 2.6552889126563174e-09, + "loss": 1.0574, + "step": 4780 + }, + { + "epoch": 1.0, + "grad_norm": 0.34375, + "learning_rate": 6.638244314793874e-10, + "loss": 0.9897, + "step": 4785 + }, + { + "epoch": 1.0, + "grad_norm": 0.40234375, + "learning_rate": 0.0, + "loss": 1.1193, + "step": 4790 + }, + { + "epoch": 1.0, + "step": 4790, + "total_flos": 1.007099383382016e+18, + "train_loss": 1.0868773834192678, + "train_runtime": 33367.2438, + "train_samples_per_second": 1.149, + "train_steps_per_second": 0.144 + } + ], + "logging_steps": 5, + "max_steps": 4790, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.007099383382016e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}