diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.875, + "eval_steps": 2000, + "global_step": 35000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025, + "grad_norm": 31.5, + "learning_rate": 0.0001, + "loss": 7.633, + "loss/crossentropy": 2.065455098450184, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.20220321230590343, + "step": 10 + }, + { + "epoch": 0.0005, + "grad_norm": 35.0, + "grad_norm_var": 2.6895182291666666, + "learning_rate": 0.0001, + "loss": 7.4618, + "loss/crossentropy": 1.9399560801684856, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.19191570337861777, + "step": 20 + }, + { + "epoch": 0.00075, + "grad_norm": 37.5, + "grad_norm_var": 6.579622395833334, + "learning_rate": 0.0001, + "loss": 7.5972, + "loss/crossentropy": 2.130601316690445, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.20188977513462306, + "step": 30 + }, + { + "epoch": 0.001, + "grad_norm": 33.5, + "grad_norm_var": 6.253125, + "learning_rate": 0.0001, + "loss": 7.5917, + "loss/crossentropy": 2.2571407079696657, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.19847887996584176, + "step": 40 + }, + { + "epoch": 0.00125, + "grad_norm": 32.25, + "grad_norm_var": 2.1619140625, + "learning_rate": 0.0001, + "loss": 7.6054, + "loss/crossentropy": 2.1717565625905992, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20264342725276946, + "step": 50 + }, + { + "epoch": 0.0015, + "grad_norm": 35.5, + "grad_norm_var": 15.786393229166666, + "learning_rate": 0.0001, + "loss": 7.5513, + "loss/crossentropy": 2.070718301087618, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19855907820165158, + "step": 60 + }, + { + "epoch": 0.00175, + "grad_norm": 31.0, + "grad_norm_var": 12.4625, + "learning_rate": 0.0001, + "loss": 7.5447, + "loss/crossentropy": 2.118075390160084, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.20283062420785428, + "step": 70 + }, + { + "epoch": 0.002, + "grad_norm": 32.25, + "grad_norm_var": 1.2643229166666667, + "learning_rate": 0.0001, + "loss": 7.468, + "loss/crossentropy": 2.0006178975105287, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18958428762853147, + "step": 80 + }, + { + "epoch": 0.00225, + "grad_norm": 30.625, + "grad_norm_var": 3.470572916666667, + "learning_rate": 0.0001, + "loss": 7.5061, + "loss/crossentropy": 1.9605075903236866, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.20559987109154462, + "step": 90 + }, + { + "epoch": 0.0025, + "grad_norm": 31.125, + "grad_norm_var": 6.763541666666667, + "learning_rate": 0.0001, + "loss": 7.4928, + "loss/crossentropy": 2.1205389350652695, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19496036488562823, + "step": 100 + }, + { + "epoch": 0.00275, + "grad_norm": 31.0, + "grad_norm_var": 6.1509765625, + "learning_rate": 0.0001, + "loss": 7.595, + "loss/crossentropy": 2.1240097641944886, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.19564666803926228, + "step": 110 + }, + { + "epoch": 0.003, + "grad_norm": 31.25, + "grad_norm_var": 3.348893229166667, + "learning_rate": 0.0001, + "loss": 7.5329, + "loss/crossentropy": 2.175096944719553, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.21303062327206135, + "step": 120 + }, + { + "epoch": 0.00325, + "grad_norm": 32.0, + "grad_norm_var": 2.8541666666666665, + "learning_rate": 0.0001, + "loss": 7.5536, + "loss/crossentropy": 2.1472502022981645, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.18929538186639547, + "step": 130 + }, + { + "epoch": 0.0035, + "grad_norm": 29.375, + "grad_norm_var": 29.683268229166668, + "learning_rate": 0.0001, + "loss": 7.5191, + "loss/crossentropy": 2.015011890232563, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.20328481420874595, + "step": 140 + }, + { + "epoch": 0.00375, + "grad_norm": 28.75, + "grad_norm_var": 28.74765625, + "learning_rate": 0.0001, + "loss": 7.4158, + "loss/crossentropy": 1.9774167470633983, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.19464388117194176, + "step": 150 + }, + { + "epoch": 0.004, + "grad_norm": 30.875, + "grad_norm_var": 1.3635416666666667, + "learning_rate": 0.0001, + "loss": 7.6354, + "loss/crossentropy": 2.320629420876503, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.20745602920651435, + "step": 160 + }, + { + "epoch": 0.00425, + "grad_norm": 31.5, + "grad_norm_var": 1.0270182291666667, + "learning_rate": 0.0001, + "loss": 7.4137, + "loss/crossentropy": 1.900385806709528, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.16769229620695114, + "step": 170 + }, + { + "epoch": 0.0045, + "grad_norm": 31.25, + "grad_norm_var": 0.9833333333333333, + "learning_rate": 0.0001, + "loss": 7.5763, + "loss/crossentropy": 2.129625543951988, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.2102549459785223, + "step": 180 + }, + { + "epoch": 0.00475, + "grad_norm": 32.25, + "grad_norm_var": 3.05390625, + "learning_rate": 0.0001, + "loss": 7.6166, + "loss/crossentropy": 2.1552532628178596, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.2250068686902523, + "step": 190 + }, + { + "epoch": 0.005, + "grad_norm": 29.625, + "grad_norm_var": 3.8375, + "learning_rate": 0.0001, + "loss": 7.5745, + "loss/crossentropy": 1.9441482461988926, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.195942450594157, + "step": 200 + }, + { + "epoch": 0.00525, + "grad_norm": 32.5, + "grad_norm_var": 18.396875, + "learning_rate": 0.0001, + "loss": 7.5292, + "loss/crossentropy": 1.9941987417638303, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18264975901693106, + "step": 210 + }, + { + "epoch": 0.0055, + "grad_norm": 31.75, + "grad_norm_var": 20.736393229166666, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.0191620789468288, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18100650198757648, + "step": 220 + }, + { + "epoch": 0.00575, + "grad_norm": 30.375, + "grad_norm_var": 2.342643229166667, + "learning_rate": 0.0001, + "loss": 7.5199, + "loss/crossentropy": 2.001779730618, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.17959208656102418, + "step": 230 + }, + { + "epoch": 0.006, + "grad_norm": 30.75, + "grad_norm_var": 1.271875, + "learning_rate": 0.0001, + "loss": 7.6842, + "loss/crossentropy": 2.1846971333026888, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.2059234745800495, + "step": 240 + }, + { + "epoch": 0.00625, + "grad_norm": 29.5, + "grad_norm_var": 5.688541666666667, + "learning_rate": 0.0001, + "loss": 7.5196, + "loss/crossentropy": 2.174124576151371, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.20000722594559192, + "step": 250 + }, + { + "epoch": 0.0065, + "grad_norm": 28.75, + "grad_norm_var": 1.9572265625, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 1.9285166233778, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.18449910767376423, + "step": 260 + }, + { + "epoch": 0.00675, + "grad_norm": 33.5, + "grad_norm_var": 2.0999348958333335, + "learning_rate": 0.0001, + "loss": 7.5877, + "loss/crossentropy": 2.0323276594281197, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19395631980150937, + "step": 270 + }, + { + "epoch": 0.007, + "grad_norm": 30.5, + "grad_norm_var": 2.15390625, + "learning_rate": 0.0001, + "loss": 7.5791, + "loss/crossentropy": 2.126656140387058, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.21661139875650406, + "step": 280 + }, + { + "epoch": 0.00725, + "grad_norm": 29.5, + "grad_norm_var": 3.193489583333333, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.200097793340683, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.21046234332025052, + "step": 290 + }, + { + "epoch": 0.0075, + "grad_norm": 26.75, + "grad_norm_var": 4.27265625, + "learning_rate": 0.0001, + "loss": 7.5404, + "loss/crossentropy": 2.1184144005179406, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.20949590150266886, + "step": 300 + }, + { + "epoch": 0.00775, + "grad_norm": 33.0, + "grad_norm_var": 3.3643229166666666, + "learning_rate": 0.0001, + "loss": 7.5628, + "loss/crossentropy": 1.9984030593186617, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.18789457948878407, + "step": 310 + }, + { + "epoch": 0.008, + "grad_norm": 32.5, + "grad_norm_var": 2.5645182291666666, + "learning_rate": 0.0001, + "loss": 7.5695, + "loss/crossentropy": 2.143594169616699, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.19360470157116652, + "step": 320 + }, + { + "epoch": 0.00825, + "grad_norm": 29.375, + "grad_norm_var": 1.8749348958333334, + "learning_rate": 0.0001, + "loss": 7.3627, + "loss/crossentropy": 2.1077703177928924, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.19771252572536469, + "step": 330 + }, + { + "epoch": 0.0085, + "grad_norm": 29.75, + "grad_norm_var": 1.5978515625, + "learning_rate": 0.0001, + "loss": 7.4192, + "loss/crossentropy": 2.0583472289144993, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.20189273860305548, + "step": 340 + }, + { + "epoch": 0.00875, + "grad_norm": 29.875, + "grad_norm_var": 1.2872395833333334, + "learning_rate": 0.0001, + "loss": 7.5432, + "loss/crossentropy": 2.0804511278867723, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19735569059848784, + "step": 350 + }, + { + "epoch": 0.009, + "grad_norm": 30.5, + "grad_norm_var": 18.731184895833334, + "learning_rate": 0.0001, + "loss": 7.4948, + "loss/crossentropy": 2.0466629534959795, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18366040643304588, + "step": 360 + }, + { + "epoch": 0.00925, + "grad_norm": 30.875, + "grad_norm_var": 25.9916015625, + "learning_rate": 0.0001, + "loss": 7.5081, + "loss/crossentropy": 1.9005662694573402, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.1900689721107483, + "step": 370 + }, + { + "epoch": 0.0095, + "grad_norm": 28.75, + "grad_norm_var": 2.451041666666667, + "learning_rate": 0.0001, + "loss": 7.4305, + "loss/crossentropy": 2.0674299761652946, + "loss/hidden": 3.517578125, + "loss/jsd": 0.0, + "loss/logits": 0.21062961965799332, + "step": 380 + }, + { + "epoch": 0.00975, + "grad_norm": 31.25, + "grad_norm_var": 5.645247395833334, + "learning_rate": 0.0001, + "loss": 7.5168, + "loss/crossentropy": 2.0279919117689134, + "loss/hidden": 3.503125, + "loss/jsd": 0.0, + "loss/logits": 0.20519332773983479, + "step": 390 + }, + { + "epoch": 0.01, + "grad_norm": 31.125, + "grad_norm_var": 5.928125, + "learning_rate": 0.0001, + "loss": 7.4985, + "loss/crossentropy": 2.0427632443606853, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.20287631042301654, + "step": 400 + }, + { + "epoch": 0.01025, + "grad_norm": 38.5, + "grad_norm_var": 438.43515625, + "learning_rate": 0.0001, + "loss": 7.5633, + "loss/crossentropy": 2.199043881893158, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.21130343191325665, + "step": 410 + }, + { + "epoch": 0.0105, + "grad_norm": 30.875, + "grad_norm_var": 43.14140625, + "learning_rate": 0.0001, + "loss": 7.4835, + "loss/crossentropy": 1.9102243572473525, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1895731385797262, + "step": 420 + }, + { + "epoch": 0.01075, + "grad_norm": 31.75, + "grad_norm_var": 5.658268229166667, + "learning_rate": 0.0001, + "loss": 7.3897, + "loss/crossentropy": 2.159160128980875, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.20280379485338926, + "step": 430 + }, + { + "epoch": 0.011, + "grad_norm": 28.375, + "grad_norm_var": 16.3375, + "learning_rate": 0.0001, + "loss": 7.5463, + "loss/crossentropy": 2.1217672407627104, + "loss/hidden": 3.545703125, + "loss/jsd": 0.0, + "loss/logits": 0.23856931366026402, + "step": 440 + }, + { + "epoch": 0.01125, + "grad_norm": 30.5, + "grad_norm_var": 17.098372395833334, + "learning_rate": 0.0001, + "loss": 7.5225, + "loss/crossentropy": 1.969854873791337, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.19548849146813155, + "step": 450 + }, + { + "epoch": 0.0115, + "grad_norm": 29.875, + "grad_norm_var": 2.5677083333333335, + "learning_rate": 0.0001, + "loss": 7.5046, + "loss/crossentropy": 2.121321603655815, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.19364523217082025, + "step": 460 + }, + { + "epoch": 0.01175, + "grad_norm": 32.25, + "grad_norm_var": 8.585416666666667, + "learning_rate": 0.0001, + "loss": 7.4558, + "loss/crossentropy": 1.9360710382461548, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.1893781816586852, + "step": 470 + }, + { + "epoch": 0.012, + "grad_norm": 29.875, + "grad_norm_var": 3.417122395833333, + "learning_rate": 0.0001, + "loss": 7.531, + "loss/crossentropy": 2.082458943128586, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.2220946006476879, + "step": 480 + }, + { + "epoch": 0.01225, + "grad_norm": 31.0, + "grad_norm_var": 48.96640625, + "learning_rate": 0.0001, + "loss": 7.5651, + "loss/crossentropy": 2.1382531195878984, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.20847559962421655, + "step": 490 + }, + { + "epoch": 0.0125, + "grad_norm": 29.875, + "grad_norm_var": 49.2666015625, + "learning_rate": 0.0001, + "loss": 7.5679, + "loss/crossentropy": 2.0875915244221686, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.1850985599681735, + "step": 500 + }, + { + "epoch": 0.01275, + "grad_norm": 31.875, + "grad_norm_var": 1.45, + "learning_rate": 0.0001, + "loss": 7.5263, + "loss/crossentropy": 2.182442346215248, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.19555890336632728, + "step": 510 + }, + { + "epoch": 0.013, + "grad_norm": 34.0, + "grad_norm_var": 1.6931640625, + "learning_rate": 0.0001, + "loss": 7.5209, + "loss/crossentropy": 1.9812136888504028, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1965757070109248, + "step": 520 + }, + { + "epoch": 0.01325, + "grad_norm": 31.0, + "grad_norm_var": 2.101822916666667, + "learning_rate": 0.0001, + "loss": 7.6059, + "loss/crossentropy": 2.0372241511940956, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.204646560549736, + "step": 530 + }, + { + "epoch": 0.0135, + "grad_norm": 29.125, + "grad_norm_var": 20.071875, + "learning_rate": 0.0001, + "loss": 7.5725, + "loss/crossentropy": 2.155761349201202, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.19602423422038556, + "step": 540 + }, + { + "epoch": 0.01375, + "grad_norm": 29.125, + "grad_norm_var": 20.506705729166665, + "learning_rate": 0.0001, + "loss": 7.5842, + "loss/crossentropy": 1.8869566857814788, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.20957522764801978, + "step": 550 + }, + { + "epoch": 0.014, + "grad_norm": 30.625, + "grad_norm_var": 10.025455729166667, + "learning_rate": 0.0001, + "loss": 7.4975, + "loss/crossentropy": 2.0370677679777147, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.19026046600192786, + "step": 560 + }, + { + "epoch": 0.01425, + "grad_norm": 33.0, + "grad_norm_var": 2.2270833333333333, + "learning_rate": 0.0001, + "loss": 7.5688, + "loss/crossentropy": 2.1931444257497787, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.2036376902833581, + "step": 570 + }, + { + "epoch": 0.0145, + "grad_norm": 35.0, + "grad_norm_var": 3.5681640625, + "learning_rate": 0.0001, + "loss": 7.478, + "loss/crossentropy": 2.061052493005991, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.2282864760607481, + "step": 580 + }, + { + "epoch": 0.01475, + "grad_norm": 32.5, + "grad_norm_var": 2.8705729166666667, + "learning_rate": 0.0001, + "loss": 7.5957, + "loss/crossentropy": 2.0078392371535303, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.19647251404821872, + "step": 590 + }, + { + "epoch": 0.015, + "grad_norm": 30.25, + "grad_norm_var": 31.449934895833334, + "learning_rate": 0.0001, + "loss": 7.5096, + "loss/crossentropy": 2.0417068414390087, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.19782953998073935, + "step": 600 + }, + { + "epoch": 0.01525, + "grad_norm": 30.5, + "grad_norm_var": 26.253059895833335, + "learning_rate": 0.0001, + "loss": 7.5368, + "loss/crossentropy": 2.1738049775362014, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.1996332859620452, + "step": 610 + }, + { + "epoch": 0.0155, + "grad_norm": 30.125, + "grad_norm_var": 2.334375, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 1.7587297886610032, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.18938990794122218, + "step": 620 + }, + { + "epoch": 0.01575, + "grad_norm": 29.25, + "grad_norm_var": 27.393684895833335, + "learning_rate": 0.0001, + "loss": 7.4833, + "loss/crossentropy": 1.9551145888864994, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.20075901364907622, + "step": 630 + }, + { + "epoch": 0.016, + "grad_norm": 29.75, + "grad_norm_var": 29.6947265625, + "learning_rate": 0.0001, + "loss": 7.4608, + "loss/crossentropy": 2.128718316555023, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.19077460393309592, + "step": 640 + }, + { + "epoch": 0.01625, + "grad_norm": 29.75, + "grad_norm_var": 27.322330729166666, + "learning_rate": 0.0001, + "loss": 7.6033, + "loss/crossentropy": 1.9678708665072917, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.18875791020691396, + "step": 650 + }, + { + "epoch": 0.0165, + "grad_norm": 30.375, + "grad_norm_var": 3.129622395833333, + "learning_rate": 0.0001, + "loss": 7.3873, + "loss/crossentropy": 1.9582339562475681, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.18309127148240806, + "step": 660 + }, + { + "epoch": 0.01675, + "grad_norm": 32.75, + "grad_norm_var": 2.7009765625, + "learning_rate": 0.0001, + "loss": 7.4913, + "loss/crossentropy": 2.0773802563548087, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.20910798981785775, + "step": 670 + }, + { + "epoch": 0.017, + "grad_norm": 34.0, + "grad_norm_var": 3.3854166666666665, + "learning_rate": 0.0001, + "loss": 7.4847, + "loss/crossentropy": 2.12913373708725, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.201920267008245, + "step": 680 + }, + { + "epoch": 0.01725, + "grad_norm": 30.75, + "grad_norm_var": 1.7176432291666666, + "learning_rate": 0.0001, + "loss": 7.5065, + "loss/crossentropy": 1.9141538538038732, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1841401271522045, + "step": 690 + }, + { + "epoch": 0.0175, + "grad_norm": 31.0, + "grad_norm_var": 1.6374348958333333, + "learning_rate": 0.0001, + "loss": 7.5897, + "loss/crossentropy": 2.207232800126076, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.21376523859798907, + "step": 700 + }, + { + "epoch": 0.01775, + "grad_norm": 32.75, + "grad_norm_var": 2.3655598958333335, + "learning_rate": 0.0001, + "loss": 7.5075, + "loss/crossentropy": 2.03845998942852, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1920805646572262, + "step": 710 + }, + { + "epoch": 0.018, + "grad_norm": 32.5, + "grad_norm_var": 1.3893229166666667, + "learning_rate": 0.0001, + "loss": 7.4669, + "loss/crossentropy": 2.054341807588935, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.19716067584231495, + "step": 720 + }, + { + "epoch": 0.01825, + "grad_norm": 31.625, + "grad_norm_var": 3.54140625, + "learning_rate": 0.0001, + "loss": 7.517, + "loss/crossentropy": 2.2111608639359472, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.20262118335813284, + "step": 730 + }, + { + "epoch": 0.0185, + "grad_norm": 29.125, + "grad_norm_var": 4.692122395833334, + "learning_rate": 0.0001, + "loss": 7.4784, + "loss/crossentropy": 2.0551758617162705, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.20378697756677866, + "step": 740 + }, + { + "epoch": 0.01875, + "grad_norm": 33.0, + "grad_norm_var": 4.295572916666667, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 2.128055375814438, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19904747987166047, + "step": 750 + }, + { + "epoch": 0.019, + "grad_norm": 6106906624.0, + "grad_norm_var": 2.3308942582349476e+18, + "learning_rate": 0.0001, + "loss": 7.4633, + "loss/crossentropy": 2.248567137122154, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.19723597317934036, + "step": 760 + }, + { + "epoch": 0.01925, + "grad_norm": 28.5, + "grad_norm_var": 2.330894258158611e+18, + "learning_rate": 0.0001, + "loss": 7.4542, + "loss/crossentropy": 2.132212319970131, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18174959290772677, + "step": 770 + }, + { + "epoch": 0.0195, + "grad_norm": 36.5, + "grad_norm_var": 4.833333333333333, + "learning_rate": 0.0001, + "loss": 7.465, + "loss/crossentropy": 2.046277052164078, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.21161840241402388, + "step": 780 + }, + { + "epoch": 0.01975, + "grad_norm": 32.75, + "grad_norm_var": 5.137434895833334, + "learning_rate": 0.0001, + "loss": 7.4171, + "loss/crossentropy": 2.058088332414627, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1815673651173711, + "step": 790 + }, + { + "epoch": 0.02, + "grad_norm": 30.125, + "grad_norm_var": 12.37265625, + "learning_rate": 0.0001, + "loss": 7.4153, + "loss/crossentropy": 2.064726157486439, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19402222614735365, + "step": 800 + }, + { + "epoch": 0.02025, + "grad_norm": 32.0, + "grad_norm_var": 12.240625, + "learning_rate": 0.0001, + "loss": 7.3739, + "loss/crossentropy": 2.0926051691174505, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.21017331834882497, + "step": 810 + }, + { + "epoch": 0.0205, + "grad_norm": 31.875, + "grad_norm_var": 3.6853515625, + "learning_rate": 0.0001, + "loss": 7.409, + "loss/crossentropy": 2.016859006881714, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.20363395065069198, + "step": 820 + }, + { + "epoch": 0.02075, + "grad_norm": 34.0, + "grad_norm_var": 278.1108723958333, + "learning_rate": 0.0001, + "loss": 7.6725, + "loss/crossentropy": 2.03957434669137, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.19866096526384353, + "step": 830 + }, + { + "epoch": 0.021, + "grad_norm": 35.75, + "grad_norm_var": 281.2239583333333, + "learning_rate": 0.0001, + "loss": 7.4058, + "loss/crossentropy": 2.1190530106425287, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19663097113370895, + "step": 840 + }, + { + "epoch": 0.02125, + "grad_norm": 32.25, + "grad_norm_var": 4.044791666666667, + "learning_rate": 0.0001, + "loss": 7.4687, + "loss/crossentropy": 2.1552326917648315, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19604418501257898, + "step": 850 + }, + { + "epoch": 0.0215, + "grad_norm": 37.25, + "grad_norm_var": 2.7587362193217157e+18, + "learning_rate": 0.0001, + "loss": 7.5552, + "loss/crossentropy": 2.1164004117250443, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.19724889248609542, + "step": 860 + }, + { + "epoch": 0.02175, + "grad_norm": 35.25, + "grad_norm_var": 2.758736219342478e+18, + "learning_rate": 0.0001, + "loss": 7.5021, + "loss/crossentropy": 2.036998500674963, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.18320635841228067, + "step": 870 + }, + { + "epoch": 0.022, + "grad_norm": 37.0, + "grad_norm_var": 16.9541015625, + "learning_rate": 0.0001, + "loss": 7.5059, + "loss/crossentropy": 1.9707016140222549, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.20436920877546072, + "step": 880 + }, + { + "epoch": 0.02225, + "grad_norm": 31.375, + "grad_norm_var": 30.538541666666667, + "learning_rate": 0.0001, + "loss": 7.4935, + "loss/crossentropy": 2.206394499540329, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.20495780408382416, + "step": 890 + }, + { + "epoch": 0.0225, + "grad_norm": 29.875, + "grad_norm_var": 28.020833333333332, + "learning_rate": 0.0001, + "loss": 7.4823, + "loss/crossentropy": 2.091763325035572, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20592593550682067, + "step": 900 + }, + { + "epoch": 0.02275, + "grad_norm": 31.875, + "grad_norm_var": 3.5645182291666666, + "learning_rate": 0.0001, + "loss": 7.422, + "loss/crossentropy": 1.9740761511027813, + "loss/hidden": 3.494921875, + "loss/jsd": 0.0, + "loss/logits": 0.2015986293554306, + "step": 910 + }, + { + "epoch": 0.023, + "grad_norm": 32.0, + "grad_norm_var": 56.256184895833336, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.030415116250515, + "loss/hidden": 3.205078125, + "loss/jsd": 0.0, + "loss/logits": 0.1614784031175077, + "step": 920 + }, + { + "epoch": 0.02325, + "grad_norm": 30.0, + "grad_norm_var": 57.1619140625, + "learning_rate": 0.0001, + "loss": 7.3713, + "loss/crossentropy": 2.0250086903572084, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.19023355115205048, + "step": 930 + }, + { + "epoch": 0.0235, + "grad_norm": 30.625, + "grad_norm_var": 1.3830729166666667, + "learning_rate": 0.0001, + "loss": 7.5277, + "loss/crossentropy": 2.222324788570404, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.19078677501529456, + "step": 940 + }, + { + "epoch": 0.02375, + "grad_norm": 31.0, + "grad_norm_var": 3.1455729166666666, + "learning_rate": 0.0001, + "loss": 7.5086, + "loss/crossentropy": 2.1299516543745995, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.21310927756130696, + "step": 950 + }, + { + "epoch": 0.024, + "grad_norm": 29.875, + "grad_norm_var": 8.883072916666666, + "learning_rate": 0.0001, + "loss": 7.5579, + "loss/crossentropy": 2.0535727672278883, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.18507701791822911, + "step": 960 + }, + { + "epoch": 0.02425, + "grad_norm": 32.75, + "grad_norm_var": 2.5916015625, + "learning_rate": 0.0001, + "loss": 7.537, + "loss/crossentropy": 2.1785535484552385, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.1955953363329172, + "step": 970 + }, + { + "epoch": 0.0245, + "grad_norm": 36.5, + "grad_norm_var": 6.852083333333334, + "learning_rate": 0.0001, + "loss": 7.5091, + "loss/crossentropy": 2.0967498391866686, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.2146583067253232, + "step": 980 + }, + { + "epoch": 0.02475, + "grad_norm": 29.625, + "grad_norm_var": 4.325455729166666, + "learning_rate": 0.0001, + "loss": 7.5901, + "loss/crossentropy": 2.1134474128484726, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19056662563234567, + "step": 990 + }, + { + "epoch": 0.025, + "grad_norm": 42.0, + "grad_norm_var": 4.1552039405313587e+18, + "learning_rate": 0.0001, + "loss": 7.6082, + "loss/crossentropy": 2.0916516482830048, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19376826155930757, + "step": 1000 + }, + { + "epoch": 0.02525, + "grad_norm": 29.625, + "grad_norm_var": 4.1552039416015355e+18, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.003750593960285, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18129821103066207, + "step": 1010 + }, + { + "epoch": 0.0255, + "grad_norm": 35.25, + "grad_norm_var": 24.095572916666665, + "learning_rate": 0.0001, + "loss": 7.5395, + "loss/crossentropy": 2.0453194856643675, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.199107607267797, + "step": 1020 + }, + { + "epoch": 0.02575, + "grad_norm": 32.25, + "grad_norm_var": 19.5259765625, + "learning_rate": 0.0001, + "loss": 7.31, + "loss/crossentropy": 2.1016619503498077, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.184703135676682, + "step": 1030 + }, + { + "epoch": 0.026, + "grad_norm": 30.75, + "grad_norm_var": 1.87890625, + "learning_rate": 0.0001, + "loss": 7.5425, + "loss/crossentropy": 2.1467826470732687, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.20074132941663264, + "step": 1040 + }, + { + "epoch": 0.02625, + "grad_norm": 30.625, + "grad_norm_var": 0.7452473958333333, + "learning_rate": 0.0001, + "loss": 7.4114, + "loss/crossentropy": 2.049474111199379, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.20267941821366547, + "step": 1050 + }, + { + "epoch": 0.0265, + "grad_norm": 31.75, + "grad_norm_var": 3.124739583333333, + "learning_rate": 0.0001, + "loss": 7.4845, + "loss/crossentropy": 2.036583887040615, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.1893632340244949, + "step": 1060 + }, + { + "epoch": 0.02675, + "grad_norm": 40.75, + "grad_norm_var": 3.405847188209664e+18, + "learning_rate": 0.0001, + "loss": 7.3982, + "loss/crossentropy": 2.124411530792713, + "loss/hidden": 3.4484375, + "loss/jsd": 0.0, + "loss/logits": 0.19454579129815103, + "step": 1070 + }, + { + "epoch": 0.027, + "grad_norm": 28.25, + "grad_norm_var": 3.4058471885941417e+18, + "learning_rate": 0.0001, + "loss": 7.3928, + "loss/crossentropy": 2.0034691862761975, + "loss/hidden": 3.503515625, + "loss/jsd": 0.0, + "loss/logits": 0.21349683087319135, + "step": 1080 + }, + { + "epoch": 0.02725, + "grad_norm": 29.875, + "grad_norm_var": 4.88515625, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 1.9183670297265052, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.19249978363513948, + "step": 1090 + }, + { + "epoch": 0.0275, + "grad_norm": 30.5, + "grad_norm_var": 3.2728515625, + "learning_rate": 0.0001, + "loss": 7.37, + "loss/crossentropy": 2.145428071916103, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.19729665387421846, + "step": 1100 + }, + { + "epoch": 0.02775, + "grad_norm": 31.25, + "grad_norm_var": 2.34765625, + "learning_rate": 0.0001, + "loss": 7.4772, + "loss/crossentropy": 2.10652961358428, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.19585925145074726, + "step": 1110 + }, + { + "epoch": 0.028, + "grad_norm": 31.25, + "grad_norm_var": 2.434477049308093e+18, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 1.9645449101924897, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.19977953620254993, + "step": 1120 + }, + { + "epoch": 0.02825, + "grad_norm": 32.0, + "grad_norm_var": 2.4344770492950907e+18, + "learning_rate": 0.0001, + "loss": 7.4453, + "loss/crossentropy": 2.131172102689743, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.2083016105927527, + "step": 1130 + }, + { + "epoch": 0.0285, + "grad_norm": 32.75, + "grad_norm_var": 3.7080729166666666, + "learning_rate": 0.0001, + "loss": 7.4009, + "loss/crossentropy": 2.003016713261604, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.18665643623098732, + "step": 1140 + }, + { + "epoch": 0.02875, + "grad_norm": 30.875, + "grad_norm_var": 1.34765625, + "learning_rate": 0.0001, + "loss": 7.5648, + "loss/crossentropy": 2.0709651306271555, + "loss/hidden": 3.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.18793081305921078, + "step": 1150 + }, + { + "epoch": 0.029, + "grad_norm": 32.25, + "grad_norm_var": 2.1582682291666666, + "learning_rate": 0.0001, + "loss": 7.4644, + "loss/crossentropy": 2.06434089243412, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.2109043262898922, + "step": 1160 + }, + { + "epoch": 0.02925, + "grad_norm": 31.375, + "grad_norm_var": 2.4010416666666665, + "learning_rate": 0.0001, + "loss": 7.4403, + "loss/crossentropy": 2.0107607185840606, + "loss/hidden": 3.498046875, + "loss/jsd": 0.0, + "loss/logits": 0.20349722560495137, + "step": 1170 + }, + { + "epoch": 0.0295, + "grad_norm": 33.25, + "grad_norm_var": 1.2260416666666667, + "learning_rate": 0.0001, + "loss": 7.4412, + "loss/crossentropy": 2.096436749398708, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.20087064132094384, + "step": 1180 + }, + { + "epoch": 0.02975, + "grad_norm": 29.75, + "grad_norm_var": 1.8046223958333334, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 1.972258360683918, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.20998958311975002, + "step": 1190 + }, + { + "epoch": 0.03, + "grad_norm": 33.75, + "grad_norm_var": 3.7395833333333335, + "learning_rate": 0.0001, + "loss": 7.3931, + "loss/crossentropy": 1.8556599006056786, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.19810242671519518, + "step": 1200 + }, + { + "epoch": 0.03025, + "grad_norm": 29.0, + "grad_norm_var": 9.394791666666666, + "learning_rate": 0.0001, + "loss": 7.5849, + "loss/crossentropy": 2.0611833460628985, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19216072149574756, + "step": 1210 + }, + { + "epoch": 0.0305, + "grad_norm": 31.75, + "grad_norm_var": 3.26640625, + "learning_rate": 0.0001, + "loss": 7.4844, + "loss/crossentropy": 2.0546294137835504, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.21588555499911308, + "step": 1220 + }, + { + "epoch": 0.03075, + "grad_norm": 31.625, + "grad_norm_var": 2.3968098958333335, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 2.0615282475948336, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.206529095210135, + "step": 1230 + }, + { + "epoch": 0.031, + "grad_norm": 32.0, + "grad_norm_var": 1.6124348958333334, + "learning_rate": 0.0001, + "loss": 7.4647, + "loss/crossentropy": 1.9786661133170127, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.17899234425276517, + "step": 1240 + }, + { + "epoch": 0.03125, + "grad_norm": 5838471168.0, + "grad_norm_var": 2.1304840753447437e+18, + "learning_rate": 0.0001, + "loss": 7.4926, + "loss/crossentropy": 2.04936410933733, + "loss/hidden": 3.714453125, + "loss/jsd": 0.0, + "loss/logits": 0.1995564555749297, + "step": 1250 + }, + { + "epoch": 0.0315, + "grad_norm": 31.25, + "grad_norm_var": 2.1304840747304878e+18, + "learning_rate": 0.0001, + "loss": 7.5078, + "loss/crossentropy": 2.1189576953649523, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.19967459067702292, + "step": 1260 + }, + { + "epoch": 0.03175, + "grad_norm": 30.5, + "grad_norm_var": 3.178580729166667, + "learning_rate": 0.0001, + "loss": 7.4255, + "loss/crossentropy": 2.163596141338348, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.19321363251656293, + "step": 1270 + }, + { + "epoch": 0.032, + "grad_norm": 33.25, + "grad_norm_var": 2.1639973958333334, + "learning_rate": 0.0001, + "loss": 7.4609, + "loss/crossentropy": 1.9938266813755035, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.18334759529680014, + "step": 1280 + }, + { + "epoch": 0.03225, + "grad_norm": 29.375, + "grad_norm_var": 1.67890625, + "learning_rate": 0.0001, + "loss": 7.4652, + "loss/crossentropy": 2.161333967000246, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19740422032773494, + "step": 1290 + }, + { + "epoch": 0.0325, + "grad_norm": 32.75, + "grad_norm_var": 3.0385416666666667, + "learning_rate": 0.0001, + "loss": 7.3146, + "loss/crossentropy": 2.0165325723588468, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.19117104820907116, + "step": 1300 + }, + { + "epoch": 0.03275, + "grad_norm": 28.25, + "grad_norm_var": 9.158072916666667, + "learning_rate": 0.0001, + "loss": 7.4955, + "loss/crossentropy": 2.124955786764622, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.19802952595055104, + "step": 1310 + }, + { + "epoch": 0.033, + "grad_norm": 30.75, + "grad_norm_var": 2.4535807291666667, + "learning_rate": 0.0001, + "loss": 7.4311, + "loss/crossentropy": 2.018800371140242, + "loss/hidden": 3.542578125, + "loss/jsd": 0.0, + "loss/logits": 0.2196814114227891, + "step": 1320 + }, + { + "epoch": 0.03325, + "grad_norm": 31.375, + "grad_norm_var": 2.39375, + "learning_rate": 0.0001, + "loss": 7.5164, + "loss/crossentropy": 2.0520452961325644, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.2013697015121579, + "step": 1330 + }, + { + "epoch": 0.0335, + "grad_norm": 32.5, + "grad_norm_var": 1.0431640625, + "learning_rate": 0.0001, + "loss": 7.5302, + "loss/crossentropy": 2.12932348549366, + "loss/hidden": 3.525, + "loss/jsd": 0.0, + "loss/logits": 0.20245677568018436, + "step": 1340 + }, + { + "epoch": 0.03375, + "grad_norm": 30.625, + "grad_norm_var": 3.3900390625, + "learning_rate": 0.0001, + "loss": 7.5292, + "loss/crossentropy": 2.031618994474411, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19062725063413383, + "step": 1350 + }, + { + "epoch": 0.034, + "grad_norm": 32.0, + "grad_norm_var": 3.3447265625, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.2257011234760284, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.1979327043518424, + "step": 1360 + }, + { + "epoch": 0.03425, + "grad_norm": 30.625, + "grad_norm_var": 3.3421223958333335, + "learning_rate": 0.0001, + "loss": 7.4219, + "loss/crossentropy": 2.155778780579567, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.19018295016139747, + "step": 1370 + }, + { + "epoch": 0.0345, + "grad_norm": 30.25, + "grad_norm_var": 2.5872395833333335, + "learning_rate": 0.0001, + "loss": 7.4637, + "loss/crossentropy": 2.058405503630638, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.2114524593576789, + "step": 1380 + }, + { + "epoch": 0.03475, + "grad_norm": 32.5, + "grad_norm_var": 3.2994140625, + "learning_rate": 0.0001, + "loss": 7.5834, + "loss/crossentropy": 2.1654782712459566, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.2024593001231551, + "step": 1390 + }, + { + "epoch": 0.035, + "grad_norm": 31.125, + "grad_norm_var": 12.812239583333334, + "learning_rate": 0.0001, + "loss": 7.4442, + "loss/crossentropy": 2.0921876966953277, + "loss/hidden": 3.286328125, + "loss/jsd": 0.0, + "loss/logits": 0.19270132519304753, + "step": 1400 + }, + { + "epoch": 0.03525, + "grad_norm": 29.25, + "grad_norm_var": 1.5108723958333334, + "learning_rate": 0.0001, + "loss": 7.4779, + "loss/crossentropy": 1.9434148371219635, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.17576389852911234, + "step": 1410 + }, + { + "epoch": 0.0355, + "grad_norm": 30.125, + "grad_norm_var": 2.154166666666667, + "learning_rate": 0.0001, + "loss": 7.508, + "loss/crossentropy": 2.0766889482736586, + "loss/hidden": 3.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.20394362770020963, + "step": 1420 + }, + { + "epoch": 0.03575, + "grad_norm": 30.125, + "grad_norm_var": 17.580208333333335, + "learning_rate": 0.0001, + "loss": 7.4612, + "loss/crossentropy": 2.00380075648427, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.18816210143268108, + "step": 1430 + }, + { + "epoch": 0.036, + "grad_norm": 31.375, + "grad_norm_var": 16.758268229166667, + "learning_rate": 0.0001, + "loss": 7.4602, + "loss/crossentropy": 2.1938020154833793, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.2016971528530121, + "step": 1440 + }, + { + "epoch": 0.03625, + "grad_norm": 30.875, + "grad_norm_var": 1.2556640625, + "learning_rate": 0.0001, + "loss": 7.4245, + "loss/crossentropy": 2.0232372283935547, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.19209201391786337, + "step": 1450 + }, + { + "epoch": 0.0365, + "grad_norm": 31.0, + "grad_norm_var": 1.4041015625, + "learning_rate": 0.0001, + "loss": 7.5518, + "loss/crossentropy": 2.2000616788864136, + "loss/hidden": 3.473046875, + "loss/jsd": 0.0, + "loss/logits": 0.22938326951116322, + "step": 1460 + }, + { + "epoch": 0.03675, + "grad_norm": 28.375, + "grad_norm_var": 2.0322916666666666, + "learning_rate": 0.0001, + "loss": 7.4397, + "loss/crossentropy": 2.0838582158088683, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.20685861641541123, + "step": 1470 + }, + { + "epoch": 0.037, + "grad_norm": 32.0, + "grad_norm_var": 1.5020833333333334, + "learning_rate": 0.0001, + "loss": 7.4183, + "loss/crossentropy": 2.149951633810997, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1984950641170144, + "step": 1480 + }, + { + "epoch": 0.03725, + "grad_norm": 33.75, + "grad_norm_var": 34.10826822916667, + "learning_rate": 0.0001, + "loss": 7.453, + "loss/crossentropy": 2.128306310623884, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.19783397912979125, + "step": 1490 + }, + { + "epoch": 0.0375, + "grad_norm": 29.5, + "grad_norm_var": 5.008072916666666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.042660539597273, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.19274956732988358, + "step": 1500 + }, + { + "epoch": 0.03775, + "grad_norm": 33.0, + "grad_norm_var": 19.1775390625, + "learning_rate": 0.0001, + "loss": 7.4119, + "loss/crossentropy": 2.043857058137655, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.18266947232186795, + "step": 1510 + }, + { + "epoch": 0.038, + "grad_norm": 29.625, + "grad_norm_var": 14.303580729166667, + "learning_rate": 0.0001, + "loss": 7.4362, + "loss/crossentropy": 1.9492302805185318, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.1754497304558754, + "step": 1520 + }, + { + "epoch": 0.03825, + "grad_norm": 29.75, + "grad_norm_var": 23.764518229166665, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.0668226674199106, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.1921279976144433, + "step": 1530 + }, + { + "epoch": 0.0385, + "grad_norm": 32.75, + "grad_norm_var": 3.2226069790467994e+18, + "learning_rate": 0.0001, + "loss": 7.5077, + "loss/crossentropy": 2.1122784771025183, + "loss/hidden": 3.46953125, + "loss/jsd": 0.0, + "loss/logits": 0.22245875597000123, + "step": 1540 + }, + { + "epoch": 0.03875, + "grad_norm": 30.25, + "grad_norm_var": 5.382291666666666, + "learning_rate": 0.0001, + "loss": 7.4525, + "loss/crossentropy": 2.264697426557541, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.2075907403603196, + "step": 1550 + }, + { + "epoch": 0.039, + "grad_norm": 30.0, + "grad_norm_var": 6.353580729166667, + "learning_rate": 0.0001, + "loss": 7.5064, + "loss/crossentropy": 2.1150408178567885, + "loss/hidden": 3.5203125, + "loss/jsd": 0.0, + "loss/logits": 0.23207673486322164, + "step": 1560 + }, + { + "epoch": 0.03925, + "grad_norm": 34.25, + "grad_norm_var": 6.72265625, + "learning_rate": 0.0001, + "loss": 7.4578, + "loss/crossentropy": 2.188142140209675, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20429779235273599, + "step": 1570 + }, + { + "epoch": 0.0395, + "grad_norm": 34.75, + "grad_norm_var": 897.6666015625, + "learning_rate": 0.0001, + "loss": 7.434, + "loss/crossentropy": 2.0795677445828913, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.18706642352044583, + "step": 1580 + }, + { + "epoch": 0.03975, + "grad_norm": 28.0, + "grad_norm_var": 903.6327473958333, + "learning_rate": 0.0001, + "loss": 7.5655, + "loss/crossentropy": 2.1025844663381577, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.1966788914054632, + "step": 1590 + }, + { + "epoch": 0.04, + "grad_norm": 28.625, + "grad_norm_var": 11.97890625, + "learning_rate": 0.0001, + "loss": 7.2578, + "loss/crossentropy": 2.050418493151665, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.20104087069630622, + "step": 1600 + }, + { + "epoch": 0.04025, + "grad_norm": 28.0, + "grad_norm_var": 2.255989583333333, + "learning_rate": 0.0001, + "loss": 7.4393, + "loss/crossentropy": 2.1767756581306457, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.2213939843699336, + "step": 1610 + }, + { + "epoch": 0.0405, + "grad_norm": 29.75, + "grad_norm_var": 3.80390625, + "learning_rate": 0.0001, + "loss": 7.5026, + "loss/crossentropy": 2.126803469657898, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.19106289148330688, + "step": 1620 + }, + { + "epoch": 0.04075, + "grad_norm": 32.0, + "grad_norm_var": 3.1249348958333334, + "learning_rate": 0.0001, + "loss": 7.4274, + "loss/crossentropy": 2.144256164133549, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.21435861438512802, + "step": 1630 + }, + { + "epoch": 0.041, + "grad_norm": 30.25, + "grad_norm_var": 29.265559895833334, + "learning_rate": 0.0001, + "loss": 7.5728, + "loss/crossentropy": 2.2575725719332693, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.20658138059079648, + "step": 1640 + }, + { + "epoch": 0.04125, + "grad_norm": 30.5, + "grad_norm_var": 48.35390625, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.096929042041302, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.18803389491513373, + "step": 1650 + }, + { + "epoch": 0.0415, + "grad_norm": 30.5, + "grad_norm_var": 1.1010416666666667, + "learning_rate": 0.0001, + "loss": 7.3792, + "loss/crossentropy": 2.0290944524109364, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.19023821037262678, + "step": 1660 + }, + { + "epoch": 0.04175, + "grad_norm": 28.125, + "grad_norm_var": 33.49270833333333, + "learning_rate": 0.0001, + "loss": 7.5018, + "loss/crossentropy": 2.0678361281752586, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.18862500675022603, + "step": 1670 + }, + { + "epoch": 0.042, + "grad_norm": 29.75, + "grad_norm_var": 2.2955729166666665, + "learning_rate": 0.0001, + "loss": 7.4432, + "loss/crossentropy": 2.0549797296524046, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.19089050237089394, + "step": 1680 + }, + { + "epoch": 0.04225, + "grad_norm": 29.75, + "grad_norm_var": 1.8791666666666667, + "learning_rate": 0.0001, + "loss": 7.3842, + "loss/crossentropy": 2.0077505365014074, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18722779098898173, + "step": 1690 + }, + { + "epoch": 0.0425, + "grad_norm": 29.375, + "grad_norm_var": 0.9434895833333333, + "learning_rate": 0.0001, + "loss": 7.4273, + "loss/crossentropy": 2.071325332671404, + "loss/hidden": 3.486328125, + "loss/jsd": 0.0, + "loss/logits": 0.20270166713744403, + "step": 1700 + }, + { + "epoch": 0.04275, + "grad_norm": 38.25, + "grad_norm_var": 7.669791666666667, + "learning_rate": 0.0001, + "loss": 7.4176, + "loss/crossentropy": 2.1353142291307448, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.19663168713450432, + "step": 1710 + }, + { + "epoch": 0.043, + "grad_norm": 28.25, + "grad_norm_var": 7.75, + "learning_rate": 0.0001, + "loss": 7.3818, + "loss/crossentropy": 1.9995346069335938, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.18310597026720643, + "step": 1720 + }, + { + "epoch": 0.04325, + "grad_norm": 29.5, + "grad_norm_var": 3.7619140625, + "learning_rate": 0.0001, + "loss": 7.4912, + "loss/crossentropy": 2.1415088951587675, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.22313783299177886, + "step": 1730 + }, + { + "epoch": 0.0435, + "grad_norm": 31.625, + "grad_norm_var": 3.0416666666666665, + "learning_rate": 0.0001, + "loss": 7.4999, + "loss/crossentropy": 2.1686330527067184, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.20409150077030064, + "step": 1740 + }, + { + "epoch": 0.04375, + "grad_norm": 31.375, + "grad_norm_var": 2.724739583333333, + "learning_rate": 0.0001, + "loss": 7.438, + "loss/crossentropy": 1.9411263287067413, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.17631518254056572, + "step": 1750 + }, + { + "epoch": 0.044, + "grad_norm": 32.0, + "grad_norm_var": 1.9145833333333333, + "learning_rate": 0.0001, + "loss": 7.679, + "loss/crossentropy": 2.1614590853452684, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.194198589771986, + "step": 1760 + }, + { + "epoch": 0.04425, + "grad_norm": 28.5, + "grad_norm_var": 2.039322916666667, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 2.282147654891014, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.19978236705064772, + "step": 1770 + }, + { + "epoch": 0.0445, + "grad_norm": 29.625, + "grad_norm_var": 2.34140625, + "learning_rate": 0.0001, + "loss": 7.5296, + "loss/crossentropy": 2.2078514605760575, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.19668537452816964, + "step": 1780 + }, + { + "epoch": 0.04475, + "grad_norm": 30.25, + "grad_norm_var": 2.70390625, + "learning_rate": 0.0001, + "loss": 7.5779, + "loss/crossentropy": 2.1053253799676894, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.20323336366564035, + "step": 1790 + }, + { + "epoch": 0.045, + "grad_norm": 28.5, + "grad_norm_var": 4.8712890625, + "learning_rate": 0.0001, + "loss": 7.4866, + "loss/crossentropy": 2.060333488881588, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18627767637372017, + "step": 1800 + }, + { + "epoch": 0.04525, + "grad_norm": 28.0, + "grad_norm_var": 14.480989583333333, + "learning_rate": 0.0001, + "loss": 7.5225, + "loss/crossentropy": 1.9755317773669958, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.20334282671101392, + "step": 1810 + }, + { + "epoch": 0.0455, + "grad_norm": 29.875, + "grad_norm_var": 12.935872395833334, + "learning_rate": 0.0001, + "loss": 7.4781, + "loss/crossentropy": 2.1289859026670457, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.1973018018528819, + "step": 1820 + }, + { + "epoch": 0.04575, + "grad_norm": 31.75, + "grad_norm_var": 2.123893229166667, + "learning_rate": 0.0001, + "loss": 7.3915, + "loss/crossentropy": 1.9609280914068221, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.1916458262130618, + "step": 1830 + }, + { + "epoch": 0.046, + "grad_norm": 32.0, + "grad_norm_var": 1.6332682291666667, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 2.0019985377788543, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19768325993791222, + "step": 1840 + }, + { + "epoch": 0.04625, + "grad_norm": 29.875, + "grad_norm_var": 2.225455729166667, + "learning_rate": 0.0001, + "loss": 7.623, + "loss/crossentropy": 2.0607564479112623, + "loss/hidden": 3.507421875, + "loss/jsd": 0.0, + "loss/logits": 0.20858939345926047, + "step": 1850 + }, + { + "epoch": 0.0465, + "grad_norm": 29.5, + "grad_norm_var": 1.9863932291666666, + "learning_rate": 0.0001, + "loss": 7.3836, + "loss/crossentropy": 2.132562433928251, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.1956317812204361, + "step": 1860 + }, + { + "epoch": 0.04675, + "grad_norm": 36.0, + "grad_norm_var": 3.2171223958333335, + "learning_rate": 0.0001, + "loss": 7.4803, + "loss/crossentropy": 2.0316790327429772, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.20630075875669718, + "step": 1870 + }, + { + "epoch": 0.047, + "grad_norm": 33.25, + "grad_norm_var": 16.304622395833334, + "learning_rate": 0.0001, + "loss": 7.576, + "loss/crossentropy": 2.161964085698128, + "loss/hidden": 3.513671875, + "loss/jsd": 0.0, + "loss/logits": 0.21842746511101724, + "step": 1880 + }, + { + "epoch": 0.04725, + "grad_norm": 29.75, + "grad_norm_var": 2.3541666666666665, + "learning_rate": 0.0001, + "loss": 7.5036, + "loss/crossentropy": 1.8695943117141725, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.18793469872325658, + "step": 1890 + }, + { + "epoch": 0.0475, + "grad_norm": 34.25, + "grad_norm_var": 2.1780598958333335, + "learning_rate": 0.0001, + "loss": 7.5623, + "loss/crossentropy": 2.2376974314451217, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.21696731727570295, + "step": 1900 + }, + { + "epoch": 0.04775, + "grad_norm": 30.75, + "grad_norm_var": 14.924934895833333, + "learning_rate": 0.0001, + "loss": 7.388, + "loss/crossentropy": 1.9403380863368511, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.18128401823341847, + "step": 1910 + }, + { + "epoch": 0.048, + "grad_norm": 29.25, + "grad_norm_var": 25.1916015625, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.1744547933340073, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.20097011709585785, + "step": 1920 + }, + { + "epoch": 0.04825, + "grad_norm": 29.25, + "grad_norm_var": 14.801822916666667, + "learning_rate": 0.0001, + "loss": 7.2893, + "loss/crossentropy": 2.101319019496441, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1921493023633957, + "step": 1930 + }, + { + "epoch": 0.0485, + "grad_norm": 30.125, + "grad_norm_var": 14.517708333333333, + "learning_rate": 0.0001, + "loss": 7.579, + "loss/crossentropy": 2.057158224284649, + "loss/hidden": 3.59140625, + "loss/jsd": 0.0, + "loss/logits": 0.21765361074358225, + "step": 1940 + }, + { + "epoch": 0.04875, + "grad_norm": 29.625, + "grad_norm_var": 15.790559895833333, + "learning_rate": 0.0001, + "loss": 7.3712, + "loss/crossentropy": 1.9415803879499436, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18346730088815094, + "step": 1950 + }, + { + "epoch": 0.049, + "grad_norm": 27.625, + "grad_norm_var": 9.794791666666667, + "learning_rate": 0.0001, + "loss": 7.4902, + "loss/crossentropy": 2.035348242521286, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.20268035624176264, + "step": 1960 + }, + { + "epoch": 0.04925, + "grad_norm": 35.25, + "grad_norm_var": 12.768684895833333, + "learning_rate": 0.0001, + "loss": 7.4627, + "loss/crossentropy": 2.054542076587677, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.2003987120464444, + "step": 1970 + }, + { + "epoch": 0.0495, + "grad_norm": 36.0, + "grad_norm_var": 12.572916666666666, + "learning_rate": 0.0001, + "loss": 7.353, + "loss/crossentropy": 1.9634785205125809, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.17985089337453247, + "step": 1980 + }, + { + "epoch": 0.04975, + "grad_norm": 36.25, + "grad_norm_var": 9.2353515625, + "learning_rate": 0.0001, + "loss": 7.4473, + "loss/crossentropy": 2.059533824026585, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.19096513148397207, + "step": 1990 + }, + { + "epoch": 0.05, + "grad_norm": 29.125, + "grad_norm_var": 13.320572916666666, + "learning_rate": 0.0001, + "loss": 7.3914, + "loss/crossentropy": 2.011685383319855, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.19188414234668016, + "step": 2000 + }, + { + "epoch": 0.05025, + "grad_norm": 36.25, + "grad_norm_var": 14.026822916666667, + "learning_rate": 0.0001, + "loss": 7.4213, + "loss/crossentropy": 2.309766414761543, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.20372038893401623, + "step": 2010 + }, + { + "epoch": 0.0505, + "grad_norm": 29.0, + "grad_norm_var": 9.237239583333333, + "learning_rate": 0.0001, + "loss": 7.4145, + "loss/crossentropy": 2.1240487143397333, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.20137840434908866, + "step": 2020 + }, + { + "epoch": 0.05075, + "grad_norm": 38.5, + "grad_norm_var": 89.21432291666666, + "learning_rate": 0.0001, + "loss": 7.3696, + "loss/crossentropy": 2.112667274475098, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.19770587887614965, + "step": 2030 + }, + { + "epoch": 0.051, + "grad_norm": 27.75, + "grad_norm_var": 94.06015625, + "learning_rate": 0.0001, + "loss": 7.2471, + "loss/crossentropy": 1.9955052442848682, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.1880181163549423, + "step": 2040 + }, + { + "epoch": 0.05125, + "grad_norm": 35.25, + "grad_norm_var": 3.67265625, + "learning_rate": 0.0001, + "loss": 7.458, + "loss/crossentropy": 2.1320972844958304, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.18908526431769132, + "step": 2050 + }, + { + "epoch": 0.0515, + "grad_norm": 38.75, + "grad_norm_var": 10.776822916666667, + "learning_rate": 0.0001, + "loss": 7.3769, + "loss/crossentropy": 2.171598494052887, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.18929236195981503, + "step": 2060 + }, + { + "epoch": 0.05175, + "grad_norm": 32.75, + "grad_norm_var": 10.53515625, + "learning_rate": 0.0001, + "loss": 7.5279, + "loss/crossentropy": 2.0172302186489106, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.2013201082125306, + "step": 2070 + }, + { + "epoch": 0.052, + "grad_norm": 32.0, + "grad_norm_var": 7.678125, + "learning_rate": 0.0001, + "loss": 7.3619, + "loss/crossentropy": 1.982726515084505, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.17850281894207, + "step": 2080 + }, + { + "epoch": 0.05225, + "grad_norm": 29.75, + "grad_norm_var": 63.6681640625, + "learning_rate": 0.0001, + "loss": 7.5109, + "loss/crossentropy": 2.121504098176956, + "loss/hidden": 3.50703125, + "loss/jsd": 0.0, + "loss/logits": 0.240205854550004, + "step": 2090 + }, + { + "epoch": 0.0525, + "grad_norm": 34.5, + "grad_norm_var": 7.506184895833333, + "learning_rate": 0.0001, + "loss": 7.4658, + "loss/crossentropy": 2.110687591135502, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.2039638390764594, + "step": 2100 + }, + { + "epoch": 0.05275, + "grad_norm": 32.5, + "grad_norm_var": 19.075455729166666, + "learning_rate": 0.0001, + "loss": 7.5668, + "loss/crossentropy": 1.9557841390371322, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.18774209143593906, + "step": 2110 + }, + { + "epoch": 0.053, + "grad_norm": 31.125, + "grad_norm_var": 3.85390625, + "learning_rate": 0.0001, + "loss": 7.5735, + "loss/crossentropy": 2.0219520531594752, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.18533632289618254, + "step": 2120 + }, + { + "epoch": 0.05325, + "grad_norm": 32.25, + "grad_norm_var": 3.8910807291666667, + "learning_rate": 0.0001, + "loss": 7.4083, + "loss/crossentropy": 2.1359280541539194, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1897095028311014, + "step": 2130 + }, + { + "epoch": 0.0535, + "grad_norm": 31.25, + "grad_norm_var": 2.5957682291666666, + "learning_rate": 0.0001, + "loss": 7.446, + "loss/crossentropy": 2.170258317142725, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.1826348526403308, + "step": 2140 + }, + { + "epoch": 0.05375, + "grad_norm": 31.25, + "grad_norm_var": 3.785416666666667, + "learning_rate": 0.0001, + "loss": 7.4014, + "loss/crossentropy": 2.131239393353462, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.18656531646847724, + "step": 2150 + }, + { + "epoch": 0.054, + "grad_norm": 31.0, + "grad_norm_var": 4.8666015625, + "learning_rate": 0.0001, + "loss": 7.5478, + "loss/crossentropy": 2.223896725475788, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.1951376979239285, + "step": 2160 + }, + { + "epoch": 0.05425, + "grad_norm": 30.375, + "grad_norm_var": 8.437955729166667, + "learning_rate": 0.0001, + "loss": 7.5562, + "loss/crossentropy": 2.1203987300395966, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.1970507999882102, + "step": 2170 + }, + { + "epoch": 0.0545, + "grad_norm": 32.0, + "grad_norm_var": 2.9488932291666665, + "learning_rate": 0.0001, + "loss": 7.5532, + "loss/crossentropy": 2.080265050381422, + "loss/hidden": 3.544140625, + "loss/jsd": 0.0, + "loss/logits": 0.2216239819303155, + "step": 2180 + }, + { + "epoch": 0.05475, + "grad_norm": 31.125, + "grad_norm_var": 8.1728515625, + "learning_rate": 0.0001, + "loss": 7.382, + "loss/crossentropy": 2.2114535331726075, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.20577374435961246, + "step": 2190 + }, + { + "epoch": 0.055, + "grad_norm": 28.875, + "grad_norm_var": 14.520833333333334, + "learning_rate": 0.0001, + "loss": 7.5766, + "loss/crossentropy": 2.1003271512687207, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.18811229150742292, + "step": 2200 + }, + { + "epoch": 0.05525, + "grad_norm": 33.5, + "grad_norm_var": 16.099739583333335, + "learning_rate": 0.0001, + "loss": 7.5553, + "loss/crossentropy": 2.1326127350330353, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.22006579730659723, + "step": 2210 + }, + { + "epoch": 0.0555, + "grad_norm": 32.25, + "grad_norm_var": 9.305143229166667, + "learning_rate": 0.0001, + "loss": 7.3766, + "loss/crossentropy": 2.1496046826243402, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.1952402491122484, + "step": 2220 + }, + { + "epoch": 0.05575, + "grad_norm": 29.125, + "grad_norm_var": 6.805143229166666, + "learning_rate": 0.0001, + "loss": 7.3648, + "loss/crossentropy": 2.13938904479146, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19394674636423587, + "step": 2230 + }, + { + "epoch": 0.056, + "grad_norm": 27.625, + "grad_norm_var": 15.0712890625, + "learning_rate": 0.0001, + "loss": 7.4292, + "loss/crossentropy": 2.0648645758628845, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.18520106598734856, + "step": 2240 + }, + { + "epoch": 0.05625, + "grad_norm": 29.25, + "grad_norm_var": 12.034309895833333, + "learning_rate": 0.0001, + "loss": 7.4469, + "loss/crossentropy": 2.080448921024799, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.18507405128329993, + "step": 2250 + }, + { + "epoch": 0.0565, + "grad_norm": 31.375, + "grad_norm_var": 2.014518229166667, + "learning_rate": 0.0001, + "loss": 7.4325, + "loss/crossentropy": 2.0871294140815735, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.20059894528239966, + "step": 2260 + }, + { + "epoch": 0.05675, + "grad_norm": 28.75, + "grad_norm_var": 1.8103515625, + "learning_rate": 0.0001, + "loss": 7.4268, + "loss/crossentropy": 2.010594163835049, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.19413960948586464, + "step": 2270 + }, + { + "epoch": 0.057, + "grad_norm": 32.5, + "grad_norm_var": 4.0369140625, + "learning_rate": 0.0001, + "loss": 7.4346, + "loss/crossentropy": 2.1129174560308455, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.1961110396310687, + "step": 2280 + }, + { + "epoch": 0.05725, + "grad_norm": 39.0, + "grad_norm_var": 30.42265625, + "learning_rate": 0.0001, + "loss": 7.4422, + "loss/crossentropy": 2.002947611361742, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.2081361676566303, + "step": 2290 + }, + { + "epoch": 0.0575, + "grad_norm": 37.25, + "grad_norm_var": 25.699934895833334, + "learning_rate": 0.0001, + "loss": 7.4312, + "loss/crossentropy": 2.06134437918663, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18918452728539706, + "step": 2300 + }, + { + "epoch": 0.05775, + "grad_norm": 28.875, + "grad_norm_var": 9.115559895833334, + "learning_rate": 0.0001, + "loss": 7.4209, + "loss/crossentropy": 2.041922479122877, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.20907302405685185, + "step": 2310 + }, + { + "epoch": 0.058, + "grad_norm": 30.125, + "grad_norm_var": 22.248372395833332, + "learning_rate": 0.0001, + "loss": 7.6844, + "loss/crossentropy": 2.0152460247278214, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.1905667196959257, + "step": 2320 + }, + { + "epoch": 0.05825, + "grad_norm": 38.25, + "grad_norm_var": 31.398893229166667, + "learning_rate": 0.0001, + "loss": 7.4713, + "loss/crossentropy": 2.105386929959059, + "loss/hidden": 3.452734375, + "loss/jsd": 0.0, + "loss/logits": 0.1982942834496498, + "step": 2330 + }, + { + "epoch": 0.0585, + "grad_norm": 28.375, + "grad_norm_var": 54.94264322916667, + "learning_rate": 0.0001, + "loss": 7.4575, + "loss/crossentropy": 2.2358868844807147, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19232469592243434, + "step": 2340 + }, + { + "epoch": 0.05875, + "grad_norm": 33.5, + "grad_norm_var": 165.74583333333334, + "learning_rate": 0.0001, + "loss": 7.2987, + "loss/crossentropy": 1.9657546751201154, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.18062973748892547, + "step": 2350 + }, + { + "epoch": 0.059, + "grad_norm": 41.0, + "grad_norm_var": 15.376822916666667, + "learning_rate": 0.0001, + "loss": 7.4431, + "loss/crossentropy": 2.191007924079895, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.2068317520432174, + "step": 2360 + }, + { + "epoch": 0.05925, + "grad_norm": 30.625, + "grad_norm_var": 12.109375, + "learning_rate": 0.0001, + "loss": 7.3325, + "loss/crossentropy": 2.0140789330005644, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.18166892379522323, + "step": 2370 + }, + { + "epoch": 0.0595, + "grad_norm": 31.875, + "grad_norm_var": 6.941666666666666, + "learning_rate": 0.0001, + "loss": 7.4039, + "loss/crossentropy": 2.0221361994743345, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.1934544663876295, + "step": 2380 + }, + { + "epoch": 0.05975, + "grad_norm": 30.125, + "grad_norm_var": 10.472330729166666, + "learning_rate": 0.0001, + "loss": 7.5862, + "loss/crossentropy": 1.9840030640363693, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19178631734102963, + "step": 2390 + }, + { + "epoch": 0.06, + "grad_norm": 29.875, + "grad_norm_var": 14.10625, + "learning_rate": 0.0001, + "loss": 7.4826, + "loss/crossentropy": 2.1700179904699324, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1915024297311902, + "step": 2400 + }, + { + "epoch": 0.06025, + "grad_norm": 32.75, + "grad_norm_var": 7.370768229166667, + "learning_rate": 0.0001, + "loss": 7.3889, + "loss/crossentropy": 2.091843403875828, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.18695627991110086, + "step": 2410 + }, + { + "epoch": 0.0605, + "grad_norm": 29.0, + "grad_norm_var": 9.922330729166667, + "learning_rate": 0.0001, + "loss": 7.4655, + "loss/crossentropy": 2.172381104528904, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.20078962799161673, + "step": 2420 + }, + { + "epoch": 0.06075, + "grad_norm": 34.25, + "grad_norm_var": 8.637239583333333, + "learning_rate": 0.0001, + "loss": 7.519, + "loss/crossentropy": 1.995463601499796, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.1993358489125967, + "step": 2430 + }, + { + "epoch": 0.061, + "grad_norm": 31.25, + "grad_norm_var": 11.9431640625, + "learning_rate": 0.0001, + "loss": 7.5169, + "loss/crossentropy": 2.296917426586151, + "loss/hidden": 3.513671875, + "loss/jsd": 0.0, + "loss/logits": 0.23228074796497822, + "step": 2440 + }, + { + "epoch": 0.06125, + "grad_norm": 30.25, + "grad_norm_var": 3.4368798046573737e+18, + "learning_rate": 0.0001, + "loss": 7.5038, + "loss/crossentropy": 2.1944432735443113, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.21073084995150565, + "step": 2450 + }, + { + "epoch": 0.0615, + "grad_norm": 33.5, + "grad_norm_var": 3.436879805205814e+18, + "learning_rate": 0.0001, + "loss": 7.4423, + "loss/crossentropy": 2.152103579044342, + "loss/hidden": 3.512109375, + "loss/jsd": 0.0, + "loss/logits": 0.20929353777319193, + "step": 2460 + }, + { + "epoch": 0.06175, + "grad_norm": 39.0, + "grad_norm_var": 2.2045823633093297e+18, + "learning_rate": 0.0001, + "loss": 7.4382, + "loss/crossentropy": 2.017627691477537, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.19590776292607187, + "step": 2470 + }, + { + "epoch": 0.062, + "grad_norm": 29.375, + "grad_norm_var": 2.2045823636681523e+18, + "learning_rate": 0.0001, + "loss": 7.4072, + "loss/crossentropy": 2.1076912328600885, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.1988623272627592, + "step": 2480 + }, + { + "epoch": 0.06225, + "grad_norm": 30.125, + "grad_norm_var": 3.2494140625, + "learning_rate": 0.0001, + "loss": 7.3192, + "loss/crossentropy": 1.9777067750692368, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.20539684109389783, + "step": 2490 + }, + { + "epoch": 0.0625, + "grad_norm": 29.125, + "grad_norm_var": 5.580208333333333, + "learning_rate": 0.0001, + "loss": 7.3283, + "loss/crossentropy": 2.061080713570118, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.20077812522649766, + "step": 2500 + }, + { + "epoch": 0.06275, + "grad_norm": 28.375, + "grad_norm_var": 5.618489583333333, + "learning_rate": 0.0001, + "loss": 7.4401, + "loss/crossentropy": 2.2099071338772776, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.2055276283994317, + "step": 2510 + }, + { + "epoch": 0.063, + "grad_norm": 28.125, + "grad_norm_var": 7.118684895833334, + "learning_rate": 0.0001, + "loss": 7.3509, + "loss/crossentropy": 1.962952435016632, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.19731322024017572, + "step": 2520 + }, + { + "epoch": 0.06325, + "grad_norm": 31.375, + "grad_norm_var": 1.9681640625, + "learning_rate": 0.0001, + "loss": 7.3695, + "loss/crossentropy": 1.9843583509325982, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2062232268974185, + "step": 2530 + }, + { + "epoch": 0.0635, + "grad_norm": 31.5, + "grad_norm_var": 3.7988932291666666, + "learning_rate": 0.0001, + "loss": 7.4485, + "loss/crossentropy": 2.1427679538726805, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.2011977185495198, + "step": 2540 + }, + { + "epoch": 0.06375, + "grad_norm": 30.0, + "grad_norm_var": 2.5885416666666665, + "learning_rate": 0.0001, + "loss": 7.4157, + "loss/crossentropy": 1.9085583783686162, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17416954301297666, + "step": 2550 + }, + { + "epoch": 0.064, + "grad_norm": 31.25, + "grad_norm_var": 1.21015625, + "learning_rate": 0.0001, + "loss": 7.5141, + "loss/crossentropy": 1.9622327491641045, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.18756412472575903, + "step": 2560 + }, + { + "epoch": 0.06425, + "grad_norm": 30.0, + "grad_norm_var": 1.7143229166666667, + "learning_rate": 0.0001, + "loss": 7.4624, + "loss/crossentropy": 2.192887546122074, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1984951412305236, + "step": 2570 + }, + { + "epoch": 0.0645, + "grad_norm": 30.125, + "grad_norm_var": 1.9143229166666667, + "learning_rate": 0.0001, + "loss": 7.3947, + "loss/crossentropy": 2.102549520134926, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.1989850653335452, + "step": 2580 + }, + { + "epoch": 0.06475, + "grad_norm": 32.25, + "grad_norm_var": 9.5322265625, + "learning_rate": 0.0001, + "loss": 7.5147, + "loss/crossentropy": 2.213281115144491, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.2027151037938893, + "step": 2590 + }, + { + "epoch": 0.065, + "grad_norm": 30.625, + "grad_norm_var": 2.3427083333333334, + "learning_rate": 0.0001, + "loss": 7.4691, + "loss/crossentropy": 2.1138279482722284, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20825629755854608, + "step": 2600 + }, + { + "epoch": 0.06525, + "grad_norm": 36.0, + "grad_norm_var": 3.3395182291666665, + "learning_rate": 0.0001, + "loss": 7.4775, + "loss/crossentropy": 2.107349547743797, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19337845854461194, + "step": 2610 + }, + { + "epoch": 0.0655, + "grad_norm": 29.25, + "grad_norm_var": 12.757291666666667, + "learning_rate": 0.0001, + "loss": 7.5438, + "loss/crossentropy": 2.0628502368927, + "loss/hidden": 3.4984375, + "loss/jsd": 0.0, + "loss/logits": 0.20967572089284658, + "step": 2620 + }, + { + "epoch": 0.06575, + "grad_norm": 28.625, + "grad_norm_var": 11.805208333333333, + "learning_rate": 0.0001, + "loss": 7.3354, + "loss/crossentropy": 2.1009589530527593, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18132725274190306, + "step": 2630 + }, + { + "epoch": 0.066, + "grad_norm": 32.5, + "grad_norm_var": 2.730208333333333, + "learning_rate": 0.0001, + "loss": 7.4257, + "loss/crossentropy": 1.983342681080103, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.19340286049991845, + "step": 2640 + }, + { + "epoch": 0.06625, + "grad_norm": 30.25, + "grad_norm_var": 3.7549465282226944e+18, + "learning_rate": 0.0001, + "loss": 7.309, + "loss/crossentropy": 2.0057250812649725, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.18936716187745334, + "step": 2650 + }, + { + "epoch": 0.0665, + "grad_norm": 36.25, + "grad_norm_var": 8.832747395833334, + "learning_rate": 0.0001, + "loss": 7.5442, + "loss/crossentropy": 2.054753464460373, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.2035602940246463, + "step": 2660 + }, + { + "epoch": 0.06675, + "grad_norm": 32.5, + "grad_norm_var": 4.8900390625, + "learning_rate": 0.0001, + "loss": 7.4106, + "loss/crossentropy": 2.0181221179664135, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.1878144398331642, + "step": 2670 + }, + { + "epoch": 0.067, + "grad_norm": 30.125, + "grad_norm_var": 4.280989583333334, + "learning_rate": 0.0001, + "loss": 7.4597, + "loss/crossentropy": 2.200540581345558, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.20286752395331858, + "step": 2680 + }, + { + "epoch": 0.06725, + "grad_norm": 31.75, + "grad_norm_var": 3.8559895833333333, + "learning_rate": 0.0001, + "loss": 7.4643, + "loss/crossentropy": 2.0630861818790436, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20401672925800085, + "step": 2690 + }, + { + "epoch": 0.0675, + "grad_norm": 33.0, + "grad_norm_var": 7.073958333333334, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 1.927167509496212, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17901942003518342, + "step": 2700 + }, + { + "epoch": 0.06775, + "grad_norm": 30.25, + "grad_norm_var": 8.9009765625, + "learning_rate": 0.0001, + "loss": 7.3461, + "loss/crossentropy": 2.0538916781544687, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.1864149821922183, + "step": 2710 + }, + { + "epoch": 0.068, + "grad_norm": 29.5, + "grad_norm_var": 2.218489583333333, + "learning_rate": 0.0001, + "loss": 7.526, + "loss/crossentropy": 2.211588367819786, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.20801848396658898, + "step": 2720 + }, + { + "epoch": 0.06825, + "grad_norm": 31.375, + "grad_norm_var": 1.0768229166666667, + "learning_rate": 0.0001, + "loss": 7.5535, + "loss/crossentropy": 2.268890543282032, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.21352684032171965, + "step": 2730 + }, + { + "epoch": 0.0685, + "grad_norm": 33.25, + "grad_norm_var": 5.663997395833333, + "learning_rate": 0.0001, + "loss": 7.411, + "loss/crossentropy": 1.902898482978344, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.1794701736420393, + "step": 2740 + }, + { + "epoch": 0.06875, + "grad_norm": 32.25, + "grad_norm_var": 6.167708333333334, + "learning_rate": 0.0001, + "loss": 7.3718, + "loss/crossentropy": 1.9450767874717712, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.18759301900863648, + "step": 2750 + }, + { + "epoch": 0.069, + "grad_norm": 31.125, + "grad_norm_var": 31.185872395833332, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.0783849939703942, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18503105416893958, + "step": 2760 + }, + { + "epoch": 0.06925, + "grad_norm": 36.5, + "grad_norm_var": 35.412434895833336, + "learning_rate": 0.0001, + "loss": 7.5806, + "loss/crossentropy": 2.2374701410532, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.19615829903632404, + "step": 2770 + }, + { + "epoch": 0.0695, + "grad_norm": 30.25, + "grad_norm_var": 19.787239583333335, + "learning_rate": 0.0001, + "loss": 7.3197, + "loss/crossentropy": 1.8297001466155052, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.16481583826243879, + "step": 2780 + }, + { + "epoch": 0.06975, + "grad_norm": 428.0, + "grad_norm_var": 9873.31640625, + "learning_rate": 0.0001, + "loss": 7.5313, + "loss/crossentropy": 2.249661484360695, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.2018596636131406, + "step": 2790 + }, + { + "epoch": 0.07, + "grad_norm": 31.0, + "grad_norm_var": 9755.6625, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 1.9368772380053998, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.18386599626392125, + "step": 2800 + }, + { + "epoch": 0.07025, + "grad_norm": 30.75, + "grad_norm_var": 1.8317057291666667, + "learning_rate": 0.0001, + "loss": 7.4372, + "loss/crossentropy": 1.98307463824749, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.19818334747105837, + "step": 2810 + }, + { + "epoch": 0.0705, + "grad_norm": 29.375, + "grad_norm_var": 2.589583333333333, + "learning_rate": 0.0001, + "loss": 7.5014, + "loss/crossentropy": 2.1463105253875256, + "loss/hidden": 3.5046875, + "loss/jsd": 0.0, + "loss/logits": 0.20105676222592592, + "step": 2820 + }, + { + "epoch": 0.07075, + "grad_norm": 60.5, + "grad_norm_var": 178.2556640625, + "learning_rate": 0.0001, + "loss": 7.4527, + "loss/crossentropy": 2.0776613369584083, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19452448841184378, + "step": 2830 + }, + { + "epoch": 0.071, + "grad_norm": 29.25, + "grad_norm_var": 172.31451822916668, + "learning_rate": 0.0001, + "loss": 7.4802, + "loss/crossentropy": 2.1200039610266685, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.19831879772245883, + "step": 2840 + }, + { + "epoch": 0.07125, + "grad_norm": 69.0, + "grad_norm_var": 117.23098958333334, + "learning_rate": 0.0001, + "loss": 7.434, + "loss/crossentropy": 2.024143140017986, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1836528332903981, + "step": 2850 + }, + { + "epoch": 0.0715, + "grad_norm": 31.375, + "grad_norm_var": 92.53723958333333, + "learning_rate": 0.0001, + "loss": 7.4934, + "loss/crossentropy": 2.2765417456626893, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.20736196860671044, + "step": 2860 + }, + { + "epoch": 0.07175, + "grad_norm": 31.625, + "grad_norm_var": 7.986393229166667, + "learning_rate": 0.0001, + "loss": 7.4826, + "loss/crossentropy": 2.269197002053261, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19869209118187428, + "step": 2870 + }, + { + "epoch": 0.072, + "grad_norm": 31.25, + "grad_norm_var": 3.1806640625, + "learning_rate": 0.0001, + "loss": 7.4018, + "loss/crossentropy": 2.2985214799642564, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.20524807646870613, + "step": 2880 + }, + { + "epoch": 0.07225, + "grad_norm": 30.875, + "grad_norm_var": 4.801822916666667, + "learning_rate": 0.0001, + "loss": 7.5148, + "loss/crossentropy": 2.2387808740139006, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.19951685946434736, + "step": 2890 + }, + { + "epoch": 0.0725, + "grad_norm": 28.875, + "grad_norm_var": 13.836458333333333, + "learning_rate": 0.0001, + "loss": 7.5232, + "loss/crossentropy": 2.049694790691137, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.19052465092390775, + "step": 2900 + }, + { + "epoch": 0.07275, + "grad_norm": 29.625, + "grad_norm_var": 17.91640625, + "learning_rate": 0.0001, + "loss": 7.3227, + "loss/crossentropy": 2.0360258772969244, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.18495636582374572, + "step": 2910 + }, + { + "epoch": 0.073, + "grad_norm": 32.0, + "grad_norm_var": 1.8926377214767268e+18, + "learning_rate": 0.0001, + "loss": 7.4512, + "loss/crossentropy": 2.13848315179348, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.18625867497175932, + "step": 2920 + }, + { + "epoch": 0.07325, + "grad_norm": 29.875, + "grad_norm_var": 1.8926377199175642e+18, + "learning_rate": 0.0001, + "loss": 7.5038, + "loss/crossentropy": 2.166595605015755, + "loss/hidden": 3.49375, + "loss/jsd": 0.0, + "loss/logits": 0.20948194600641729, + "step": 2930 + }, + { + "epoch": 0.0735, + "grad_norm": 28.5, + "grad_norm_var": 73.08020833333333, + "learning_rate": 0.0001, + "loss": 7.374, + "loss/crossentropy": 1.9849643550813199, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.18302082028239966, + "step": 2940 + }, + { + "epoch": 0.07375, + "grad_norm": 29.125, + "grad_norm_var": 24.825, + "learning_rate": 0.0001, + "loss": 7.3651, + "loss/crossentropy": 2.057874396443367, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1866615541279316, + "step": 2950 + }, + { + "epoch": 0.074, + "grad_norm": 30.625, + "grad_norm_var": 883.6354166666666, + "learning_rate": 0.0001, + "loss": 7.5415, + "loss/crossentropy": 2.1631729155778885, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.20762786027044058, + "step": 2960 + }, + { + "epoch": 0.07425, + "grad_norm": 32.75, + "grad_norm_var": 887.2705729166667, + "learning_rate": 0.0001, + "loss": 7.4471, + "loss/crossentropy": 1.9493468508124352, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.1884406829252839, + "step": 2970 + }, + { + "epoch": 0.0745, + "grad_norm": 28.875, + "grad_norm_var": 5.070768229166666, + "learning_rate": 0.0001, + "loss": 7.605, + "loss/crossentropy": 2.122344336658716, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.21057356838136912, + "step": 2980 + }, + { + "epoch": 0.07475, + "grad_norm": 37.0, + "grad_norm_var": 21.535416666666666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.008989527821541, + "loss/hidden": 3.54140625, + "loss/jsd": 0.0, + "loss/logits": 0.2172183733433485, + "step": 2990 + }, + { + "epoch": 0.075, + "grad_norm": 29.375, + "grad_norm_var": 18.198958333333334, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.1922819674015046, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.20425879992544652, + "step": 3000 + }, + { + "epoch": 0.07525, + "grad_norm": 29.5, + "grad_norm_var": 2.668684895833333, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 2.189265179634094, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.20808048862963915, + "step": 3010 + }, + { + "epoch": 0.0755, + "grad_norm": 30.75, + "grad_norm_var": 14.20625, + "learning_rate": 0.0001, + "loss": 7.5013, + "loss/crossentropy": 2.0573098927736284, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.18116160985082388, + "step": 3020 + }, + { + "epoch": 0.07575, + "grad_norm": 31.375, + "grad_norm_var": 16.983333333333334, + "learning_rate": 0.0001, + "loss": 7.4455, + "loss/crossentropy": 1.9735823571681976, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.19495000168681145, + "step": 3030 + }, + { + "epoch": 0.076, + "grad_norm": 7247757312.0, + "grad_norm_var": 3.2831240991582193e+18, + "learning_rate": 0.0001, + "loss": 7.4881, + "loss/crossentropy": 1.971890377253294, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.18015608433634042, + "step": 3040 + }, + { + "epoch": 0.07625, + "grad_norm": 28.25, + "grad_norm_var": 3.283124098780732e+18, + "learning_rate": 0.0001, + "loss": 7.3664, + "loss/crossentropy": 1.8378953270614147, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1741427879780531, + "step": 3050 + }, + { + "epoch": 0.0765, + "grad_norm": 31.75, + "grad_norm_var": 1.89140625, + "learning_rate": 0.0001, + "loss": 7.5137, + "loss/crossentropy": 2.141886255145073, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.19584037065505983, + "step": 3060 + }, + { + "epoch": 0.07675, + "grad_norm": 27.25, + "grad_norm_var": 2.4244140625, + "learning_rate": 0.0001, + "loss": 7.4296, + "loss/crossentropy": 2.0373554110527037, + "loss/hidden": 3.5640625, + "loss/jsd": 0.0, + "loss/logits": 0.216986732929945, + "step": 3070 + }, + { + "epoch": 0.077, + "grad_norm": 35.25, + "grad_norm_var": 3.7322265625, + "learning_rate": 0.0001, + "loss": 7.5269, + "loss/crossentropy": 1.975497831404209, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.1780722170136869, + "step": 3080 + }, + { + "epoch": 0.07725, + "grad_norm": 32.75, + "grad_norm_var": 3.6895182291666666, + "learning_rate": 0.0001, + "loss": 7.4938, + "loss/crossentropy": 2.151789793372154, + "loss/hidden": 3.502734375, + "loss/jsd": 0.0, + "loss/logits": 0.21854450944811105, + "step": 3090 + }, + { + "epoch": 0.0775, + "grad_norm": 29.5, + "grad_norm_var": 6.82265625, + "learning_rate": 0.0001, + "loss": 7.4321, + "loss/crossentropy": 1.9484706297516823, + "loss/hidden": 3.506640625, + "loss/jsd": 0.0, + "loss/logits": 0.19896488767117262, + "step": 3100 + }, + { + "epoch": 0.07775, + "grad_norm": 29.75, + "grad_norm_var": 3.0780598958333334, + "learning_rate": 0.0001, + "loss": 7.5471, + "loss/crossentropy": 2.165594828128815, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.20095103643834591, + "step": 3110 + }, + { + "epoch": 0.078, + "grad_norm": 29.0, + "grad_norm_var": 2.2197916666666666, + "learning_rate": 0.0001, + "loss": 7.6334, + "loss/crossentropy": 2.1854751259088516, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.21246263310313224, + "step": 3120 + }, + { + "epoch": 0.07825, + "grad_norm": 29.0, + "grad_norm_var": 3.71640625, + "learning_rate": 0.0001, + "loss": 7.4278, + "loss/crossentropy": 1.914103902876377, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.18373754434287548, + "step": 3130 + }, + { + "epoch": 0.0785, + "grad_norm": 29.0, + "grad_norm_var": 1.2952473958333333, + "learning_rate": 0.0001, + "loss": 7.4487, + "loss/crossentropy": 1.9421842776238918, + "loss/hidden": 3.5296875, + "loss/jsd": 0.0, + "loss/logits": 0.19919300880283117, + "step": 3140 + }, + { + "epoch": 0.07875, + "grad_norm": 29.375, + "grad_norm_var": 1.8268229166666667, + "learning_rate": 0.0001, + "loss": 7.5818, + "loss/crossentropy": 2.0765694811940194, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.19946561977267266, + "step": 3150 + }, + { + "epoch": 0.079, + "grad_norm": 28.125, + "grad_norm_var": 11.483268229166667, + "learning_rate": 0.0001, + "loss": 7.4372, + "loss/crossentropy": 2.013955050334334, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.20109358858317136, + "step": 3160 + }, + { + "epoch": 0.07925, + "grad_norm": 28.875, + "grad_norm_var": 12.871809895833334, + "learning_rate": 0.0001, + "loss": 7.4606, + "loss/crossentropy": 2.2802242666482924, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.21229397617280482, + "step": 3170 + }, + { + "epoch": 0.0795, + "grad_norm": 28.375, + "grad_norm_var": 1.6301432291666667, + "learning_rate": 0.0001, + "loss": 7.4691, + "loss/crossentropy": 2.134338477253914, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18632632456719875, + "step": 3180 + }, + { + "epoch": 0.07975, + "grad_norm": 30.625, + "grad_norm_var": 2.6113932291666666, + "learning_rate": 0.0001, + "loss": 7.4903, + "loss/crossentropy": 2.192245528101921, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.19276445377618073, + "step": 3190 + }, + { + "epoch": 0.08, + "grad_norm": 27.875, + "grad_norm_var": 2.6830729166666667, + "learning_rate": 0.0001, + "loss": 7.4715, + "loss/crossentropy": 2.1333388604223726, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.1902673264965415, + "step": 3200 + }, + { + "epoch": 0.08025, + "grad_norm": 29.625, + "grad_norm_var": 2.7072265625, + "learning_rate": 0.0001, + "loss": 7.4646, + "loss/crossentropy": 2.1069626569747926, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.18933899328112602, + "step": 3210 + }, + { + "epoch": 0.0805, + "grad_norm": 33.0, + "grad_norm_var": 1.6457682291666667, + "learning_rate": 0.0001, + "loss": 7.3771, + "loss/crossentropy": 2.143903985619545, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.19841080270707606, + "step": 3220 + }, + { + "epoch": 0.08075, + "grad_norm": 29.5, + "grad_norm_var": 2.405143229166667, + "learning_rate": 0.0001, + "loss": 7.4629, + "loss/crossentropy": 1.9501185864210129, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.2003694986924529, + "step": 3230 + }, + { + "epoch": 0.081, + "grad_norm": 35.0, + "grad_norm_var": 3.4619140625, + "learning_rate": 0.0001, + "loss": 7.6085, + "loss/crossentropy": 2.1099744185805323, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.1865939747542143, + "step": 3240 + }, + { + "epoch": 0.08125, + "grad_norm": 38.0, + "grad_norm_var": 15.54140625, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 1.8915734700858593, + "loss/hidden": 3.550390625, + "loss/jsd": 0.0, + "loss/logits": 0.20414282865822314, + "step": 3250 + }, + { + "epoch": 0.0815, + "grad_norm": 31.875, + "grad_norm_var": 15.074934895833334, + "learning_rate": 0.0001, + "loss": 7.4995, + "loss/crossentropy": 2.0746393710374833, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.19025763403624296, + "step": 3260 + }, + { + "epoch": 0.08175, + "grad_norm": 29.625, + "grad_norm_var": 4.532291666666667, + "learning_rate": 0.0001, + "loss": 7.4517, + "loss/crossentropy": 2.201898355782032, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.1851862959563732, + "step": 3270 + }, + { + "epoch": 0.082, + "grad_norm": 32.25, + "grad_norm_var": 9.199739583333333, + "learning_rate": 0.0001, + "loss": 7.4085, + "loss/crossentropy": 1.9774614453315735, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1853517958894372, + "step": 3280 + }, + { + "epoch": 0.08225, + "grad_norm": 31.0, + "grad_norm_var": 13.801497395833334, + "learning_rate": 0.0001, + "loss": 7.4065, + "loss/crossentropy": 2.1263367265462874, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.18529028967022895, + "step": 3290 + }, + { + "epoch": 0.0825, + "grad_norm": 29.5, + "grad_norm_var": 2.967643229166667, + "learning_rate": 0.0001, + "loss": 7.4165, + "loss/crossentropy": 2.193544697761536, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19897244460880756, + "step": 3300 + }, + { + "epoch": 0.08275, + "grad_norm": 33.75, + "grad_norm_var": 9.687239583333334, + "learning_rate": 0.0001, + "loss": 7.5716, + "loss/crossentropy": 2.0868531957268717, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.21278488002717494, + "step": 3310 + }, + { + "epoch": 0.083, + "grad_norm": 31.0, + "grad_norm_var": 7.9478515625, + "learning_rate": 0.0001, + "loss": 7.5543, + "loss/crossentropy": 2.1392074063420297, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.21113577168434858, + "step": 3320 + }, + { + "epoch": 0.08325, + "grad_norm": 30.0, + "grad_norm_var": 2.0268229166666667, + "learning_rate": 0.0001, + "loss": 7.4454, + "loss/crossentropy": 2.0691144198179243, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.20186964478343725, + "step": 3330 + }, + { + "epoch": 0.0835, + "grad_norm": 31.5, + "grad_norm_var": 2.6211653265769103e+18, + "learning_rate": 0.0001, + "loss": 7.4481, + "loss/crossentropy": 2.0832756504416468, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.19915037509053946, + "step": 3340 + }, + { + "epoch": 0.08375, + "grad_norm": 32.5, + "grad_norm_var": 2.621165324337292e+18, + "learning_rate": 0.0001, + "loss": 7.3606, + "loss/crossentropy": 2.102260760962963, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.19333885367959738, + "step": 3350 + }, + { + "epoch": 0.084, + "grad_norm": 29.25, + "grad_norm_var": 85.575, + "learning_rate": 0.0001, + "loss": 7.4073, + "loss/crossentropy": 2.149528594315052, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.2177526842802763, + "step": 3360 + }, + { + "epoch": 0.08425, + "grad_norm": 30.25, + "grad_norm_var": 2.8645833333333335, + "learning_rate": 0.0001, + "loss": 7.4642, + "loss/crossentropy": 2.085590344667435, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.17804578468203544, + "step": 3370 + }, + { + "epoch": 0.0845, + "grad_norm": 33.25, + "grad_norm_var": 2.996875, + "learning_rate": 0.0001, + "loss": 7.3953, + "loss/crossentropy": 2.0975965946912765, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.1847201505675912, + "step": 3380 + }, + { + "epoch": 0.08475, + "grad_norm": 32.0, + "grad_norm_var": 2.470572916666667, + "learning_rate": 0.0001, + "loss": 7.4553, + "loss/crossentropy": 2.1018140748143197, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.18250287007540464, + "step": 3390 + }, + { + "epoch": 0.085, + "grad_norm": 30.25, + "grad_norm_var": 2.887955729166667, + "learning_rate": 0.0001, + "loss": 7.5238, + "loss/crossentropy": 2.1050665065646172, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.2124734738841653, + "step": 3400 + }, + { + "epoch": 0.08525, + "grad_norm": 30.0, + "grad_norm_var": 1.7143229166666667, + "learning_rate": 0.0001, + "loss": 7.2754, + "loss/crossentropy": 2.0948296964168547, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.178215317055583, + "step": 3410 + }, + { + "epoch": 0.0855, + "grad_norm": 30.875, + "grad_norm_var": 5.354622395833333, + "learning_rate": 0.0001, + "loss": 7.4191, + "loss/crossentropy": 2.0418393671512605, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.18740264605730772, + "step": 3420 + }, + { + "epoch": 0.08575, + "grad_norm": 32.25, + "grad_norm_var": 6.430989583333333, + "learning_rate": 0.0001, + "loss": 7.5642, + "loss/crossentropy": 2.0279636546969413, + "loss/hidden": 3.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.20154636316001415, + "step": 3430 + }, + { + "epoch": 0.086, + "grad_norm": 29.125, + "grad_norm_var": 53.64791666666667, + "learning_rate": 0.0001, + "loss": 7.485, + "loss/crossentropy": 2.0705729112029077, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.22035282999277114, + "step": 3440 + }, + { + "epoch": 0.08625, + "grad_norm": 30.375, + "grad_norm_var": 5.54765625, + "learning_rate": 0.0001, + "loss": 7.428, + "loss/crossentropy": 1.9830067940056324, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.19354272997006775, + "step": 3450 + }, + { + "epoch": 0.0865, + "grad_norm": 28.375, + "grad_norm_var": 2.758736220726598e+18, + "learning_rate": 0.0001, + "loss": 7.4342, + "loss/crossentropy": 2.1590976014733316, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.20231232214719058, + "step": 3460 + }, + { + "epoch": 0.08675, + "grad_norm": 29.125, + "grad_norm_var": 2.470572916666667, + "learning_rate": 0.0001, + "loss": 7.3376, + "loss/crossentropy": 2.108407254517078, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.18425025548785925, + "step": 3470 + }, + { + "epoch": 0.087, + "grad_norm": 32.5, + "grad_norm_var": 19.315559895833335, + "learning_rate": 0.0001, + "loss": 7.391, + "loss/crossentropy": 2.086346108466387, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.19492445401847364, + "step": 3480 + }, + { + "epoch": 0.08725, + "grad_norm": 30.75, + "grad_norm_var": 3.9009765625, + "learning_rate": 0.0001, + "loss": 7.454, + "loss/crossentropy": 2.0728737086057665, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.21246139723807572, + "step": 3490 + }, + { + "epoch": 0.0875, + "grad_norm": 53.25, + "grad_norm_var": 34.962955729166666, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 1.9173476293683052, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18263984741643072, + "step": 3500 + }, + { + "epoch": 0.08775, + "grad_norm": 29.875, + "grad_norm_var": 36.22389322916667, + "learning_rate": 0.0001, + "loss": 7.5855, + "loss/crossentropy": 1.9761252515017986, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.20959299746900797, + "step": 3510 + }, + { + "epoch": 0.088, + "grad_norm": 32.25, + "grad_norm_var": 17.7337890625, + "learning_rate": 0.0001, + "loss": 7.4728, + "loss/crossentropy": 2.0416554152965545, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.19014756735414268, + "step": 3520 + }, + { + "epoch": 0.08825, + "grad_norm": 29.375, + "grad_norm_var": 14.664322916666666, + "learning_rate": 0.0001, + "loss": 7.5608, + "loss/crossentropy": 2.2834356099367143, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19908843878656626, + "step": 3530 + }, + { + "epoch": 0.0885, + "grad_norm": 31.875, + "grad_norm_var": 2.6702473958333335, + "learning_rate": 0.0001, + "loss": 7.4804, + "loss/crossentropy": 2.0417330890893934, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.20852382443845272, + "step": 3540 + }, + { + "epoch": 0.08875, + "grad_norm": 31.625, + "grad_norm_var": 2.460724589971584e+18, + "learning_rate": 0.0001, + "loss": 7.5559, + "loss/crossentropy": 2.1676768481731417, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1989177169278264, + "step": 3550 + }, + { + "epoch": 0.089, + "grad_norm": 30.0, + "grad_norm_var": 6.881705729166667, + "learning_rate": 0.0001, + "loss": 7.4678, + "loss/crossentropy": 2.221273897588253, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.19402988757938147, + "step": 3560 + }, + { + "epoch": 0.08925, + "grad_norm": 31.375, + "grad_norm_var": 7.732747395833333, + "learning_rate": 0.0001, + "loss": 7.4508, + "loss/crossentropy": 2.1802149415016174, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20121808685362338, + "step": 3570 + }, + { + "epoch": 0.0895, + "grad_norm": 52.5, + "grad_norm_var": 30.9775390625, + "learning_rate": 0.0001, + "loss": 7.3982, + "loss/crossentropy": 2.085124118626118, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.18448642026633025, + "step": 3580 + }, + { + "epoch": 0.08975, + "grad_norm": 30.875, + "grad_norm_var": 32.91295572916667, + "learning_rate": 0.0001, + "loss": 7.4381, + "loss/crossentropy": 2.1467449337244036, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19393185302615165, + "step": 3590 + }, + { + "epoch": 0.09, + "grad_norm": 29.25, + "grad_norm_var": 1.4708333333333334, + "learning_rate": 0.0001, + "loss": 7.415, + "loss/crossentropy": 2.0135369554162024, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18443848174065353, + "step": 3600 + }, + { + "epoch": 0.09025, + "grad_norm": 31.375, + "grad_norm_var": 6.1962890625, + "learning_rate": 0.0001, + "loss": 7.4028, + "loss/crossentropy": 2.1443901300430297, + "loss/hidden": 3.440234375, + "loss/jsd": 0.0, + "loss/logits": 0.2054579086601734, + "step": 3610 + }, + { + "epoch": 0.0905, + "grad_norm": 26.5, + "grad_norm_var": 3.562239583333333, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 1.799356396496296, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.17441922090947629, + "step": 3620 + }, + { + "epoch": 0.09075, + "grad_norm": 31.375, + "grad_norm_var": 2.2083333333333335, + "learning_rate": 0.0001, + "loss": 7.4272, + "loss/crossentropy": 1.9925116747617722, + "loss/hidden": 3.52578125, + "loss/jsd": 0.0, + "loss/logits": 0.21653544921427964, + "step": 3630 + }, + { + "epoch": 0.091, + "grad_norm": 30.125, + "grad_norm_var": 0.6125, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.135761073231697, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.18989351522177458, + "step": 3640 + }, + { + "epoch": 0.09125, + "grad_norm": 31.375, + "grad_norm_var": 1.4330729166666667, + "learning_rate": 0.0001, + "loss": 7.4505, + "loss/crossentropy": 2.0986070543527604, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18352905213832854, + "step": 3650 + }, + { + "epoch": 0.0915, + "grad_norm": 29.625, + "grad_norm_var": 2.5869140625, + "learning_rate": 0.0001, + "loss": 7.4199, + "loss/crossentropy": 2.1555575743317603, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19261632524430752, + "step": 3660 + }, + { + "epoch": 0.09175, + "grad_norm": 31.5, + "grad_norm_var": 2.371875, + "learning_rate": 0.0001, + "loss": 7.5463, + "loss/crossentropy": 2.1411691516637803, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.2046652188524604, + "step": 3670 + }, + { + "epoch": 0.092, + "grad_norm": 30.625, + "grad_norm_var": 4.703580729166666, + "learning_rate": 0.0001, + "loss": 7.404, + "loss/crossentropy": 2.142404294013977, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20414466112852098, + "step": 3680 + }, + { + "epoch": 0.09225, + "grad_norm": 30.375, + "grad_norm_var": 3.25625, + "learning_rate": 0.0001, + "loss": 7.4774, + "loss/crossentropy": 2.187901920080185, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.21911972090601922, + "step": 3690 + }, + { + "epoch": 0.0925, + "grad_norm": 31.875, + "grad_norm_var": 1.2166666666666666, + "learning_rate": 0.0001, + "loss": 7.5965, + "loss/crossentropy": 2.086391404271126, + "loss/hidden": 3.438671875, + "loss/jsd": 0.0, + "loss/logits": 0.2020766455680132, + "step": 3700 + }, + { + "epoch": 0.09275, + "grad_norm": 30.625, + "grad_norm_var": 2.147330729166667, + "learning_rate": 0.0001, + "loss": 7.4579, + "loss/crossentropy": 2.09081457182765, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.1868050311692059, + "step": 3710 + }, + { + "epoch": 0.093, + "grad_norm": 34.25, + "grad_norm_var": 2.467643229166667, + "learning_rate": 0.0001, + "loss": 7.522, + "loss/crossentropy": 2.12264247238636, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.18927707765251398, + "step": 3720 + }, + { + "epoch": 0.09325, + "grad_norm": 32.25, + "grad_norm_var": 3.981184895833333, + "learning_rate": 0.0001, + "loss": 7.4155, + "loss/crossentropy": 2.1118928104639054, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.19489197488874196, + "step": 3730 + }, + { + "epoch": 0.0935, + "grad_norm": 34.0, + "grad_norm_var": 5.312434895833333, + "learning_rate": 0.0001, + "loss": 7.5053, + "loss/crossentropy": 2.1360882744193077, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.19313989579677582, + "step": 3740 + }, + { + "epoch": 0.09375, + "grad_norm": 29.125, + "grad_norm_var": 4.549739583333333, + "learning_rate": 0.0001, + "loss": 7.3275, + "loss/crossentropy": 2.010613538324833, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.18421147018671036, + "step": 3750 + }, + { + "epoch": 0.094, + "grad_norm": 31.625, + "grad_norm_var": 1.5541666666666667, + "learning_rate": 0.0001, + "loss": 7.4784, + "loss/crossentropy": 2.1465295113623144, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.18987073097378016, + "step": 3760 + }, + { + "epoch": 0.09425, + "grad_norm": 32.75, + "grad_norm_var": 1.9018229166666667, + "learning_rate": 0.0001, + "loss": 7.3495, + "loss/crossentropy": 2.17747982442379, + "loss/hidden": 3.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.2016214355826378, + "step": 3770 + }, + { + "epoch": 0.0945, + "grad_norm": 30.875, + "grad_norm_var": 3.088997395833333, + "learning_rate": 0.0001, + "loss": 7.5384, + "loss/crossentropy": 2.179350584745407, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19142594784498215, + "step": 3780 + }, + { + "epoch": 0.09475, + "grad_norm": 29.625, + "grad_norm_var": 1.1559895833333333, + "learning_rate": 0.0001, + "loss": 7.4035, + "loss/crossentropy": 2.155378046631813, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.19720839541405438, + "step": 3790 + }, + { + "epoch": 0.095, + "grad_norm": 30.625, + "grad_norm_var": 1.1999348958333333, + "learning_rate": 0.0001, + "loss": 7.4441, + "loss/crossentropy": 2.0597486779093743, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.19279775265604257, + "step": 3800 + }, + { + "epoch": 0.09525, + "grad_norm": 33.5, + "grad_norm_var": 2.1666666666666665, + "learning_rate": 0.0001, + "loss": 7.5146, + "loss/crossentropy": 2.1966816753149034, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.20174810625612735, + "step": 3810 + }, + { + "epoch": 0.0955, + "grad_norm": 31.5, + "grad_norm_var": 1.9593098958333333, + "learning_rate": 0.0001, + "loss": 7.539, + "loss/crossentropy": 2.165803623199463, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.1953417781740427, + "step": 3820 + }, + { + "epoch": 0.09575, + "grad_norm": 32.0, + "grad_norm_var": 6.690625, + "learning_rate": 0.0001, + "loss": 7.514, + "loss/crossentropy": 2.0817860513925552, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.20838446952402592, + "step": 3830 + }, + { + "epoch": 0.096, + "grad_norm": 32.75, + "grad_norm_var": 7.6431640625, + "learning_rate": 0.0001, + "loss": 7.5472, + "loss/crossentropy": 2.231910442560911, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.21717903479002415, + "step": 3840 + }, + { + "epoch": 0.09625, + "grad_norm": 32.0, + "grad_norm_var": 16.134375, + "learning_rate": 0.0001, + "loss": 7.5807, + "loss/crossentropy": 2.0746277555823327, + "loss/hidden": 3.47578125, + "loss/jsd": 0.0, + "loss/logits": 0.20851925816386938, + "step": 3850 + }, + { + "epoch": 0.0965, + "grad_norm": 30.625, + "grad_norm_var": 16.132747395833334, + "learning_rate": 0.0001, + "loss": 7.3749, + "loss/crossentropy": 2.1463438466191294, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.19305863380432128, + "step": 3860 + }, + { + "epoch": 0.09675, + "grad_norm": 32.5, + "grad_norm_var": 1.0895182291666667, + "learning_rate": 0.0001, + "loss": 7.5499, + "loss/crossentropy": 2.2108413323760034, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.20310868676751853, + "step": 3870 + }, + { + "epoch": 0.097, + "grad_norm": 30.75, + "grad_norm_var": 1.4559895833333334, + "learning_rate": 0.0001, + "loss": 7.4788, + "loss/crossentropy": 2.0900154620409013, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.18780422061681748, + "step": 3880 + }, + { + "epoch": 0.09725, + "grad_norm": 30.625, + "grad_norm_var": 13.917643229166666, + "learning_rate": 0.0001, + "loss": 7.4391, + "loss/crossentropy": 2.0574848279356956, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.19390027467161416, + "step": 3890 + }, + { + "epoch": 0.0975, + "grad_norm": 27.375, + "grad_norm_var": 13.55, + "learning_rate": 0.0001, + "loss": 7.4327, + "loss/crossentropy": 2.2832688719034193, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.20608801003545524, + "step": 3900 + }, + { + "epoch": 0.09775, + "grad_norm": 29.0, + "grad_norm_var": 3.296875, + "learning_rate": 0.0001, + "loss": 7.3691, + "loss/crossentropy": 1.9183307077735663, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.1917601386550814, + "step": 3910 + }, + { + "epoch": 0.098, + "grad_norm": 34.0, + "grad_norm_var": 3.24765625, + "learning_rate": 0.0001, + "loss": 7.4628, + "loss/crossentropy": 2.0630046002566815, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.1871832549571991, + "step": 3920 + }, + { + "epoch": 0.09825, + "grad_norm": 31.75, + "grad_norm_var": 1.5384765625, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.061261148750782, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.18525551967322826, + "step": 3930 + }, + { + "epoch": 0.0985, + "grad_norm": 29.75, + "grad_norm_var": 1.584375, + "learning_rate": 0.0001, + "loss": 7.5498, + "loss/crossentropy": 2.0895790114998816, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.1932330032810569, + "step": 3940 + }, + { + "epoch": 0.09875, + "grad_norm": 30.625, + "grad_norm_var": 25.79765625, + "learning_rate": 0.0001, + "loss": 7.6502, + "loss/crossentropy": 2.1616804771125318, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.18905209768563508, + "step": 3950 + }, + { + "epoch": 0.099, + "grad_norm": 30.5, + "grad_norm_var": 28.547916666666666, + "learning_rate": 0.0001, + "loss": 7.3334, + "loss/crossentropy": 2.1435488507151605, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1910943292081356, + "step": 3960 + }, + { + "epoch": 0.09925, + "grad_norm": 32.75, + "grad_norm_var": 6.3650390625, + "learning_rate": 0.0001, + "loss": 7.542, + "loss/crossentropy": 2.176460310816765, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.18821860365569593, + "step": 3970 + }, + { + "epoch": 0.0995, + "grad_norm": 31.625, + "grad_norm_var": 3.9905598958333335, + "learning_rate": 0.0001, + "loss": 7.5231, + "loss/crossentropy": 2.2077176332473756, + "loss/hidden": 3.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.21911400128155947, + "step": 3980 + }, + { + "epoch": 0.09975, + "grad_norm": 31.125, + "grad_norm_var": 1.75625, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.105836200714111, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.1997914554551244, + "step": 3990 + }, + { + "epoch": 0.1, + "grad_norm": 38.0, + "grad_norm_var": 4.710416666666666, + "learning_rate": 0.0001, + "loss": 7.5675, + "loss/crossentropy": 2.233233967423439, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.20876242108643056, + "step": 4000 + }, + { + "epoch": 0.10025, + "grad_norm": 28.625, + "grad_norm_var": 7.56640625, + "learning_rate": 0.0001, + "loss": 7.4736, + "loss/crossentropy": 2.103509198874235, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.1953927006572485, + "step": 4010 + }, + { + "epoch": 0.1005, + "grad_norm": 28.875, + "grad_norm_var": 4.119791666666667, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 1.9697775058448315, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.17186311883851885, + "step": 4020 + }, + { + "epoch": 0.10075, + "grad_norm": 29.5, + "grad_norm_var": 1.3177083333333333, + "learning_rate": 0.0001, + "loss": 7.333, + "loss/crossentropy": 2.0519870311021804, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.1872571600601077, + "step": 4030 + }, + { + "epoch": 0.101, + "grad_norm": 29.5, + "grad_norm_var": 1.2785807291666667, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.0663713179528713, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.18582073990255593, + "step": 4040 + }, + { + "epoch": 0.10125, + "grad_norm": 30.375, + "grad_norm_var": 1.9577473958333333, + "learning_rate": 0.0001, + "loss": 7.3812, + "loss/crossentropy": 2.1256399258971213, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.19628962082788348, + "step": 4050 + }, + { + "epoch": 0.1015, + "grad_norm": 30.625, + "grad_norm_var": 0.53125, + "learning_rate": 0.0001, + "loss": 7.3726, + "loss/crossentropy": 2.1235328309237955, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.18646292947232723, + "step": 4060 + }, + { + "epoch": 0.10175, + "grad_norm": 29.0, + "grad_norm_var": 3.19255952647709e+18, + "learning_rate": 0.0001, + "loss": 7.4564, + "loss/crossentropy": 2.0213126331567763, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.19607899691909553, + "step": 4070 + }, + { + "epoch": 0.102, + "grad_norm": 28.75, + "grad_norm_var": 3.48515625, + "learning_rate": 0.0001, + "loss": 7.3886, + "loss/crossentropy": 2.0899658009409903, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.1851665174588561, + "step": 4080 + }, + { + "epoch": 0.10225, + "grad_norm": 29.5, + "grad_norm_var": 1.8692057291666666, + "learning_rate": 0.0001, + "loss": 7.4838, + "loss/crossentropy": 2.027493818849325, + "loss/hidden": 3.49765625, + "loss/jsd": 0.0, + "loss/logits": 0.19640162959694862, + "step": 4090 + }, + { + "epoch": 0.1025, + "grad_norm": 29.125, + "grad_norm_var": 11.762434895833334, + "learning_rate": 0.0001, + "loss": 7.5099, + "loss/crossentropy": 2.056584618985653, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.17638762388378382, + "step": 4100 + }, + { + "epoch": 0.10275, + "grad_norm": 30.125, + "grad_norm_var": 12.459375, + "learning_rate": 0.0001, + "loss": 7.5255, + "loss/crossentropy": 2.0713445380330087, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.18587317056953906, + "step": 4110 + }, + { + "epoch": 0.103, + "grad_norm": 33.25, + "grad_norm_var": 1.9958333333333333, + "learning_rate": 0.0001, + "loss": 7.4437, + "loss/crossentropy": 2.2338072419166566, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.18814200926572083, + "step": 4120 + }, + { + "epoch": 0.10325, + "grad_norm": 32.75, + "grad_norm_var": 3.1259765625, + "learning_rate": 0.0001, + "loss": 7.3184, + "loss/crossentropy": 2.0210259817540646, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18816483654081823, + "step": 4130 + }, + { + "epoch": 0.1035, + "grad_norm": 29.5, + "grad_norm_var": 2.870247395833333, + "learning_rate": 0.0001, + "loss": 7.5124, + "loss/crossentropy": 2.0151045128703116, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.19255878478288652, + "step": 4140 + }, + { + "epoch": 0.10375, + "grad_norm": 30.625, + "grad_norm_var": 1.3926432291666666, + "learning_rate": 0.0001, + "loss": 7.5096, + "loss/crossentropy": 1.9808883003890514, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.19115560222417116, + "step": 4150 + }, + { + "epoch": 0.104, + "grad_norm": 30.75, + "grad_norm_var": 1.6979166666666667, + "learning_rate": 0.0001, + "loss": 7.549, + "loss/crossentropy": 2.1932784736156465, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.20479805655777455, + "step": 4160 + }, + { + "epoch": 0.10425, + "grad_norm": 30.125, + "grad_norm_var": 2.3333333333333335, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 1.8820222720503808, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.17310038600116967, + "step": 4170 + }, + { + "epoch": 0.1045, + "grad_norm": 33.0, + "grad_norm_var": 3.7728515625, + "learning_rate": 0.0001, + "loss": 7.4212, + "loss/crossentropy": 2.082476270198822, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.19099258184432982, + "step": 4180 + }, + { + "epoch": 0.10475, + "grad_norm": 30.875, + "grad_norm_var": 11.408268229166667, + "learning_rate": 0.0001, + "loss": 7.4991, + "loss/crossentropy": 2.287242355942726, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1982285875827074, + "step": 4190 + }, + { + "epoch": 0.105, + "grad_norm": 28.75, + "grad_norm_var": 2.999739583333333, + "learning_rate": 0.0001, + "loss": 7.5959, + "loss/crossentropy": 2.1783332407474516, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.2117959801107645, + "step": 4200 + }, + { + "epoch": 0.10525, + "grad_norm": 30.0, + "grad_norm_var": 4.708268229166666, + "learning_rate": 0.0001, + "loss": 7.3363, + "loss/crossentropy": 1.955865352600813, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.18177355360239744, + "step": 4210 + }, + { + "epoch": 0.1055, + "grad_norm": 30.625, + "grad_norm_var": 3.0254557291666666, + "learning_rate": 0.0001, + "loss": 7.4673, + "loss/crossentropy": 1.833389012515545, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1878132861107588, + "step": 4220 + }, + { + "epoch": 0.10575, + "grad_norm": 32.0, + "grad_norm_var": 3.05, + "learning_rate": 0.0001, + "loss": 7.3969, + "loss/crossentropy": 1.9096243590116502, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.17025592969730496, + "step": 4230 + }, + { + "epoch": 0.106, + "grad_norm": 30.875, + "grad_norm_var": 1.82265625, + "learning_rate": 0.0001, + "loss": 7.4638, + "loss/crossentropy": 2.0454175233840943, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.20515710916370153, + "step": 4240 + }, + { + "epoch": 0.10625, + "grad_norm": 30.125, + "grad_norm_var": 3.1333333333333333, + "learning_rate": 0.0001, + "loss": 7.5126, + "loss/crossentropy": 2.089062933623791, + "loss/hidden": 3.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.19156677946448325, + "step": 4250 + }, + { + "epoch": 0.1065, + "grad_norm": 29.0, + "grad_norm_var": 4.311393229166667, + "learning_rate": 0.0001, + "loss": 7.4468, + "loss/crossentropy": 2.0564094200730323, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.19553639348596336, + "step": 4260 + }, + { + "epoch": 0.10675, + "grad_norm": 32.0, + "grad_norm_var": 3.2587890625, + "learning_rate": 0.0001, + "loss": 7.4186, + "loss/crossentropy": 2.13806764036417, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19822277761995793, + "step": 4270 + }, + { + "epoch": 0.107, + "grad_norm": 28.0, + "grad_norm_var": 1.6926432291666667, + "learning_rate": 0.0001, + "loss": 7.4595, + "loss/crossentropy": 2.0767486467957497, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1884168043732643, + "step": 4280 + }, + { + "epoch": 0.10725, + "grad_norm": 33.0, + "grad_norm_var": 2.3059895833333335, + "learning_rate": 0.0001, + "loss": 7.4481, + "loss/crossentropy": 2.033916361629963, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.20558829829096795, + "step": 4290 + }, + { + "epoch": 0.1075, + "grad_norm": 31.0, + "grad_norm_var": 2.9375, + "learning_rate": 0.0001, + "loss": 7.4871, + "loss/crossentropy": 2.078028707951307, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.188079852424562, + "step": 4300 + }, + { + "epoch": 0.10775, + "grad_norm": 33.25, + "grad_norm_var": 2.1020833333333333, + "learning_rate": 0.0001, + "loss": 7.5379, + "loss/crossentropy": 2.003500834107399, + "loss/hidden": 3.544921875, + "loss/jsd": 0.0, + "loss/logits": 0.20521650360897184, + "step": 4310 + }, + { + "epoch": 0.108, + "grad_norm": 29.625, + "grad_norm_var": 2.8447916666666666, + "learning_rate": 0.0001, + "loss": 7.3536, + "loss/crossentropy": 2.043112625181675, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.19910661596804857, + "step": 4320 + }, + { + "epoch": 0.10825, + "grad_norm": 28.5, + "grad_norm_var": 4.000455729166666, + "learning_rate": 0.0001, + "loss": 7.3717, + "loss/crossentropy": 2.1422011658549307, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.19375871792435645, + "step": 4330 + }, + { + "epoch": 0.1085, + "grad_norm": 29.0, + "grad_norm_var": 3.6259765625, + "learning_rate": 0.0001, + "loss": 7.5021, + "loss/crossentropy": 2.131446525454521, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.2063008865341544, + "step": 4340 + }, + { + "epoch": 0.10875, + "grad_norm": 32.0, + "grad_norm_var": 5.9525390625, + "learning_rate": 0.0001, + "loss": 7.4749, + "loss/crossentropy": 2.085691845417023, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1889802658930421, + "step": 4350 + }, + { + "epoch": 0.109, + "grad_norm": 30.75, + "grad_norm_var": 3.154166666666667, + "learning_rate": 0.0001, + "loss": 7.3816, + "loss/crossentropy": 1.8972876839339734, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.17174729090183974, + "step": 4360 + }, + { + "epoch": 0.10925, + "grad_norm": 29.875, + "grad_norm_var": 1.7509765625, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.127763804793358, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.18679574280977249, + "step": 4370 + }, + { + "epoch": 0.1095, + "grad_norm": 29.875, + "grad_norm_var": 2.16015625, + "learning_rate": 0.0001, + "loss": 7.4682, + "loss/crossentropy": 2.1872297644615175, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.183891461789608, + "step": 4380 + }, + { + "epoch": 0.10975, + "grad_norm": 28.875, + "grad_norm_var": 3.3692057291666666, + "learning_rate": 0.0001, + "loss": 7.429, + "loss/crossentropy": 2.19267495572567, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.20111876968294382, + "step": 4390 + }, + { + "epoch": 0.11, + "grad_norm": 29.375, + "grad_norm_var": 1.6858723958333333, + "learning_rate": 0.0001, + "loss": 7.556, + "loss/crossentropy": 2.1324411287903784, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.2090261412784457, + "step": 4400 + }, + { + "epoch": 0.11025, + "grad_norm": 33.5, + "grad_norm_var": 3.374739583333333, + "learning_rate": 0.0001, + "loss": 7.4081, + "loss/crossentropy": 1.9800483137369156, + "loss/hidden": 3.584375, + "loss/jsd": 0.0, + "loss/logits": 0.19881114605814218, + "step": 4410 + }, + { + "epoch": 0.1105, + "grad_norm": 31.75, + "grad_norm_var": 4.13515625, + "learning_rate": 0.0001, + "loss": 7.4904, + "loss/crossentropy": 2.053773292154074, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.18270381446927786, + "step": 4420 + }, + { + "epoch": 0.11075, + "grad_norm": 30.75, + "grad_norm_var": 2.3499348958333335, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.0641689248383046, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19245190378278493, + "step": 4430 + }, + { + "epoch": 0.111, + "grad_norm": 33.25, + "grad_norm_var": 3.158333333333333, + "learning_rate": 0.0001, + "loss": 7.3576, + "loss/crossentropy": 2.073286408931017, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.1892416624352336, + "step": 4440 + }, + { + "epoch": 0.11125, + "grad_norm": 35.75, + "grad_norm_var": 6.167122395833333, + "learning_rate": 0.0001, + "loss": 7.456, + "loss/crossentropy": 2.191167525947094, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.19327596500515937, + "step": 4450 + }, + { + "epoch": 0.1115, + "grad_norm": 28.0, + "grad_norm_var": 6.762239583333334, + "learning_rate": 0.0001, + "loss": 7.4254, + "loss/crossentropy": 1.9917161837220192, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18673346154391765, + "step": 4460 + }, + { + "epoch": 0.11175, + "grad_norm": 31.0, + "grad_norm_var": 2.763541666666667, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 2.0167058646678924, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.20151916183531285, + "step": 4470 + }, + { + "epoch": 0.112, + "grad_norm": 30.5, + "grad_norm_var": 7.175455729166667, + "learning_rate": 0.0001, + "loss": 7.4057, + "loss/crossentropy": 2.013149876892567, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.1819242848083377, + "step": 4480 + }, + { + "epoch": 0.11225, + "grad_norm": 43.25, + "grad_norm_var": 13.478580729166667, + "learning_rate": 0.0001, + "loss": 7.4416, + "loss/crossentropy": 2.111778366565704, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.20088088884949684, + "step": 4490 + }, + { + "epoch": 0.1125, + "grad_norm": 30.125, + "grad_norm_var": 11.905143229166667, + "learning_rate": 0.0001, + "loss": 7.4435, + "loss/crossentropy": 2.0223396182060243, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1967620700597763, + "step": 4500 + }, + { + "epoch": 0.11275, + "grad_norm": 28.375, + "grad_norm_var": 2.2978515625, + "learning_rate": 0.0001, + "loss": 7.3969, + "loss/crossentropy": 1.9966137878596784, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.19062119219452142, + "step": 4510 + }, + { + "epoch": 0.113, + "grad_norm": 29.75, + "grad_norm_var": 3.1759765625, + "learning_rate": 0.0001, + "loss": 7.2845, + "loss/crossentropy": 1.8878834903240205, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.16922880560159684, + "step": 4520 + }, + { + "epoch": 0.11325, + "grad_norm": 33.5, + "grad_norm_var": 3.78515625, + "learning_rate": 0.0001, + "loss": 7.5223, + "loss/crossentropy": 2.0424712359905244, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.18261839263141155, + "step": 4530 + }, + { + "epoch": 0.1135, + "grad_norm": 41.75, + "grad_norm_var": 13.172330729166667, + "learning_rate": 0.0001, + "loss": 7.4917, + "loss/crossentropy": 2.1800880253314974, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.1875661849975586, + "step": 4540 + }, + { + "epoch": 0.11375, + "grad_norm": 29.5, + "grad_norm_var": 13.737239583333333, + "learning_rate": 0.0001, + "loss": 7.4929, + "loss/crossentropy": 2.1130245834589005, + "loss/hidden": 3.514453125, + "loss/jsd": 0.0, + "loss/logits": 0.20742647554725407, + "step": 4550 + }, + { + "epoch": 0.114, + "grad_norm": 31.875, + "grad_norm_var": 3.1447265625, + "learning_rate": 0.0001, + "loss": 7.4885, + "loss/crossentropy": 2.0878429099917413, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.19807947240769863, + "step": 4560 + }, + { + "epoch": 0.11425, + "grad_norm": 32.0, + "grad_norm_var": 1.9080729166666666, + "learning_rate": 0.0001, + "loss": 7.412, + "loss/crossentropy": 2.045598204433918, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.19935160782188177, + "step": 4570 + }, + { + "epoch": 0.1145, + "grad_norm": 31.25, + "grad_norm_var": 2.703285650940459e+18, + "learning_rate": 0.0001, + "loss": 7.4112, + "loss/crossentropy": 1.9612677067518234, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1939171139150858, + "step": 4580 + }, + { + "epoch": 0.11475, + "grad_norm": 30.125, + "grad_norm_var": 9.067708333333334, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.066862888634205, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.20057452656328678, + "step": 4590 + }, + { + "epoch": 0.115, + "grad_norm": 29.25, + "grad_norm_var": 6.670833333333333, + "learning_rate": 0.0001, + "loss": 7.3857, + "loss/crossentropy": 2.0378803849220275, + "loss/hidden": 3.495703125, + "loss/jsd": 0.0, + "loss/logits": 0.19217969439923763, + "step": 4600 + }, + { + "epoch": 0.11525, + "grad_norm": 32.0, + "grad_norm_var": 8.108268229166667, + "learning_rate": 0.0001, + "loss": 7.4449, + "loss/crossentropy": 1.9883966132998467, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.1796421378850937, + "step": 4610 + }, + { + "epoch": 0.1155, + "grad_norm": 28.5, + "grad_norm_var": 2.8853515625, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 2.2122382700443266, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.20737907551229, + "step": 4620 + }, + { + "epoch": 0.11575, + "grad_norm": 30.375, + "grad_norm_var": 3.7968098958333334, + "learning_rate": 0.0001, + "loss": 7.3858, + "loss/crossentropy": 2.0896764233708383, + "loss/hidden": 3.540234375, + "loss/jsd": 0.0, + "loss/logits": 0.20905990786850454, + "step": 4630 + }, + { + "epoch": 0.116, + "grad_norm": 27.5, + "grad_norm_var": 3.6879557291666667, + "learning_rate": 0.0001, + "loss": 7.5145, + "loss/crossentropy": 2.104724445939064, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.19548750538378953, + "step": 4640 + }, + { + "epoch": 0.11625, + "grad_norm": 29.875, + "grad_norm_var": 8.7056640625, + "learning_rate": 0.0001, + "loss": 7.4009, + "loss/crossentropy": 2.155320603400469, + "loss/hidden": 3.47578125, + "loss/jsd": 0.0, + "loss/logits": 0.2002986514940858, + "step": 4650 + }, + { + "epoch": 0.1165, + "grad_norm": 27.0, + "grad_norm_var": 5.1541015625, + "learning_rate": 0.0001, + "loss": 7.3193, + "loss/crossentropy": 2.085461828112602, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1905359473079443, + "step": 4660 + }, + { + "epoch": 0.11675, + "grad_norm": 30.5, + "grad_norm_var": 1.5926432291666666, + "learning_rate": 0.0001, + "loss": 7.3125, + "loss/crossentropy": 1.9927285239100456, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.17640038076788186, + "step": 4670 + }, + { + "epoch": 0.117, + "grad_norm": 33.75, + "grad_norm_var": 4.747330729166666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.1633560836315153, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.1862495567649603, + "step": 4680 + }, + { + "epoch": 0.11725, + "grad_norm": 28.25, + "grad_norm_var": 7.198372395833333, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.2390024289488792, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.2097862558439374, + "step": 4690 + }, + { + "epoch": 0.1175, + "grad_norm": 31.375, + "grad_norm_var": 5.760872395833333, + "learning_rate": 0.0001, + "loss": 7.4669, + "loss/crossentropy": 2.0608770951628683, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.19615320730954408, + "step": 4700 + }, + { + "epoch": 0.11775, + "grad_norm": 34.25, + "grad_norm_var": 4.1894735190686346e+18, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.0900899082422257, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.17933723451569678, + "step": 4710 + }, + { + "epoch": 0.118, + "grad_norm": 29.625, + "grad_norm_var": 58.10729166666667, + "learning_rate": 0.0001, + "loss": 7.3979, + "loss/crossentropy": 2.094898019731045, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.20720194689929486, + "step": 4720 + }, + { + "epoch": 0.11825, + "grad_norm": 30.25, + "grad_norm_var": 1.98515625, + "learning_rate": 0.0001, + "loss": 7.4519, + "loss/crossentropy": 2.083225329220295, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.20777787994593383, + "step": 4730 + }, + { + "epoch": 0.1185, + "grad_norm": 30.375, + "grad_norm_var": 4.818684895833333, + "learning_rate": 0.0001, + "loss": 7.4795, + "loss/crossentropy": 2.1974314540624618, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.19978385213762523, + "step": 4740 + }, + { + "epoch": 0.11875, + "grad_norm": 32.5, + "grad_norm_var": 3.439322916666667, + "learning_rate": 0.0001, + "loss": 7.3843, + "loss/crossentropy": 1.9562335655093193, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18924889974296094, + "step": 4750 + }, + { + "epoch": 0.119, + "grad_norm": 30.625, + "grad_norm_var": 1.3015402743274143e+18, + "learning_rate": 0.0001, + "loss": 7.5729, + "loss/crossentropy": 2.0693807609379293, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.18801879994571208, + "step": 4760 + }, + { + "epoch": 0.11925, + "grad_norm": 35.25, + "grad_norm_var": 258.8791015625, + "learning_rate": 0.0001, + "loss": 7.3013, + "loss/crossentropy": 2.0631250627338886, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.18974527437239885, + "step": 4770 + }, + { + "epoch": 0.1195, + "grad_norm": 28.625, + "grad_norm_var": 301.52233072916664, + "learning_rate": 0.0001, + "loss": 7.4639, + "loss/crossentropy": 2.1473939388990404, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19722200892865657, + "step": 4780 + }, + { + "epoch": 0.11975, + "grad_norm": 31.125, + "grad_norm_var": 25.472330729166668, + "learning_rate": 0.0001, + "loss": 7.3161, + "loss/crossentropy": 2.1767601929605007, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.20041130091995002, + "step": 4790 + }, + { + "epoch": 0.12, + "grad_norm": 29.5, + "grad_norm_var": 2.8580729166666665, + "learning_rate": 0.0001, + "loss": 7.3077, + "loss/crossentropy": 2.0214909121394156, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19553480856120586, + "step": 4800 + }, + { + "epoch": 0.12025, + "grad_norm": 34.25, + "grad_norm_var": 2.3666015625, + "learning_rate": 0.0001, + "loss": 7.4537, + "loss/crossentropy": 2.092876334488392, + "loss/hidden": 3.276171875, + "loss/jsd": 0.0, + "loss/logits": 0.19079044535756112, + "step": 4810 + }, + { + "epoch": 0.1205, + "grad_norm": 28.75, + "grad_norm_var": 2.1494140625, + "learning_rate": 0.0001, + "loss": 7.3579, + "loss/crossentropy": 2.159788618981838, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.20938555523753166, + "step": 4820 + }, + { + "epoch": 0.12075, + "grad_norm": 31.625, + "grad_norm_var": 1.2635411529466906e+18, + "learning_rate": 0.0001, + "loss": 7.3822, + "loss/crossentropy": 2.221826246380806, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18899439387023448, + "step": 4830 + }, + { + "epoch": 0.121, + "grad_norm": 29.375, + "grad_norm_var": 7.171875, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.2076950490474703, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.1911212421953678, + "step": 4840 + }, + { + "epoch": 0.12125, + "grad_norm": 28.875, + "grad_norm_var": 5.397916666666666, + "learning_rate": 0.0001, + "loss": 7.2934, + "loss/crossentropy": 2.1398009806871414, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.18104367554187775, + "step": 4850 + }, + { + "epoch": 0.1215, + "grad_norm": 33.25, + "grad_norm_var": 2.292122395833333, + "learning_rate": 0.0001, + "loss": 7.3944, + "loss/crossentropy": 2.0568679124116898, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.19066975675523282, + "step": 4860 + }, + { + "epoch": 0.12175, + "grad_norm": 31.75, + "grad_norm_var": 1.5145182291666666, + "learning_rate": 0.0001, + "loss": 7.5365, + "loss/crossentropy": 2.2600763499736787, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20988074019551278, + "step": 4870 + }, + { + "epoch": 0.122, + "grad_norm": 30.125, + "grad_norm_var": 0.8442057291666667, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 2.087808459997177, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.20126468148082494, + "step": 4880 + }, + { + "epoch": 0.12225, + "grad_norm": 29.25, + "grad_norm_var": 1.9455729166666667, + "learning_rate": 0.0001, + "loss": 7.4649, + "loss/crossentropy": 2.089573635160923, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18984669484198094, + "step": 4890 + }, + { + "epoch": 0.1225, + "grad_norm": 29.125, + "grad_norm_var": 2.7552083333333335, + "learning_rate": 0.0001, + "loss": 7.4894, + "loss/crossentropy": 2.1424145482480528, + "loss/hidden": 3.47890625, + "loss/jsd": 0.0, + "loss/logits": 0.20886036530137062, + "step": 4900 + }, + { + "epoch": 0.12275, + "grad_norm": 31.0, + "grad_norm_var": 4.751497395833334, + "learning_rate": 0.0001, + "loss": 7.5033, + "loss/crossentropy": 2.104494086652994, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.1945918256416917, + "step": 4910 + }, + { + "epoch": 0.123, + "grad_norm": 28.125, + "grad_norm_var": 5.330989583333333, + "learning_rate": 0.0001, + "loss": 7.4954, + "loss/crossentropy": 2.0843611776828768, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.1925347488373518, + "step": 4920 + }, + { + "epoch": 0.12325, + "grad_norm": 28.625, + "grad_norm_var": 3.8166015625, + "learning_rate": 0.0001, + "loss": 7.4404, + "loss/crossentropy": 2.205425333976746, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18580489940941333, + "step": 4930 + }, + { + "epoch": 0.1235, + "grad_norm": 29.375, + "grad_norm_var": 14.980208333333334, + "learning_rate": 0.0001, + "loss": 7.3481, + "loss/crossentropy": 1.9896500617265702, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1904701752588153, + "step": 4940 + }, + { + "epoch": 0.12375, + "grad_norm": 32.75, + "grad_norm_var": 19.178580729166665, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.1207278318703175, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.19760717861354352, + "step": 4950 + }, + { + "epoch": 0.124, + "grad_norm": 32.5, + "grad_norm_var": 17.264583333333334, + "learning_rate": 0.0001, + "loss": 7.2678, + "loss/crossentropy": 1.9271991185843944, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19860625620931388, + "step": 4960 + }, + { + "epoch": 0.12425, + "grad_norm": 28.625, + "grad_norm_var": 11.196809895833333, + "learning_rate": 0.0001, + "loss": 7.3703, + "loss/crossentropy": 2.0659097760915754, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.18224728610366583, + "step": 4970 + }, + { + "epoch": 0.1245, + "grad_norm": 37.75, + "grad_norm_var": 10.03515625, + "learning_rate": 0.0001, + "loss": 7.5041, + "loss/crossentropy": 1.9809176340699195, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19965030066668987, + "step": 4980 + }, + { + "epoch": 0.12475, + "grad_norm": 27.125, + "grad_norm_var": 11.567708333333334, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.0197409205138683, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.18525638189166785, + "step": 4990 + }, + { + "epoch": 0.125, + "grad_norm": 34.75, + "grad_norm_var": 8.558268229166666, + "learning_rate": 0.0001, + "loss": 7.393, + "loss/crossentropy": 2.100055608153343, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.19607669236138464, + "step": 5000 + }, + { + "epoch": 0.12525, + "grad_norm": 29.75, + "grad_norm_var": 5.602083333333334, + "learning_rate": 0.0001, + "loss": 7.4165, + "loss/crossentropy": 1.9898378394544125, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18131834492087365, + "step": 5010 + }, + { + "epoch": 0.1255, + "grad_norm": 34.75, + "grad_norm_var": 5.866666666666666, + "learning_rate": 0.0001, + "loss": 7.555, + "loss/crossentropy": 2.086017055809498, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.1889553153887391, + "step": 5020 + }, + { + "epoch": 0.12575, + "grad_norm": 33.25, + "grad_norm_var": 9.083268229166666, + "learning_rate": 0.0001, + "loss": 7.4098, + "loss/crossentropy": 2.1133529357612133, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.1881294794380665, + "step": 5030 + }, + { + "epoch": 0.126, + "grad_norm": 37.25, + "grad_norm_var": 14.268489583333333, + "learning_rate": 0.0001, + "loss": 7.4117, + "loss/crossentropy": 2.1818468660116195, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.19351670220494271, + "step": 5040 + }, + { + "epoch": 0.12625, + "grad_norm": 27.75, + "grad_norm_var": 15.270572916666667, + "learning_rate": 0.0001, + "loss": 7.448, + "loss/crossentropy": 2.133790023624897, + "loss/hidden": 3.267578125, + "loss/jsd": 0.0, + "loss/logits": 0.17900315206497908, + "step": 5050 + }, + { + "epoch": 0.1265, + "grad_norm": 27.75, + "grad_norm_var": 12.469791666666667, + "learning_rate": 0.0001, + "loss": 7.4514, + "loss/crossentropy": 2.013399636745453, + "loss/hidden": 3.502734375, + "loss/jsd": 0.0, + "loss/logits": 0.19984339475631713, + "step": 5060 + }, + { + "epoch": 0.12675, + "grad_norm": 28.625, + "grad_norm_var": 6.479622395833333, + "learning_rate": 0.0001, + "loss": 7.4411, + "loss/crossentropy": 2.1552533119916917, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.20999168753623962, + "step": 5070 + }, + { + "epoch": 0.127, + "grad_norm": 30.125, + "grad_norm_var": 4.280989583333334, + "learning_rate": 0.0001, + "loss": 7.4238, + "loss/crossentropy": 2.1047082796692846, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.19757428932935, + "step": 5080 + }, + { + "epoch": 0.12725, + "grad_norm": 29.25, + "grad_norm_var": 3.971875, + "learning_rate": 0.0001, + "loss": 7.547, + "loss/crossentropy": 2.2064288735389708, + "loss/hidden": 3.472265625, + "loss/jsd": 0.0, + "loss/logits": 0.2037733059376478, + "step": 5090 + }, + { + "epoch": 0.1275, + "grad_norm": 30.125, + "grad_norm_var": 3.809309895833333, + "learning_rate": 0.0001, + "loss": 7.5359, + "loss/crossentropy": 2.307460626959801, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.2208320491015911, + "step": 5100 + }, + { + "epoch": 0.12775, + "grad_norm": 30.0, + "grad_norm_var": 6.887434895833334, + "learning_rate": 0.0001, + "loss": 7.3695, + "loss/crossentropy": 2.1241589702665804, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.18603499811142682, + "step": 5110 + }, + { + "epoch": 0.128, + "grad_norm": 30.125, + "grad_norm_var": 1.8353515625, + "learning_rate": 0.0001, + "loss": 7.4045, + "loss/crossentropy": 2.1248120576143266, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.19218573588877916, + "step": 5120 + }, + { + "epoch": 0.12825, + "grad_norm": 30.875, + "grad_norm_var": 18.843489583333334, + "learning_rate": 0.0001, + "loss": 7.3683, + "loss/crossentropy": 2.0221078641712666, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.19441262539476156, + "step": 5130 + }, + { + "epoch": 0.1285, + "grad_norm": 30.25, + "grad_norm_var": 19.755989583333335, + "learning_rate": 0.0001, + "loss": 7.4467, + "loss/crossentropy": 2.0746863678097727, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.20940047055482863, + "step": 5140 + }, + { + "epoch": 0.12875, + "grad_norm": 29.75, + "grad_norm_var": 7.226497395833333, + "learning_rate": 0.0001, + "loss": 7.4125, + "loss/crossentropy": 2.127023458480835, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19320496991276742, + "step": 5150 + }, + { + "epoch": 0.129, + "grad_norm": 30.875, + "grad_norm_var": 8.332747395833334, + "learning_rate": 0.0001, + "loss": 7.3609, + "loss/crossentropy": 2.0404578357934953, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.18524497244507074, + "step": 5160 + }, + { + "epoch": 0.12925, + "grad_norm": 30.375, + "grad_norm_var": 5.566080729166667, + "learning_rate": 0.0001, + "loss": 7.3806, + "loss/crossentropy": 2.05174797475338, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.20412184661254287, + "step": 5170 + }, + { + "epoch": 0.1295, + "grad_norm": 30.875, + "grad_norm_var": 72.65201822916667, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.0945761643350123, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.1920377543196082, + "step": 5180 + }, + { + "epoch": 0.12975, + "grad_norm": 34.25, + "grad_norm_var": 1.8330729166666666, + "learning_rate": 0.0001, + "loss": 7.3953, + "loss/crossentropy": 2.0848742216825484, + "loss/hidden": 3.548046875, + "loss/jsd": 0.0, + "loss/logits": 0.22283064387738705, + "step": 5190 + }, + { + "epoch": 0.13, + "grad_norm": 31.25, + "grad_norm_var": 2.4244140625, + "learning_rate": 0.0001, + "loss": 7.5061, + "loss/crossentropy": 1.997230054438114, + "loss/hidden": 3.488671875, + "loss/jsd": 0.0, + "loss/logits": 0.198976163379848, + "step": 5200 + }, + { + "epoch": 0.13025, + "grad_norm": 42.0, + "grad_norm_var": 9.762239583333333, + "learning_rate": 0.0001, + "loss": 7.523, + "loss/crossentropy": 2.169138702750206, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.2180183682590723, + "step": 5210 + }, + { + "epoch": 0.1305, + "grad_norm": 27.75, + "grad_norm_var": 13.330989583333333, + "learning_rate": 0.0001, + "loss": 7.4174, + "loss/crossentropy": 2.0436717979609966, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.18874377477914095, + "step": 5220 + }, + { + "epoch": 0.13075, + "grad_norm": 33.5, + "grad_norm_var": 5.186393229166667, + "learning_rate": 0.0001, + "loss": 7.4735, + "loss/crossentropy": 2.1061771392822264, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.1841479053720832, + "step": 5230 + }, + { + "epoch": 0.131, + "grad_norm": 28.25, + "grad_norm_var": 3.480208333333333, + "learning_rate": 0.0001, + "loss": 7.4379, + "loss/crossentropy": 1.9957973182201385, + "loss/hidden": 3.46171875, + "loss/jsd": 0.0, + "loss/logits": 0.19976749327033758, + "step": 5240 + }, + { + "epoch": 0.13125, + "grad_norm": 31.0, + "grad_norm_var": 2.2249348958333335, + "learning_rate": 0.0001, + "loss": 7.445, + "loss/crossentropy": 2.089694794267416, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.18123079631477595, + "step": 5250 + }, + { + "epoch": 0.1315, + "grad_norm": 30.625, + "grad_norm_var": 2.062239583333333, + "learning_rate": 0.0001, + "loss": 7.3525, + "loss/crossentropy": 2.096596322953701, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.17838086038827897, + "step": 5260 + }, + { + "epoch": 0.13175, + "grad_norm": 27.75, + "grad_norm_var": 2.6619140625, + "learning_rate": 0.0001, + "loss": 7.4042, + "loss/crossentropy": 2.086874121427536, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.18623477015644313, + "step": 5270 + }, + { + "epoch": 0.132, + "grad_norm": 31.125, + "grad_norm_var": 1.8416015625, + "learning_rate": 0.0001, + "loss": 7.3634, + "loss/crossentropy": 2.131754931807518, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.20106223467737436, + "step": 5280 + }, + { + "epoch": 0.13225, + "grad_norm": 32.5, + "grad_norm_var": 6.598372395833334, + "learning_rate": 0.0001, + "loss": 7.4969, + "loss/crossentropy": 2.1548122704029082, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20439809635281564, + "step": 5290 + }, + { + "epoch": 0.1325, + "grad_norm": 28.125, + "grad_norm_var": 5.252083333333333, + "learning_rate": 0.0001, + "loss": 7.4373, + "loss/crossentropy": 2.1770398393273354, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19656166546046733, + "step": 5300 + }, + { + "epoch": 0.13275, + "grad_norm": 29.875, + "grad_norm_var": 1.2473307291666667, + "learning_rate": 0.0001, + "loss": 7.4308, + "loss/crossentropy": 2.152033807337284, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.18985964702442287, + "step": 5310 + }, + { + "epoch": 0.133, + "grad_norm": 31.625, + "grad_norm_var": 1.0041666666666667, + "learning_rate": 0.0001, + "loss": 7.4327, + "loss/crossentropy": 2.087932828068733, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.184254783205688, + "step": 5320 + }, + { + "epoch": 0.13325, + "grad_norm": 31.0, + "grad_norm_var": 2.4344770491390623e+18, + "learning_rate": 0.0001, + "loss": 7.3366, + "loss/crossentropy": 2.034941144287586, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.1904723599553108, + "step": 5330 + }, + { + "epoch": 0.1335, + "grad_norm": 31.5, + "grad_norm_var": 6.255989583333333, + "learning_rate": 0.0001, + "loss": 7.4492, + "loss/crossentropy": 2.152811796963215, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.18841406889259815, + "step": 5340 + }, + { + "epoch": 0.13375, + "grad_norm": 31.625, + "grad_norm_var": 1.6822916666666667, + "learning_rate": 0.0001, + "loss": 7.465, + "loss/crossentropy": 2.1877569228410723, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.19606791157275438, + "step": 5350 + }, + { + "epoch": 0.134, + "grad_norm": 30.25, + "grad_norm_var": 2.17890625, + "learning_rate": 0.0001, + "loss": 7.3975, + "loss/crossentropy": 2.0353255167603495, + "loss/hidden": 3.340234375, + "loss/jsd": 0.0, + "loss/logits": 0.1815076546743512, + "step": 5360 + }, + { + "epoch": 0.13425, + "grad_norm": 31.0, + "grad_norm_var": 12.4525390625, + "learning_rate": 0.0001, + "loss": 7.3462, + "loss/crossentropy": 1.919140312820673, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.18229803508147596, + "step": 5370 + }, + { + "epoch": 0.1345, + "grad_norm": 29.625, + "grad_norm_var": 7.26640625, + "learning_rate": 0.0001, + "loss": 7.506, + "loss/crossentropy": 2.11019846200943, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.19277678560465575, + "step": 5380 + }, + { + "epoch": 0.13475, + "grad_norm": 32.0, + "grad_norm_var": 0.9275390625, + "learning_rate": 0.0001, + "loss": 7.3668, + "loss/crossentropy": 2.1108837127685547, + "loss/hidden": 3.493359375, + "loss/jsd": 0.0, + "loss/logits": 0.20013203900307416, + "step": 5390 + }, + { + "epoch": 0.135, + "grad_norm": 28.125, + "grad_norm_var": 1.7330729166666667, + "learning_rate": 0.0001, + "loss": 7.5524, + "loss/crossentropy": 2.16382010653615, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.1924815428443253, + "step": 5400 + }, + { + "epoch": 0.13525, + "grad_norm": 30.25, + "grad_norm_var": 2.5893229166666667, + "learning_rate": 0.0001, + "loss": 7.5393, + "loss/crossentropy": 2.0622613176703455, + "loss/hidden": 3.498828125, + "loss/jsd": 0.0, + "loss/logits": 0.20035731326788664, + "step": 5410 + }, + { + "epoch": 0.1355, + "grad_norm": 29.625, + "grad_norm_var": 2.8499348958333335, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.169532992690802, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.2085475005209446, + "step": 5420 + }, + { + "epoch": 0.13575, + "grad_norm": 31.0, + "grad_norm_var": 2.8212890625, + "learning_rate": 0.0001, + "loss": 7.4299, + "loss/crossentropy": 2.056758251786232, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.19217969793826342, + "step": 5430 + }, + { + "epoch": 0.136, + "grad_norm": 32.25, + "grad_norm_var": 3.9749348958333335, + "learning_rate": 0.0001, + "loss": 7.4334, + "loss/crossentropy": 2.1805212616920473, + "loss/hidden": 3.437109375, + "loss/jsd": 0.0, + "loss/logits": 0.21980819348245859, + "step": 5440 + }, + { + "epoch": 0.13625, + "grad_norm": 29.5, + "grad_norm_var": 2.0218098958333335, + "learning_rate": 0.0001, + "loss": 7.5382, + "loss/crossentropy": 2.1516773015260697, + "loss/hidden": 3.50703125, + "loss/jsd": 0.0, + "loss/logits": 0.208776849322021, + "step": 5450 + }, + { + "epoch": 0.1365, + "grad_norm": 31.375, + "grad_norm_var": 2.082747395833333, + "learning_rate": 0.0001, + "loss": 7.4684, + "loss/crossentropy": 2.1602507561445234, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.1842126866802573, + "step": 5460 + }, + { + "epoch": 0.13675, + "grad_norm": 31.25, + "grad_norm_var": 2.6197265625, + "learning_rate": 0.0001, + "loss": 7.3823, + "loss/crossentropy": 2.081377077102661, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.1906685210764408, + "step": 5470 + }, + { + "epoch": 0.137, + "grad_norm": 30.875, + "grad_norm_var": 2.21875, + "learning_rate": 0.0001, + "loss": 7.4128, + "loss/crossentropy": 2.138934540748596, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.18631890565156936, + "step": 5480 + }, + { + "epoch": 0.13725, + "grad_norm": 31.0, + "grad_norm_var": 3.958268229166667, + "learning_rate": 0.0001, + "loss": 7.568, + "loss/crossentropy": 2.02208868265152, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.19863407909870148, + "step": 5490 + }, + { + "epoch": 0.1375, + "grad_norm": 33.0, + "grad_norm_var": 6.1775390625, + "learning_rate": 0.0001, + "loss": 7.3806, + "loss/crossentropy": 2.0247954726219177, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.18538292730227113, + "step": 5500 + }, + { + "epoch": 0.13775, + "grad_norm": 30.625, + "grad_norm_var": 6.076041666666667, + "learning_rate": 0.0001, + "loss": 7.6241, + "loss/crossentropy": 2.2269895624369385, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.197488261340186, + "step": 5510 + }, + { + "epoch": 0.138, + "grad_norm": 30.25, + "grad_norm_var": 4.1947265625, + "learning_rate": 0.0001, + "loss": 7.4968, + "loss/crossentropy": 2.077942840754986, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19135653134435415, + "step": 5520 + }, + { + "epoch": 0.13825, + "grad_norm": 27.75, + "grad_norm_var": 2.701041666666667, + "learning_rate": 0.0001, + "loss": 7.3842, + "loss/crossentropy": 2.104434663057327, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.18928063409402968, + "step": 5530 + }, + { + "epoch": 0.1385, + "grad_norm": 31.5, + "grad_norm_var": 6.4087890625, + "learning_rate": 0.0001, + "loss": 7.443, + "loss/crossentropy": 2.0420807294547556, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.18584198467433452, + "step": 5540 + }, + { + "epoch": 0.13875, + "grad_norm": 32.5, + "grad_norm_var": 4.7556640625, + "learning_rate": 0.0001, + "loss": 7.418, + "loss/crossentropy": 2.0941856279969215, + "loss/hidden": 3.311328125, + "loss/jsd": 0.0, + "loss/logits": 0.18290557386353612, + "step": 5550 + }, + { + "epoch": 0.139, + "grad_norm": 34.0, + "grad_norm_var": 3.0947916666666666, + "learning_rate": 0.0001, + "loss": 7.3594, + "loss/crossentropy": 2.1482032746076585, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19914243686944247, + "step": 5560 + }, + { + "epoch": 0.13925, + "grad_norm": 31.875, + "grad_norm_var": 4.449739583333334, + "learning_rate": 0.0001, + "loss": 7.5652, + "loss/crossentropy": 2.150991679728031, + "loss/hidden": 3.4984375, + "loss/jsd": 0.0, + "loss/logits": 0.20609580241143705, + "step": 5570 + }, + { + "epoch": 0.1395, + "grad_norm": 30.125, + "grad_norm_var": 8.527083333333334, + "learning_rate": 0.0001, + "loss": 7.3112, + "loss/crossentropy": 2.1712302803993224, + "loss/hidden": 3.38203125, + "loss/jsd": 0.0, + "loss/logits": 0.18877983894199132, + "step": 5580 + }, + { + "epoch": 0.13975, + "grad_norm": 28.5, + "grad_norm_var": 2.1197265625, + "learning_rate": 0.0001, + "loss": 7.4033, + "loss/crossentropy": 2.1502134561538697, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.21474836114794016, + "step": 5590 + }, + { + "epoch": 0.14, + "grad_norm": 30.25, + "grad_norm_var": 3.6809895833333335, + "learning_rate": 0.0001, + "loss": 7.5089, + "loss/crossentropy": 2.230164831876755, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.2127472611144185, + "step": 5600 + }, + { + "epoch": 0.14025, + "grad_norm": 30.625, + "grad_norm_var": 52.13333333333333, + "learning_rate": 0.0001, + "loss": 7.5328, + "loss/crossentropy": 2.1207681491971018, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.20428987089544534, + "step": 5610 + }, + { + "epoch": 0.1405, + "grad_norm": 30.625, + "grad_norm_var": 2.037239583333333, + "learning_rate": 0.0001, + "loss": 7.3972, + "loss/crossentropy": 2.065328547358513, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.19245364069938659, + "step": 5620 + }, + { + "epoch": 0.14075, + "grad_norm": 29.375, + "grad_norm_var": 2.40625, + "learning_rate": 0.0001, + "loss": 7.2803, + "loss/crossentropy": 2.0791175961494446, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.1857742078602314, + "step": 5630 + }, + { + "epoch": 0.141, + "grad_norm": 29.0, + "grad_norm_var": 2.687239583333333, + "learning_rate": 0.0001, + "loss": 7.3324, + "loss/crossentropy": 2.054654690623283, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.18879605047404766, + "step": 5640 + }, + { + "epoch": 0.14125, + "grad_norm": 27.625, + "grad_norm_var": 3.06015625, + "learning_rate": 0.0001, + "loss": 7.4804, + "loss/crossentropy": 2.163857588917017, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.2018537001684308, + "step": 5650 + }, + { + "epoch": 0.1415, + "grad_norm": 31.625, + "grad_norm_var": 3.54140625, + "learning_rate": 0.0001, + "loss": 7.4337, + "loss/crossentropy": 2.1230690620839594, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.18610329292714595, + "step": 5660 + }, + { + "epoch": 0.14175, + "grad_norm": 30.0, + "grad_norm_var": 2.0171223958333333, + "learning_rate": 0.0001, + "loss": 7.484, + "loss/crossentropy": 2.1232656478881835, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18209880087524652, + "step": 5670 + }, + { + "epoch": 0.142, + "grad_norm": 30.625, + "grad_norm_var": 3.6244140625, + "learning_rate": 0.0001, + "loss": 7.3601, + "loss/crossentropy": 1.9925632011145353, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.18384792990982532, + "step": 5680 + }, + { + "epoch": 0.14225, + "grad_norm": 28.875, + "grad_norm_var": 4.595247395833334, + "learning_rate": 0.0001, + "loss": 7.3492, + "loss/crossentropy": 1.9747695334255695, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.18710496351122857, + "step": 5690 + }, + { + "epoch": 0.1425, + "grad_norm": 30.0, + "grad_norm_var": 5.160872395833334, + "learning_rate": 0.0001, + "loss": 7.1869, + "loss/crossentropy": 1.9229816131293773, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.1764959843829274, + "step": 5700 + }, + { + "epoch": 0.14275, + "grad_norm": 31.375, + "grad_norm_var": 1.1177083333333333, + "learning_rate": 0.0001, + "loss": 7.5353, + "loss/crossentropy": 2.1759460479021073, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.2011772884055972, + "step": 5710 + }, + { + "epoch": 0.143, + "grad_norm": 30.875, + "grad_norm_var": 19.32265625, + "learning_rate": 0.0001, + "loss": 7.4364, + "loss/crossentropy": 1.9983633741736413, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.2036839971318841, + "step": 5720 + }, + { + "epoch": 0.14325, + "grad_norm": 31.75, + "grad_norm_var": 6.330989583333333, + "learning_rate": 0.0001, + "loss": 7.3473, + "loss/crossentropy": 2.2608693316578865, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.20173839703202248, + "step": 5730 + }, + { + "epoch": 0.1435, + "grad_norm": 29.0, + "grad_norm_var": 30.8306640625, + "learning_rate": 0.0001, + "loss": 7.4861, + "loss/crossentropy": 2.191919285058975, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.1848000530153513, + "step": 5740 + }, + { + "epoch": 0.14375, + "grad_norm": 32.5, + "grad_norm_var": 8.167643229166666, + "learning_rate": 0.0001, + "loss": 7.5179, + "loss/crossentropy": 2.098912109434605, + "loss/hidden": 3.544921875, + "loss/jsd": 0.0, + "loss/logits": 0.22364525627344847, + "step": 5750 + }, + { + "epoch": 0.144, + "grad_norm": 30.5, + "grad_norm_var": 2.5479166666666666, + "learning_rate": 0.0001, + "loss": 7.4241, + "loss/crossentropy": 2.089163874089718, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.19125983892008663, + "step": 5760 + }, + { + "epoch": 0.14425, + "grad_norm": 29.875, + "grad_norm_var": 8.223958333333334, + "learning_rate": 0.0001, + "loss": 7.45, + "loss/crossentropy": 2.2600366115570067, + "loss/hidden": 3.444140625, + "loss/jsd": 0.0, + "loss/logits": 0.1984367400407791, + "step": 5770 + }, + { + "epoch": 0.1445, + "grad_norm": 32.0, + "grad_norm_var": 14.806705729166667, + "learning_rate": 0.0001, + "loss": 7.3632, + "loss/crossentropy": 1.9510320864617825, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.18547183061018585, + "step": 5780 + }, + { + "epoch": 0.14475, + "grad_norm": 30.0, + "grad_norm_var": 9.655989583333334, + "learning_rate": 0.0001, + "loss": 7.5503, + "loss/crossentropy": 2.143619356304407, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.20010631643235682, + "step": 5790 + }, + { + "epoch": 0.145, + "grad_norm": 28.375, + "grad_norm_var": 9.556184895833333, + "learning_rate": 0.0001, + "loss": 7.5076, + "loss/crossentropy": 1.9316529139876366, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.18306834027171134, + "step": 5800 + }, + { + "epoch": 0.14525, + "grad_norm": 37.25, + "grad_norm_var": 10.718489583333334, + "learning_rate": 0.0001, + "loss": 7.5004, + "loss/crossentropy": 2.136544609069824, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.20682645812630654, + "step": 5810 + }, + { + "epoch": 0.1455, + "grad_norm": 34.75, + "grad_norm_var": 9.395572916666667, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.0811544865369798, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.187607554346323, + "step": 5820 + }, + { + "epoch": 0.14575, + "grad_norm": 30.125, + "grad_norm_var": 11.476041666666667, + "learning_rate": 0.0001, + "loss": 7.4615, + "loss/crossentropy": 2.2464685887098312, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.21834317222237587, + "step": 5830 + }, + { + "epoch": 0.146, + "grad_norm": 31.875, + "grad_norm_var": 8.106705729166666, + "learning_rate": 0.0001, + "loss": 7.379, + "loss/crossentropy": 2.1494223892688753, + "loss/hidden": 3.4875, + "loss/jsd": 0.0, + "loss/logits": 0.20599585752934219, + "step": 5840 + }, + { + "epoch": 0.14625, + "grad_norm": 32.25, + "grad_norm_var": 119.0541015625, + "learning_rate": 0.0001, + "loss": 7.4223, + "loss/crossentropy": 2.013238602876663, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.17934355642646552, + "step": 5850 + }, + { + "epoch": 0.1465, + "grad_norm": 55.5, + "grad_norm_var": 40.619205729166666, + "learning_rate": 0.0001, + "loss": 7.4766, + "loss/crossentropy": 2.1309464499354362, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1953899236395955, + "step": 5860 + }, + { + "epoch": 0.14675, + "grad_norm": 31.125, + "grad_norm_var": 51.764322916666664, + "learning_rate": 0.0001, + "loss": 7.5385, + "loss/crossentropy": 2.203585295379162, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.20035494081676006, + "step": 5870 + }, + { + "epoch": 0.147, + "grad_norm": 30.125, + "grad_norm_var": 8.2541015625, + "learning_rate": 0.0001, + "loss": 7.3861, + "loss/crossentropy": 2.057890709489584, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.18680873457342387, + "step": 5880 + }, + { + "epoch": 0.14725, + "grad_norm": 30.25, + "grad_norm_var": 4.555989583333333, + "learning_rate": 0.0001, + "loss": 7.4283, + "loss/crossentropy": 2.049139867722988, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.1856512013822794, + "step": 5890 + }, + { + "epoch": 0.1475, + "grad_norm": 30.375, + "grad_norm_var": 10.570768229166667, + "learning_rate": 0.0001, + "loss": 7.4651, + "loss/crossentropy": 2.0553019613027574, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18676785845309496, + "step": 5900 + }, + { + "epoch": 0.14775, + "grad_norm": 30.125, + "grad_norm_var": 14.59140625, + "learning_rate": 0.0001, + "loss": 7.4727, + "loss/crossentropy": 2.0098409935832025, + "loss/hidden": 3.48125, + "loss/jsd": 0.0, + "loss/logits": 0.20080684809945523, + "step": 5910 + }, + { + "epoch": 0.148, + "grad_norm": 28.625, + "grad_norm_var": 8.489322916666667, + "learning_rate": 0.0001, + "loss": 7.4203, + "loss/crossentropy": 2.2615666806697847, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.18850413355976342, + "step": 5920 + }, + { + "epoch": 0.14825, + "grad_norm": 30.0, + "grad_norm_var": 3.6233723958333335, + "learning_rate": 0.0001, + "loss": 7.4441, + "loss/crossentropy": 2.178256964683533, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.19248567353934048, + "step": 5930 + }, + { + "epoch": 0.1485, + "grad_norm": 31.0, + "grad_norm_var": 3.455208333333333, + "learning_rate": 0.0001, + "loss": 7.4573, + "loss/crossentropy": 2.246034747362137, + "loss/hidden": 3.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.2096735591068864, + "step": 5940 + }, + { + "epoch": 0.14875, + "grad_norm": 30.625, + "grad_norm_var": 3.8494140625, + "learning_rate": 0.0001, + "loss": 7.4811, + "loss/crossentropy": 2.180899788439274, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.19460927378386259, + "step": 5950 + }, + { + "epoch": 0.149, + "grad_norm": 30.5, + "grad_norm_var": 3.4385416666666666, + "learning_rate": 0.0001, + "loss": 7.3829, + "loss/crossentropy": 2.258976912498474, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.19133044108748437, + "step": 5960 + }, + { + "epoch": 0.14925, + "grad_norm": 29.0, + "grad_norm_var": 13.80390625, + "learning_rate": 0.0001, + "loss": 7.5384, + "loss/crossentropy": 2.012222741544247, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.19505210760980846, + "step": 5970 + }, + { + "epoch": 0.1495, + "grad_norm": 27.875, + "grad_norm_var": 12.9369140625, + "learning_rate": 0.0001, + "loss": 7.4033, + "loss/crossentropy": 2.0392286255955696, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1930427584797144, + "step": 5980 + }, + { + "epoch": 0.14975, + "grad_norm": 28.25, + "grad_norm_var": 18.174739583333334, + "learning_rate": 0.0001, + "loss": 7.399, + "loss/crossentropy": 1.9029529005289079, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.198425155505538, + "step": 5990 + }, + { + "epoch": 0.15, + "grad_norm": 31.875, + "grad_norm_var": 3.0737770860662226e+18, + "learning_rate": 0.0001, + "loss": 7.4994, + "loss/crossentropy": 1.8985859856009484, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.1951824951916933, + "step": 6000 + }, + { + "epoch": 0.15025, + "grad_norm": 36.0, + "grad_norm_var": 3.073777086665239e+18, + "learning_rate": 0.0001, + "loss": 7.4659, + "loss/crossentropy": 2.097201645374298, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.18253911342471837, + "step": 6010 + }, + { + "epoch": 0.1505, + "grad_norm": 27.875, + "grad_norm_var": 6.801041666666666, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 2.0210610911250115, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.197306059114635, + "step": 6020 + }, + { + "epoch": 0.15075, + "grad_norm": 31.0, + "grad_norm_var": 14.46640625, + "learning_rate": 0.0001, + "loss": 7.4519, + "loss/crossentropy": 2.1985476523637772, + "loss/hidden": 3.475, + "loss/jsd": 0.0, + "loss/logits": 0.20262509360909461, + "step": 6030 + }, + { + "epoch": 0.151, + "grad_norm": 30.625, + "grad_norm_var": 6.254622395833334, + "learning_rate": 0.0001, + "loss": 7.3353, + "loss/crossentropy": 2.0093181416392327, + "loss/hidden": 3.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.17620250331237913, + "step": 6040 + }, + { + "epoch": 0.15125, + "grad_norm": 31.625, + "grad_norm_var": 56.4291015625, + "learning_rate": 0.0001, + "loss": 7.4034, + "loss/crossentropy": 2.177773226797581, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.20441538300365208, + "step": 6050 + }, + { + "epoch": 0.1515, + "grad_norm": 26.75, + "grad_norm_var": 55.889322916666664, + "learning_rate": 0.0001, + "loss": 7.3245, + "loss/crossentropy": 2.1666259437799456, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.19311951845884323, + "step": 6060 + }, + { + "epoch": 0.15175, + "grad_norm": 30.25, + "grad_norm_var": 91.0103515625, + "learning_rate": 0.0001, + "loss": 7.368, + "loss/crossentropy": 2.063462796807289, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.1834208857268095, + "step": 6070 + }, + { + "epoch": 0.152, + "grad_norm": 30.5, + "grad_norm_var": 18.1212890625, + "learning_rate": 0.0001, + "loss": 7.4335, + "loss/crossentropy": 1.9907098844647408, + "loss/hidden": 3.53671875, + "loss/jsd": 0.0, + "loss/logits": 0.20707368329167367, + "step": 6080 + }, + { + "epoch": 0.15225, + "grad_norm": 38.25, + "grad_norm_var": 11.470247395833333, + "learning_rate": 0.0001, + "loss": 7.3789, + "loss/crossentropy": 2.083692157268524, + "loss/hidden": 3.465625, + "loss/jsd": 0.0, + "loss/logits": 0.1846569798886776, + "step": 6090 + }, + { + "epoch": 0.1525, + "grad_norm": 28.25, + "grad_norm_var": 21.829622395833333, + "learning_rate": 0.0001, + "loss": 7.3767, + "loss/crossentropy": 2.1113929279148578, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.1990992769598961, + "step": 6100 + }, + { + "epoch": 0.15275, + "grad_norm": 33.75, + "grad_norm_var": 20.676497395833334, + "learning_rate": 0.0001, + "loss": 7.339, + "loss/crossentropy": 2.1296695113182067, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.1936045665293932, + "step": 6110 + }, + { + "epoch": 0.153, + "grad_norm": 36.25, + "grad_norm_var": 7.8166015625, + "learning_rate": 0.0001, + "loss": 7.5027, + "loss/crossentropy": 2.1011226207017897, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.20695240292698144, + "step": 6120 + }, + { + "epoch": 0.15325, + "grad_norm": 30.625, + "grad_norm_var": 2.1304840750224113e+18, + "learning_rate": 0.0001, + "loss": 7.506, + "loss/crossentropy": 2.2427969723939896, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1966634625568986, + "step": 6130 + }, + { + "epoch": 0.1535, + "grad_norm": 34.25, + "grad_norm_var": 36.542643229166664, + "learning_rate": 0.0001, + "loss": 7.4413, + "loss/crossentropy": 2.0855264641344546, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.19436217453330756, + "step": 6140 + }, + { + "epoch": 0.15375, + "grad_norm": 32.75, + "grad_norm_var": 10.153580729166666, + "learning_rate": 0.0001, + "loss": 7.3096, + "loss/crossentropy": 2.0322439685463904, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.1753252800554037, + "step": 6150 + }, + { + "epoch": 0.154, + "grad_norm": 31.0, + "grad_norm_var": 10.216080729166666, + "learning_rate": 0.0001, + "loss": 7.2481, + "loss/crossentropy": 2.074477408081293, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.17578690703958272, + "step": 6160 + }, + { + "epoch": 0.15425, + "grad_norm": 33.25, + "grad_norm_var": 28.79765625, + "learning_rate": 0.0001, + "loss": 7.4403, + "loss/crossentropy": 2.0863804474473, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.20699662044644357, + "step": 6170 + }, + { + "epoch": 0.1545, + "grad_norm": 33.5, + "grad_norm_var": 24.84375, + "learning_rate": 0.0001, + "loss": 7.4609, + "loss/crossentropy": 2.0696858704090118, + "loss/hidden": 3.458984375, + "loss/jsd": 0.0, + "loss/logits": 0.2128069180995226, + "step": 6180 + }, + { + "epoch": 0.15475, + "grad_norm": 31.375, + "grad_norm_var": 5.099739583333333, + "learning_rate": 0.0001, + "loss": 7.2575, + "loss/crossentropy": 2.182169410586357, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.181897877715528, + "step": 6190 + }, + { + "epoch": 0.155, + "grad_norm": 34.25, + "grad_norm_var": 4.699934895833334, + "learning_rate": 0.0001, + "loss": 7.4975, + "loss/crossentropy": 2.165008749067783, + "loss/hidden": 3.492578125, + "loss/jsd": 0.0, + "loss/logits": 0.19800901636481286, + "step": 6200 + }, + { + "epoch": 0.15525, + "grad_norm": 33.75, + "grad_norm_var": 2.7738932291666667, + "learning_rate": 0.0001, + "loss": 7.3992, + "loss/crossentropy": 2.0653695166110992, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.19180236533284187, + "step": 6210 + }, + { + "epoch": 0.1555, + "grad_norm": 35.25, + "grad_norm_var": 5.31015625, + "learning_rate": 0.0001, + "loss": 7.3638, + "loss/crossentropy": 2.1244568385183813, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.18238217020407319, + "step": 6220 + }, + { + "epoch": 0.15575, + "grad_norm": 32.25, + "grad_norm_var": 3.2817057291666667, + "learning_rate": 0.0001, + "loss": 7.5073, + "loss/crossentropy": 2.1881898671388624, + "loss/hidden": 3.498828125, + "loss/jsd": 0.0, + "loss/logits": 0.2045454490929842, + "step": 6230 + }, + { + "epoch": 0.156, + "grad_norm": 28.25, + "grad_norm_var": 2.6572265625, + "learning_rate": 0.0001, + "loss": 7.4609, + "loss/crossentropy": 2.14600064009428, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.18945380430668593, + "step": 6240 + }, + { + "epoch": 0.15625, + "grad_norm": 30.75, + "grad_norm_var": 35.73118489583333, + "learning_rate": 0.0001, + "loss": 7.3786, + "loss/crossentropy": 2.168429624289274, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.18439108245074748, + "step": 6250 + }, + { + "epoch": 0.1565, + "grad_norm": 52.5, + "grad_norm_var": 64.9962890625, + "learning_rate": 0.0001, + "loss": 7.3511, + "loss/crossentropy": 2.1293379329144955, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.1828605517745018, + "step": 6260 + }, + { + "epoch": 0.15675, + "grad_norm": 29.25, + "grad_norm_var": 59.703125, + "learning_rate": 0.0001, + "loss": 7.3978, + "loss/crossentropy": 1.8641120925545693, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.18577109538018705, + "step": 6270 + }, + { + "epoch": 0.157, + "grad_norm": 28.75, + "grad_norm_var": 32.1994140625, + "learning_rate": 0.0001, + "loss": 7.4066, + "loss/crossentropy": 2.0997040398418902, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.18688563201576472, + "step": 6280 + }, + { + "epoch": 0.15725, + "grad_norm": 31.375, + "grad_norm_var": 17.302018229166666, + "learning_rate": 0.0001, + "loss": 7.4043, + "loss/crossentropy": 1.9712626039981842, + "loss/hidden": 3.45078125, + "loss/jsd": 0.0, + "loss/logits": 0.20054549565538765, + "step": 6290 + }, + { + "epoch": 0.1575, + "grad_norm": 31.5, + "grad_norm_var": 17.8431640625, + "learning_rate": 0.0001, + "loss": 7.4502, + "loss/crossentropy": 2.0252815186977386, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18488127905875446, + "step": 6300 + }, + { + "epoch": 0.15775, + "grad_norm": 30.75, + "grad_norm_var": 7.995572916666666, + "learning_rate": 0.0001, + "loss": 7.3829, + "loss/crossentropy": 2.030302118510008, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.19101340658962726, + "step": 6310 + }, + { + "epoch": 0.158, + "grad_norm": 30.125, + "grad_norm_var": 5.805143229166666, + "learning_rate": 0.0001, + "loss": 7.3852, + "loss/crossentropy": 1.9795936658978461, + "loss/hidden": 3.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.19110800279304385, + "step": 6320 + }, + { + "epoch": 0.15825, + "grad_norm": 34.0, + "grad_norm_var": 6.91640625, + "learning_rate": 0.0001, + "loss": 7.4417, + "loss/crossentropy": 2.0620448149740698, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.194018579646945, + "step": 6330 + }, + { + "epoch": 0.1585, + "grad_norm": 28.125, + "grad_norm_var": 31.058268229166668, + "learning_rate": 0.0001, + "loss": 7.4142, + "loss/crossentropy": 2.012200343608856, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.21260247621685266, + "step": 6340 + }, + { + "epoch": 0.15875, + "grad_norm": 36.75, + "grad_norm_var": 35.18118489583333, + "learning_rate": 0.0001, + "loss": 7.3776, + "loss/crossentropy": 1.9757203698158263, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.200297892652452, + "step": 6350 + }, + { + "epoch": 0.159, + "grad_norm": 31.125, + "grad_norm_var": 17.764583333333334, + "learning_rate": 0.0001, + "loss": 7.4762, + "loss/crossentropy": 2.195678301155567, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.1989523505792022, + "step": 6360 + }, + { + "epoch": 0.15925, + "grad_norm": 29.625, + "grad_norm_var": 12.851041666666667, + "learning_rate": 0.0001, + "loss": 7.4661, + "loss/crossentropy": 2.0537394002079963, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.20311654023826123, + "step": 6370 + }, + { + "epoch": 0.1595, + "grad_norm": 30.0, + "grad_norm_var": 10.0994140625, + "learning_rate": 0.0001, + "loss": 7.2759, + "loss/crossentropy": 2.02838040292263, + "loss/hidden": 3.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.19148585237562657, + "step": 6380 + }, + { + "epoch": 0.15975, + "grad_norm": 39.0, + "grad_norm_var": 2324.6707682291667, + "learning_rate": 0.0001, + "loss": 7.3973, + "loss/crossentropy": 2.0951177358627318, + "loss/hidden": 3.430859375, + "loss/jsd": 0.0, + "loss/logits": 0.2158473737537861, + "step": 6390 + }, + { + "epoch": 0.16, + "grad_norm": 40.0, + "grad_norm_var": 21.121809895833334, + "learning_rate": 0.0001, + "loss": 7.2877, + "loss/crossentropy": 1.878954614698887, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.18514612764120103, + "step": 6400 + }, + { + "epoch": 0.16025, + "grad_norm": 33.5, + "grad_norm_var": 23.1666015625, + "learning_rate": 0.0001, + "loss": 7.3598, + "loss/crossentropy": 2.123918867111206, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18728599287569522, + "step": 6410 + }, + { + "epoch": 0.1605, + "grad_norm": 29.25, + "grad_norm_var": 11.230143229166666, + "learning_rate": 0.0001, + "loss": 7.476, + "loss/crossentropy": 2.168968527019024, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.20153266489505767, + "step": 6420 + }, + { + "epoch": 0.16075, + "grad_norm": 30.0, + "grad_norm_var": 90.8056640625, + "learning_rate": 0.0001, + "loss": 7.4289, + "loss/crossentropy": 2.0426762118935584, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.19033107869327068, + "step": 6430 + }, + { + "epoch": 0.161, + "grad_norm": 38.25, + "grad_norm_var": 15.570247395833333, + "learning_rate": 0.0001, + "loss": 7.3992, + "loss/crossentropy": 2.0535445332527162, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.19420330366119742, + "step": 6440 + }, + { + "epoch": 0.16125, + "grad_norm": 45.5, + "grad_norm_var": 30.326822916666668, + "learning_rate": 0.0001, + "loss": 7.3881, + "loss/crossentropy": 1.949498599767685, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.18119702748954297, + "step": 6450 + }, + { + "epoch": 0.1615, + "grad_norm": 30.625, + "grad_norm_var": 96.64837239583333, + "learning_rate": 0.0001, + "loss": 7.432, + "loss/crossentropy": 2.2534308552742006, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.20010371711105107, + "step": 6460 + }, + { + "epoch": 0.16175, + "grad_norm": 34.0, + "grad_norm_var": 82.66015625, + "learning_rate": 0.0001, + "loss": 7.3555, + "loss/crossentropy": 2.114310759305954, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.19822147954255342, + "step": 6470 + }, + { + "epoch": 0.162, + "grad_norm": 29.875, + "grad_norm_var": 8.4541015625, + "learning_rate": 0.0001, + "loss": 7.1893, + "loss/crossentropy": 2.062894639372826, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.17771479729562997, + "step": 6480 + }, + { + "epoch": 0.16225, + "grad_norm": 30.375, + "grad_norm_var": 15.72265625, + "learning_rate": 0.0001, + "loss": 7.3665, + "loss/crossentropy": 2.0109994761645793, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.19815738410688938, + "step": 6490 + }, + { + "epoch": 0.1625, + "grad_norm": 33.75, + "grad_norm_var": 162.74680989583334, + "learning_rate": 0.0001, + "loss": 7.4105, + "loss/crossentropy": 2.100720961391926, + "loss/hidden": 3.26484375, + "loss/jsd": 0.0, + "loss/logits": 0.18080853056162596, + "step": 6500 + }, + { + "epoch": 0.16275, + "grad_norm": 36.0, + "grad_norm_var": 10.530143229166667, + "learning_rate": 0.0001, + "loss": 7.4993, + "loss/crossentropy": 2.208073277771473, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18481182418763636, + "step": 6510 + }, + { + "epoch": 0.163, + "grad_norm": 37.0, + "grad_norm_var": 8.981705729166666, + "learning_rate": 0.0001, + "loss": 7.5196, + "loss/crossentropy": 2.2666310742497444, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.20527655016630889, + "step": 6520 + }, + { + "epoch": 0.16325, + "grad_norm": 29.625, + "grad_norm_var": 13.959830729166667, + "learning_rate": 0.0001, + "loss": 7.4239, + "loss/crossentropy": 2.2184250116348267, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.19118925426155328, + "step": 6530 + }, + { + "epoch": 0.1635, + "grad_norm": 29.375, + "grad_norm_var": 8.820833333333333, + "learning_rate": 0.0001, + "loss": 7.3298, + "loss/crossentropy": 2.119840921461582, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.20046985391527414, + "step": 6540 + }, + { + "epoch": 0.16375, + "grad_norm": 31.375, + "grad_norm_var": 3.1910807291666665, + "learning_rate": 0.0001, + "loss": 7.4507, + "loss/crossentropy": 2.109931045770645, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.18326662238687277, + "step": 6550 + }, + { + "epoch": 0.164, + "grad_norm": 35.25, + "grad_norm_var": 8.6625, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 2.090756069123745, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.1922204466536641, + "step": 6560 + }, + { + "epoch": 0.16425, + "grad_norm": 30.625, + "grad_norm_var": 14.567643229166666, + "learning_rate": 0.0001, + "loss": 7.1796, + "loss/crossentropy": 1.9266313910484314, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.20681370329111814, + "step": 6570 + }, + { + "epoch": 0.1645, + "grad_norm": 37.5, + "grad_norm_var": 12.9447265625, + "learning_rate": 0.0001, + "loss": 7.5097, + "loss/crossentropy": 1.875116103887558, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.2045787101611495, + "step": 6580 + }, + { + "epoch": 0.16475, + "grad_norm": 29.875, + "grad_norm_var": 5.706184895833333, + "learning_rate": 0.0001, + "loss": 7.329, + "loss/crossentropy": 2.116366655379534, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.18577212654054165, + "step": 6590 + }, + { + "epoch": 0.165, + "grad_norm": 33.75, + "grad_norm_var": 2.3650390625, + "learning_rate": 0.0001, + "loss": 7.3765, + "loss/crossentropy": 2.0037689693272114, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.1971780034713447, + "step": 6600 + }, + { + "epoch": 0.16525, + "grad_norm": 29.625, + "grad_norm_var": 4.601497395833333, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 2.1523181863129137, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.192273567058146, + "step": 6610 + }, + { + "epoch": 0.1655, + "grad_norm": 29.875, + "grad_norm_var": 7.6525390625, + "learning_rate": 0.0001, + "loss": 7.3804, + "loss/crossentropy": 2.0919234342873096, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.20757663380354643, + "step": 6620 + }, + { + "epoch": 0.16575, + "grad_norm": 31.875, + "grad_norm_var": 6.917708333333334, + "learning_rate": 0.0001, + "loss": 7.3889, + "loss/crossentropy": 2.035097151994705, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.19357213731855155, + "step": 6630 + }, + { + "epoch": 0.166, + "grad_norm": 28.125, + "grad_norm_var": 2.4607245906905574e+18, + "learning_rate": 0.0001, + "loss": 7.5149, + "loss/crossentropy": 2.114539227634668, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.19631449952721597, + "step": 6640 + }, + { + "epoch": 0.16625, + "grad_norm": 28.375, + "grad_norm_var": 2.4607245908931773e+18, + "learning_rate": 0.0001, + "loss": 7.3157, + "loss/crossentropy": 2.0170676171779633, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.1893145913258195, + "step": 6650 + }, + { + "epoch": 0.1665, + "grad_norm": 28.0, + "grad_norm_var": 32.25807291666667, + "learning_rate": 0.0001, + "loss": 7.2947, + "loss/crossentropy": 1.9412188947200775, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.18349691890180111, + "step": 6660 + }, + { + "epoch": 0.16675, + "grad_norm": 30.25, + "grad_norm_var": 48.2375, + "learning_rate": 0.0001, + "loss": 7.4086, + "loss/crossentropy": 2.1157006829977036, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.1936411712318659, + "step": 6670 + }, + { + "epoch": 0.167, + "grad_norm": 29.75, + "grad_norm_var": 36.18333333333333, + "learning_rate": 0.0001, + "loss": 7.4096, + "loss/crossentropy": 1.9384170174598694, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.1831628430634737, + "step": 6680 + }, + { + "epoch": 0.16725, + "grad_norm": 30.0, + "grad_norm_var": 62.6150390625, + "learning_rate": 0.0001, + "loss": 7.4604, + "loss/crossentropy": 2.152873657643795, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.20049556214362382, + "step": 6690 + }, + { + "epoch": 0.1675, + "grad_norm": 29.75, + "grad_norm_var": 28.671875, + "learning_rate": 0.0001, + "loss": 7.5739, + "loss/crossentropy": 2.1935679107904433, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.20188184324651956, + "step": 6700 + }, + { + "epoch": 0.16775, + "grad_norm": 28.25, + "grad_norm_var": 2.278580729166667, + "learning_rate": 0.0001, + "loss": 7.4709, + "loss/crossentropy": 2.062030902504921, + "loss/hidden": 3.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.18874304387718438, + "step": 6710 + }, + { + "epoch": 0.168, + "grad_norm": 29.125, + "grad_norm_var": 3.3150390625, + "learning_rate": 0.0001, + "loss": 7.3655, + "loss/crossentropy": 1.999978879839182, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.18526637642644345, + "step": 6720 + }, + { + "epoch": 0.16825, + "grad_norm": 35.25, + "grad_norm_var": 6.237239583333333, + "learning_rate": 0.0001, + "loss": 7.4037, + "loss/crossentropy": 2.0561595499515533, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.20046296287328005, + "step": 6730 + }, + { + "epoch": 0.1685, + "grad_norm": 28.75, + "grad_norm_var": 5.5619140625, + "learning_rate": 0.0001, + "loss": 7.368, + "loss/crossentropy": 2.0664093092083933, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.1885912848636508, + "step": 6740 + }, + { + "epoch": 0.16875, + "grad_norm": 30.875, + "grad_norm_var": 5.843489583333334, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.1205774366855623, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.21412673257291318, + "step": 6750 + }, + { + "epoch": 0.169, + "grad_norm": 32.0, + "grad_norm_var": 6.3775390625, + "learning_rate": 0.0001, + "loss": 7.3708, + "loss/crossentropy": 2.0314668610692026, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.18550706487149, + "step": 6760 + }, + { + "epoch": 0.16925, + "grad_norm": 32.5, + "grad_norm_var": 7.378059895833333, + "learning_rate": 0.0001, + "loss": 7.4055, + "loss/crossentropy": 2.1428691864013674, + "loss/hidden": 3.506640625, + "loss/jsd": 0.0, + "loss/logits": 0.21024896781891583, + "step": 6770 + }, + { + "epoch": 0.1695, + "grad_norm": 31.375, + "grad_norm_var": 3.9791666666666665, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 1.9500044576823712, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.19028451843187213, + "step": 6780 + }, + { + "epoch": 0.16975, + "grad_norm": 29.75, + "grad_norm_var": 3.2853515625, + "learning_rate": 0.0001, + "loss": 7.4369, + "loss/crossentropy": 2.1563921123743057, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18290520180016756, + "step": 6790 + }, + { + "epoch": 0.17, + "grad_norm": 38.0, + "grad_norm_var": 8.0947265625, + "learning_rate": 0.0001, + "loss": 7.3756, + "loss/crossentropy": 2.133736363053322, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.1851572971791029, + "step": 6800 + }, + { + "epoch": 0.17025, + "grad_norm": 34.75, + "grad_norm_var": 14.757747395833333, + "learning_rate": 0.0001, + "loss": 7.2181, + "loss/crossentropy": 2.054655596613884, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.18349520340561867, + "step": 6810 + }, + { + "epoch": 0.1705, + "grad_norm": 30.375, + "grad_norm_var": 5.17890625, + "learning_rate": 0.0001, + "loss": 7.303, + "loss/crossentropy": 2.024763736128807, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.19044207576662303, + "step": 6820 + }, + { + "epoch": 0.17075, + "grad_norm": 32.25, + "grad_norm_var": 3.7955729166666665, + "learning_rate": 0.0001, + "loss": 7.4884, + "loss/crossentropy": 1.9924081854522229, + "loss/hidden": 3.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.20656490996479987, + "step": 6830 + }, + { + "epoch": 0.171, + "grad_norm": 30.875, + "grad_norm_var": 3.215559895833333, + "learning_rate": 0.0001, + "loss": 7.4845, + "loss/crossentropy": 2.1104799427092074, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18316805781796575, + "step": 6840 + }, + { + "epoch": 0.17125, + "grad_norm": 31.625, + "grad_norm_var": 25.610872395833333, + "learning_rate": 0.0001, + "loss": 7.3771, + "loss/crossentropy": 2.02793128117919, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.17605492258444427, + "step": 6850 + }, + { + "epoch": 0.1715, + "grad_norm": 29.375, + "grad_norm_var": 34.992122395833334, + "learning_rate": 0.0001, + "loss": 7.5157, + "loss/crossentropy": 2.1305345237255096, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.19113806802779437, + "step": 6860 + }, + { + "epoch": 0.17175, + "grad_norm": 28.375, + "grad_norm_var": 12.389322916666666, + "learning_rate": 0.0001, + "loss": 7.2752, + "loss/crossentropy": 2.0788576349616052, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.19617441901937127, + "step": 6870 + }, + { + "epoch": 0.172, + "grad_norm": 27.625, + "grad_norm_var": 17.758072916666666, + "learning_rate": 0.0001, + "loss": 7.4366, + "loss/crossentropy": 2.068412736058235, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18466003462672234, + "step": 6880 + }, + { + "epoch": 0.17225, + "grad_norm": 30.375, + "grad_norm_var": 19.836393229166667, + "learning_rate": 0.0001, + "loss": 7.5047, + "loss/crossentropy": 2.07881121635437, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19641269743442535, + "step": 6890 + }, + { + "epoch": 0.1725, + "grad_norm": 29.625, + "grad_norm_var": 14.70625, + "learning_rate": 0.0001, + "loss": 7.4086, + "loss/crossentropy": 1.84702168405056, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.1932983512058854, + "step": 6900 + }, + { + "epoch": 0.17275, + "grad_norm": 31.875, + "grad_norm_var": 31.006705729166665, + "learning_rate": 0.0001, + "loss": 7.3715, + "loss/crossentropy": 2.099086304008961, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.20250021573156118, + "step": 6910 + }, + { + "epoch": 0.173, + "grad_norm": 29.0, + "grad_norm_var": 26.07265625, + "learning_rate": 0.0001, + "loss": 7.3343, + "loss/crossentropy": 2.0957569405436516, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.2010388659313321, + "step": 6920 + }, + { + "epoch": 0.17325, + "grad_norm": 33.0, + "grad_norm_var": 3.753059895833333, + "learning_rate": 0.0001, + "loss": 7.4937, + "loss/crossentropy": 2.071177572757006, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.19133195597678423, + "step": 6930 + }, + { + "epoch": 0.1735, + "grad_norm": 31.125, + "grad_norm_var": 5.88515625, + "learning_rate": 0.0001, + "loss": 7.4757, + "loss/crossentropy": 2.0803056344389916, + "loss/hidden": 3.51875, + "loss/jsd": 0.0, + "loss/logits": 0.20995833892375232, + "step": 6940 + }, + { + "epoch": 0.17375, + "grad_norm": 28.5, + "grad_norm_var": 7.328059895833333, + "learning_rate": 0.0001, + "loss": 7.2293, + "loss/crossentropy": 1.9285863403230905, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.1665677004493773, + "step": 6950 + }, + { + "epoch": 0.174, + "grad_norm": 27.375, + "grad_norm_var": 11.585872395833333, + "learning_rate": 0.0001, + "loss": 7.313, + "loss/crossentropy": 2.0258478805422784, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.1821833540685475, + "step": 6960 + }, + { + "epoch": 0.17425, + "grad_norm": 30.5, + "grad_norm_var": 10.760416666666666, + "learning_rate": 0.0001, + "loss": 7.5413, + "loss/crossentropy": 2.1308654129505156, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.21524183861911297, + "step": 6970 + }, + { + "epoch": 0.1745, + "grad_norm": 29.625, + "grad_norm_var": 4.138541666666667, + "learning_rate": 0.0001, + "loss": 7.4692, + "loss/crossentropy": 2.1182237058877944, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.18946228343993426, + "step": 6980 + }, + { + "epoch": 0.17475, + "grad_norm": 31.0, + "grad_norm_var": 5.499934895833333, + "learning_rate": 0.0001, + "loss": 7.4658, + "loss/crossentropy": 2.0863646306097507, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.2054815970361233, + "step": 6990 + }, + { + "epoch": 0.175, + "grad_norm": 31.375, + "grad_norm_var": 1.7067057291666667, + "learning_rate": 0.0001, + "loss": 7.3196, + "loss/crossentropy": 2.1002516582608224, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.19273097421973945, + "step": 7000 + }, + { + "epoch": 0.17525, + "grad_norm": 31.25, + "grad_norm_var": 1.7999348958333334, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.015849883854389, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.18935495987534523, + "step": 7010 + }, + { + "epoch": 0.1755, + "grad_norm": 28.875, + "grad_norm_var": 3.3645833333333335, + "learning_rate": 0.0001, + "loss": 7.3973, + "loss/crossentropy": 2.072261115908623, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.19815812185406684, + "step": 7020 + }, + { + "epoch": 0.17575, + "grad_norm": 29.375, + "grad_norm_var": 8.442708333333334, + "learning_rate": 0.0001, + "loss": 7.4193, + "loss/crossentropy": 2.1367180705070496, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1970324844121933, + "step": 7030 + }, + { + "epoch": 0.176, + "grad_norm": 30.375, + "grad_norm_var": 5.426822916666667, + "learning_rate": 0.0001, + "loss": 7.518, + "loss/crossentropy": 2.210773140192032, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.1955376474186778, + "step": 7040 + }, + { + "epoch": 0.17625, + "grad_norm": 30.125, + "grad_norm_var": 3.1791015625, + "learning_rate": 0.0001, + "loss": 7.3883, + "loss/crossentropy": 2.1343745410442354, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.19158909022808074, + "step": 7050 + }, + { + "epoch": 0.1765, + "grad_norm": 29.25, + "grad_norm_var": 4.112434895833333, + "learning_rate": 0.0001, + "loss": 7.4451, + "loss/crossentropy": 1.9646480686962604, + "loss/hidden": 3.465234375, + "loss/jsd": 0.0, + "loss/logits": 0.19925388041883707, + "step": 7060 + }, + { + "epoch": 0.17675, + "grad_norm": 31.5, + "grad_norm_var": 3.3275390625, + "learning_rate": 0.0001, + "loss": 7.4874, + "loss/crossentropy": 2.1882633604109287, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.19836742132902146, + "step": 7070 + }, + { + "epoch": 0.177, + "grad_norm": 34.75, + "grad_norm_var": 5.7291015625, + "learning_rate": 0.0001, + "loss": 7.4907, + "loss/crossentropy": 2.2362487465143204, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.19153916742652655, + "step": 7080 + }, + { + "epoch": 0.17725, + "grad_norm": 29.5, + "grad_norm_var": 5.002083333333333, + "learning_rate": 0.0001, + "loss": 7.4459, + "loss/crossentropy": 2.164398466050625, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20325577780604362, + "step": 7090 + }, + { + "epoch": 0.1775, + "grad_norm": 32.25, + "grad_norm_var": 21.282747395833333, + "learning_rate": 0.0001, + "loss": 7.4933, + "loss/crossentropy": 2.1975876331329345, + "loss/hidden": 3.455078125, + "loss/jsd": 0.0, + "loss/logits": 0.19638306740671396, + "step": 7100 + }, + { + "epoch": 0.17775, + "grad_norm": 28.5, + "grad_norm_var": 32.18020833333333, + "learning_rate": 0.0001, + "loss": 7.358, + "loss/crossentropy": 1.9753169894218445, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.17774544414132834, + "step": 7110 + }, + { + "epoch": 0.178, + "grad_norm": 30.0, + "grad_norm_var": 11.808333333333334, + "learning_rate": 0.0001, + "loss": 7.4718, + "loss/crossentropy": 2.0591190218925477, + "loss/hidden": 3.4578125, + "loss/jsd": 0.0, + "loss/logits": 0.19963474106043577, + "step": 7120 + }, + { + "epoch": 0.17825, + "grad_norm": 29.25, + "grad_norm_var": 7.753580729166667, + "learning_rate": 0.0001, + "loss": 7.3979, + "loss/crossentropy": 2.062809920310974, + "loss/hidden": 3.437109375, + "loss/jsd": 0.0, + "loss/logits": 0.19096513669937848, + "step": 7130 + }, + { + "epoch": 0.1785, + "grad_norm": 30.125, + "grad_norm_var": 6.6025390625, + "learning_rate": 0.0001, + "loss": 7.3036, + "loss/crossentropy": 2.0240534149110316, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.18496394343674183, + "step": 7140 + }, + { + "epoch": 0.17875, + "grad_norm": 31.375, + "grad_norm_var": 2.4942057291666666, + "learning_rate": 0.0001, + "loss": 7.4778, + "loss/crossentropy": 2.124583348631859, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.2003519142046571, + "step": 7150 + }, + { + "epoch": 0.179, + "grad_norm": 29.0, + "grad_norm_var": 13.8978515625, + "learning_rate": 0.0001, + "loss": 7.3866, + "loss/crossentropy": 2.035899819433689, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.18300745636224747, + "step": 7160 + }, + { + "epoch": 0.17925, + "grad_norm": 28.125, + "grad_norm_var": 18.375455729166667, + "learning_rate": 0.0001, + "loss": 7.3499, + "loss/crossentropy": 2.086082286387682, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.17294995756819845, + "step": 7170 + }, + { + "epoch": 0.1795, + "grad_norm": 28.5, + "grad_norm_var": 22.834830729166665, + "learning_rate": 0.0001, + "loss": 7.4265, + "loss/crossentropy": 2.0105697728693483, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.19119318593293427, + "step": 7180 + }, + { + "epoch": 0.17975, + "grad_norm": 33.0, + "grad_norm_var": 19.51640625, + "learning_rate": 0.0001, + "loss": 7.4722, + "loss/crossentropy": 2.1506593719124796, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.19607089888304471, + "step": 7190 + }, + { + "epoch": 0.18, + "grad_norm": 31.5, + "grad_norm_var": 9.854622395833333, + "learning_rate": 0.0001, + "loss": 7.3205, + "loss/crossentropy": 2.0767677523195744, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.19086614530533552, + "step": 7200 + }, + { + "epoch": 0.18025, + "grad_norm": 27.625, + "grad_norm_var": 8.885416666666666, + "learning_rate": 0.0001, + "loss": 7.2908, + "loss/crossentropy": 2.230179136991501, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.18500201255083085, + "step": 7210 + }, + { + "epoch": 0.1805, + "grad_norm": 38.25, + "grad_norm_var": 8.60625, + "learning_rate": 0.0001, + "loss": 7.4664, + "loss/crossentropy": 2.192136238515377, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.1965922711417079, + "step": 7220 + }, + { + "epoch": 0.18075, + "grad_norm": 33.0, + "grad_norm_var": 7.258333333333334, + "learning_rate": 0.0001, + "loss": 7.3796, + "loss/crossentropy": 2.010923378914595, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.19027266185730696, + "step": 7230 + }, + { + "epoch": 0.181, + "grad_norm": 39.75, + "grad_norm_var": 11.3353515625, + "learning_rate": 0.0001, + "loss": 7.5537, + "loss/crossentropy": 2.055556283891201, + "loss/hidden": 3.47421875, + "loss/jsd": 0.0, + "loss/logits": 0.19257053220644593, + "step": 7240 + }, + { + "epoch": 0.18125, + "grad_norm": 27.5, + "grad_norm_var": 14.424739583333333, + "learning_rate": 0.0001, + "loss": 7.4216, + "loss/crossentropy": 2.136477355659008, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.20195687096565962, + "step": 7250 + }, + { + "epoch": 0.1815, + "grad_norm": 30.125, + "grad_norm_var": 9.70390625, + "learning_rate": 0.0001, + "loss": 7.4656, + "loss/crossentropy": 2.0134367659687995, + "loss/hidden": 3.523046875, + "loss/jsd": 0.0, + "loss/logits": 0.22354123163968326, + "step": 7260 + }, + { + "epoch": 0.18175, + "grad_norm": 30.0, + "grad_norm_var": 6.324739583333334, + "learning_rate": 0.0001, + "loss": 7.3959, + "loss/crossentropy": 2.1245115220546724, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.19102834183722733, + "step": 7270 + }, + { + "epoch": 0.182, + "grad_norm": 29.125, + "grad_norm_var": 18.903580729166666, + "learning_rate": 0.0001, + "loss": 7.3761, + "loss/crossentropy": 2.035867254436016, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.19426564145833253, + "step": 7280 + }, + { + "epoch": 0.18225, + "grad_norm": 30.5, + "grad_norm_var": 20.406705729166667, + "learning_rate": 0.0001, + "loss": 7.3884, + "loss/crossentropy": 1.9805133253335954, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.19129701480269432, + "step": 7290 + }, + { + "epoch": 0.1825, + "grad_norm": 31.375, + "grad_norm_var": 5.842708333333333, + "learning_rate": 0.0001, + "loss": 7.3833, + "loss/crossentropy": 2.0621849209070207, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.19919742476195096, + "step": 7300 + }, + { + "epoch": 0.18275, + "grad_norm": 47.0, + "grad_norm_var": 2.675771470406222e+18, + "learning_rate": 0.0001, + "loss": 7.2357, + "loss/crossentropy": 2.1282688602805138, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.18128441767767073, + "step": 7310 + }, + { + "epoch": 0.183, + "grad_norm": 29.25, + "grad_norm_var": 28.158333333333335, + "learning_rate": 0.0001, + "loss": 7.4531, + "loss/crossentropy": 2.1078659296035767, + "loss/hidden": 3.580078125, + "loss/jsd": 0.0, + "loss/logits": 0.2171280149370432, + "step": 7320 + }, + { + "epoch": 0.18325, + "grad_norm": 35.25, + "grad_norm_var": 25.365625, + "learning_rate": 0.0001, + "loss": 7.3538, + "loss/crossentropy": 2.200744313001633, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.19498275145888327, + "step": 7330 + }, + { + "epoch": 0.1835, + "grad_norm": 28.375, + "grad_norm_var": 23.859830729166667, + "learning_rate": 0.0001, + "loss": 7.3437, + "loss/crossentropy": 1.9968993581831456, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.1875888810493052, + "step": 7340 + }, + { + "epoch": 0.18375, + "grad_norm": 28.0, + "grad_norm_var": 8.989518229166666, + "learning_rate": 0.0001, + "loss": 7.3984, + "loss/crossentropy": 2.0748091831803324, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.1947355069220066, + "step": 7350 + }, + { + "epoch": 0.184, + "grad_norm": 29.875, + "grad_norm_var": 7.728125, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.067877373099327, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.18336500320583582, + "step": 7360 + }, + { + "epoch": 0.18425, + "grad_norm": 29.25, + "grad_norm_var": 13.189583333333333, + "learning_rate": 0.0001, + "loss": 7.4547, + "loss/crossentropy": 2.152864509820938, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.2102669222280383, + "step": 7370 + }, + { + "epoch": 0.1845, + "grad_norm": 28.5, + "grad_norm_var": 8.7556640625, + "learning_rate": 0.0001, + "loss": 7.286, + "loss/crossentropy": 1.9991149730980395, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.18364950213581324, + "step": 7380 + }, + { + "epoch": 0.18475, + "grad_norm": 31.75, + "grad_norm_var": 7.770572916666667, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.0513292245566843, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.1970413200557232, + "step": 7390 + }, + { + "epoch": 0.185, + "grad_norm": 30.125, + "grad_norm_var": 7.620572916666666, + "learning_rate": 0.0001, + "loss": 7.5116, + "loss/crossentropy": 2.102216296643019, + "loss/hidden": 3.51875, + "loss/jsd": 0.0, + "loss/logits": 0.21091360161080958, + "step": 7400 + }, + { + "epoch": 0.18525, + "grad_norm": 32.75, + "grad_norm_var": 9.191666666666666, + "learning_rate": 0.0001, + "loss": 7.3421, + "loss/crossentropy": 1.9926266744732857, + "loss/hidden": 3.455078125, + "loss/jsd": 0.0, + "loss/logits": 0.19832582902163268, + "step": 7410 + }, + { + "epoch": 0.1855, + "grad_norm": 29.5, + "grad_norm_var": 5.945833333333334, + "learning_rate": 0.0001, + "loss": 7.4563, + "loss/crossentropy": 2.141331580281258, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.20020943265408278, + "step": 7420 + }, + { + "epoch": 0.18575, + "grad_norm": 31.0, + "grad_norm_var": 7.198372395833333, + "learning_rate": 0.0001, + "loss": 7.4227, + "loss/crossentropy": 1.9694693490862847, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.18249646089971067, + "step": 7430 + }, + { + "epoch": 0.186, + "grad_norm": 29.375, + "grad_norm_var": 5.9697265625, + "learning_rate": 0.0001, + "loss": 7.4119, + "loss/crossentropy": 2.1407265037298204, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.17720893137156962, + "step": 7440 + }, + { + "epoch": 0.18625, + "grad_norm": 29.875, + "grad_norm_var": 0.8072916666666666, + "learning_rate": 0.0001, + "loss": 7.3441, + "loss/crossentropy": 2.124198019504547, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.1946978410705924, + "step": 7450 + }, + { + "epoch": 0.1865, + "grad_norm": 32.75, + "grad_norm_var": 1.9634765625, + "learning_rate": 0.0001, + "loss": 7.4652, + "loss/crossentropy": 2.131994958221912, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19461573138833047, + "step": 7460 + }, + { + "epoch": 0.18675, + "grad_norm": 30.875, + "grad_norm_var": 2.7280598958333333, + "learning_rate": 0.0001, + "loss": 7.3278, + "loss/crossentropy": 2.117748848348856, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.18584235943853855, + "step": 7470 + }, + { + "epoch": 0.187, + "grad_norm": 31.625, + "grad_norm_var": 2.466080729166667, + "learning_rate": 0.0001, + "loss": 7.5817, + "loss/crossentropy": 2.1364282086491584, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.21069204956293106, + "step": 7480 + }, + { + "epoch": 0.18725, + "grad_norm": 31.25, + "grad_norm_var": 3.012434895833333, + "learning_rate": 0.0001, + "loss": 7.5197, + "loss/crossentropy": 2.0523312032222747, + "loss/hidden": 3.50078125, + "loss/jsd": 0.0, + "loss/logits": 0.21221144162118435, + "step": 7490 + }, + { + "epoch": 0.1875, + "grad_norm": 33.75, + "grad_norm_var": 4.995768229166667, + "learning_rate": 0.0001, + "loss": 7.4317, + "loss/crossentropy": 2.0852270901203154, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.17908250950276852, + "step": 7500 + }, + { + "epoch": 0.18775, + "grad_norm": 30.875, + "grad_norm_var": 2.4518229166666665, + "learning_rate": 0.0001, + "loss": 7.5444, + "loss/crossentropy": 2.059383874386549, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.19440155941992998, + "step": 7510 + }, + { + "epoch": 0.188, + "grad_norm": 27.625, + "grad_norm_var": 2.851822916666667, + "learning_rate": 0.0001, + "loss": 7.5199, + "loss/crossentropy": 2.1382876858115196, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.19888029601424934, + "step": 7520 + }, + { + "epoch": 0.18825, + "grad_norm": 29.375, + "grad_norm_var": 3.073372395833333, + "learning_rate": 0.0001, + "loss": 7.3012, + "loss/crossentropy": 2.0625696159899234, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.18495072829537093, + "step": 7530 + }, + { + "epoch": 0.1885, + "grad_norm": 32.0, + "grad_norm_var": 2.2122395833333335, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.124967637658119, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18375679664313793, + "step": 7540 + }, + { + "epoch": 0.18875, + "grad_norm": 29.125, + "grad_norm_var": 3.2681640625, + "learning_rate": 0.0001, + "loss": 7.351, + "loss/crossentropy": 2.0680116668343542, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.18827605471014977, + "step": 7550 + }, + { + "epoch": 0.189, + "grad_norm": 31.75, + "grad_norm_var": 1.5337890625, + "learning_rate": 0.0001, + "loss": 7.4207, + "loss/crossentropy": 2.079096484184265, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.20160295628011227, + "step": 7560 + }, + { + "epoch": 0.18925, + "grad_norm": 30.625, + "grad_norm_var": 18.2197265625, + "learning_rate": 0.0001, + "loss": 7.4789, + "loss/crossentropy": 2.058067685365677, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.21216327100992202, + "step": 7570 + }, + { + "epoch": 0.1895, + "grad_norm": 34.25, + "grad_norm_var": 14.415625, + "learning_rate": 0.0001, + "loss": 7.4987, + "loss/crossentropy": 2.0142914205789566, + "loss/hidden": 3.580078125, + "loss/jsd": 0.0, + "loss/logits": 0.20794902741909027, + "step": 7580 + }, + { + "epoch": 0.18975, + "grad_norm": 30.5, + "grad_norm_var": 1.9809895833333333, + "learning_rate": 0.0001, + "loss": 7.4585, + "loss/crossentropy": 2.299562671780586, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.19880922697484493, + "step": 7590 + }, + { + "epoch": 0.19, + "grad_norm": 30.875, + "grad_norm_var": 15.101822916666666, + "learning_rate": 0.0001, + "loss": 7.3204, + "loss/crossentropy": 2.1472302600741386, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.18796155080199242, + "step": 7600 + }, + { + "epoch": 0.19025, + "grad_norm": 29.0, + "grad_norm_var": 2.5940733610451533e+18, + "learning_rate": 0.0001, + "loss": 7.5335, + "loss/crossentropy": 2.1664531916379928, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.18874028734862805, + "step": 7610 + }, + { + "epoch": 0.1905, + "grad_norm": 29.125, + "grad_norm_var": 0.8843098958333333, + "learning_rate": 0.0001, + "loss": 7.4846, + "loss/crossentropy": 2.0765088513493537, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.18674521408975125, + "step": 7620 + }, + { + "epoch": 0.19075, + "grad_norm": 27.25, + "grad_norm_var": 4.381184895833333, + "learning_rate": 0.0001, + "loss": 7.3401, + "loss/crossentropy": 1.7713539503514766, + "loss/hidden": 3.515234375, + "loss/jsd": 0.0, + "loss/logits": 0.1749590938910842, + "step": 7630 + }, + { + "epoch": 0.191, + "grad_norm": 33.5, + "grad_norm_var": 5.26640625, + "learning_rate": 0.0001, + "loss": 7.4838, + "loss/crossentropy": 1.922049730271101, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.18183694053441285, + "step": 7640 + }, + { + "epoch": 0.19125, + "grad_norm": 30.5, + "grad_norm_var": 4.030989583333334, + "learning_rate": 0.0001, + "loss": 7.3887, + "loss/crossentropy": 2.094516658782959, + "loss/hidden": 3.452734375, + "loss/jsd": 0.0, + "loss/logits": 0.18757299687713386, + "step": 7650 + }, + { + "epoch": 0.1915, + "grad_norm": 32.0, + "grad_norm_var": 1.5035807291666667, + "learning_rate": 0.0001, + "loss": 7.4445, + "loss/crossentropy": 1.999229770898819, + "loss/hidden": 3.516015625, + "loss/jsd": 0.0, + "loss/logits": 0.19523975029587745, + "step": 7660 + }, + { + "epoch": 0.19175, + "grad_norm": 32.25, + "grad_norm_var": 3.0275390625, + "learning_rate": 0.0001, + "loss": 7.4626, + "loss/crossentropy": 2.0366951674222946, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.18243371956050397, + "step": 7670 + }, + { + "epoch": 0.192, + "grad_norm": 32.75, + "grad_norm_var": 2.5931640625, + "learning_rate": 0.0001, + "loss": 7.5274, + "loss/crossentropy": 2.0489632681012155, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.21230401135981083, + "step": 7680 + }, + { + "epoch": 0.19225, + "grad_norm": 32.75, + "grad_norm_var": 3.738997395833333, + "learning_rate": 0.0001, + "loss": 7.4128, + "loss/crossentropy": 2.0826203912496566, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18644160348922015, + "step": 7690 + }, + { + "epoch": 0.1925, + "grad_norm": 29.875, + "grad_norm_var": 4.351041666666666, + "learning_rate": 0.0001, + "loss": 7.5195, + "loss/crossentropy": 2.1674430795013904, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.19499621093273162, + "step": 7700 + }, + { + "epoch": 0.19275, + "grad_norm": 28.625, + "grad_norm_var": 4.939322916666667, + "learning_rate": 0.0001, + "loss": 7.4236, + "loss/crossentropy": 2.092283549904823, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.19506504610180855, + "step": 7710 + }, + { + "epoch": 0.193, + "grad_norm": 33.0, + "grad_norm_var": 9.080208333333333, + "learning_rate": 0.0001, + "loss": 7.3501, + "loss/crossentropy": 2.0526101261377336, + "loss/hidden": 3.51328125, + "loss/jsd": 0.0, + "loss/logits": 0.19770189765840768, + "step": 7720 + }, + { + "epoch": 0.19325, + "grad_norm": 29.0, + "grad_norm_var": 9.731184895833334, + "learning_rate": 0.0001, + "loss": 7.4607, + "loss/crossentropy": 2.05912861302495, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.19287334326654673, + "step": 7730 + }, + { + "epoch": 0.1935, + "grad_norm": 30.5, + "grad_norm_var": 1.8052083333333333, + "learning_rate": 0.0001, + "loss": 7.4243, + "loss/crossentropy": 1.8983285859227181, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.18626301139593124, + "step": 7740 + }, + { + "epoch": 0.19375, + "grad_norm": 40.75, + "grad_norm_var": 9.1541015625, + "learning_rate": 0.0001, + "loss": 7.4332, + "loss/crossentropy": 2.099681233614683, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.20556394904851913, + "step": 7750 + }, + { + "epoch": 0.194, + "grad_norm": 37.0, + "grad_norm_var": 12.584375, + "learning_rate": 0.0001, + "loss": 7.516, + "loss/crossentropy": 2.0436215907335282, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.17505232142284513, + "step": 7760 + }, + { + "epoch": 0.19425, + "grad_norm": 30.125, + "grad_norm_var": 5.2681640625, + "learning_rate": 0.0001, + "loss": 7.5486, + "loss/crossentropy": 2.0449838273227217, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.19324074545875192, + "step": 7770 + }, + { + "epoch": 0.1945, + "grad_norm": 30.125, + "grad_norm_var": 1.4518229166666667, + "learning_rate": 0.0001, + "loss": 7.4052, + "loss/crossentropy": 2.1020638972520826, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.1987349819391966, + "step": 7780 + }, + { + "epoch": 0.19475, + "grad_norm": 32.25, + "grad_norm_var": 1.525, + "learning_rate": 0.0001, + "loss": 7.3148, + "loss/crossentropy": 2.0913542471826077, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.1900358498096466, + "step": 7790 + }, + { + "epoch": 0.195, + "grad_norm": 29.875, + "grad_norm_var": 1.2280598958333333, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 1.9448820307850838, + "loss/hidden": 3.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.18228193083778024, + "step": 7800 + }, + { + "epoch": 0.19525, + "grad_norm": 28.75, + "grad_norm_var": 2.3791015625, + "learning_rate": 0.0001, + "loss": 7.4772, + "loss/crossentropy": 2.0547610491514208, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.19370344914495946, + "step": 7810 + }, + { + "epoch": 0.1955, + "grad_norm": 31.375, + "grad_norm_var": 6.208072916666667, + "learning_rate": 0.0001, + "loss": 7.4448, + "loss/crossentropy": 2.0798824220895766, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.18972196318209172, + "step": 7820 + }, + { + "epoch": 0.19575, + "grad_norm": 29.5, + "grad_norm_var": 6.248893229166667, + "learning_rate": 0.0001, + "loss": 7.5358, + "loss/crossentropy": 2.2324195951223373, + "loss/hidden": 3.4375, + "loss/jsd": 0.0, + "loss/logits": 0.2047037549316883, + "step": 7830 + }, + { + "epoch": 0.196, + "grad_norm": 29.375, + "grad_norm_var": 4.453059895833333, + "learning_rate": 0.0001, + "loss": 7.3114, + "loss/crossentropy": 2.1020479179918765, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.19328910131007432, + "step": 7840 + }, + { + "epoch": 0.19625, + "grad_norm": 33.5, + "grad_norm_var": 590.0817057291666, + "learning_rate": 0.0001, + "loss": 7.4281, + "loss/crossentropy": 2.0953447744250298, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.1854689259082079, + "step": 7850 + }, + { + "epoch": 0.1965, + "grad_norm": 33.0, + "grad_norm_var": 625.6192057291667, + "learning_rate": 0.0001, + "loss": 7.5283, + "loss/crossentropy": 2.061315707862377, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.18506519980728625, + "step": 7860 + }, + { + "epoch": 0.19675, + "grad_norm": 32.0, + "grad_norm_var": 69.38430989583334, + "learning_rate": 0.0001, + "loss": 7.5314, + "loss/crossentropy": 2.1605025470256805, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.20316088199615479, + "step": 7870 + }, + { + "epoch": 0.197, + "grad_norm": 31.375, + "grad_norm_var": 1.1067057291666667, + "learning_rate": 0.0001, + "loss": 7.4244, + "loss/crossentropy": 2.1977868393063544, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.1992826245725155, + "step": 7880 + }, + { + "epoch": 0.19725, + "grad_norm": 29.25, + "grad_norm_var": 9.578059895833333, + "learning_rate": 0.0001, + "loss": 7.502, + "loss/crossentropy": 2.0480964958667753, + "loss/hidden": 3.636328125, + "loss/jsd": 0.0, + "loss/logits": 0.21483200527727603, + "step": 7890 + }, + { + "epoch": 0.1975, + "grad_norm": 30.875, + "grad_norm_var": 3.6372395833333333, + "learning_rate": 0.0001, + "loss": 7.548, + "loss/crossentropy": 2.157261362671852, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.2084518164396286, + "step": 7900 + }, + { + "epoch": 0.19775, + "grad_norm": 28.5, + "grad_norm_var": 2.095572916666667, + "learning_rate": 0.0001, + "loss": 7.3864, + "loss/crossentropy": 2.1441849052906035, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.20142039898782968, + "step": 7910 + }, + { + "epoch": 0.198, + "grad_norm": 37.5, + "grad_norm_var": 16.170247395833332, + "learning_rate": 0.0001, + "loss": 7.429, + "loss/crossentropy": 2.001983726769686, + "loss/hidden": 3.448828125, + "loss/jsd": 0.0, + "loss/logits": 0.19066998092457652, + "step": 7920 + }, + { + "epoch": 0.19825, + "grad_norm": 29.375, + "grad_norm_var": 15.561393229166667, + "learning_rate": 0.0001, + "loss": 7.4093, + "loss/crossentropy": 2.225774070620537, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1993710033595562, + "step": 7930 + }, + { + "epoch": 0.1985, + "grad_norm": 34.0, + "grad_norm_var": 25.343489583333334, + "learning_rate": 0.0001, + "loss": 7.4932, + "loss/crossentropy": 2.0083594918251038, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.19412665143609048, + "step": 7940 + }, + { + "epoch": 0.19875, + "grad_norm": 31.625, + "grad_norm_var": 23.877083333333335, + "learning_rate": 0.0001, + "loss": 7.441, + "loss/crossentropy": 2.08128562271595, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.17889103144407273, + "step": 7950 + }, + { + "epoch": 0.199, + "grad_norm": 27.875, + "grad_norm_var": 1.5921223958333333, + "learning_rate": 0.0001, + "loss": 7.4251, + "loss/crossentropy": 2.1019147261977196, + "loss/hidden": 3.571484375, + "loss/jsd": 0.0, + "loss/logits": 0.2211546439677477, + "step": 7960 + }, + { + "epoch": 0.19925, + "grad_norm": 32.0, + "grad_norm_var": 3.3059895833333335, + "learning_rate": 0.0001, + "loss": 7.4213, + "loss/crossentropy": 2.14247687458992, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.18870262056589127, + "step": 7970 + }, + { + "epoch": 0.1995, + "grad_norm": 31.5, + "grad_norm_var": 2.0947265625, + "learning_rate": 0.0001, + "loss": 7.5788, + "loss/crossentropy": 2.1896591186523438, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.20883973222225904, + "step": 7980 + }, + { + "epoch": 0.19975, + "grad_norm": 28.25, + "grad_norm_var": 2.982291666666667, + "learning_rate": 0.0001, + "loss": 7.5217, + "loss/crossentropy": 2.1884095311164855, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.2022842913866043, + "step": 7990 + }, + { + "epoch": 0.2, + "grad_norm": 28.5, + "grad_norm_var": 2.379622395833333, + "learning_rate": 0.0001, + "loss": 7.4975, + "loss/crossentropy": 2.225523295998573, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.2032675376161933, + "step": 8000 + }, + { + "epoch": 0.20025, + "grad_norm": 30.625, + "grad_norm_var": 4.181705729166667, + "learning_rate": 0.0001, + "loss": 7.3874, + "loss/crossentropy": 1.9566738605499268, + "loss/hidden": 3.576953125, + "loss/jsd": 0.0, + "loss/logits": 0.19702311754226684, + "step": 8010 + }, + { + "epoch": 0.2005, + "grad_norm": 32.75, + "grad_norm_var": 6.3509765625, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.1517204724252226, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.2005010774359107, + "step": 8020 + }, + { + "epoch": 0.20075, + "grad_norm": 30.875, + "grad_norm_var": 5.44140625, + "learning_rate": 0.0001, + "loss": 7.5306, + "loss/crossentropy": 2.0300184957683087, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.18887464031577111, + "step": 8030 + }, + { + "epoch": 0.201, + "grad_norm": 31.75, + "grad_norm_var": 3.1302083333333335, + "learning_rate": 0.0001, + "loss": 7.4025, + "loss/crossentropy": 2.0889772072434427, + "loss/hidden": 3.27109375, + "loss/jsd": 0.0, + "loss/logits": 0.17016669576987625, + "step": 8040 + }, + { + "epoch": 0.20125, + "grad_norm": 27.125, + "grad_norm_var": 94.78125, + "learning_rate": 0.0001, + "loss": 7.4379, + "loss/crossentropy": 2.158484524488449, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.19672231934964657, + "step": 8050 + }, + { + "epoch": 0.2015, + "grad_norm": 38.0, + "grad_norm_var": 13.220572916666667, + "learning_rate": 0.0001, + "loss": 7.4314, + "loss/crossentropy": 1.9623262777924537, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.1833876773715019, + "step": 8060 + }, + { + "epoch": 0.20175, + "grad_norm": 29.875, + "grad_norm_var": 7.858268229166667, + "learning_rate": 0.0001, + "loss": 7.4662, + "loss/crossentropy": 2.2177498638629913, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.1954023003578186, + "step": 8070 + }, + { + "epoch": 0.202, + "grad_norm": 28.875, + "grad_norm_var": 7.627083333333333, + "learning_rate": 0.0001, + "loss": 7.4792, + "loss/crossentropy": 2.1019868202507497, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.1995641984976828, + "step": 8080 + }, + { + "epoch": 0.20225, + "grad_norm": 44.75, + "grad_norm_var": 20.4306640625, + "learning_rate": 0.0001, + "loss": 7.5664, + "loss/crossentropy": 2.299755599349737, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.19075682908296585, + "step": 8090 + }, + { + "epoch": 0.2025, + "grad_norm": 37.75, + "grad_norm_var": 2.5671221292944763e+18, + "learning_rate": 0.0001, + "loss": 7.4526, + "loss/crossentropy": 2.131952489167452, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.1955398641526699, + "step": 8100 + }, + { + "epoch": 0.20275, + "grad_norm": 31.25, + "grad_norm_var": 20.342643229166665, + "learning_rate": 0.0001, + "loss": 7.4687, + "loss/crossentropy": 1.9825796701014042, + "loss/hidden": 3.475, + "loss/jsd": 0.0, + "loss/logits": 0.20226136669516565, + "step": 8110 + }, + { + "epoch": 0.203, + "grad_norm": 27.125, + "grad_norm_var": 15.192643229166666, + "learning_rate": 0.0001, + "loss": 7.1604, + "loss/crossentropy": 2.0296560734510423, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.1786259189248085, + "step": 8120 + }, + { + "epoch": 0.20325, + "grad_norm": 31.0, + "grad_norm_var": 7.1775390625, + "learning_rate": 0.0001, + "loss": 7.2815, + "loss/crossentropy": 2.104996609687805, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.19307580199092628, + "step": 8130 + }, + { + "epoch": 0.2035, + "grad_norm": 34.75, + "grad_norm_var": 6.820572916666666, + "learning_rate": 0.0001, + "loss": 7.3774, + "loss/crossentropy": 2.1900447353720667, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.20103423558175565, + "step": 8140 + }, + { + "epoch": 0.20375, + "grad_norm": 31.0, + "grad_norm_var": 5.48125, + "learning_rate": 0.0001, + "loss": 7.3628, + "loss/crossentropy": 2.0671978294849396, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.18412660714238882, + "step": 8150 + }, + { + "epoch": 0.204, + "grad_norm": 41.25, + "grad_norm_var": 14.245572916666667, + "learning_rate": 0.0001, + "loss": 7.5547, + "loss/crossentropy": 2.05537860840559, + "loss/hidden": 3.48515625, + "loss/jsd": 0.0, + "loss/logits": 0.1820721985772252, + "step": 8160 + }, + { + "epoch": 0.20425, + "grad_norm": 29.125, + "grad_norm_var": 13.8625, + "learning_rate": 0.0001, + "loss": 7.345, + "loss/crossentropy": 1.9336151838302613, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.20152895338833332, + "step": 8170 + }, + { + "epoch": 0.2045, + "grad_norm": 28.125, + "grad_norm_var": 7.096875, + "learning_rate": 0.0001, + "loss": 7.5005, + "loss/crossentropy": 2.055295965075493, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.1865659100934863, + "step": 8180 + }, + { + "epoch": 0.20475, + "grad_norm": 28.25, + "grad_norm_var": 8.805989583333334, + "learning_rate": 0.0001, + "loss": 7.5324, + "loss/crossentropy": 1.9976115971803665, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.2114253517240286, + "step": 8190 + }, + { + "epoch": 0.205, + "grad_norm": 29.875, + "grad_norm_var": 12.435416666666667, + "learning_rate": 0.0001, + "loss": 7.4232, + "loss/crossentropy": 2.1122905567288397, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.21649520397186278, + "step": 8200 + }, + { + "epoch": 0.20525, + "grad_norm": 28.125, + "grad_norm_var": 10.055208333333333, + "learning_rate": 0.0001, + "loss": 7.4875, + "loss/crossentropy": 2.1132961876690386, + "loss/hidden": 3.47734375, + "loss/jsd": 0.0, + "loss/logits": 0.2026100393384695, + "step": 8210 + }, + { + "epoch": 0.2055, + "grad_norm": 29.125, + "grad_norm_var": 6.793489583333334, + "learning_rate": 0.0001, + "loss": 7.3834, + "loss/crossentropy": 2.191230720281601, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19885572660714387, + "step": 8220 + }, + { + "epoch": 0.20575, + "grad_norm": 30.75, + "grad_norm_var": 4.630208333333333, + "learning_rate": 0.0001, + "loss": 7.4544, + "loss/crossentropy": 2.080083931982517, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.2282587742432952, + "step": 8230 + }, + { + "epoch": 0.206, + "grad_norm": 31.375, + "grad_norm_var": 11.458268229166666, + "learning_rate": 0.0001, + "loss": 7.449, + "loss/crossentropy": 2.1384637162089346, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.1842938730493188, + "step": 8240 + }, + { + "epoch": 0.20625, + "grad_norm": 31.875, + "grad_norm_var": 1.7087890625, + "learning_rate": 0.0001, + "loss": 7.3946, + "loss/crossentropy": 2.1355771869421005, + "loss/hidden": 3.264453125, + "loss/jsd": 0.0, + "loss/logits": 0.18052869867533444, + "step": 8250 + }, + { + "epoch": 0.2065, + "grad_norm": 31.75, + "grad_norm_var": 13.0447265625, + "learning_rate": 0.0001, + "loss": 7.6316, + "loss/crossentropy": 2.0203320410102608, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.192235934920609, + "step": 8260 + }, + { + "epoch": 0.20675, + "grad_norm": 29.5, + "grad_norm_var": 12.270247395833334, + "learning_rate": 0.0001, + "loss": 7.4572, + "loss/crossentropy": 1.9869592547416688, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.202651490829885, + "step": 8270 + }, + { + "epoch": 0.207, + "grad_norm": 30.125, + "grad_norm_var": 1.4822916666666666, + "learning_rate": 0.0001, + "loss": 7.5227, + "loss/crossentropy": 2.051181730628014, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.20994498692452906, + "step": 8280 + }, + { + "epoch": 0.20725, + "grad_norm": 43.75, + "grad_norm_var": 12.776822916666667, + "learning_rate": 0.0001, + "loss": 7.3664, + "loss/crossentropy": 1.9618318520486355, + "loss/hidden": 3.558984375, + "loss/jsd": 0.0, + "loss/logits": 0.19053993374109268, + "step": 8290 + }, + { + "epoch": 0.2075, + "grad_norm": 30.5, + "grad_norm_var": 14.92890625, + "learning_rate": 0.0001, + "loss": 7.318, + "loss/crossentropy": 2.0114831268787383, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.20329152811318635, + "step": 8300 + }, + { + "epoch": 0.20775, + "grad_norm": 34.0, + "grad_norm_var": 2.8869140625, + "learning_rate": 0.0001, + "loss": 7.5489, + "loss/crossentropy": 2.1187786638736723, + "loss/hidden": 3.512890625, + "loss/jsd": 0.0, + "loss/logits": 0.20839224103838205, + "step": 8310 + }, + { + "epoch": 0.208, + "grad_norm": 27.875, + "grad_norm_var": 2.565559895833333, + "learning_rate": 0.0001, + "loss": 7.3443, + "loss/crossentropy": 2.1263110756874086, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.18627664018422366, + "step": 8320 + }, + { + "epoch": 0.20825, + "grad_norm": 30.375, + "grad_norm_var": 2.755989583333333, + "learning_rate": 0.0001, + "loss": 7.3857, + "loss/crossentropy": 1.944861602783203, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.18570939563214778, + "step": 8330 + }, + { + "epoch": 0.2085, + "grad_norm": 31.125, + "grad_norm_var": 2.340625, + "learning_rate": 0.0001, + "loss": 7.5344, + "loss/crossentropy": 2.1811724051833155, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.19298948515206577, + "step": 8340 + }, + { + "epoch": 0.20875, + "grad_norm": 31.5, + "grad_norm_var": 1.56015625, + "learning_rate": 0.0001, + "loss": 7.3826, + "loss/crossentropy": 2.152976579964161, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.190250195749104, + "step": 8350 + }, + { + "epoch": 0.209, + "grad_norm": 28.875, + "grad_norm_var": 1.3416015625, + "learning_rate": 0.0001, + "loss": 7.323, + "loss/crossentropy": 2.2053099036216737, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.1929330924525857, + "step": 8360 + }, + { + "epoch": 0.20925, + "grad_norm": 29.875, + "grad_norm_var": 1.65625, + "learning_rate": 0.0001, + "loss": 7.4376, + "loss/crossentropy": 2.011850906908512, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.20794765576720237, + "step": 8370 + }, + { + "epoch": 0.2095, + "grad_norm": 29.25, + "grad_norm_var": 2.381705729166667, + "learning_rate": 0.0001, + "loss": 7.4518, + "loss/crossentropy": 2.284806078672409, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.2203810729086399, + "step": 8380 + }, + { + "epoch": 0.20975, + "grad_norm": 43.25, + "grad_norm_var": 17.695247395833334, + "learning_rate": 0.0001, + "loss": 7.418, + "loss/crossentropy": 2.1161764934659004, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.1863908626139164, + "step": 8390 + }, + { + "epoch": 0.21, + "grad_norm": 29.625, + "grad_norm_var": 19.8697265625, + "learning_rate": 0.0001, + "loss": 7.4564, + "loss/crossentropy": 2.0293585821986198, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.1884205201640725, + "step": 8400 + }, + { + "epoch": 0.21025, + "grad_norm": 27.25, + "grad_norm_var": 11.422330729166667, + "learning_rate": 0.0001, + "loss": 7.4317, + "loss/crossentropy": 2.2351802065968513, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.20424611177295446, + "step": 8410 + }, + { + "epoch": 0.2105, + "grad_norm": 30.125, + "grad_norm_var": 10.734830729166667, + "learning_rate": 0.0001, + "loss": 7.3989, + "loss/crossentropy": 2.1349810734391212, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.18979013338685036, + "step": 8420 + }, + { + "epoch": 0.21075, + "grad_norm": 30.875, + "grad_norm_var": 2.387955729166667, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 2.080636392533779, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18893024744465947, + "step": 8430 + }, + { + "epoch": 0.211, + "grad_norm": 28.75, + "grad_norm_var": 20.089518229166668, + "learning_rate": 0.0001, + "loss": 7.4847, + "loss/crossentropy": 1.9267802774906158, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.18156335428357123, + "step": 8440 + }, + { + "epoch": 0.21125, + "grad_norm": 29.125, + "grad_norm_var": 2.2650390625, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.14048397988081, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.1889717074111104, + "step": 8450 + }, + { + "epoch": 0.2115, + "grad_norm": 28.375, + "grad_norm_var": 3.283124099188418e+18, + "learning_rate": 0.0001, + "loss": 7.4661, + "loss/crossentropy": 2.1406930878758432, + "loss/hidden": 3.699609375, + "loss/jsd": 0.0, + "loss/logits": 0.20904620084911585, + "step": 8460 + }, + { + "epoch": 0.21175, + "grad_norm": 30.75, + "grad_norm_var": 23.509830729166666, + "learning_rate": 0.0001, + "loss": 7.2867, + "loss/crossentropy": 2.1153231114149094, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.18998019583523273, + "step": 8470 + }, + { + "epoch": 0.212, + "grad_norm": 31.0, + "grad_norm_var": 2.3139973958333333, + "learning_rate": 0.0001, + "loss": 7.3997, + "loss/crossentropy": 2.169650764763355, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17556187361478806, + "step": 8480 + }, + { + "epoch": 0.21225, + "grad_norm": 29.625, + "grad_norm_var": 2.312239583333333, + "learning_rate": 0.0001, + "loss": 7.4486, + "loss/crossentropy": 2.128412726521492, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.1975807584822178, + "step": 8490 + }, + { + "epoch": 0.2125, + "grad_norm": 30.75, + "grad_norm_var": 2.903125, + "learning_rate": 0.0001, + "loss": 7.4526, + "loss/crossentropy": 2.029564914107323, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.19610330546274782, + "step": 8500 + }, + { + "epoch": 0.21275, + "grad_norm": 31.375, + "grad_norm_var": 4.004622395833334, + "learning_rate": 0.0001, + "loss": 7.2845, + "loss/crossentropy": 1.9631854377686977, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.1855375848710537, + "step": 8510 + }, + { + "epoch": 0.213, + "grad_norm": 30.0, + "grad_norm_var": 3.476822916666667, + "learning_rate": 0.0001, + "loss": 7.4124, + "loss/crossentropy": 2.1494273841381073, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19540442936122418, + "step": 8520 + }, + { + "epoch": 0.21325, + "grad_norm": 31.625, + "grad_norm_var": 0.9452473958333333, + "learning_rate": 0.0001, + "loss": 7.4659, + "loss/crossentropy": 2.0508621491491796, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.18023168351501226, + "step": 8530 + }, + { + "epoch": 0.2135, + "grad_norm": 31.25, + "grad_norm_var": 2.5809895833333334, + "learning_rate": 0.0001, + "loss": 7.4522, + "loss/crossentropy": 2.090715576708317, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.18382459450513125, + "step": 8540 + }, + { + "epoch": 0.21375, + "grad_norm": 28.75, + "grad_norm_var": 2.986168000807314e+18, + "learning_rate": 0.0001, + "loss": 7.4917, + "loss/crossentropy": 2.177123633027077, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.197673611715436, + "step": 8550 + }, + { + "epoch": 0.214, + "grad_norm": 27.75, + "grad_norm_var": 4.035872395833334, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.028589369356632, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.196718043461442, + "step": 8560 + }, + { + "epoch": 0.21425, + "grad_norm": 30.5, + "grad_norm_var": 4.758072916666666, + "learning_rate": 0.0001, + "loss": 7.3829, + "loss/crossentropy": 2.02908306196332, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.1757409404963255, + "step": 8570 + }, + { + "epoch": 0.2145, + "grad_norm": 30.875, + "grad_norm_var": 1.9754557291666666, + "learning_rate": 0.0001, + "loss": 7.4107, + "loss/crossentropy": 2.0411842301487924, + "loss/hidden": 3.580078125, + "loss/jsd": 0.0, + "loss/logits": 0.2309743857011199, + "step": 8580 + }, + { + "epoch": 0.21475, + "grad_norm": 30.0, + "grad_norm_var": 25.7244140625, + "learning_rate": 0.0001, + "loss": 7.4038, + "loss/crossentropy": 2.1026074662804604, + "loss/hidden": 3.490234375, + "loss/jsd": 0.0, + "loss/logits": 0.202506691403687, + "step": 8590 + }, + { + "epoch": 0.215, + "grad_norm": 28.625, + "grad_norm_var": 3.386458333333333, + "learning_rate": 0.0001, + "loss": 7.3559, + "loss/crossentropy": 2.1690615713596344, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.19095612335950135, + "step": 8600 + }, + { + "epoch": 0.21525, + "grad_norm": 29.0, + "grad_norm_var": 23.880989583333335, + "learning_rate": 0.0001, + "loss": 7.4164, + "loss/crossentropy": 2.099227898567915, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.19162636022083462, + "step": 8610 + }, + { + "epoch": 0.2155, + "grad_norm": 34.25, + "grad_norm_var": 23.880208333333332, + "learning_rate": 0.0001, + "loss": 7.3725, + "loss/crossentropy": 1.9689884655177594, + "loss/hidden": 3.52265625, + "loss/jsd": 0.0, + "loss/logits": 0.1830376190133393, + "step": 8620 + }, + { + "epoch": 0.21575, + "grad_norm": 29.875, + "grad_norm_var": 2.77890625, + "learning_rate": 0.0001, + "loss": 7.4511, + "loss/crossentropy": 2.0263702854514123, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.18029460608959197, + "step": 8630 + }, + { + "epoch": 0.216, + "grad_norm": 32.0, + "grad_norm_var": 23.001822916666665, + "learning_rate": 0.0001, + "loss": 7.3863, + "loss/crossentropy": 1.9046964697539805, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1935629203915596, + "step": 8640 + }, + { + "epoch": 0.21625, + "grad_norm": 33.5, + "grad_norm_var": 25.009375, + "learning_rate": 0.0001, + "loss": 7.3083, + "loss/crossentropy": 2.129426471889019, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.20037918202579022, + "step": 8650 + }, + { + "epoch": 0.2165, + "grad_norm": 28.375, + "grad_norm_var": 12.242643229166667, + "learning_rate": 0.0001, + "loss": 7.4603, + "loss/crossentropy": 2.2266604125499727, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.18952292017638683, + "step": 8660 + }, + { + "epoch": 0.21675, + "grad_norm": 34.75, + "grad_norm_var": 19.517708333333335, + "learning_rate": 0.0001, + "loss": 7.4031, + "loss/crossentropy": 1.9756697475910188, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.20053059812635182, + "step": 8670 + }, + { + "epoch": 0.217, + "grad_norm": 29.25, + "grad_norm_var": 8.315625, + "learning_rate": 0.0001, + "loss": 7.4013, + "loss/crossentropy": 2.13729098290205, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19440573658794164, + "step": 8680 + }, + { + "epoch": 0.21725, + "grad_norm": 31.5, + "grad_norm_var": 1.9936848958333333, + "learning_rate": 0.0001, + "loss": 7.2982, + "loss/crossentropy": 2.0935462579131126, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.200434254668653, + "step": 8690 + }, + { + "epoch": 0.2175, + "grad_norm": 31.625, + "grad_norm_var": 26.66640625, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.0081637501716614, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.196201959438622, + "step": 8700 + }, + { + "epoch": 0.21775, + "grad_norm": 28.125, + "grad_norm_var": 1.869627142435242e+18, + "learning_rate": 0.0001, + "loss": 7.4726, + "loss/crossentropy": 2.057238683104515, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.18527965154498816, + "step": 8710 + }, + { + "epoch": 0.218, + "grad_norm": 30.25, + "grad_norm_var": 4.806705729166667, + "learning_rate": 0.0001, + "loss": 7.424, + "loss/crossentropy": 2.225848586857319, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.20839223694056272, + "step": 8720 + }, + { + "epoch": 0.21825, + "grad_norm": 29.5, + "grad_norm_var": 3.2572265625, + "learning_rate": 0.0001, + "loss": 7.3713, + "loss/crossentropy": 1.9737806752324105, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.18166052605956792, + "step": 8730 + }, + { + "epoch": 0.2185, + "grad_norm": 31.25, + "grad_norm_var": 3.49765625, + "learning_rate": 0.0001, + "loss": 7.3192, + "loss/crossentropy": 2.1625877559185027, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.196309875510633, + "step": 8740 + }, + { + "epoch": 0.21875, + "grad_norm": 31.625, + "grad_norm_var": 15.733333333333333, + "learning_rate": 0.0001, + "loss": 7.3994, + "loss/crossentropy": 2.1306996777653695, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.17798179090023042, + "step": 8750 + }, + { + "epoch": 0.219, + "grad_norm": 31.25, + "grad_norm_var": 5.465625, + "learning_rate": 0.0001, + "loss": 7.4225, + "loss/crossentropy": 2.168031161278486, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.19842574130743743, + "step": 8760 + }, + { + "epoch": 0.21925, + "grad_norm": 29.5, + "grad_norm_var": 3.457747395833333, + "learning_rate": 0.0001, + "loss": 7.4515, + "loss/crossentropy": 2.081401216983795, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.1922367751598358, + "step": 8770 + }, + { + "epoch": 0.2195, + "grad_norm": 28.625, + "grad_norm_var": 2.9389973958333333, + "learning_rate": 0.0001, + "loss": 7.4641, + "loss/crossentropy": 1.9896476596593857, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.18900877684354783, + "step": 8780 + }, + { + "epoch": 0.21975, + "grad_norm": 29.375, + "grad_norm_var": 3.5931640625, + "learning_rate": 0.0001, + "loss": 7.5034, + "loss/crossentropy": 2.131504286080599, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.1981994620524347, + "step": 8790 + }, + { + "epoch": 0.22, + "grad_norm": 29.75, + "grad_norm_var": 17.612955729166668, + "learning_rate": 0.0001, + "loss": 7.3602, + "loss/crossentropy": 2.367118790745735, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.20705808699131012, + "step": 8800 + }, + { + "epoch": 0.22025, + "grad_norm": 30.625, + "grad_norm_var": 3.588997395833333, + "learning_rate": 0.0001, + "loss": 7.3512, + "loss/crossentropy": 1.915165586769581, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.18030493911355733, + "step": 8810 + }, + { + "epoch": 0.2205, + "grad_norm": 31.625, + "grad_norm_var": 2.732291666666667, + "learning_rate": 0.0001, + "loss": 7.3832, + "loss/crossentropy": 2.1191729307174683, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.20246538575738668, + "step": 8820 + }, + { + "epoch": 0.22075, + "grad_norm": 32.0, + "grad_norm_var": 2.034309895833333, + "learning_rate": 0.0001, + "loss": 7.4184, + "loss/crossentropy": 2.280582541972399, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.1913912059739232, + "step": 8830 + }, + { + "epoch": 0.221, + "grad_norm": 28.75, + "grad_norm_var": 1.4629557291666666, + "learning_rate": 0.0001, + "loss": 7.5177, + "loss/crossentropy": 2.07154730707407, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18417379464954137, + "step": 8840 + }, + { + "epoch": 0.22125, + "grad_norm": 29.25, + "grad_norm_var": 2.0306640625, + "learning_rate": 0.0001, + "loss": 7.4098, + "loss/crossentropy": 2.0918928742408753, + "loss/hidden": 3.478515625, + "loss/jsd": 0.0, + "loss/logits": 0.21339783817529678, + "step": 8850 + }, + { + "epoch": 0.2215, + "grad_norm": 34.75, + "grad_norm_var": 2.4098307291666665, + "learning_rate": 0.0001, + "loss": 7.3907, + "loss/crossentropy": 2.0262165658175944, + "loss/hidden": 3.4796875, + "loss/jsd": 0.0, + "loss/logits": 0.1963033676147461, + "step": 8860 + }, + { + "epoch": 0.22175, + "grad_norm": 29.625, + "grad_norm_var": 3.1041015625, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.119324280321598, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.19243048634380103, + "step": 8870 + }, + { + "epoch": 0.222, + "grad_norm": 29.25, + "grad_norm_var": 3.3593098958333334, + "learning_rate": 0.0001, + "loss": 7.335, + "loss/crossentropy": 2.1042064100503923, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.2055317424237728, + "step": 8880 + }, + { + "epoch": 0.22225, + "grad_norm": 29.0, + "grad_norm_var": 4.35390625, + "learning_rate": 0.0001, + "loss": 7.3974, + "loss/crossentropy": 2.110988216102123, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.18536690715700388, + "step": 8890 + }, + { + "epoch": 0.2225, + "grad_norm": 30.0, + "grad_norm_var": 6.014322916666667, + "learning_rate": 0.0001, + "loss": 7.4155, + "loss/crossentropy": 2.033397987484932, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.19111265633255242, + "step": 8900 + }, + { + "epoch": 0.22275, + "grad_norm": 32.25, + "grad_norm_var": 4.747916666666667, + "learning_rate": 0.0001, + "loss": 7.4631, + "loss/crossentropy": 2.090746468305588, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.20997797157615422, + "step": 8910 + }, + { + "epoch": 0.223, + "grad_norm": 32.75, + "grad_norm_var": 2.0247395833333335, + "learning_rate": 0.0001, + "loss": 7.3647, + "loss/crossentropy": 2.107650229334831, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.18095682561397552, + "step": 8920 + }, + { + "epoch": 0.22325, + "grad_norm": 33.0, + "grad_norm_var": 4.983072916666667, + "learning_rate": 0.0001, + "loss": 7.4161, + "loss/crossentropy": 2.1033721581101417, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1837721960619092, + "step": 8930 + }, + { + "epoch": 0.2235, + "grad_norm": 30.0, + "grad_norm_var": 5.160872395833334, + "learning_rate": 0.0001, + "loss": 7.381, + "loss/crossentropy": 2.2039036631584166, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.20040026511996983, + "step": 8940 + }, + { + "epoch": 0.22375, + "grad_norm": 32.75, + "grad_norm_var": 5.625455729166666, + "learning_rate": 0.0001, + "loss": 7.4581, + "loss/crossentropy": 2.0015091970562935, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.1968998895958066, + "step": 8950 + }, + { + "epoch": 0.224, + "grad_norm": 30.125, + "grad_norm_var": 7.976497395833333, + "learning_rate": 0.0001, + "loss": 7.446, + "loss/crossentropy": 2.1770635031163694, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.18164771795272827, + "step": 8960 + }, + { + "epoch": 0.22425, + "grad_norm": 26.875, + "grad_norm_var": 2.037239583333333, + "learning_rate": 0.0001, + "loss": 7.2677, + "loss/crossentropy": 1.9846746385097505, + "loss/hidden": 3.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.19175144601613284, + "step": 8970 + }, + { + "epoch": 0.2245, + "grad_norm": 30.25, + "grad_norm_var": 2.8353515625, + "learning_rate": 0.0001, + "loss": 7.4645, + "loss/crossentropy": 2.169223573803902, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.18991702441126107, + "step": 8980 + }, + { + "epoch": 0.22475, + "grad_norm": 29.875, + "grad_norm_var": 10.3228515625, + "learning_rate": 0.0001, + "loss": 7.4607, + "loss/crossentropy": 2.1323930069804193, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.18631141390651465, + "step": 8990 + }, + { + "epoch": 0.225, + "grad_norm": 29.625, + "grad_norm_var": 11.431705729166667, + "learning_rate": 0.0001, + "loss": 7.4203, + "loss/crossentropy": 2.1948125064373016, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.21075339019298553, + "step": 9000 + }, + { + "epoch": 0.22525, + "grad_norm": 31.0, + "grad_norm_var": 3.21875, + "learning_rate": 0.0001, + "loss": 7.3937, + "loss/crossentropy": 2.061431697010994, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.20084240343421697, + "step": 9010 + }, + { + "epoch": 0.2255, + "grad_norm": 30.0, + "grad_norm_var": 9.7125, + "learning_rate": 0.0001, + "loss": 7.5364, + "loss/crossentropy": 2.1422473564743996, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.20135180205106734, + "step": 9020 + }, + { + "epoch": 0.22575, + "grad_norm": 29.0, + "grad_norm_var": 10.320247395833333, + "learning_rate": 0.0001, + "loss": 7.3638, + "loss/crossentropy": 2.064805781841278, + "loss/hidden": 3.509375, + "loss/jsd": 0.0, + "loss/logits": 0.19638751186430453, + "step": 9030 + }, + { + "epoch": 0.226, + "grad_norm": 29.0, + "grad_norm_var": 7.7306640625, + "learning_rate": 0.0001, + "loss": 7.367, + "loss/crossentropy": 2.0203323513269424, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.1730576554313302, + "step": 9040 + }, + { + "epoch": 0.22625, + "grad_norm": 28.0, + "grad_norm_var": 7.5291015625, + "learning_rate": 0.0001, + "loss": 7.4083, + "loss/crossentropy": 2.1449467122554777, + "loss/hidden": 3.428125, + "loss/jsd": 0.0, + "loss/logits": 0.20745128113776445, + "step": 9050 + }, + { + "epoch": 0.2265, + "grad_norm": 29.625, + "grad_norm_var": 10.059830729166666, + "learning_rate": 0.0001, + "loss": 7.3739, + "loss/crossentropy": 2.277996188402176, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.1961175424978137, + "step": 9060 + }, + { + "epoch": 0.22675, + "grad_norm": 29.75, + "grad_norm_var": 2.9497395833333333, + "learning_rate": 0.0001, + "loss": 7.425, + "loss/crossentropy": 2.128904873877764, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.1912422338500619, + "step": 9070 + }, + { + "epoch": 0.227, + "grad_norm": 30.0, + "grad_norm_var": 2.4291666666666667, + "learning_rate": 0.0001, + "loss": 7.3724, + "loss/crossentropy": 1.8966167330741883, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1772749178111553, + "step": 9080 + }, + { + "epoch": 0.22725, + "grad_norm": 30.625, + "grad_norm_var": 2.2955729166666665, + "learning_rate": 0.0001, + "loss": 7.4341, + "loss/crossentropy": 1.9993775576353072, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.17548400331288577, + "step": 9090 + }, + { + "epoch": 0.2275, + "grad_norm": 30.125, + "grad_norm_var": 2.3212890625, + "learning_rate": 0.0001, + "loss": 7.4512, + "loss/crossentropy": 2.1553252935409546, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1904754728078842, + "step": 9100 + }, + { + "epoch": 0.22775, + "grad_norm": 31.875, + "grad_norm_var": 1.8264973958333333, + "learning_rate": 0.0001, + "loss": 7.4299, + "loss/crossentropy": 2.0791067980229854, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.18264174591749907, + "step": 9110 + }, + { + "epoch": 0.228, + "grad_norm": 30.625, + "grad_norm_var": 8.670572916666666, + "learning_rate": 0.0001, + "loss": 7.3621, + "loss/crossentropy": 2.2495081633329392, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.19295338317751884, + "step": 9120 + }, + { + "epoch": 0.22825, + "grad_norm": 34.25, + "grad_norm_var": 9.628580729166666, + "learning_rate": 0.0001, + "loss": 7.4069, + "loss/crossentropy": 2.0221784450113773, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.18131311442703008, + "step": 9130 + }, + { + "epoch": 0.2285, + "grad_norm": 31.375, + "grad_norm_var": 3.8372395833333335, + "learning_rate": 0.0001, + "loss": 7.3788, + "loss/crossentropy": 2.096472094208002, + "loss/hidden": 3.533984375, + "loss/jsd": 0.0, + "loss/logits": 0.19181215222924947, + "step": 9140 + }, + { + "epoch": 0.22875, + "grad_norm": 32.75, + "grad_norm_var": 2.9369140625, + "learning_rate": 0.0001, + "loss": 7.4625, + "loss/crossentropy": 2.161876367032528, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.19645992666482925, + "step": 9150 + }, + { + "epoch": 0.229, + "grad_norm": 36.5, + "grad_norm_var": 13.144791666666666, + "learning_rate": 0.0001, + "loss": 7.3883, + "loss/crossentropy": 1.870260328054428, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.19089705124497414, + "step": 9160 + }, + { + "epoch": 0.22925, + "grad_norm": 30.125, + "grad_norm_var": 11.746809895833334, + "learning_rate": 0.0001, + "loss": 7.3778, + "loss/crossentropy": 2.304895442724228, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.19963221047073604, + "step": 9170 + }, + { + "epoch": 0.2295, + "grad_norm": 30.875, + "grad_norm_var": 2.64839803006287e+18, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.1439118653535845, + "loss/hidden": 3.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.20680125001817942, + "step": 9180 + }, + { + "epoch": 0.22975, + "grad_norm": 29.625, + "grad_norm_var": 16.839518229166668, + "learning_rate": 0.0001, + "loss": 7.3153, + "loss/crossentropy": 1.9939407154917717, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.19113040501251816, + "step": 9190 + }, + { + "epoch": 0.23, + "grad_norm": 30.125, + "grad_norm_var": 1.8309895833333334, + "learning_rate": 0.0001, + "loss": 7.5357, + "loss/crossentropy": 2.088300883769989, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.19538584928959607, + "step": 9200 + }, + { + "epoch": 0.23025, + "grad_norm": 30.75, + "grad_norm_var": 1.66640625, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 2.0176690459251403, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.1823873495683074, + "step": 9210 + }, + { + "epoch": 0.2305, + "grad_norm": 32.75, + "grad_norm_var": 4.837955729166667, + "learning_rate": 0.0001, + "loss": 7.3334, + "loss/crossentropy": 2.061754436790943, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.18769590836018324, + "step": 9220 + }, + { + "epoch": 0.23075, + "grad_norm": 31.75, + "grad_norm_var": 4.683268229166667, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.0443666532635687, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19146692994982004, + "step": 9230 + }, + { + "epoch": 0.231, + "grad_norm": 33.25, + "grad_norm_var": 2.594073359575469e+18, + "learning_rate": 0.0001, + "loss": 7.5992, + "loss/crossentropy": 2.189889648556709, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.19348510541021824, + "step": 9240 + }, + { + "epoch": 0.23125, + "grad_norm": 29.125, + "grad_norm_var": 2.594073359702976e+18, + "learning_rate": 0.0001, + "loss": 7.4217, + "loss/crossentropy": 2.0103170931339265, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19076041504740715, + "step": 9250 + }, + { + "epoch": 0.2315, + "grad_norm": 31.375, + "grad_norm_var": 13.5134765625, + "learning_rate": 0.0001, + "loss": 7.3004, + "loss/crossentropy": 2.0597189858555796, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.18948603458702565, + "step": 9260 + }, + { + "epoch": 0.23175, + "grad_norm": 31.625, + "grad_norm_var": 13.402083333333334, + "learning_rate": 0.0001, + "loss": 7.3946, + "loss/crossentropy": 2.0123729363083838, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1885578565299511, + "step": 9270 + }, + { + "epoch": 0.232, + "grad_norm": 29.875, + "grad_norm_var": 5.879622395833334, + "learning_rate": 0.0001, + "loss": 7.1578, + "loss/crossentropy": 2.2501363843679427, + "loss/hidden": 3.286328125, + "loss/jsd": 0.0, + "loss/logits": 0.19268405642360448, + "step": 9280 + }, + { + "epoch": 0.23225, + "grad_norm": 30.25, + "grad_norm_var": 16.982291666666665, + "learning_rate": 0.0001, + "loss": 7.3481, + "loss/crossentropy": 2.076053886115551, + "loss/hidden": 3.571875, + "loss/jsd": 0.0, + "loss/logits": 0.21077420487999915, + "step": 9290 + }, + { + "epoch": 0.2325, + "grad_norm": 29.5, + "grad_norm_var": 21.769791666666666, + "learning_rate": 0.0001, + "loss": 7.2711, + "loss/crossentropy": 2.193411388993263, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18807493168860673, + "step": 9300 + }, + { + "epoch": 0.23275, + "grad_norm": 34.25, + "grad_norm_var": 12.355143229166666, + "learning_rate": 0.0001, + "loss": 7.3819, + "loss/crossentropy": 2.166665832698345, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.18620893750339745, + "step": 9310 + }, + { + "epoch": 0.233, + "grad_norm": 29.875, + "grad_norm_var": 3.002083333333333, + "learning_rate": 0.0001, + "loss": 7.4076, + "loss/crossentropy": 2.022654353827238, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.18289813362061977, + "step": 9320 + }, + { + "epoch": 0.23325, + "grad_norm": 30.0, + "grad_norm_var": 3.0509765625, + "learning_rate": 0.0001, + "loss": 7.3206, + "loss/crossentropy": 2.020027980953455, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.18972874553874136, + "step": 9330 + }, + { + "epoch": 0.2335, + "grad_norm": 31.125, + "grad_norm_var": 2.098893229166667, + "learning_rate": 0.0001, + "loss": 7.4091, + "loss/crossentropy": 2.029365235567093, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.2005317559465766, + "step": 9340 + }, + { + "epoch": 0.23375, + "grad_norm": 29.5, + "grad_norm_var": 10.520768229166666, + "learning_rate": 0.0001, + "loss": 7.3704, + "loss/crossentropy": 1.95634398534894, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.18962019477039577, + "step": 9350 + }, + { + "epoch": 0.234, + "grad_norm": 27.75, + "grad_norm_var": 13.475, + "learning_rate": 0.0001, + "loss": 7.3797, + "loss/crossentropy": 2.063827896118164, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.19280518041923642, + "step": 9360 + }, + { + "epoch": 0.23425, + "grad_norm": 30.5, + "grad_norm_var": 2.0869140625, + "learning_rate": 0.0001, + "loss": 7.4918, + "loss/crossentropy": 2.068470099568367, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.19361184667795897, + "step": 9370 + }, + { + "epoch": 0.2345, + "grad_norm": 28.0, + "grad_norm_var": 6.534830729166667, + "learning_rate": 0.0001, + "loss": 7.4265, + "loss/crossentropy": 2.125886672735214, + "loss/hidden": 3.401171875, + "loss/jsd": 0.0, + "loss/logits": 0.19355848152190447, + "step": 9380 + }, + { + "epoch": 0.23475, + "grad_norm": 30.375, + "grad_norm_var": 2.2770182291666665, + "learning_rate": 0.0001, + "loss": 7.3769, + "loss/crossentropy": 2.2057121500372885, + "loss/hidden": 3.28515625, + "loss/jsd": 0.0, + "loss/logits": 0.1843592157587409, + "step": 9390 + }, + { + "epoch": 0.235, + "grad_norm": 87.5, + "grad_norm_var": 202.43932291666667, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.17584248483181, + "loss/hidden": 3.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.1986805137246847, + "step": 9400 + }, + { + "epoch": 0.23525, + "grad_norm": 33.0, + "grad_norm_var": 205.446875, + "learning_rate": 0.0001, + "loss": 7.5575, + "loss/crossentropy": 1.9989879056811333, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.21514312122017146, + "step": 9410 + }, + { + "epoch": 0.2355, + "grad_norm": 33.25, + "grad_norm_var": 3.4775390625, + "learning_rate": 0.0001, + "loss": 7.3965, + "loss/crossentropy": 2.31248200237751, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.20192647576332093, + "step": 9420 + }, + { + "epoch": 0.23575, + "grad_norm": 34.0, + "grad_norm_var": 2.5723307291666666, + "learning_rate": 0.0001, + "loss": 7.425, + "loss/crossentropy": 2.0246524304151534, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.1936120893806219, + "step": 9430 + }, + { + "epoch": 0.236, + "grad_norm": 31.5, + "grad_norm_var": 3.8207682291666667, + "learning_rate": 0.0001, + "loss": 7.3458, + "loss/crossentropy": 2.0714851915836334, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.19567677434533834, + "step": 9440 + }, + { + "epoch": 0.23625, + "grad_norm": 32.25, + "grad_norm_var": 2.0452473958333335, + "learning_rate": 0.0001, + "loss": 7.3934, + "loss/crossentropy": 2.024892423301935, + "loss/hidden": 3.541796875, + "loss/jsd": 0.0, + "loss/logits": 0.19850265365093947, + "step": 9450 + }, + { + "epoch": 0.2365, + "grad_norm": 32.5, + "grad_norm_var": 3.16640625, + "learning_rate": 0.0001, + "loss": 7.4192, + "loss/crossentropy": 2.146814212203026, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20048882197588683, + "step": 9460 + }, + { + "epoch": 0.23675, + "grad_norm": 27.875, + "grad_norm_var": 12.364322916666667, + "learning_rate": 0.0001, + "loss": 7.3745, + "loss/crossentropy": 2.0004019677639007, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19019459020346402, + "step": 9470 + }, + { + "epoch": 0.237, + "grad_norm": 28.75, + "grad_norm_var": 12.885416666666666, + "learning_rate": 0.0001, + "loss": 7.2858, + "loss/crossentropy": 2.0364832431077957, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.18539349418133497, + "step": 9480 + }, + { + "epoch": 0.23725, + "grad_norm": 26.75, + "grad_norm_var": 4.488997395833334, + "learning_rate": 0.0001, + "loss": 7.2865, + "loss/crossentropy": 2.257227724790573, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.1774066084995866, + "step": 9490 + }, + { + "epoch": 0.2375, + "grad_norm": 28.25, + "grad_norm_var": 5.15625, + "learning_rate": 0.0001, + "loss": 7.3742, + "loss/crossentropy": 2.192533364892006, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18531391881406306, + "step": 9500 + }, + { + "epoch": 0.23775, + "grad_norm": 28.5, + "grad_norm_var": 8.7025390625, + "learning_rate": 0.0001, + "loss": 7.4007, + "loss/crossentropy": 1.9954842567443847, + "loss/hidden": 3.512890625, + "loss/jsd": 0.0, + "loss/logits": 0.21250668447464705, + "step": 9510 + }, + { + "epoch": 0.238, + "grad_norm": 27.0, + "grad_norm_var": 3.2018229166666665, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 2.0501152604818342, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19774708338081837, + "step": 9520 + }, + { + "epoch": 0.23825, + "grad_norm": 31.125, + "grad_norm_var": 2.471875, + "learning_rate": 0.0001, + "loss": 7.3611, + "loss/crossentropy": 2.1446721121668815, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.19521092902868986, + "step": 9530 + }, + { + "epoch": 0.2385, + "grad_norm": 33.25, + "grad_norm_var": 2.911393229166667, + "learning_rate": 0.0001, + "loss": 7.4037, + "loss/crossentropy": 2.050874675065279, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.18872325737029313, + "step": 9540 + }, + { + "epoch": 0.23875, + "grad_norm": 32.25, + "grad_norm_var": 1.69375, + "learning_rate": 0.0001, + "loss": 7.3851, + "loss/crossentropy": 2.092629846930504, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1858744696713984, + "step": 9550 + }, + { + "epoch": 0.239, + "grad_norm": 30.25, + "grad_norm_var": 1.6229166666666666, + "learning_rate": 0.0001, + "loss": 7.324, + "loss/crossentropy": 2.1389416724443437, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.18115091007202863, + "step": 9560 + }, + { + "epoch": 0.23925, + "grad_norm": 28.375, + "grad_norm_var": 0.7893229166666667, + "learning_rate": 0.0001, + "loss": 7.3684, + "loss/crossentropy": 2.1148220866918566, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.18993774689733983, + "step": 9570 + }, + { + "epoch": 0.2395, + "grad_norm": 29.125, + "grad_norm_var": 1.0260416666666667, + "learning_rate": 0.0001, + "loss": 7.4711, + "loss/crossentropy": 1.987958113849163, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19200538750737906, + "step": 9580 + }, + { + "epoch": 0.23975, + "grad_norm": 29.625, + "grad_norm_var": 0.6018229166666667, + "learning_rate": 0.0001, + "loss": 7.3578, + "loss/crossentropy": 2.076607885956764, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.20001101978123187, + "step": 9590 + }, + { + "epoch": 0.24, + "grad_norm": 30.75, + "grad_norm_var": 1.4124348958333333, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 2.1496243715286254, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.1889566643163562, + "step": 9600 + }, + { + "epoch": 0.24025, + "grad_norm": 29.875, + "grad_norm_var": 1.04140625, + "learning_rate": 0.0001, + "loss": 7.4632, + "loss/crossentropy": 2.162689308822155, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.19204493910074233, + "step": 9610 + }, + { + "epoch": 0.2405, + "grad_norm": 27.375, + "grad_norm_var": 1.5863932291666667, + "learning_rate": 0.0001, + "loss": 7.3897, + "loss/crossentropy": 2.1323146484792233, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.20718522872775794, + "step": 9620 + }, + { + "epoch": 0.24075, + "grad_norm": 32.25, + "grad_norm_var": 2.09375, + "learning_rate": 0.0001, + "loss": 7.2848, + "loss/crossentropy": 2.0804166465997698, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.1966247998178005, + "step": 9630 + }, + { + "epoch": 0.241, + "grad_norm": 31.25, + "grad_norm_var": 1.3559895833333333, + "learning_rate": 0.0001, + "loss": 7.4174, + "loss/crossentropy": 2.3036757931113243, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.18416995517909526, + "step": 9640 + }, + { + "epoch": 0.24125, + "grad_norm": 33.0, + "grad_norm_var": 2.41015625, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 1.9936259984970093, + "loss/hidden": 3.428125, + "loss/jsd": 0.0, + "loss/logits": 0.21977887880057095, + "step": 9650 + }, + { + "epoch": 0.2415, + "grad_norm": 32.5, + "grad_norm_var": 6.1228515625, + "learning_rate": 0.0001, + "loss": 7.4938, + "loss/crossentropy": 2.2000851720571517, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.20301534831523896, + "step": 9660 + }, + { + "epoch": 0.24175, + "grad_norm": 31.0, + "grad_norm_var": 3.59765625, + "learning_rate": 0.0001, + "loss": 7.5133, + "loss/crossentropy": 2.192278115451336, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.19489070847630502, + "step": 9670 + }, + { + "epoch": 0.242, + "grad_norm": 28.875, + "grad_norm_var": 6.8978515625, + "learning_rate": 0.0001, + "loss": 7.4247, + "loss/crossentropy": 2.194097451120615, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.20746590523049235, + "step": 9680 + }, + { + "epoch": 0.24225, + "grad_norm": 29.75, + "grad_norm_var": 4.0587890625, + "learning_rate": 0.0001, + "loss": 7.3413, + "loss/crossentropy": 2.1902857303619383, + "loss/hidden": 3.187109375, + "loss/jsd": 0.0, + "loss/logits": 0.17439354099333287, + "step": 9690 + }, + { + "epoch": 0.2425, + "grad_norm": 41.5, + "grad_norm_var": 3.952541960104418e+18, + "learning_rate": 0.0001, + "loss": 7.5429, + "loss/crossentropy": 2.046304853260517, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.18022917695343493, + "step": 9700 + }, + { + "epoch": 0.24275, + "grad_norm": 30.375, + "grad_norm_var": 3.952541960261809e+18, + "learning_rate": 0.0001, + "loss": 7.2729, + "loss/crossentropy": 2.082195009291172, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.1983122780919075, + "step": 9710 + }, + { + "epoch": 0.243, + "grad_norm": 28.375, + "grad_norm_var": 2.6056640625, + "learning_rate": 0.0001, + "loss": 7.2271, + "loss/crossentropy": 1.970278625190258, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.19305091574788094, + "step": 9720 + }, + { + "epoch": 0.24325, + "grad_norm": 30.375, + "grad_norm_var": 2.0576524262052987e+18, + "learning_rate": 0.0001, + "loss": 7.3979, + "loss/crossentropy": 2.1140229746699335, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.19698369763791562, + "step": 9730 + }, + { + "epoch": 0.2435, + "grad_norm": 29.0, + "grad_norm_var": 2.057652425978177e+18, + "learning_rate": 0.0001, + "loss": 7.2491, + "loss/crossentropy": 1.9204902969300748, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.18290270324796437, + "step": 9740 + }, + { + "epoch": 0.24375, + "grad_norm": 49.25, + "grad_norm_var": 27.333072916666666, + "learning_rate": 0.0001, + "loss": 7.4168, + "loss/crossentropy": 2.203968660533428, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.19210707377642394, + "step": 9750 + }, + { + "epoch": 0.244, + "grad_norm": 27.75, + "grad_norm_var": 28.5837890625, + "learning_rate": 0.0001, + "loss": 7.3667, + "loss/crossentropy": 2.0645239472389223, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1849998442456126, + "step": 9760 + }, + { + "epoch": 0.24425, + "grad_norm": 30.0, + "grad_norm_var": 7.441666666666666, + "learning_rate": 0.0001, + "loss": 7.3958, + "loss/crossentropy": 2.0423281893134115, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19458430632948875, + "step": 9770 + }, + { + "epoch": 0.2445, + "grad_norm": 28.25, + "grad_norm_var": 25.725, + "learning_rate": 0.0001, + "loss": 7.371, + "loss/crossentropy": 2.2162967801094053, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.19894264116883278, + "step": 9780 + }, + { + "epoch": 0.24475, + "grad_norm": 31.875, + "grad_norm_var": 25.701822916666668, + "learning_rate": 0.0001, + "loss": 7.4136, + "loss/crossentropy": 2.052091246843338, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.19768061954528093, + "step": 9790 + }, + { + "epoch": 0.245, + "grad_norm": 29.375, + "grad_norm_var": 2.4247395833333334, + "learning_rate": 0.0001, + "loss": 7.3752, + "loss/crossentropy": 2.0378331199288366, + "loss/hidden": 3.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.1888222724199295, + "step": 9800 + }, + { + "epoch": 0.24525, + "grad_norm": 29.75, + "grad_norm_var": 17.073958333333334, + "learning_rate": 0.0001, + "loss": 7.389, + "loss/crossentropy": 2.0574826121330263, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.1817958688363433, + "step": 9810 + }, + { + "epoch": 0.2455, + "grad_norm": 28.125, + "grad_norm_var": 24.198372395833335, + "learning_rate": 0.0001, + "loss": 7.284, + "loss/crossentropy": 2.068347904086113, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.1905125178396702, + "step": 9820 + }, + { + "epoch": 0.24575, + "grad_norm": 28.5, + "grad_norm_var": 9.8041015625, + "learning_rate": 0.0001, + "loss": 7.3867, + "loss/crossentropy": 2.0113710410892964, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.1825895557180047, + "step": 9830 + }, + { + "epoch": 0.246, + "grad_norm": 30.5, + "grad_norm_var": 1.16015625, + "learning_rate": 0.0001, + "loss": 7.3496, + "loss/crossentropy": 2.0473890252411366, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.18930337531492114, + "step": 9840 + }, + { + "epoch": 0.24625, + "grad_norm": 29.875, + "grad_norm_var": 1.75, + "learning_rate": 0.0001, + "loss": 7.4305, + "loss/crossentropy": 2.167546259611845, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.18493952229619026, + "step": 9850 + }, + { + "epoch": 0.2465, + "grad_norm": 28.375, + "grad_norm_var": 2.7625, + "learning_rate": 0.0001, + "loss": 7.444, + "loss/crossentropy": 2.1046394810080526, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.18062518630176783, + "step": 9860 + }, + { + "epoch": 0.24675, + "grad_norm": 28.75, + "grad_norm_var": 7819.056705729166, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 2.0700221791863442, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.190355353243649, + "step": 9870 + }, + { + "epoch": 0.247, + "grad_norm": 29.0, + "grad_norm_var": 7843.72265625, + "learning_rate": 0.0001, + "loss": 7.2875, + "loss/crossentropy": 2.057565826922655, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.18850490506738424, + "step": 9880 + }, + { + "epoch": 0.24725, + "grad_norm": 32.5, + "grad_norm_var": 3.778059895833333, + "learning_rate": 0.0001, + "loss": 7.4329, + "loss/crossentropy": 2.0141181223094464, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.19769613686949014, + "step": 9890 + }, + { + "epoch": 0.2475, + "grad_norm": 30.375, + "grad_norm_var": 6.312239583333334, + "learning_rate": 0.0001, + "loss": 7.336, + "loss/crossentropy": 2.146551664918661, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.18510441221296786, + "step": 9900 + }, + { + "epoch": 0.24775, + "grad_norm": 33.0, + "grad_norm_var": 6.36640625, + "learning_rate": 0.0001, + "loss": 7.306, + "loss/crossentropy": 2.2865438759326935, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.19417236726731063, + "step": 9910 + }, + { + "epoch": 0.248, + "grad_norm": 33.5, + "grad_norm_var": 2.405989583333333, + "learning_rate": 0.0001, + "loss": 7.3855, + "loss/crossentropy": 2.133404280245304, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.19065556656569244, + "step": 9920 + }, + { + "epoch": 0.24825, + "grad_norm": 34.25, + "grad_norm_var": 3.1626528109486234e+18, + "learning_rate": 0.0001, + "loss": 7.4378, + "loss/crossentropy": 2.0854051023721696, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.20351361632347106, + "step": 9930 + }, + { + "epoch": 0.2485, + "grad_norm": 32.25, + "grad_norm_var": 3.1626528099408717e+18, + "learning_rate": 0.0001, + "loss": 7.3826, + "loss/crossentropy": 2.087091060727835, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.19672764679417015, + "step": 9940 + }, + { + "epoch": 0.24875, + "grad_norm": 31.875, + "grad_norm_var": 3.729622395833333, + "learning_rate": 0.0001, + "loss": 7.4303, + "loss/crossentropy": 1.9529958970844745, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.1957404987886548, + "step": 9950 + }, + { + "epoch": 0.249, + "grad_norm": 27.0, + "grad_norm_var": 3.515625, + "learning_rate": 0.0001, + "loss": 7.2963, + "loss/crossentropy": 2.1291355013847353, + "loss/hidden": 3.278125, + "loss/jsd": 0.0, + "loss/logits": 0.1832482174038887, + "step": 9960 + }, + { + "epoch": 0.24925, + "grad_norm": 31.5, + "grad_norm_var": 2.403580729166667, + "learning_rate": 0.0001, + "loss": 7.3011, + "loss/crossentropy": 2.073649263381958, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.17194083742797375, + "step": 9970 + }, + { + "epoch": 0.2495, + "grad_norm": 30.5, + "grad_norm_var": 3.7934895833333333, + "learning_rate": 0.0001, + "loss": 7.3253, + "loss/crossentropy": 2.312130460143089, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.19424791410565376, + "step": 9980 + }, + { + "epoch": 0.24975, + "grad_norm": 31.875, + "grad_norm_var": 6.4666015625, + "learning_rate": 0.0001, + "loss": 7.3395, + "loss/crossentropy": 1.9769040577113628, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1865438589360565, + "step": 9990 + }, + { + "epoch": 0.25, + "grad_norm": 37.0, + "grad_norm_var": 5.488997395833334, + "learning_rate": 0.0001, + "loss": 7.3121, + "loss/crossentropy": 2.0656296610832214, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.19510807991027831, + "step": 10000 + }, + { + "epoch": 0.25025, + "grad_norm": 30.5, + "grad_norm_var": 6.201822916666667, + "learning_rate": 0.0001, + "loss": 7.3059, + "loss/crossentropy": 2.1552528128027917, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.1871857862919569, + "step": 10010 + }, + { + "epoch": 0.2505, + "grad_norm": 29.375, + "grad_norm_var": 3.7697265625, + "learning_rate": 0.0001, + "loss": 7.402, + "loss/crossentropy": 2.1086191907525063, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.20956437531858682, + "step": 10020 + }, + { + "epoch": 0.25075, + "grad_norm": 27.0, + "grad_norm_var": 79.871875, + "learning_rate": 0.0001, + "loss": 7.2906, + "loss/crossentropy": 2.24168366715312, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.1914551755413413, + "step": 10030 + }, + { + "epoch": 0.251, + "grad_norm": 28.25, + "grad_norm_var": 3.5348307291666665, + "learning_rate": 0.0001, + "loss": 7.3933, + "loss/crossentropy": 1.9110309466719628, + "loss/hidden": 3.52421875, + "loss/jsd": 0.0, + "loss/logits": 0.1938932467252016, + "step": 10040 + }, + { + "epoch": 0.25125, + "grad_norm": 29.5, + "grad_norm_var": 3.318489583333333, + "learning_rate": 0.0001, + "loss": 7.3066, + "loss/crossentropy": 2.2151729106903075, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.1898823155090213, + "step": 10050 + }, + { + "epoch": 0.2515, + "grad_norm": 27.125, + "grad_norm_var": 5.114583333333333, + "learning_rate": 0.0001, + "loss": 7.3306, + "loss/crossentropy": 1.9325201705098152, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.1914183372631669, + "step": 10060 + }, + { + "epoch": 0.25175, + "grad_norm": 28.375, + "grad_norm_var": 5.568489583333333, + "learning_rate": 0.0001, + "loss": 7.4302, + "loss/crossentropy": 2.053273378312588, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.193053549900651, + "step": 10070 + }, + { + "epoch": 0.252, + "grad_norm": 32.25, + "grad_norm_var": 1.5947265625, + "learning_rate": 0.0001, + "loss": 7.3679, + "loss/crossentropy": 2.1425619572401047, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.19612068887799977, + "step": 10080 + }, + { + "epoch": 0.25225, + "grad_norm": 28.125, + "grad_norm_var": 1.2666666666666666, + "learning_rate": 0.0001, + "loss": 7.4696, + "loss/crossentropy": 2.1615766674280166, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.21496526505798103, + "step": 10090 + }, + { + "epoch": 0.2525, + "grad_norm": 31.375, + "grad_norm_var": 87.72389322916666, + "learning_rate": 0.0001, + "loss": 7.3933, + "loss/crossentropy": 2.093507520854473, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18447848707437514, + "step": 10100 + }, + { + "epoch": 0.25275, + "grad_norm": 29.125, + "grad_norm_var": 87.9962890625, + "learning_rate": 0.0001, + "loss": 7.42, + "loss/crossentropy": 2.2579648420214653, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.19929521884769202, + "step": 10110 + }, + { + "epoch": 0.253, + "grad_norm": 31.0, + "grad_norm_var": 3.0921223958333335, + "learning_rate": 0.0001, + "loss": 7.4462, + "loss/crossentropy": 2.187203352898359, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.18776683378964662, + "step": 10120 + }, + { + "epoch": 0.25325, + "grad_norm": 35.0, + "grad_norm_var": 37.43333333333333, + "learning_rate": 0.0001, + "loss": 7.493, + "loss/crossentropy": 2.156028524041176, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.20926487501710653, + "step": 10130 + }, + { + "epoch": 0.2535, + "grad_norm": 30.625, + "grad_norm_var": 36.87180989583333, + "learning_rate": 0.0001, + "loss": 7.4227, + "loss/crossentropy": 1.9757653154432773, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.1927978384308517, + "step": 10140 + }, + { + "epoch": 0.25375, + "grad_norm": 30.75, + "grad_norm_var": 2.1582682291666666, + "learning_rate": 0.0001, + "loss": 7.3403, + "loss/crossentropy": 2.1322492100298405, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.1974259439855814, + "step": 10150 + }, + { + "epoch": 0.254, + "grad_norm": 28.125, + "grad_norm_var": 2.9259765625, + "learning_rate": 0.0001, + "loss": 7.3083, + "loss/crossentropy": 2.0464363016188143, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.17546537732705475, + "step": 10160 + }, + { + "epoch": 0.25425, + "grad_norm": 28.625, + "grad_norm_var": 3.7514973958333333, + "learning_rate": 0.0001, + "loss": 7.3988, + "loss/crossentropy": 2.059445019811392, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.19312301548197866, + "step": 10170 + }, + { + "epoch": 0.2545, + "grad_norm": 29.5, + "grad_norm_var": 3.36015625, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.0300123430788517, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18283235086128116, + "step": 10180 + }, + { + "epoch": 0.25475, + "grad_norm": 31.625, + "grad_norm_var": 2.7192057291666667, + "learning_rate": 0.0001, + "loss": 7.4117, + "loss/crossentropy": 2.052568303793669, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.17590052969753742, + "step": 10190 + }, + { + "epoch": 0.255, + "grad_norm": 30.875, + "grad_norm_var": 3.0660807291666665, + "learning_rate": 0.0001, + "loss": 7.4384, + "loss/crossentropy": 2.1124994076788424, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.20564852859824895, + "step": 10200 + }, + { + "epoch": 0.25525, + "grad_norm": 32.25, + "grad_norm_var": 1.8223307291666666, + "learning_rate": 0.0001, + "loss": 7.4006, + "loss/crossentropy": 1.997454211115837, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.189224443025887, + "step": 10210 + }, + { + "epoch": 0.2555, + "grad_norm": 30.75, + "grad_norm_var": 12.555989583333334, + "learning_rate": 0.0001, + "loss": 7.5754, + "loss/crossentropy": 2.095755438506603, + "loss/hidden": 3.470703125, + "loss/jsd": 0.0, + "loss/logits": 0.1905398152768612, + "step": 10220 + }, + { + "epoch": 0.25575, + "grad_norm": 30.25, + "grad_norm_var": 2.0666015625, + "learning_rate": 0.0001, + "loss": 7.4499, + "loss/crossentropy": 2.031408229470253, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.1826389298774302, + "step": 10230 + }, + { + "epoch": 0.256, + "grad_norm": 30.875, + "grad_norm_var": 1.1973307291666666, + "learning_rate": 0.0001, + "loss": 7.4736, + "loss/crossentropy": 2.103855460882187, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.21460786666721104, + "step": 10240 + }, + { + "epoch": 0.25625, + "grad_norm": 31.125, + "grad_norm_var": 2.6556640625, + "learning_rate": 0.0001, + "loss": 7.5078, + "loss/crossentropy": 2.0419384971261025, + "loss/hidden": 3.44609375, + "loss/jsd": 0.0, + "loss/logits": 0.1855954358354211, + "step": 10250 + }, + { + "epoch": 0.2565, + "grad_norm": 31.0, + "grad_norm_var": 3.025, + "learning_rate": 0.0001, + "loss": 7.3116, + "loss/crossentropy": 2.2164057731628417, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18515868298709393, + "step": 10260 + }, + { + "epoch": 0.25675, + "grad_norm": 30.75, + "grad_norm_var": 2.653059895833333, + "learning_rate": 0.0001, + "loss": 7.2962, + "loss/crossentropy": 2.0967153370380402, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.2144587781280279, + "step": 10270 + }, + { + "epoch": 0.257, + "grad_norm": 27.625, + "grad_norm_var": 2874.977018229167, + "learning_rate": 0.0001, + "loss": 7.3634, + "loss/crossentropy": 2.197367396950722, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.1903674839064479, + "step": 10280 + }, + { + "epoch": 0.25725, + "grad_norm": 30.25, + "grad_norm_var": 2830.1666015625, + "learning_rate": 0.0001, + "loss": 7.4461, + "loss/crossentropy": 1.9168385773897172, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.18945675920695065, + "step": 10290 + }, + { + "epoch": 0.2575, + "grad_norm": 30.625, + "grad_norm_var": 10.3634765625, + "learning_rate": 0.0001, + "loss": 7.4383, + "loss/crossentropy": 2.196033439040184, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.20106703452765942, + "step": 10300 + }, + { + "epoch": 0.25775, + "grad_norm": 29.75, + "grad_norm_var": 2.41015625, + "learning_rate": 0.0001, + "loss": 7.4, + "loss/crossentropy": 1.9811425000429153, + "loss/hidden": 3.2875, + "loss/jsd": 0.0, + "loss/logits": 0.18086131382733583, + "step": 10310 + }, + { + "epoch": 0.258, + "grad_norm": 28.625, + "grad_norm_var": 1.3056640625, + "learning_rate": 0.0001, + "loss": 7.3814, + "loss/crossentropy": 2.141839873790741, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.19040859565138818, + "step": 10320 + }, + { + "epoch": 0.25825, + "grad_norm": 29.75, + "grad_norm_var": 0.9582682291666667, + "learning_rate": 0.0001, + "loss": 7.3988, + "loss/crossentropy": 2.0412203505635262, + "loss/hidden": 3.471484375, + "loss/jsd": 0.0, + "loss/logits": 0.19841136487666516, + "step": 10330 + }, + { + "epoch": 0.2585, + "grad_norm": 31.0, + "grad_norm_var": 1.94765625, + "learning_rate": 0.0001, + "loss": 7.464, + "loss/crossentropy": 2.124895977973938, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.19423805810511113, + "step": 10340 + }, + { + "epoch": 0.25875, + "grad_norm": 27.125, + "grad_norm_var": 20.603125, + "learning_rate": 0.0001, + "loss": 7.3612, + "loss/crossentropy": 2.271273523569107, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19086832217872143, + "step": 10350 + }, + { + "epoch": 0.259, + "grad_norm": 31.75, + "grad_norm_var": 41.87265625, + "learning_rate": 0.0001, + "loss": 7.3645, + "loss/crossentropy": 2.148430307209492, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.18722805399447678, + "step": 10360 + }, + { + "epoch": 0.25925, + "grad_norm": 31.125, + "grad_norm_var": 28.49140625, + "learning_rate": 0.0001, + "loss": 7.2841, + "loss/crossentropy": 2.088160905241966, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.19452540278434755, + "step": 10370 + }, + { + "epoch": 0.2595, + "grad_norm": 30.375, + "grad_norm_var": 3.8275390625, + "learning_rate": 0.0001, + "loss": 7.3269, + "loss/crossentropy": 2.037956405431032, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.19043162725865842, + "step": 10380 + }, + { + "epoch": 0.25975, + "grad_norm": 28.25, + "grad_norm_var": 8.718489583333334, + "learning_rate": 0.0001, + "loss": 7.2968, + "loss/crossentropy": 1.9823431193828582, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.19301451742649078, + "step": 10390 + }, + { + "epoch": 0.26, + "grad_norm": 31.625, + "grad_norm_var": 8.5625, + "learning_rate": 0.0001, + "loss": 7.3314, + "loss/crossentropy": 2.0179347068071367, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.18727120459079744, + "step": 10400 + }, + { + "epoch": 0.26025, + "grad_norm": 29.375, + "grad_norm_var": 7.3625, + "learning_rate": 0.0001, + "loss": 7.3878, + "loss/crossentropy": 2.0856245614588262, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.18157497737556696, + "step": 10410 + }, + { + "epoch": 0.2605, + "grad_norm": 33.0, + "grad_norm_var": 3.0134765625, + "learning_rate": 0.0001, + "loss": 7.3355, + "loss/crossentropy": 2.0798820555210114, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.1937016123905778, + "step": 10420 + }, + { + "epoch": 0.26075, + "grad_norm": 30.625, + "grad_norm_var": 2.5400390625, + "learning_rate": 0.0001, + "loss": 7.4527, + "loss/crossentropy": 2.0538913279771807, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.19370690621435643, + "step": 10430 + }, + { + "epoch": 0.261, + "grad_norm": 30.125, + "grad_norm_var": 3.42890625, + "learning_rate": 0.0001, + "loss": 7.46, + "loss/crossentropy": 2.181585241854191, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1953628245741129, + "step": 10440 + }, + { + "epoch": 0.26125, + "grad_norm": 27.75, + "grad_norm_var": 3.783072916666667, + "learning_rate": 0.0001, + "loss": 7.3934, + "loss/crossentropy": 2.183496044576168, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20456707030534743, + "step": 10450 + }, + { + "epoch": 0.2615, + "grad_norm": 30.125, + "grad_norm_var": 1.4379557291666667, + "learning_rate": 0.0001, + "loss": 7.3211, + "loss/crossentropy": 2.132909268140793, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18731370605528355, + "step": 10460 + }, + { + "epoch": 0.26175, + "grad_norm": 31.25, + "grad_norm_var": 2.5791666666666666, + "learning_rate": 0.0001, + "loss": 7.3587, + "loss/crossentropy": 2.0115842171013356, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.18043918311595916, + "step": 10470 + }, + { + "epoch": 0.262, + "grad_norm": 30.25, + "grad_norm_var": 3.8268229166666665, + "learning_rate": 0.0001, + "loss": 7.2962, + "loss/crossentropy": 2.1343379452824593, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.18047874867916108, + "step": 10480 + }, + { + "epoch": 0.26225, + "grad_norm": 36.0, + "grad_norm_var": 8.967708333333333, + "learning_rate": 0.0001, + "loss": 7.2903, + "loss/crossentropy": 1.8975451827049254, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.17858757209032775, + "step": 10490 + }, + { + "epoch": 0.2625, + "grad_norm": 31.75, + "grad_norm_var": 10.077083333333333, + "learning_rate": 0.0001, + "loss": 7.3074, + "loss/crossentropy": 2.0936397939920424, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.19368096999824047, + "step": 10500 + }, + { + "epoch": 0.26275, + "grad_norm": 29.375, + "grad_norm_var": 2.020768229166667, + "learning_rate": 0.0001, + "loss": 7.3675, + "loss/crossentropy": 2.06072843298316, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17674694433808327, + "step": 10510 + }, + { + "epoch": 0.263, + "grad_norm": 29.125, + "grad_norm_var": 3.4686848958333334, + "learning_rate": 0.0001, + "loss": 7.3847, + "loss/crossentropy": 2.057940775156021, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.18738706540316344, + "step": 10520 + }, + { + "epoch": 0.26325, + "grad_norm": 31.625, + "grad_norm_var": 3.225, + "learning_rate": 0.0001, + "loss": 7.3449, + "loss/crossentropy": 2.051366009563208, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18555977791547776, + "step": 10530 + }, + { + "epoch": 0.2635, + "grad_norm": 29.75, + "grad_norm_var": 32.99108072916667, + "learning_rate": 0.0001, + "loss": 7.3795, + "loss/crossentropy": 2.061921717226505, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.1927286960184574, + "step": 10540 + }, + { + "epoch": 0.26375, + "grad_norm": 31.875, + "grad_norm_var": 34.16451822916667, + "learning_rate": 0.0001, + "loss": 7.3305, + "loss/crossentropy": 2.051614002883434, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.20516632981598376, + "step": 10550 + }, + { + "epoch": 0.264, + "grad_norm": 30.25, + "grad_norm_var": 2.2802083333333334, + "learning_rate": 0.0001, + "loss": 7.3251, + "loss/crossentropy": 2.208609312772751, + "loss/hidden": 3.302734375, + "loss/jsd": 0.0, + "loss/logits": 0.1906878400593996, + "step": 10560 + }, + { + "epoch": 0.26425, + "grad_norm": 28.0, + "grad_norm_var": 2.8785807291666665, + "learning_rate": 0.0001, + "loss": 7.4584, + "loss/crossentropy": 2.2136432066559792, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.1983118023723364, + "step": 10570 + }, + { + "epoch": 0.2645, + "grad_norm": 30.5, + "grad_norm_var": 3.562239583333333, + "learning_rate": 0.0001, + "loss": 7.3017, + "loss/crossentropy": 2.2243916779756545, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.19225324541330338, + "step": 10580 + }, + { + "epoch": 0.26475, + "grad_norm": 31.75, + "grad_norm_var": 2.47265625, + "learning_rate": 0.0001, + "loss": 7.2623, + "loss/crossentropy": 2.085415804386139, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.1769318761304021, + "step": 10590 + }, + { + "epoch": 0.265, + "grad_norm": 30.5, + "grad_norm_var": 1.8895182291666666, + "learning_rate": 0.0001, + "loss": 7.4543, + "loss/crossentropy": 2.0929081469774244, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.21267817355692387, + "step": 10600 + }, + { + "epoch": 0.26525, + "grad_norm": 28.75, + "grad_norm_var": 2.505989583333333, + "learning_rate": 0.0001, + "loss": 7.2136, + "loss/crossentropy": 1.885066507011652, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.17078990247100592, + "step": 10610 + }, + { + "epoch": 0.2655, + "grad_norm": 27.25, + "grad_norm_var": 2.18515625, + "learning_rate": 0.0001, + "loss": 7.258, + "loss/crossentropy": 1.9193078093230724, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.18096741195768118, + "step": 10620 + }, + { + "epoch": 0.26575, + "grad_norm": 29.5, + "grad_norm_var": 2.499739583333333, + "learning_rate": 0.0001, + "loss": 7.256, + "loss/crossentropy": 1.9643688909709454, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.16975617725402117, + "step": 10630 + }, + { + "epoch": 0.266, + "grad_norm": 41.25, + "grad_norm_var": 2.3308942592018744e+18, + "learning_rate": 0.0001, + "loss": 7.3436, + "loss/crossentropy": 1.8120718583464623, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.17283489797264337, + "step": 10640 + }, + { + "epoch": 0.26625, + "grad_norm": 30.5, + "grad_norm_var": 2.3308942571726003e+18, + "learning_rate": 0.0001, + "loss": 7.398, + "loss/crossentropy": 2.2462258487939835, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.20116339288651944, + "step": 10650 + }, + { + "epoch": 0.2665, + "grad_norm": 32.0, + "grad_norm_var": 21.258268229166667, + "learning_rate": 0.0001, + "loss": 7.2994, + "loss/crossentropy": 2.044428373128176, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.1989501324482262, + "step": 10660 + }, + { + "epoch": 0.26675, + "grad_norm": 29.875, + "grad_norm_var": 1.8681640625, + "learning_rate": 0.0001, + "loss": 7.3881, + "loss/crossentropy": 1.9573393151164056, + "loss/hidden": 3.27421875, + "loss/jsd": 0.0, + "loss/logits": 0.17021252401173115, + "step": 10670 + }, + { + "epoch": 0.267, + "grad_norm": 33.0, + "grad_norm_var": 2.6197265625, + "learning_rate": 0.0001, + "loss": 7.449, + "loss/crossentropy": 2.1520077705383303, + "loss/hidden": 3.498046875, + "loss/jsd": 0.0, + "loss/logits": 0.20022546350955964, + "step": 10680 + }, + { + "epoch": 0.26725, + "grad_norm": 30.25, + "grad_norm_var": 69.97473958333333, + "learning_rate": 0.0001, + "loss": 7.4428, + "loss/crossentropy": 2.1147203534841537, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.18886133264750243, + "step": 10690 + }, + { + "epoch": 0.2675, + "grad_norm": 33.25, + "grad_norm_var": 27.978125, + "learning_rate": 0.0001, + "loss": 7.3953, + "loss/crossentropy": 2.1656775265932082, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18777367267757655, + "step": 10700 + }, + { + "epoch": 0.26775, + "grad_norm": 30.75, + "grad_norm_var": 39.307291666666664, + "learning_rate": 0.0001, + "loss": 7.2137, + "loss/crossentropy": 2.2314133137464522, + "loss/hidden": 3.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.18172509353607894, + "step": 10710 + }, + { + "epoch": 0.268, + "grad_norm": 30.5, + "grad_norm_var": 43.19055989583333, + "learning_rate": 0.0001, + "loss": 7.3973, + "loss/crossentropy": 2.0373408157378434, + "loss/hidden": 3.500390625, + "loss/jsd": 0.0, + "loss/logits": 0.19881470818072558, + "step": 10720 + }, + { + "epoch": 0.26825, + "grad_norm": 28.125, + "grad_norm_var": 7.35625, + "learning_rate": 0.0001, + "loss": 7.3064, + "loss/crossentropy": 2.065886814892292, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.19001882169395684, + "step": 10730 + }, + { + "epoch": 0.2685, + "grad_norm": 30.0, + "grad_norm_var": 2.437239583333333, + "learning_rate": 0.0001, + "loss": 7.4337, + "loss/crossentropy": 2.1057795181870462, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19459401965141296, + "step": 10740 + }, + { + "epoch": 0.26875, + "grad_norm": 29.75, + "grad_norm_var": 3.24765625, + "learning_rate": 0.0001, + "loss": 7.4157, + "loss/crossentropy": 1.7320681288838387, + "loss/hidden": 3.533984375, + "loss/jsd": 0.0, + "loss/logits": 0.20864265877753496, + "step": 10750 + }, + { + "epoch": 0.269, + "grad_norm": 31.0, + "grad_norm_var": 2.9098307291666665, + "learning_rate": 0.0001, + "loss": 7.3267, + "loss/crossentropy": 2.1352997794747353, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.19193184426985682, + "step": 10760 + }, + { + "epoch": 0.26925, + "grad_norm": 32.25, + "grad_norm_var": 3.0444333193999944e+18, + "learning_rate": 0.0001, + "loss": 7.3861, + "loss/crossentropy": 2.070178285241127, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.19077087007462978, + "step": 10770 + }, + { + "epoch": 0.2695, + "grad_norm": 28.5, + "grad_norm_var": 23.91640625, + "learning_rate": 0.0001, + "loss": 7.4494, + "loss/crossentropy": 2.147341425716877, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.18359180726110935, + "step": 10780 + }, + { + "epoch": 0.26975, + "grad_norm": 30.875, + "grad_norm_var": 1.5843098958333333, + "learning_rate": 0.0001, + "loss": 7.3566, + "loss/crossentropy": 2.0286200530827045, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.1995188482105732, + "step": 10790 + }, + { + "epoch": 0.27, + "grad_norm": 30.625, + "grad_norm_var": 1.8247395833333333, + "learning_rate": 0.0001, + "loss": 7.3752, + "loss/crossentropy": 2.1403896272182465, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.19066570214927198, + "step": 10800 + }, + { + "epoch": 0.27025, + "grad_norm": 32.25, + "grad_norm_var": 3.2462890625, + "learning_rate": 0.0001, + "loss": 7.3297, + "loss/crossentropy": 2.109960842132568, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.19408894181251526, + "step": 10810 + }, + { + "epoch": 0.2705, + "grad_norm": 31.0, + "grad_norm_var": 2.125, + "learning_rate": 0.0001, + "loss": 7.4115, + "loss/crossentropy": 2.170359855145216, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.1864105872809887, + "step": 10820 + }, + { + "epoch": 0.27075, + "grad_norm": 30.125, + "grad_norm_var": 2.5025390625, + "learning_rate": 0.0001, + "loss": 7.3568, + "loss/crossentropy": 2.0696767389774324, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.18454026989638805, + "step": 10830 + }, + { + "epoch": 0.271, + "grad_norm": 29.625, + "grad_norm_var": 3.187239583333333, + "learning_rate": 0.0001, + "loss": 7.3986, + "loss/crossentropy": 2.1671934336423875, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.19040703494101763, + "step": 10840 + }, + { + "epoch": 0.27125, + "grad_norm": 29.25, + "grad_norm_var": 3.470833333333333, + "learning_rate": 0.0001, + "loss": 7.3105, + "loss/crossentropy": 1.9061052799224854, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.1798303933814168, + "step": 10850 + }, + { + "epoch": 0.2715, + "grad_norm": 33.0, + "grad_norm_var": 3.2087890625, + "learning_rate": 0.0001, + "loss": 7.3692, + "loss/crossentropy": 2.1218462653458117, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.20523111913353204, + "step": 10860 + }, + { + "epoch": 0.27175, + "grad_norm": 29.375, + "grad_norm_var": 3.544205729166667, + "learning_rate": 0.0001, + "loss": 7.3206, + "loss/crossentropy": 2.006873355805874, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.2144127245992422, + "step": 10870 + }, + { + "epoch": 0.272, + "grad_norm": 29.875, + "grad_norm_var": 1.5832967245463552e+18, + "learning_rate": 0.0001, + "loss": 7.4255, + "loss/crossentropy": 2.1331238821148872, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.19302980024367572, + "step": 10880 + }, + { + "epoch": 0.27225, + "grad_norm": 31.875, + "grad_norm_var": 1.5832967244414976e+18, + "learning_rate": 0.0001, + "loss": 7.3477, + "loss/crossentropy": 2.041455736756325, + "loss/hidden": 3.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.230126572214067, + "step": 10890 + }, + { + "epoch": 0.2725, + "grad_norm": 32.25, + "grad_norm_var": 16.21015625, + "learning_rate": 0.0001, + "loss": 7.3668, + "loss/crossentropy": 2.11685880869627, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.18238812778145075, + "step": 10900 + }, + { + "epoch": 0.27275, + "grad_norm": 29.625, + "grad_norm_var": 15.566666666666666, + "learning_rate": 0.0001, + "loss": 7.349, + "loss/crossentropy": 2.1108590021729468, + "loss/hidden": 3.448828125, + "loss/jsd": 0.0, + "loss/logits": 0.2031965653412044, + "step": 10910 + }, + { + "epoch": 0.273, + "grad_norm": 30.5, + "grad_norm_var": 6.133072916666666, + "learning_rate": 0.0001, + "loss": 7.3685, + "loss/crossentropy": 1.9768708415329457, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.1786771345883608, + "step": 10920 + }, + { + "epoch": 0.27325, + "grad_norm": 28.75, + "grad_norm_var": 6.448372395833333, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 1.9710132874548436, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.18313440950587392, + "step": 10930 + }, + { + "epoch": 0.2735, + "grad_norm": 29.5, + "grad_norm_var": 2.63515625, + "learning_rate": 0.0001, + "loss": 7.4147, + "loss/crossentropy": 2.1898112446069717, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.1952790988609195, + "step": 10940 + }, + { + "epoch": 0.27375, + "grad_norm": 31.25, + "grad_norm_var": 1.8978515625, + "learning_rate": 0.0001, + "loss": 7.3679, + "loss/crossentropy": 2.135420022904873, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.21568923965096473, + "step": 10950 + }, + { + "epoch": 0.274, + "grad_norm": 31.625, + "grad_norm_var": 3.5184895833333334, + "learning_rate": 0.0001, + "loss": 7.2611, + "loss/crossentropy": 1.9667419284582137, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.1834744794294238, + "step": 10960 + }, + { + "epoch": 0.27425, + "grad_norm": 30.625, + "grad_norm_var": 1.3561848958333333, + "learning_rate": 0.0001, + "loss": 7.4273, + "loss/crossentropy": 2.074997512996197, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.18813611660152674, + "step": 10970 + }, + { + "epoch": 0.2745, + "grad_norm": 31.75, + "grad_norm_var": 2.325455729166667, + "learning_rate": 0.0001, + "loss": 7.3688, + "loss/crossentropy": 2.234239089488983, + "loss/hidden": 3.254296875, + "loss/jsd": 0.0, + "loss/logits": 0.18571207812055945, + "step": 10980 + }, + { + "epoch": 0.27475, + "grad_norm": 33.25, + "grad_norm_var": 5.548958333333333, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.1355573579669, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.20484505780041218, + "step": 10990 + }, + { + "epoch": 0.275, + "grad_norm": 26.625, + "grad_norm_var": 5.2759765625, + "learning_rate": 0.0001, + "loss": 7.4261, + "loss/crossentropy": 2.2509318992495535, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.19250140003859997, + "step": 11000 + }, + { + "epoch": 0.27525, + "grad_norm": 31.0, + "grad_norm_var": 5.387239583333334, + "learning_rate": 0.0001, + "loss": 7.3389, + "loss/crossentropy": 2.114951176941395, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.198347245156765, + "step": 11010 + }, + { + "epoch": 0.2755, + "grad_norm": 30.375, + "grad_norm_var": 1.5468098958333334, + "learning_rate": 0.0001, + "loss": 7.3199, + "loss/crossentropy": 2.110514160990715, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.1900305664166808, + "step": 11020 + }, + { + "epoch": 0.27575, + "grad_norm": 27.25, + "grad_norm_var": 2.003059895833333, + "learning_rate": 0.0001, + "loss": 7.3304, + "loss/crossentropy": 2.201597589254379, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.20035982932895421, + "step": 11030 + }, + { + "epoch": 0.276, + "grad_norm": 32.25, + "grad_norm_var": 3.0139973958333335, + "learning_rate": 0.0001, + "loss": 7.4256, + "loss/crossentropy": 2.1428385630249975, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.18116684164851904, + "step": 11040 + }, + { + "epoch": 0.27625, + "grad_norm": 31.25, + "grad_norm_var": 2.4671223958333335, + "learning_rate": 0.0001, + "loss": 7.2733, + "loss/crossentropy": 2.043394061923027, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.17936107851564884, + "step": 11050 + }, + { + "epoch": 0.2765, + "grad_norm": 32.75, + "grad_norm_var": 1.3830729166666667, + "learning_rate": 0.0001, + "loss": 7.4904, + "loss/crossentropy": 2.1975908786058427, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.21010352578014135, + "step": 11060 + }, + { + "epoch": 0.27675, + "grad_norm": 30.0, + "grad_norm_var": 4.671875, + "learning_rate": 0.0001, + "loss": 7.3718, + "loss/crossentropy": 2.1417069509625435, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.17585361283272505, + "step": 11070 + }, + { + "epoch": 0.277, + "grad_norm": 29.375, + "grad_norm_var": 4.309375, + "learning_rate": 0.0001, + "loss": 7.2855, + "loss/crossentropy": 2.041763362288475, + "loss/hidden": 3.221875, + "loss/jsd": 0.0, + "loss/logits": 0.17130352668464183, + "step": 11080 + }, + { + "epoch": 0.27725, + "grad_norm": 28.375, + "grad_norm_var": 11.15390625, + "learning_rate": 0.0001, + "loss": 7.3131, + "loss/crossentropy": 2.1522015750408174, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.18296321779489516, + "step": 11090 + }, + { + "epoch": 0.2775, + "grad_norm": 30.375, + "grad_norm_var": 11.266080729166667, + "learning_rate": 0.0001, + "loss": 7.3499, + "loss/crossentropy": 2.1265663146972655, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.18307223506271839, + "step": 11100 + }, + { + "epoch": 0.27775, + "grad_norm": 30.75, + "grad_norm_var": 1.6994140625, + "learning_rate": 0.0001, + "loss": 7.3764, + "loss/crossentropy": 2.126886320114136, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.1963003095239401, + "step": 11110 + }, + { + "epoch": 0.278, + "grad_norm": 29.75, + "grad_norm_var": 1.9780598958333333, + "learning_rate": 0.0001, + "loss": 7.3686, + "loss/crossentropy": 2.209390402585268, + "loss/hidden": 3.284375, + "loss/jsd": 0.0, + "loss/logits": 0.18762579131871462, + "step": 11120 + }, + { + "epoch": 0.27825, + "grad_norm": 29.875, + "grad_norm_var": 2.3806640625, + "learning_rate": 0.0001, + "loss": 7.4086, + "loss/crossentropy": 2.0636716064065697, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.18702280572615565, + "step": 11130 + }, + { + "epoch": 0.2785, + "grad_norm": 30.0, + "grad_norm_var": 11.084375, + "learning_rate": 0.0001, + "loss": 7.3611, + "loss/crossentropy": 1.9728175386786462, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18249935954809188, + "step": 11140 + }, + { + "epoch": 0.27875, + "grad_norm": 30.75, + "grad_norm_var": 13.183333333333334, + "learning_rate": 0.0001, + "loss": 7.2901, + "loss/crossentropy": 1.9563469380140304, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.1720572842285037, + "step": 11150 + }, + { + "epoch": 0.279, + "grad_norm": 40.75, + "grad_norm_var": 11.42890625, + "learning_rate": 0.0001, + "loss": 7.2633, + "loss/crossentropy": 2.1441919445991515, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.1793568328022957, + "step": 11160 + }, + { + "epoch": 0.27925, + "grad_norm": 31.125, + "grad_norm_var": 9.245833333333334, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.0893258482217787, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18886512089520693, + "step": 11170 + }, + { + "epoch": 0.2795, + "grad_norm": 29.25, + "grad_norm_var": 15.84140625, + "learning_rate": 0.0001, + "loss": 7.3265, + "loss/crossentropy": 2.028592649102211, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.1916877321898937, + "step": 11180 + }, + { + "epoch": 0.27975, + "grad_norm": 29.0, + "grad_norm_var": 5.451822916666667, + "learning_rate": 0.0001, + "loss": 7.4362, + "loss/crossentropy": 2.1735318168997764, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.18261966463178397, + "step": 11190 + }, + { + "epoch": 0.28, + "grad_norm": 30.75, + "grad_norm_var": 3.405143229166667, + "learning_rate": 0.0001, + "loss": 7.3648, + "loss/crossentropy": 2.129372274875641, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.19846085608005523, + "step": 11200 + }, + { + "epoch": 0.28025, + "grad_norm": 30.125, + "grad_norm_var": 3.4462890625, + "learning_rate": 0.0001, + "loss": 7.4302, + "loss/crossentropy": 2.1702652648091316, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.20027328319847584, + "step": 11210 + }, + { + "epoch": 0.2805, + "grad_norm": 30.625, + "grad_norm_var": 2.58515625, + "learning_rate": 0.0001, + "loss": 7.3575, + "loss/crossentropy": 2.134139196574688, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.1903523735702038, + "step": 11220 + }, + { + "epoch": 0.28075, + "grad_norm": 32.25, + "grad_norm_var": 3.1093098958333334, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 2.026133489608765, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.201400394923985, + "step": 11230 + }, + { + "epoch": 0.281, + "grad_norm": 31.25, + "grad_norm_var": 12.038997395833333, + "learning_rate": 0.0001, + "loss": 7.4162, + "loss/crossentropy": 2.1442901253700257, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.20070651173591614, + "step": 11240 + }, + { + "epoch": 0.28125, + "grad_norm": 28.875, + "grad_norm_var": 162.22962239583333, + "learning_rate": 0.0001, + "loss": 7.4058, + "loss/crossentropy": 2.1855412632226945, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.1976642705500126, + "step": 11250 + }, + { + "epoch": 0.2815, + "grad_norm": 31.75, + "grad_norm_var": 10.683072916666667, + "learning_rate": 0.0001, + "loss": 7.3535, + "loss/crossentropy": 2.030480499565601, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.20611500162631274, + "step": 11260 + }, + { + "epoch": 0.28175, + "grad_norm": 36.75, + "grad_norm_var": 11.08515625, + "learning_rate": 0.0001, + "loss": 7.287, + "loss/crossentropy": 2.01583767645061, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.18007306316867472, + "step": 11270 + }, + { + "epoch": 0.282, + "grad_norm": 30.625, + "grad_norm_var": 7.605989583333334, + "learning_rate": 0.0001, + "loss": 7.3012, + "loss/crossentropy": 2.2034949243068693, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.18539215214550495, + "step": 11280 + }, + { + "epoch": 0.28225, + "grad_norm": 29.125, + "grad_norm_var": 4.1791015625, + "learning_rate": 0.0001, + "loss": 7.3566, + "loss/crossentropy": 2.0296434491872786, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.18707384672015906, + "step": 11290 + }, + { + "epoch": 0.2825, + "grad_norm": 30.375, + "grad_norm_var": 7.01015625, + "learning_rate": 0.0001, + "loss": 7.2793, + "loss/crossentropy": 2.0983324527740477, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.17865268159657716, + "step": 11300 + }, + { + "epoch": 0.28275, + "grad_norm": 34.0, + "grad_norm_var": 7.029622395833333, + "learning_rate": 0.0001, + "loss": 7.3716, + "loss/crossentropy": 2.127777361869812, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.20420280396938323, + "step": 11310 + }, + { + "epoch": 0.283, + "grad_norm": 32.5, + "grad_norm_var": 38.66712239583333, + "learning_rate": 0.0001, + "loss": 7.3326, + "loss/crossentropy": 2.1566628091037274, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.180232659727335, + "step": 11320 + }, + { + "epoch": 0.28325, + "grad_norm": 30.5, + "grad_norm_var": 44.13743489583333, + "learning_rate": 0.0001, + "loss": 7.3567, + "loss/crossentropy": 2.091618612408638, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.18182430015876888, + "step": 11330 + }, + { + "epoch": 0.2835, + "grad_norm": 29.625, + "grad_norm_var": 4.887434895833334, + "learning_rate": 0.0001, + "loss": 7.3312, + "loss/crossentropy": 2.1611782908439636, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.1828035417944193, + "step": 11340 + }, + { + "epoch": 0.28375, + "grad_norm": 34.25, + "grad_norm_var": 6.423893229166667, + "learning_rate": 0.0001, + "loss": 7.1759, + "loss/crossentropy": 1.9736236594617367, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.18899882938712836, + "step": 11350 + }, + { + "epoch": 0.284, + "grad_norm": 30.0, + "grad_norm_var": 9.488541666666666, + "learning_rate": 0.0001, + "loss": 7.1771, + "loss/crossentropy": 2.0265045419335364, + "loss/hidden": 3.273828125, + "loss/jsd": 0.0, + "loss/logits": 0.1712389207445085, + "step": 11360 + }, + { + "epoch": 0.28425, + "grad_norm": 33.75, + "grad_norm_var": 12.092122395833334, + "learning_rate": 0.0001, + "loss": 7.361, + "loss/crossentropy": 2.2598242223262788, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.19073396287858485, + "step": 11370 + }, + { + "epoch": 0.2845, + "grad_norm": 29.0, + "grad_norm_var": 8.455989583333333, + "learning_rate": 0.0001, + "loss": 7.3234, + "loss/crossentropy": 2.144850969314575, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.18078193911351265, + "step": 11380 + }, + { + "epoch": 0.28475, + "grad_norm": 29.25, + "grad_norm_var": 1.7212890625, + "learning_rate": 0.0001, + "loss": 7.3, + "loss/crossentropy": 2.1646609872579576, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18855173885822296, + "step": 11390 + }, + { + "epoch": 0.285, + "grad_norm": 31.875, + "grad_norm_var": 2.5837890625, + "learning_rate": 0.0001, + "loss": 7.2976, + "loss/crossentropy": 2.120090515911579, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.17507072482258082, + "step": 11400 + }, + { + "epoch": 0.28525, + "grad_norm": 29.375, + "grad_norm_var": 2.0785807291666667, + "learning_rate": 0.0001, + "loss": 7.4018, + "loss/crossentropy": 2.0800730645656587, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.20872990731149912, + "step": 11410 + }, + { + "epoch": 0.2855, + "grad_norm": 30.25, + "grad_norm_var": 13.87890625, + "learning_rate": 0.0001, + "loss": 7.1549, + "loss/crossentropy": 2.1735212251544, + "loss/hidden": 3.25625, + "loss/jsd": 0.0, + "loss/logits": 0.18148906547576188, + "step": 11420 + }, + { + "epoch": 0.28575, + "grad_norm": 30.375, + "grad_norm_var": 15.132747395833333, + "learning_rate": 0.0001, + "loss": 7.206, + "loss/crossentropy": 1.8314741916954518, + "loss/hidden": 3.272265625, + "loss/jsd": 0.0, + "loss/logits": 0.17273464631289243, + "step": 11430 + }, + { + "epoch": 0.286, + "grad_norm": 28.875, + "grad_norm_var": 2.2330729166666665, + "learning_rate": 0.0001, + "loss": 7.3502, + "loss/crossentropy": 1.9636109337210654, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.16999556813389063, + "step": 11440 + }, + { + "epoch": 0.28625, + "grad_norm": 30.5, + "grad_norm_var": 1.8009765625, + "learning_rate": 0.0001, + "loss": 7.2893, + "loss/crossentropy": 1.9146189287304878, + "loss/hidden": 3.425390625, + "loss/jsd": 0.0, + "loss/logits": 0.1754764079116285, + "step": 11450 + }, + { + "epoch": 0.2865, + "grad_norm": 31.75, + "grad_norm_var": 1.9197265625, + "learning_rate": 0.0001, + "loss": 7.3233, + "loss/crossentropy": 2.0301330149173737, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.1742406915873289, + "step": 11460 + }, + { + "epoch": 0.28675, + "grad_norm": 34.25, + "grad_norm_var": 3.8009765625, + "learning_rate": 0.0001, + "loss": 7.4231, + "loss/crossentropy": 2.1827415734529496, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.18332260251045226, + "step": 11470 + }, + { + "epoch": 0.287, + "grad_norm": 30.875, + "grad_norm_var": 3.479622395833333, + "learning_rate": 0.0001, + "loss": 7.3842, + "loss/crossentropy": 1.9815428338944912, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.17839936017990113, + "step": 11480 + }, + { + "epoch": 0.28725, + "grad_norm": 30.125, + "grad_norm_var": 12.9228515625, + "learning_rate": 0.0001, + "loss": 7.3888, + "loss/crossentropy": 2.10543007850647, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.18028822876513004, + "step": 11490 + }, + { + "epoch": 0.2875, + "grad_norm": 30.5, + "grad_norm_var": 0.9973307291666667, + "learning_rate": 0.0001, + "loss": 7.2278, + "loss/crossentropy": 1.9690312303602695, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.19321996718645096, + "step": 11500 + }, + { + "epoch": 0.28775, + "grad_norm": 29.5, + "grad_norm_var": 7.462239583333333, + "learning_rate": 0.0001, + "loss": 7.4485, + "loss/crossentropy": 2.0997009545564653, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.1947010463103652, + "step": 11510 + }, + { + "epoch": 0.288, + "grad_norm": 30.625, + "grad_norm_var": 6.814583333333333, + "learning_rate": 0.0001, + "loss": 7.4014, + "loss/crossentropy": 2.041436542570591, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.17614111984148623, + "step": 11520 + }, + { + "epoch": 0.28825, + "grad_norm": 29.625, + "grad_norm_var": 1.7247395833333334, + "learning_rate": 0.0001, + "loss": 7.3231, + "loss/crossentropy": 2.056125995516777, + "loss/hidden": 3.475, + "loss/jsd": 0.0, + "loss/logits": 0.20333750136196613, + "step": 11530 + }, + { + "epoch": 0.2885, + "grad_norm": 30.5, + "grad_norm_var": 3.0014973958333333, + "learning_rate": 0.0001, + "loss": 7.2735, + "loss/crossentropy": 1.9806349158287049, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.17432715664617718, + "step": 11540 + }, + { + "epoch": 0.28875, + "grad_norm": 32.75, + "grad_norm_var": 1.6973307291666666, + "learning_rate": 0.0001, + "loss": 7.3842, + "loss/crossentropy": 2.0468244731426237, + "loss/hidden": 3.506640625, + "loss/jsd": 0.0, + "loss/logits": 0.19939990881830455, + "step": 11550 + }, + { + "epoch": 0.289, + "grad_norm": 31.625, + "grad_norm_var": 3.546875, + "learning_rate": 0.0001, + "loss": 7.4212, + "loss/crossentropy": 2.1170118719339373, + "loss/hidden": 3.452734375, + "loss/jsd": 0.0, + "loss/logits": 0.1938545262441039, + "step": 11560 + }, + { + "epoch": 0.28925, + "grad_norm": 29.0, + "grad_norm_var": 6.987239583333333, + "learning_rate": 0.0001, + "loss": 7.3068, + "loss/crossentropy": 1.9378852248191833, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.18525508288294076, + "step": 11570 + }, + { + "epoch": 0.2895, + "grad_norm": 32.25, + "grad_norm_var": 7.7650390625, + "learning_rate": 0.0001, + "loss": 7.3939, + "loss/crossentropy": 2.094913274049759, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.19857972972095012, + "step": 11580 + }, + { + "epoch": 0.28975, + "grad_norm": 31.0, + "grad_norm_var": 2.3934895833333334, + "learning_rate": 0.0001, + "loss": 7.3634, + "loss/crossentropy": 2.112175312638283, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.1923027027398348, + "step": 11590 + }, + { + "epoch": 0.29, + "grad_norm": 29.75, + "grad_norm_var": 2.8676432291666667, + "learning_rate": 0.0001, + "loss": 7.4788, + "loss/crossentropy": 2.347763030230999, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20385736282914876, + "step": 11600 + }, + { + "epoch": 0.29025, + "grad_norm": 36.0, + "grad_norm_var": 2.1060661210680481e+18, + "learning_rate": 0.0001, + "loss": 7.482, + "loss/crossentropy": 2.1438632771372794, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1947392811998725, + "step": 11610 + }, + { + "epoch": 0.2905, + "grad_norm": 31.5, + "grad_norm_var": 2.1060661214006216e+18, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 2.080791215598583, + "loss/hidden": 3.2875, + "loss/jsd": 0.0, + "loss/logits": 0.19497475158423186, + "step": 11620 + }, + { + "epoch": 0.29075, + "grad_norm": 30.0, + "grad_norm_var": 1.5931640625, + "learning_rate": 0.0001, + "loss": 7.2971, + "loss/crossentropy": 2.16863374710083, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.18860027231276036, + "step": 11630 + }, + { + "epoch": 0.291, + "grad_norm": 31.625, + "grad_norm_var": 1.5994140625, + "learning_rate": 0.0001, + "loss": 7.3835, + "loss/crossentropy": 2.061967647075653, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.18620339650660753, + "step": 11640 + }, + { + "epoch": 0.29125, + "grad_norm": 33.0, + "grad_norm_var": 3.690207280948122e+18, + "learning_rate": 0.0001, + "loss": 7.3738, + "loss/crossentropy": 2.0998649850487707, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.18567133340984582, + "step": 11650 + }, + { + "epoch": 0.2915, + "grad_norm": 29.0, + "grad_norm_var": 29.6228515625, + "learning_rate": 0.0001, + "loss": 7.2405, + "loss/crossentropy": 2.1259921073913572, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.18609032463282346, + "step": 11660 + }, + { + "epoch": 0.29175, + "grad_norm": 31.0, + "grad_norm_var": 14.421809895833333, + "learning_rate": 0.0001, + "loss": 7.4326, + "loss/crossentropy": 2.066218351200223, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1920524787157774, + "step": 11670 + }, + { + "epoch": 0.292, + "grad_norm": 27.875, + "grad_norm_var": 49.305989583333336, + "learning_rate": 0.0001, + "loss": 7.2881, + "loss/crossentropy": 2.07202163413167, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.18048244724050164, + "step": 11680 + }, + { + "epoch": 0.29225, + "grad_norm": 31.875, + "grad_norm_var": 4.995247395833333, + "learning_rate": 0.0001, + "loss": 7.2687, + "loss/crossentropy": 2.045185898244381, + "loss/hidden": 3.275, + "loss/jsd": 0.0, + "loss/logits": 0.17395208198577167, + "step": 11690 + }, + { + "epoch": 0.2925, + "grad_norm": 30.0, + "grad_norm_var": 5.696875, + "learning_rate": 0.0001, + "loss": 7.3259, + "loss/crossentropy": 2.3408665537834166, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.20136666856706142, + "step": 11700 + }, + { + "epoch": 0.29275, + "grad_norm": 30.0, + "grad_norm_var": 3.7421223958333334, + "learning_rate": 0.0001, + "loss": 7.2137, + "loss/crossentropy": 1.9074330985546113, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.1704694928601384, + "step": 11710 + }, + { + "epoch": 0.293, + "grad_norm": 31.5, + "grad_norm_var": 5.803125, + "learning_rate": 0.0001, + "loss": 7.2817, + "loss/crossentropy": 2.290114316344261, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.20061923637986184, + "step": 11720 + }, + { + "epoch": 0.29325, + "grad_norm": 31.125, + "grad_norm_var": 2.4936848958333333, + "learning_rate": 0.0001, + "loss": 7.4136, + "loss/crossentropy": 2.2652835667133333, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.18269798178225755, + "step": 11730 + }, + { + "epoch": 0.2935, + "grad_norm": 31.875, + "grad_norm_var": 2.06640625, + "learning_rate": 0.0001, + "loss": 7.3219, + "loss/crossentropy": 2.047122722864151, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.18665554728358985, + "step": 11740 + }, + { + "epoch": 0.29375, + "grad_norm": 29.875, + "grad_norm_var": 11.484309895833333, + "learning_rate": 0.0001, + "loss": 7.4668, + "loss/crossentropy": 2.0893666088581084, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18700389545410873, + "step": 11750 + }, + { + "epoch": 0.294, + "grad_norm": 33.25, + "grad_norm_var": 2.412239583333333, + "learning_rate": 0.0001, + "loss": 7.3389, + "loss/crossentropy": 2.0646077781915664, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.1896628426387906, + "step": 11760 + }, + { + "epoch": 0.29425, + "grad_norm": 36.75, + "grad_norm_var": 15.843489583333334, + "learning_rate": 0.0001, + "loss": 7.3537, + "loss/crossentropy": 2.14951953291893, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19772466979920864, + "step": 11770 + }, + { + "epoch": 0.2945, + "grad_norm": 34.0, + "grad_norm_var": 15.808333333333334, + "learning_rate": 0.0001, + "loss": 7.2266, + "loss/crossentropy": 1.9084580048918725, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.17457483559846879, + "step": 11780 + }, + { + "epoch": 0.29475, + "grad_norm": 30.5, + "grad_norm_var": 17.133268229166667, + "learning_rate": 0.0001, + "loss": 7.4198, + "loss/crossentropy": 2.1059816129505635, + "loss/hidden": 3.2609375, + "loss/jsd": 0.0, + "loss/logits": 0.19060559011995792, + "step": 11790 + }, + { + "epoch": 0.295, + "grad_norm": 28.875, + "grad_norm_var": 15.191666666666666, + "learning_rate": 0.0001, + "loss": 7.3353, + "loss/crossentropy": 2.0854270696640014, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.18200992513448, + "step": 11800 + }, + { + "epoch": 0.29525, + "grad_norm": 33.0, + "grad_norm_var": 5.77890625, + "learning_rate": 0.0001, + "loss": 7.338, + "loss/crossentropy": 2.0409404814243315, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.18492084443569184, + "step": 11810 + }, + { + "epoch": 0.2955, + "grad_norm": 28.25, + "grad_norm_var": 4.9556640625, + "learning_rate": 0.0001, + "loss": 7.2394, + "loss/crossentropy": 2.1242421194911003, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.17592983674257995, + "step": 11820 + }, + { + "epoch": 0.29575, + "grad_norm": 32.75, + "grad_norm_var": 12.3994140625, + "learning_rate": 0.0001, + "loss": 7.3172, + "loss/crossentropy": 2.135970714688301, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.1905512258410454, + "step": 11830 + }, + { + "epoch": 0.296, + "grad_norm": 31.125, + "grad_norm_var": 13.479166666666666, + "learning_rate": 0.0001, + "loss": 7.3928, + "loss/crossentropy": 2.0788447827100756, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.18583104945719242, + "step": 11840 + }, + { + "epoch": 0.29625, + "grad_norm": 29.375, + "grad_norm_var": 5.16640625, + "learning_rate": 0.0001, + "loss": 7.2575, + "loss/crossentropy": 2.0609687596559523, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.1885878125205636, + "step": 11850 + }, + { + "epoch": 0.2965, + "grad_norm": 33.5, + "grad_norm_var": 4.534309895833333, + "learning_rate": 0.0001, + "loss": 7.2397, + "loss/crossentropy": 2.113059785962105, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.19279801957309245, + "step": 11860 + }, + { + "epoch": 0.29675, + "grad_norm": 28.0, + "grad_norm_var": 2.833268229166667, + "learning_rate": 0.0001, + "loss": 7.4813, + "loss/crossentropy": 2.2016214139759542, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.19246475584805012, + "step": 11870 + }, + { + "epoch": 0.297, + "grad_norm": 35.5, + "grad_norm_var": 5.829622395833334, + "learning_rate": 0.0001, + "loss": 7.4107, + "loss/crossentropy": 2.1937885224819182, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.20546972285956144, + "step": 11880 + }, + { + "epoch": 0.29725, + "grad_norm": 32.25, + "grad_norm_var": 5.101041666666666, + "learning_rate": 0.0001, + "loss": 7.4133, + "loss/crossentropy": 2.112873890995979, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.19512166157364846, + "step": 11890 + }, + { + "epoch": 0.2975, + "grad_norm": 29.0, + "grad_norm_var": 4.71875, + "learning_rate": 0.0001, + "loss": 7.3659, + "loss/crossentropy": 2.068368895351887, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.22229769406840205, + "step": 11900 + }, + { + "epoch": 0.29775, + "grad_norm": 28.25, + "grad_norm_var": 30.575, + "learning_rate": 0.0001, + "loss": 7.3459, + "loss/crossentropy": 1.9633590430021286, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.1919392876327038, + "step": 11910 + }, + { + "epoch": 0.298, + "grad_norm": 31.5, + "grad_norm_var": 25.169205729166666, + "learning_rate": 0.0001, + "loss": 7.2575, + "loss/crossentropy": 2.0932082675397394, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.18578519038856028, + "step": 11920 + }, + { + "epoch": 0.29825, + "grad_norm": 29.375, + "grad_norm_var": 1.9197916666666666, + "learning_rate": 0.0001, + "loss": 7.3554, + "loss/crossentropy": 2.0443739868700503, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18646586760878564, + "step": 11930 + }, + { + "epoch": 0.2985, + "grad_norm": 29.125, + "grad_norm_var": 1.1629557291666666, + "learning_rate": 0.0001, + "loss": 7.3341, + "loss/crossentropy": 2.0645432278513907, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.20268544293940066, + "step": 11940 + }, + { + "epoch": 0.29875, + "grad_norm": 29.25, + "grad_norm_var": 4.085416666666666, + "learning_rate": 0.0001, + "loss": 7.2596, + "loss/crossentropy": 1.8821386724710465, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.17970443107187747, + "step": 11950 + }, + { + "epoch": 0.299, + "grad_norm": 29.75, + "grad_norm_var": 2.4072265625, + "learning_rate": 0.0001, + "loss": 7.3139, + "loss/crossentropy": 2.1848059430718423, + "loss/hidden": 3.311328125, + "loss/jsd": 0.0, + "loss/logits": 0.19258468672633172, + "step": 11960 + }, + { + "epoch": 0.29925, + "grad_norm": 28.625, + "grad_norm_var": 1.49140625, + "learning_rate": 0.0001, + "loss": 7.3064, + "loss/crossentropy": 1.9953081101179122, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.17766128927469255, + "step": 11970 + }, + { + "epoch": 0.2995, + "grad_norm": 30.625, + "grad_norm_var": 3.527083333333333, + "learning_rate": 0.0001, + "loss": 7.3213, + "loss/crossentropy": 1.9512401312589644, + "loss/hidden": 3.257421875, + "loss/jsd": 0.0, + "loss/logits": 0.16765224877744914, + "step": 11980 + }, + { + "epoch": 0.29975, + "grad_norm": 31.875, + "grad_norm_var": 2.8322265625, + "learning_rate": 0.0001, + "loss": 7.3373, + "loss/crossentropy": 2.134880567342043, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.19845206197351217, + "step": 11990 + }, + { + "epoch": 0.3, + "grad_norm": 29.0, + "grad_norm_var": 1.88515625, + "learning_rate": 0.0001, + "loss": 7.3355, + "loss/crossentropy": 2.084690621495247, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.18881614562124013, + "step": 12000 + }, + { + "epoch": 0.30025, + "grad_norm": 29.375, + "grad_norm_var": 1.678125, + "learning_rate": 0.0001, + "loss": 7.2797, + "loss/crossentropy": 2.0142040476202965, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.17630351725965737, + "step": 12010 + }, + { + "epoch": 0.3005, + "grad_norm": 30.125, + "grad_norm_var": 1.0702473958333334, + "learning_rate": 0.0001, + "loss": 7.3524, + "loss/crossentropy": 2.0099969252943994, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.17516886200755835, + "step": 12020 + }, + { + "epoch": 0.30075, + "grad_norm": 30.125, + "grad_norm_var": 2.1509765625, + "learning_rate": 0.0001, + "loss": 7.4081, + "loss/crossentropy": 2.0550965458154677, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.18450612868182362, + "step": 12030 + }, + { + "epoch": 0.301, + "grad_norm": 29.375, + "grad_norm_var": 2.088997395833333, + "learning_rate": 0.0001, + "loss": 7.3732, + "loss/crossentropy": 2.015381334722042, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.19866329655051232, + "step": 12040 + }, + { + "epoch": 0.30125, + "grad_norm": 32.5, + "grad_norm_var": 28.125455729166667, + "learning_rate": 0.0001, + "loss": 7.3689, + "loss/crossentropy": 2.0619105845689774, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.19213079251348972, + "step": 12050 + }, + { + "epoch": 0.3015, + "grad_norm": 29.5, + "grad_norm_var": 2.5869140625, + "learning_rate": 0.0001, + "loss": 7.3533, + "loss/crossentropy": 2.059162912517786, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.1831585830077529, + "step": 12060 + }, + { + "epoch": 0.30175, + "grad_norm": 30.375, + "grad_norm_var": 6.6150390625, + "learning_rate": 0.0001, + "loss": 7.339, + "loss/crossentropy": 2.0344214349985124, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18243852034211158, + "step": 12070 + }, + { + "epoch": 0.302, + "grad_norm": 31.375, + "grad_norm_var": 10.05390625, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.222717672586441, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.19160165078938007, + "step": 12080 + }, + { + "epoch": 0.30225, + "grad_norm": 30.375, + "grad_norm_var": 8.060416666666667, + "learning_rate": 0.0001, + "loss": 7.2928, + "loss/crossentropy": 1.9902059100568295, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.1817794761620462, + "step": 12090 + }, + { + "epoch": 0.3025, + "grad_norm": 29.0, + "grad_norm_var": 4.048958333333333, + "learning_rate": 0.0001, + "loss": 7.403, + "loss/crossentropy": 1.9990896947681904, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.17872856128960848, + "step": 12100 + }, + { + "epoch": 0.30275, + "grad_norm": 32.0, + "grad_norm_var": 14.486393229166667, + "learning_rate": 0.0001, + "loss": 7.3774, + "loss/crossentropy": 2.178261125087738, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.19976076446473598, + "step": 12110 + }, + { + "epoch": 0.303, + "grad_norm": 29.0, + "grad_norm_var": 25.053059895833332, + "learning_rate": 0.0001, + "loss": 7.3852, + "loss/crossentropy": 2.1738795921206475, + "loss/hidden": 3.21171875, + "loss/jsd": 0.0, + "loss/logits": 0.1802441133186221, + "step": 12120 + }, + { + "epoch": 0.30325, + "grad_norm": 32.5, + "grad_norm_var": 27.863541666666666, + "learning_rate": 0.0001, + "loss": 7.3119, + "loss/crossentropy": 2.017075891792774, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.1819239752367139, + "step": 12130 + }, + { + "epoch": 0.3035, + "grad_norm": 5704253440.0, + "grad_norm_var": 2.033656683216328e+18, + "learning_rate": 0.0001, + "loss": 7.3187, + "loss/crossentropy": 2.1378798320889474, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.18621215373277664, + "step": 12140 + }, + { + "epoch": 0.30375, + "grad_norm": 28.5, + "grad_norm_var": 2.0336566834599473e+18, + "learning_rate": 0.0001, + "loss": 7.3375, + "loss/crossentropy": 2.0977103441953657, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.18937066961079835, + "step": 12150 + }, + { + "epoch": 0.304, + "grad_norm": 30.0, + "grad_norm_var": 25.7119140625, + "learning_rate": 0.0001, + "loss": 7.2813, + "loss/crossentropy": 2.0422945946455, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.17747708465903997, + "step": 12160 + }, + { + "epoch": 0.30425, + "grad_norm": 29.375, + "grad_norm_var": 1.8973307291666666, + "learning_rate": 0.0001, + "loss": 7.2982, + "loss/crossentropy": 2.1438152998685838, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.19225314557552337, + "step": 12170 + }, + { + "epoch": 0.3045, + "grad_norm": 32.5, + "grad_norm_var": 6.688997395833334, + "learning_rate": 0.0001, + "loss": 7.2886, + "loss/crossentropy": 2.033204630762339, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.1741345826536417, + "step": 12180 + }, + { + "epoch": 0.30475, + "grad_norm": 28.5, + "grad_norm_var": 7.262434895833334, + "learning_rate": 0.0001, + "loss": 7.3812, + "loss/crossentropy": 1.9149722829461098, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.16993715493008493, + "step": 12190 + }, + { + "epoch": 0.305, + "grad_norm": 30.625, + "grad_norm_var": 2.315625, + "learning_rate": 0.0001, + "loss": 7.355, + "loss/crossentropy": 2.200256870687008, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.1957419654354453, + "step": 12200 + }, + { + "epoch": 0.30525, + "grad_norm": 32.25, + "grad_norm_var": 1.8291666666666666, + "learning_rate": 0.0001, + "loss": 7.2689, + "loss/crossentropy": 1.9441036701202392, + "loss/hidden": 3.450390625, + "loss/jsd": 0.0, + "loss/logits": 0.19158205185085536, + "step": 12210 + }, + { + "epoch": 0.3055, + "grad_norm": 32.5, + "grad_norm_var": 224.99993489583332, + "learning_rate": 0.0001, + "loss": 7.4205, + "loss/crossentropy": 2.21477717012167, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.19972570687532426, + "step": 12220 + }, + { + "epoch": 0.30575, + "grad_norm": 77.0, + "grad_norm_var": 161.4994140625, + "learning_rate": 0.0001, + "loss": 7.3221, + "loss/crossentropy": 2.243621972203255, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.20230026841163634, + "step": 12230 + }, + { + "epoch": 0.306, + "grad_norm": 29.125, + "grad_norm_var": 136.2509765625, + "learning_rate": 0.0001, + "loss": 7.2544, + "loss/crossentropy": 2.1267666652798654, + "loss/hidden": 3.30078125, + "loss/jsd": 0.0, + "loss/logits": 0.18361538834869862, + "step": 12240 + }, + { + "epoch": 0.30625, + "grad_norm": 43.0, + "grad_norm_var": 15.1384765625, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.097319718450308, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.1941752176731825, + "step": 12250 + }, + { + "epoch": 0.3065, + "grad_norm": 27.125, + "grad_norm_var": 45.91875, + "learning_rate": 0.0001, + "loss": 7.1913, + "loss/crossentropy": 2.024158325791359, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.18036661259829997, + "step": 12260 + }, + { + "epoch": 0.30675, + "grad_norm": 32.5, + "grad_norm_var": 34.94765625, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.05818811878562, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19272202141582967, + "step": 12270 + }, + { + "epoch": 0.307, + "grad_norm": 31.625, + "grad_norm_var": 1.7958333333333334, + "learning_rate": 0.0001, + "loss": 7.3478, + "loss/crossentropy": 2.058871729671955, + "loss/hidden": 3.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.1805756026878953, + "step": 12280 + }, + { + "epoch": 0.30725, + "grad_norm": 37.0, + "grad_norm_var": 4.418489583333334, + "learning_rate": 0.0001, + "loss": 7.3202, + "loss/crossentropy": 1.9400940239429474, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.19434008933603764, + "step": 12290 + }, + { + "epoch": 0.3075, + "grad_norm": 30.5, + "grad_norm_var": 30.1572265625, + "learning_rate": 0.0001, + "loss": 7.2939, + "loss/crossentropy": 2.069047340750694, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.20558738969266416, + "step": 12300 + }, + { + "epoch": 0.30775, + "grad_norm": 33.0, + "grad_norm_var": 11.9666015625, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.227694994211197, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.18980047646909953, + "step": 12310 + }, + { + "epoch": 0.308, + "grad_norm": 29.625, + "grad_norm_var": 4.90390625, + "learning_rate": 0.0001, + "loss": 7.3686, + "loss/crossentropy": 2.0309864141047003, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1816682495176792, + "step": 12320 + }, + { + "epoch": 0.30825, + "grad_norm": 30.125, + "grad_norm_var": 1.7249348958333333, + "learning_rate": 0.0001, + "loss": 7.4105, + "loss/crossentropy": 2.2505668699741364, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.19684594608843325, + "step": 12330 + }, + { + "epoch": 0.3085, + "grad_norm": 30.75, + "grad_norm_var": 1.9535807291666667, + "learning_rate": 0.0001, + "loss": 7.3679, + "loss/crossentropy": 2.0422501020133494, + "loss/hidden": 3.223828125, + "loss/jsd": 0.0, + "loss/logits": 0.1723391016945243, + "step": 12340 + }, + { + "epoch": 0.30875, + "grad_norm": 30.25, + "grad_norm_var": 1.9186848958333333, + "learning_rate": 0.0001, + "loss": 7.3186, + "loss/crossentropy": 2.190096604824066, + "loss/hidden": 3.299609375, + "loss/jsd": 0.0, + "loss/logits": 0.1794669708237052, + "step": 12350 + }, + { + "epoch": 0.309, + "grad_norm": 30.0, + "grad_norm_var": 1.7926432291666667, + "learning_rate": 0.0001, + "loss": 7.2068, + "loss/crossentropy": 2.011892383173108, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.17626657225191594, + "step": 12360 + }, + { + "epoch": 0.30925, + "grad_norm": 32.25, + "grad_norm_var": 3.3900390625, + "learning_rate": 0.0001, + "loss": 7.3963, + "loss/crossentropy": 2.1367972552776338, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.19975323602557182, + "step": 12370 + }, + { + "epoch": 0.3095, + "grad_norm": 30.875, + "grad_norm_var": 2.9785807291666666, + "learning_rate": 0.0001, + "loss": 7.3586, + "loss/crossentropy": 2.1065737903118134, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.1861647253856063, + "step": 12380 + }, + { + "epoch": 0.30975, + "grad_norm": 34.0, + "grad_norm_var": 49.94583333333333, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 2.047655486315489, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.19645441174507142, + "step": 12390 + }, + { + "epoch": 0.31, + "grad_norm": 30.75, + "grad_norm_var": 55.521809895833336, + "learning_rate": 0.0001, + "loss": 7.3384, + "loss/crossentropy": 2.1421022772789002, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.21594423688948156, + "step": 12400 + }, + { + "epoch": 0.31025, + "grad_norm": 28.625, + "grad_norm_var": 3.630989583333333, + "learning_rate": 0.0001, + "loss": 7.4915, + "loss/crossentropy": 1.9852606005966664, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.1970728723332286, + "step": 12410 + }, + { + "epoch": 0.3105, + "grad_norm": 30.375, + "grad_norm_var": 13.821875, + "learning_rate": 0.0001, + "loss": 7.316, + "loss/crossentropy": 2.0221208080649378, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.18263351675122977, + "step": 12420 + }, + { + "epoch": 0.31075, + "grad_norm": 27.75, + "grad_norm_var": 3.7905598958333333, + "learning_rate": 0.0001, + "loss": 7.4569, + "loss/crossentropy": 2.2214868292212486, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.20803299229592084, + "step": 12430 + }, + { + "epoch": 0.311, + "grad_norm": 28.0, + "grad_norm_var": 5.039322916666666, + "learning_rate": 0.0001, + "loss": 7.2246, + "loss/crossentropy": 1.968916893005371, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.1823066620156169, + "step": 12440 + }, + { + "epoch": 0.31125, + "grad_norm": 32.25, + "grad_norm_var": 3.011458333333333, + "learning_rate": 0.0001, + "loss": 7.2986, + "loss/crossentropy": 2.102560856938362, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.19421595316380263, + "step": 12450 + }, + { + "epoch": 0.3115, + "grad_norm": 31.625, + "grad_norm_var": 1.7122395833333333, + "learning_rate": 0.0001, + "loss": 7.4102, + "loss/crossentropy": 1.8904587842524052, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.16765992902219296, + "step": 12460 + }, + { + "epoch": 0.31175, + "grad_norm": 32.75, + "grad_norm_var": 1.3614583333333334, + "learning_rate": 0.0001, + "loss": 7.317, + "loss/crossentropy": 2.2456528916954994, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.18445245549082756, + "step": 12470 + }, + { + "epoch": 0.312, + "grad_norm": 32.5, + "grad_norm_var": 2.9613932291666667, + "learning_rate": 0.0001, + "loss": 7.2527, + "loss/crossentropy": 1.9254849448800087, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.17210696106776596, + "step": 12480 + }, + { + "epoch": 0.31225, + "grad_norm": 29.0, + "grad_norm_var": 2.503059895833333, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 1.963121373206377, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.17580699175596237, + "step": 12490 + }, + { + "epoch": 0.3125, + "grad_norm": 27.375, + "grad_norm_var": 2.6510416666666665, + "learning_rate": 0.0001, + "loss": 7.3302, + "loss/crossentropy": 2.0668979212641716, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.1756731817498803, + "step": 12500 + }, + { + "epoch": 0.31275, + "grad_norm": 32.5, + "grad_norm_var": 9.658072916666667, + "learning_rate": 0.0001, + "loss": 7.396, + "loss/crossentropy": 2.0963846892118454, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.19187606330960988, + "step": 12510 + }, + { + "epoch": 0.313, + "grad_norm": 28.125, + "grad_norm_var": 8.387239583333333, + "learning_rate": 0.0001, + "loss": 7.291, + "loss/crossentropy": 2.1238081738352776, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.18899311302229763, + "step": 12520 + }, + { + "epoch": 0.31325, + "grad_norm": 30.625, + "grad_norm_var": 2.2712890625, + "learning_rate": 0.0001, + "loss": 7.2927, + "loss/crossentropy": 1.8629853092133999, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.18793845577165486, + "step": 12530 + }, + { + "epoch": 0.3135, + "grad_norm": 31.25, + "grad_norm_var": 29.926041666666666, + "learning_rate": 0.0001, + "loss": 7.334, + "loss/crossentropy": 2.1922558069229128, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.18994415253400804, + "step": 12540 + }, + { + "epoch": 0.31375, + "grad_norm": 33.0, + "grad_norm_var": 19.626822916666665, + "learning_rate": 0.0001, + "loss": 7.4246, + "loss/crossentropy": 2.0747738771140574, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.1956952316686511, + "step": 12550 + }, + { + "epoch": 0.314, + "grad_norm": 31.0, + "grad_norm_var": 5.9822265625, + "learning_rate": 0.0001, + "loss": 7.3921, + "loss/crossentropy": 2.168559101223946, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.19871533028781413, + "step": 12560 + }, + { + "epoch": 0.31425, + "grad_norm": 29.875, + "grad_norm_var": 23.478125, + "learning_rate": 0.0001, + "loss": 7.381, + "loss/crossentropy": 2.1687012270092962, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.17819978184998037, + "step": 12570 + }, + { + "epoch": 0.3145, + "grad_norm": 30.5, + "grad_norm_var": 24.369791666666668, + "learning_rate": 0.0001, + "loss": 7.2273, + "loss/crossentropy": 2.0223761543631555, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.17995049208402633, + "step": 12580 + }, + { + "epoch": 0.31475, + "grad_norm": 31.125, + "grad_norm_var": 3.8067057291666666, + "learning_rate": 0.0001, + "loss": 7.3672, + "loss/crossentropy": 2.056264813989401, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19224987253546716, + "step": 12590 + }, + { + "epoch": 0.315, + "grad_norm": 30.125, + "grad_norm_var": 3.3018229166666666, + "learning_rate": 0.0001, + "loss": 7.2404, + "loss/crossentropy": 2.0127123326063154, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.17083216030150652, + "step": 12600 + }, + { + "epoch": 0.31525, + "grad_norm": 31.25, + "grad_norm_var": 2.94375, + "learning_rate": 0.0001, + "loss": 7.3918, + "loss/crossentropy": 2.0961378000676634, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.18820952698588372, + "step": 12610 + }, + { + "epoch": 0.3155, + "grad_norm": 30.75, + "grad_norm_var": 3.910416666666667, + "learning_rate": 0.0001, + "loss": 7.2715, + "loss/crossentropy": 2.1737264052033423, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.1752359176054597, + "step": 12620 + }, + { + "epoch": 0.31575, + "grad_norm": 28.5, + "grad_norm_var": 12.914322916666666, + "learning_rate": 0.0001, + "loss": 7.2866, + "loss/crossentropy": 2.04836600497365, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.1814719710499048, + "step": 12630 + }, + { + "epoch": 0.316, + "grad_norm": 32.5, + "grad_norm_var": 5.895572916666667, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.0633833378553392, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.1844972724094987, + "step": 12640 + }, + { + "epoch": 0.31625, + "grad_norm": 29.5, + "grad_norm_var": 4.556705729166667, + "learning_rate": 0.0001, + "loss": 7.3404, + "loss/crossentropy": 2.1031613536179066, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1890261113177985, + "step": 12650 + }, + { + "epoch": 0.3165, + "grad_norm": 33.25, + "grad_norm_var": 7.43515625, + "learning_rate": 0.0001, + "loss": 7.3116, + "loss/crossentropy": 2.0913070663809776, + "loss/hidden": 3.48515625, + "loss/jsd": 0.0, + "loss/logits": 0.19625448603183032, + "step": 12660 + }, + { + "epoch": 0.31675, + "grad_norm": 30.0, + "grad_norm_var": 24.72265625, + "learning_rate": 0.0001, + "loss": 7.309, + "loss/crossentropy": 2.1766112834215163, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.19453570377081633, + "step": 12670 + }, + { + "epoch": 0.317, + "grad_norm": 28.0, + "grad_norm_var": 3.5518229166666666, + "learning_rate": 0.0001, + "loss": 7.2193, + "loss/crossentropy": 2.1292213678359984, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.17923556556925177, + "step": 12680 + }, + { + "epoch": 0.31725, + "grad_norm": 33.75, + "grad_norm_var": 3.2041666666666666, + "learning_rate": 0.0001, + "loss": 7.3568, + "loss/crossentropy": 2.0966979548335076, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.1796438904479146, + "step": 12690 + }, + { + "epoch": 0.3175, + "grad_norm": 30.125, + "grad_norm_var": 21.9853515625, + "learning_rate": 0.0001, + "loss": 7.3281, + "loss/crossentropy": 2.2050976656377315, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.18696724623441696, + "step": 12700 + }, + { + "epoch": 0.31775, + "grad_norm": 31.25, + "grad_norm_var": 20.936458333333334, + "learning_rate": 0.0001, + "loss": 7.2583, + "loss/crossentropy": 2.0703148849308493, + "loss/hidden": 3.267578125, + "loss/jsd": 0.0, + "loss/logits": 0.17689838781952857, + "step": 12710 + }, + { + "epoch": 0.318, + "grad_norm": 31.75, + "grad_norm_var": 1.9395833333333334, + "learning_rate": 0.0001, + "loss": 7.3007, + "loss/crossentropy": 1.9791635520756246, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.17443582694977522, + "step": 12720 + }, + { + "epoch": 0.31825, + "grad_norm": 30.25, + "grad_norm_var": 2.5155598958333334, + "learning_rate": 0.0001, + "loss": 7.2557, + "loss/crossentropy": 2.0753975957632065, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18456004988402128, + "step": 12730 + }, + { + "epoch": 0.3185, + "grad_norm": 29.125, + "grad_norm_var": 1.3518229166666667, + "learning_rate": 0.0001, + "loss": 7.2658, + "loss/crossentropy": 2.0314756602048876, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18754468094557525, + "step": 12740 + }, + { + "epoch": 0.31875, + "grad_norm": 29.25, + "grad_norm_var": 1.6634765625, + "learning_rate": 0.0001, + "loss": 7.2545, + "loss/crossentropy": 2.027373602986336, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.18138464018702508, + "step": 12750 + }, + { + "epoch": 0.319, + "grad_norm": 33.0, + "grad_norm_var": 2.2837890625, + "learning_rate": 0.0001, + "loss": 7.2711, + "loss/crossentropy": 2.0996690608561037, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.19384829625487326, + "step": 12760 + }, + { + "epoch": 0.31925, + "grad_norm": 29.5, + "grad_norm_var": 1.7580729166666667, + "learning_rate": 0.0001, + "loss": 7.3448, + "loss/crossentropy": 2.081255576014519, + "loss/hidden": 3.48515625, + "loss/jsd": 0.0, + "loss/logits": 0.2054756512865424, + "step": 12770 + }, + { + "epoch": 0.3195, + "grad_norm": 30.375, + "grad_norm_var": 1.6337890625, + "learning_rate": 0.0001, + "loss": 7.3614, + "loss/crossentropy": 2.109497997164726, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.1826931856572628, + "step": 12780 + }, + { + "epoch": 0.31975, + "grad_norm": 29.125, + "grad_norm_var": 2.9125, + "learning_rate": 0.0001, + "loss": 7.3569, + "loss/crossentropy": 2.071355660259724, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.18764556515961886, + "step": 12790 + }, + { + "epoch": 0.32, + "grad_norm": 31.0, + "grad_norm_var": 5.68125, + "learning_rate": 0.0001, + "loss": 7.3399, + "loss/crossentropy": 1.9978457435965538, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.17209083335474135, + "step": 12800 + }, + { + "epoch": 0.32025, + "grad_norm": 28.75, + "grad_norm_var": 20.754166666666666, + "learning_rate": 0.0001, + "loss": 7.3016, + "loss/crossentropy": 1.984939170628786, + "loss/hidden": 3.50625, + "loss/jsd": 0.0, + "loss/logits": 0.19444225933402776, + "step": 12810 + }, + { + "epoch": 0.3205, + "grad_norm": 28.875, + "grad_norm_var": 3.3942057291666665, + "learning_rate": 0.0001, + "loss": 7.3066, + "loss/crossentropy": 1.8981015786528588, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.18696952695026994, + "step": 12820 + }, + { + "epoch": 0.32075, + "grad_norm": 30.25, + "grad_norm_var": 3.9218098958333334, + "learning_rate": 0.0001, + "loss": 7.3974, + "loss/crossentropy": 2.1415200501680376, + "loss/hidden": 3.434375, + "loss/jsd": 0.0, + "loss/logits": 0.20154954344034196, + "step": 12830 + }, + { + "epoch": 0.321, + "grad_norm": 31.375, + "grad_norm_var": 35.8400390625, + "learning_rate": 0.0001, + "loss": 7.3048, + "loss/crossentropy": 2.1483262166380883, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.19059547781944275, + "step": 12840 + }, + { + "epoch": 0.32125, + "grad_norm": 31.0, + "grad_norm_var": 2.561393229166667, + "learning_rate": 0.0001, + "loss": 7.4233, + "loss/crossentropy": 2.1605855494737627, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.19197470052167773, + "step": 12850 + }, + { + "epoch": 0.3215, + "grad_norm": 29.375, + "grad_norm_var": 1.98125, + "learning_rate": 0.0001, + "loss": 7.3826, + "loss/crossentropy": 2.1228017687797545, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.21111836601048709, + "step": 12860 + }, + { + "epoch": 0.32175, + "grad_norm": 32.75, + "grad_norm_var": 2.6666666666666665, + "learning_rate": 0.0001, + "loss": 7.317, + "loss/crossentropy": 1.9638956546783448, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.18815859649330377, + "step": 12870 + }, + { + "epoch": 0.322, + "grad_norm": 30.875, + "grad_norm_var": 2.5927083333333334, + "learning_rate": 0.0001, + "loss": 7.3412, + "loss/crossentropy": 2.15369998216629, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.19398804232478142, + "step": 12880 + }, + { + "epoch": 0.32225, + "grad_norm": 28.875, + "grad_norm_var": 4.522916666666666, + "learning_rate": 0.0001, + "loss": 7.3785, + "loss/crossentropy": 2.134490595757961, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.19743212331086396, + "step": 12890 + }, + { + "epoch": 0.3225, + "grad_norm": 28.125, + "grad_norm_var": 4.824739583333334, + "learning_rate": 0.0001, + "loss": 7.2795, + "loss/crossentropy": 2.05883831679821, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.1887126075103879, + "step": 12900 + }, + { + "epoch": 0.32275, + "grad_norm": 31.125, + "grad_norm_var": 1.6257994456428096e+18, + "learning_rate": 0.0001, + "loss": 7.3637, + "loss/crossentropy": 1.865815930068493, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.16476531345397233, + "step": 12910 + }, + { + "epoch": 0.323, + "grad_norm": 27.375, + "grad_norm_var": 1.6257994456056202e+18, + "learning_rate": 0.0001, + "loss": 7.3231, + "loss/crossentropy": 2.1335863292217256, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.18054731655865908, + "step": 12920 + }, + { + "epoch": 0.32325, + "grad_norm": 30.5, + "grad_norm_var": 1.6822265625, + "learning_rate": 0.0001, + "loss": 7.2992, + "loss/crossentropy": 2.029070366919041, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.1656917490065098, + "step": 12930 + }, + { + "epoch": 0.3235, + "grad_norm": 29.875, + "grad_norm_var": 1.9139973958333334, + "learning_rate": 0.0001, + "loss": 7.3529, + "loss/crossentropy": 2.0367226876318454, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.18601053059101105, + "step": 12940 + }, + { + "epoch": 0.32375, + "grad_norm": 27.875, + "grad_norm_var": 2.1228515625, + "learning_rate": 0.0001, + "loss": 7.146, + "loss/crossentropy": 2.008280509710312, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.17591428142040968, + "step": 12950 + }, + { + "epoch": 0.324, + "grad_norm": 31.5, + "grad_norm_var": 1.8893229166666667, + "learning_rate": 0.0001, + "loss": 7.3616, + "loss/crossentropy": 2.1186923176050185, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.20130661278963088, + "step": 12960 + }, + { + "epoch": 0.32425, + "grad_norm": 28.0, + "grad_norm_var": 1.4749348958333333, + "learning_rate": 0.0001, + "loss": 7.3933, + "loss/crossentropy": 2.1367026805877685, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.19593327604234217, + "step": 12970 + }, + { + "epoch": 0.3245, + "grad_norm": 33.5, + "grad_norm_var": 2.5009765625, + "learning_rate": 0.0001, + "loss": 7.36, + "loss/crossentropy": 1.9709487736225129, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.18522804593667389, + "step": 12980 + }, + { + "epoch": 0.32475, + "grad_norm": 32.25, + "grad_norm_var": 3.037955729166667, + "learning_rate": 0.0001, + "loss": 7.3995, + "loss/crossentropy": 2.182401825487614, + "loss/hidden": 3.288671875, + "loss/jsd": 0.0, + "loss/logits": 0.1944254267960787, + "step": 12990 + }, + { + "epoch": 0.325, + "grad_norm": 31.875, + "grad_norm_var": 1.8259765625, + "learning_rate": 0.0001, + "loss": 7.297, + "loss/crossentropy": 2.144014260172844, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.18785431496798993, + "step": 13000 + }, + { + "epoch": 0.32525, + "grad_norm": 29.125, + "grad_norm_var": 1.5858723958333334, + "learning_rate": 0.0001, + "loss": 7.4172, + "loss/crossentropy": 2.185704696178436, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.21230035796761512, + "step": 13010 + }, + { + "epoch": 0.3255, + "grad_norm": 33.75, + "grad_norm_var": 2.0989583333333335, + "learning_rate": 0.0001, + "loss": 7.3542, + "loss/crossentropy": 2.169140038639307, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.20137128187343478, + "step": 13020 + }, + { + "epoch": 0.32575, + "grad_norm": 29.0, + "grad_norm_var": 1.7613932291666667, + "learning_rate": 0.0001, + "loss": 7.3018, + "loss/crossentropy": 2.0429045438766478, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.1801679054275155, + "step": 13030 + }, + { + "epoch": 0.326, + "grad_norm": 28.25, + "grad_norm_var": 20.3603515625, + "learning_rate": 0.0001, + "loss": 7.3076, + "loss/crossentropy": 2.090936814248562, + "loss/hidden": 3.363671875, + "loss/jsd": 0.0, + "loss/logits": 0.18894913028925658, + "step": 13040 + }, + { + "epoch": 0.32625, + "grad_norm": 29.5, + "grad_norm_var": 21.5759765625, + "learning_rate": 0.0001, + "loss": 7.3032, + "loss/crossentropy": 2.0933708012104035, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18349476121366023, + "step": 13050 + }, + { + "epoch": 0.3265, + "grad_norm": 31.125, + "grad_norm_var": 23.737434895833335, + "learning_rate": 0.0001, + "loss": 7.4007, + "loss/crossentropy": 1.9636264845728875, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.17653203681111335, + "step": 13060 + }, + { + "epoch": 0.32675, + "grad_norm": 30.875, + "grad_norm_var": 26.170833333333334, + "learning_rate": 0.0001, + "loss": 7.2685, + "loss/crossentropy": 2.0459864370524885, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.18335657650604845, + "step": 13070 + }, + { + "epoch": 0.327, + "grad_norm": 32.25, + "grad_norm_var": 13.033072916666667, + "learning_rate": 0.0001, + "loss": 7.3321, + "loss/crossentropy": 2.2077785804867744, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.1968739289790392, + "step": 13080 + }, + { + "epoch": 0.32725, + "grad_norm": 29.625, + "grad_norm_var": 8.8634765625, + "learning_rate": 0.0001, + "loss": 7.2613, + "loss/crossentropy": 2.039609357714653, + "loss/hidden": 3.45078125, + "loss/jsd": 0.0, + "loss/logits": 0.19310947302728892, + "step": 13090 + }, + { + "epoch": 0.3275, + "grad_norm": 29.875, + "grad_norm_var": 8.334309895833334, + "learning_rate": 0.0001, + "loss": 7.3587, + "loss/crossentropy": 2.0822102136909963, + "loss/hidden": 3.26640625, + "loss/jsd": 0.0, + "loss/logits": 0.1863661216571927, + "step": 13100 + }, + { + "epoch": 0.32775, + "grad_norm": 34.25, + "grad_norm_var": 8.694205729166667, + "learning_rate": 0.0001, + "loss": 7.3602, + "loss/crossentropy": 2.167416235804558, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.19521966725587844, + "step": 13110 + }, + { + "epoch": 0.328, + "grad_norm": 29.125, + "grad_norm_var": 7.191080729166667, + "learning_rate": 0.0001, + "loss": 7.3767, + "loss/crossentropy": 2.07031906619668, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.22052818778902292, + "step": 13120 + }, + { + "epoch": 0.32825, + "grad_norm": 35.5, + "grad_norm_var": 5.821809895833334, + "learning_rate": 0.0001, + "loss": 7.5034, + "loss/crossentropy": 2.1318026900291445, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.19695397429168224, + "step": 13130 + }, + { + "epoch": 0.3285, + "grad_norm": 29.5, + "grad_norm_var": 4.738541666666666, + "learning_rate": 0.0001, + "loss": 7.2907, + "loss/crossentropy": 1.893085064738989, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.17454516123980285, + "step": 13140 + }, + { + "epoch": 0.32875, + "grad_norm": 33.25, + "grad_norm_var": 4.96640625, + "learning_rate": 0.0001, + "loss": 7.3911, + "loss/crossentropy": 2.0709325544536115, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.19108830243349076, + "step": 13150 + }, + { + "epoch": 0.329, + "grad_norm": 30.375, + "grad_norm_var": 6.593489583333334, + "learning_rate": 0.0001, + "loss": 7.278, + "loss/crossentropy": 2.0809618443250657, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.1867564545944333, + "step": 13160 + }, + { + "epoch": 0.32925, + "grad_norm": 28.875, + "grad_norm_var": 5.355143229166667, + "learning_rate": 0.0001, + "loss": 7.1865, + "loss/crossentropy": 1.86175979077816, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.18658734802156687, + "step": 13170 + }, + { + "epoch": 0.3295, + "grad_norm": 32.5, + "grad_norm_var": 7.297916666666667, + "learning_rate": 0.0001, + "loss": 7.2993, + "loss/crossentropy": 2.0688765406608582, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.1887798959389329, + "step": 13180 + }, + { + "epoch": 0.32975, + "grad_norm": 32.0, + "grad_norm_var": 3.1462890625, + "learning_rate": 0.0001, + "loss": 7.396, + "loss/crossentropy": 2.1349074259400367, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.1891916124150157, + "step": 13190 + }, + { + "epoch": 0.33, + "grad_norm": 29.375, + "grad_norm_var": 3.2080729166666666, + "learning_rate": 0.0001, + "loss": 7.2194, + "loss/crossentropy": 2.0701663225889204, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.19238610491156577, + "step": 13200 + }, + { + "epoch": 0.33025, + "grad_norm": 29.875, + "grad_norm_var": 1.3447265625, + "learning_rate": 0.0001, + "loss": 7.3476, + "loss/crossentropy": 2.126928760111332, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.18552705701440572, + "step": 13210 + }, + { + "epoch": 0.3305, + "grad_norm": 29.875, + "grad_norm_var": 2.4916666666666667, + "learning_rate": 0.0001, + "loss": 7.3275, + "loss/crossentropy": 2.0960667990148067, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.18840519580990076, + "step": 13220 + }, + { + "epoch": 0.33075, + "grad_norm": 29.0, + "grad_norm_var": 31.097916666666666, + "learning_rate": 0.0001, + "loss": 7.3449, + "loss/crossentropy": 1.9724566139280797, + "loss/hidden": 3.401171875, + "loss/jsd": 0.0, + "loss/logits": 0.17257819082587958, + "step": 13230 + }, + { + "epoch": 0.331, + "grad_norm": 30.25, + "grad_norm_var": 29.516080729166667, + "learning_rate": 0.0001, + "loss": 7.5214, + "loss/crossentropy": 2.144184100627899, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.19250041600316764, + "step": 13240 + }, + { + "epoch": 0.33125, + "grad_norm": 28.25, + "grad_norm_var": 6.55, + "learning_rate": 0.0001, + "loss": 7.3245, + "loss/crossentropy": 1.9724821582436562, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.18454758413136005, + "step": 13250 + }, + { + "epoch": 0.3315, + "grad_norm": 35.75, + "grad_norm_var": 10.574739583333333, + "learning_rate": 0.0001, + "loss": 7.2067, + "loss/crossentropy": 2.0341859996318816, + "loss/hidden": 3.466015625, + "loss/jsd": 0.0, + "loss/logits": 0.19355686828494073, + "step": 13260 + }, + { + "epoch": 0.33175, + "grad_norm": 31.875, + "grad_norm_var": 6.093489583333334, + "learning_rate": 0.0001, + "loss": 7.3386, + "loss/crossentropy": 2.1593449860811234, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.18228702070191502, + "step": 13270 + }, + { + "epoch": 0.332, + "grad_norm": 32.25, + "grad_norm_var": 4.518489583333333, + "learning_rate": 0.0001, + "loss": 7.2584, + "loss/crossentropy": 2.021310421824455, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.193563433829695, + "step": 13280 + }, + { + "epoch": 0.33225, + "grad_norm": 33.25, + "grad_norm_var": 3.008072916666667, + "learning_rate": 0.0001, + "loss": 7.3654, + "loss/crossentropy": 2.010246267169714, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.18375244587659836, + "step": 13290 + }, + { + "epoch": 0.3325, + "grad_norm": 33.5, + "grad_norm_var": 1.8309895833333334, + "learning_rate": 0.0001, + "loss": 7.4013, + "loss/crossentropy": 2.1324703454971314, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.19651575423777104, + "step": 13300 + }, + { + "epoch": 0.33275, + "grad_norm": 31.625, + "grad_norm_var": 9.018489583333333, + "learning_rate": 0.0001, + "loss": 7.4123, + "loss/crossentropy": 2.2024870067834854, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.19117040373384953, + "step": 13310 + }, + { + "epoch": 0.333, + "grad_norm": 29.5, + "grad_norm_var": 54.362239583333334, + "learning_rate": 0.0001, + "loss": 7.3561, + "loss/crossentropy": 1.9291939452290534, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18055410776287317, + "step": 13320 + }, + { + "epoch": 0.33325, + "grad_norm": 30.5, + "grad_norm_var": 53.271875, + "learning_rate": 0.0001, + "loss": 7.3417, + "loss/crossentropy": 1.9812588825821877, + "loss/hidden": 3.246875, + "loss/jsd": 0.0, + "loss/logits": 0.17277830513194203, + "step": 13330 + }, + { + "epoch": 0.3335, + "grad_norm": 30.625, + "grad_norm_var": 2.0332682291666666, + "learning_rate": 0.0001, + "loss": 7.4792, + "loss/crossentropy": 2.1468162171542646, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.1815652133896947, + "step": 13340 + }, + { + "epoch": 0.33375, + "grad_norm": 30.5, + "grad_norm_var": 1.8942057291666667, + "learning_rate": 0.0001, + "loss": 7.3437, + "loss/crossentropy": 1.9899487346410751, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1736677044071257, + "step": 13350 + }, + { + "epoch": 0.334, + "grad_norm": 32.75, + "grad_norm_var": 3.374955311060432e+18, + "learning_rate": 0.0001, + "loss": 7.4592, + "loss/crossentropy": 2.0041578873991965, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.19686540886759757, + "step": 13360 + }, + { + "epoch": 0.33425, + "grad_norm": 29.375, + "grad_norm_var": 3.3749553110757407e+18, + "learning_rate": 0.0001, + "loss": 7.3521, + "loss/crossentropy": 2.050678627192974, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.1835683614946902, + "step": 13370 + }, + { + "epoch": 0.3345, + "grad_norm": 32.5, + "grad_norm_var": 1.4561848958333334, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.255459001660347, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.2026721488684416, + "step": 13380 + }, + { + "epoch": 0.33475, + "grad_norm": 55.0, + "grad_norm_var": 37.805989583333336, + "learning_rate": 0.0001, + "loss": 7.432, + "loss/crossentropy": 2.092339722812176, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20365451760590075, + "step": 13390 + }, + { + "epoch": 0.335, + "grad_norm": 29.5, + "grad_norm_var": 39.1962890625, + "learning_rate": 0.0001, + "loss": 7.3082, + "loss/crossentropy": 2.060484157502651, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.19208470694720745, + "step": 13400 + }, + { + "epoch": 0.33525, + "grad_norm": 30.875, + "grad_norm_var": 4.119205729166667, + "learning_rate": 0.0001, + "loss": 7.3159, + "loss/crossentropy": 2.0885482341051103, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.18701701434329152, + "step": 13410 + }, + { + "epoch": 0.3355, + "grad_norm": 34.75, + "grad_norm_var": 3.8354166666666667, + "learning_rate": 0.0001, + "loss": 7.3792, + "loss/crossentropy": 2.064309825748205, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.20586481597274542, + "step": 13420 + }, + { + "epoch": 0.33575, + "grad_norm": 29.75, + "grad_norm_var": 2.535416666666667, + "learning_rate": 0.0001, + "loss": 7.4193, + "loss/crossentropy": 1.8737250491976738, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.19758351668715476, + "step": 13430 + }, + { + "epoch": 0.336, + "grad_norm": 30.25, + "grad_norm_var": 3.8686848958333333, + "learning_rate": 0.0001, + "loss": 7.4324, + "loss/crossentropy": 1.9396042831242084, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.18089414723217487, + "step": 13440 + }, + { + "epoch": 0.33625, + "grad_norm": 28.875, + "grad_norm_var": 4.540625, + "learning_rate": 0.0001, + "loss": 7.3798, + "loss/crossentropy": 2.004528859257698, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.22120596412569285, + "step": 13450 + }, + { + "epoch": 0.3365, + "grad_norm": 29.125, + "grad_norm_var": 1.69375, + "learning_rate": 0.0001, + "loss": 7.3708, + "loss/crossentropy": 2.170805335044861, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.18870076667517424, + "step": 13460 + }, + { + "epoch": 0.33675, + "grad_norm": 29.375, + "grad_norm_var": 2.9284656301175997e+18, + "learning_rate": 0.0001, + "loss": 7.3594, + "loss/crossentropy": 2.0726011991500854, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.18316982481628657, + "step": 13470 + }, + { + "epoch": 0.337, + "grad_norm": 32.25, + "grad_norm_var": 3.999739583333333, + "learning_rate": 0.0001, + "loss": 7.3072, + "loss/crossentropy": 2.0701726540923118, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.18232779456302523, + "step": 13480 + }, + { + "epoch": 0.33725, + "grad_norm": 31.625, + "grad_norm_var": 1.01640625, + "learning_rate": 0.0001, + "loss": 7.3553, + "loss/crossentropy": 2.066875821352005, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.18299774192273616, + "step": 13490 + }, + { + "epoch": 0.3375, + "grad_norm": 32.0, + "grad_norm_var": 1.4489583333333333, + "learning_rate": 0.0001, + "loss": 7.2708, + "loss/crossentropy": 2.110422171652317, + "loss/hidden": 3.353125, + "loss/jsd": 0.0, + "loss/logits": 0.19538786429911853, + "step": 13500 + }, + { + "epoch": 0.33775, + "grad_norm": 29.875, + "grad_norm_var": 1.06015625, + "learning_rate": 0.0001, + "loss": 7.3529, + "loss/crossentropy": 2.1061682522296907, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.19031856823712587, + "step": 13510 + }, + { + "epoch": 0.338, + "grad_norm": 32.5, + "grad_norm_var": 1.40390625, + "learning_rate": 0.0001, + "loss": 7.2473, + "loss/crossentropy": 2.0029176853597166, + "loss/hidden": 3.284375, + "loss/jsd": 0.0, + "loss/logits": 0.182035060133785, + "step": 13520 + }, + { + "epoch": 0.33825, + "grad_norm": 29.375, + "grad_norm_var": 9.417708333333334, + "learning_rate": 0.0001, + "loss": 7.3084, + "loss/crossentropy": 2.1238187912851574, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.188486148416996, + "step": 13530 + }, + { + "epoch": 0.3385, + "grad_norm": 30.25, + "grad_norm_var": 1.3983723958333334, + "learning_rate": 0.0001, + "loss": 7.2256, + "loss/crossentropy": 1.9766217768192291, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.17294846558943391, + "step": 13540 + }, + { + "epoch": 0.33875, + "grad_norm": 29.375, + "grad_norm_var": 3.0874348958333333, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.1272383123636245, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18510236088186502, + "step": 13550 + }, + { + "epoch": 0.339, + "grad_norm": 31.0, + "grad_norm_var": 3.10390625, + "learning_rate": 0.0001, + "loss": 7.2885, + "loss/crossentropy": 2.0515687823295594, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.18967761769890784, + "step": 13560 + }, + { + "epoch": 0.33925, + "grad_norm": 29.625, + "grad_norm_var": 11.933072916666667, + "learning_rate": 0.0001, + "loss": 7.3573, + "loss/crossentropy": 2.0446967758238315, + "loss/hidden": 3.425, + "loss/jsd": 0.0, + "loss/logits": 0.18640230242162942, + "step": 13570 + }, + { + "epoch": 0.3395, + "grad_norm": 28.875, + "grad_norm_var": 1.84765625, + "learning_rate": 0.0001, + "loss": 7.2965, + "loss/crossentropy": 1.9837520524859429, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.1916389312595129, + "step": 13580 + }, + { + "epoch": 0.33975, + "grad_norm": 30.5, + "grad_norm_var": 3.1791666666666667, + "learning_rate": 0.0001, + "loss": 7.3632, + "loss/crossentropy": 2.1806528866291046, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.19690024815499782, + "step": 13590 + }, + { + "epoch": 0.34, + "grad_norm": 30.75, + "grad_norm_var": 1.3385416666666667, + "learning_rate": 0.0001, + "loss": 7.3761, + "loss/crossentropy": 2.215220719575882, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.19034639187157154, + "step": 13600 + }, + { + "epoch": 0.34025, + "grad_norm": 27.375, + "grad_norm_var": 2.7129557291666666, + "learning_rate": 0.0001, + "loss": 7.3129, + "loss/crossentropy": 2.16262392103672, + "loss/hidden": 3.503125, + "loss/jsd": 0.0, + "loss/logits": 0.20174582321196793, + "step": 13610 + }, + { + "epoch": 0.3405, + "grad_norm": 30.5, + "grad_norm_var": 3.2363932291666666, + "learning_rate": 0.0001, + "loss": 7.3501, + "loss/crossentropy": 2.126379433274269, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18432336021214724, + "step": 13620 + }, + { + "epoch": 0.34075, + "grad_norm": 30.875, + "grad_norm_var": 1.8176432291666667, + "learning_rate": 0.0001, + "loss": 7.3601, + "loss/crossentropy": 2.0324837125837805, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.18331200983375312, + "step": 13630 + }, + { + "epoch": 0.341, + "grad_norm": 29.0, + "grad_norm_var": 1.7942057291666667, + "learning_rate": 0.0001, + "loss": 7.3882, + "loss/crossentropy": 2.176556921005249, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.19637300558388232, + "step": 13640 + }, + { + "epoch": 0.34125, + "grad_norm": 29.875, + "grad_norm_var": 3.758072916666667, + "learning_rate": 0.0001, + "loss": 7.1992, + "loss/crossentropy": 1.9680285774171353, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.1837764661759138, + "step": 13650 + }, + { + "epoch": 0.3415, + "grad_norm": 29.125, + "grad_norm_var": 2.6936848958333335, + "learning_rate": 0.0001, + "loss": 7.361, + "loss/crossentropy": 2.1511219844222067, + "loss/hidden": 3.29140625, + "loss/jsd": 0.0, + "loss/logits": 0.1746565790846944, + "step": 13660 + }, + { + "epoch": 0.34175, + "grad_norm": 28.375, + "grad_norm_var": 2.312239583333333, + "learning_rate": 0.0001, + "loss": 7.3856, + "loss/crossentropy": 2.187084162980318, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.18925763312727212, + "step": 13670 + }, + { + "epoch": 0.342, + "grad_norm": 29.75, + "grad_norm_var": 2.4139973958333334, + "learning_rate": 0.0001, + "loss": 7.196, + "loss/crossentropy": 2.0472760550677775, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.17813790533691645, + "step": 13680 + }, + { + "epoch": 0.34225, + "grad_norm": 32.25, + "grad_norm_var": 1.6458333333333333, + "learning_rate": 0.0001, + "loss": 7.4184, + "loss/crossentropy": 2.0482982218265535, + "loss/hidden": 3.483203125, + "loss/jsd": 0.0, + "loss/logits": 0.1855379816144705, + "step": 13690 + }, + { + "epoch": 0.3425, + "grad_norm": 27.875, + "grad_norm_var": 98.41432291666666, + "learning_rate": 0.0001, + "loss": 7.2863, + "loss/crossentropy": 2.0204505778849127, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.17318440061062573, + "step": 13700 + }, + { + "epoch": 0.34275, + "grad_norm": 28.375, + "grad_norm_var": 33.1900390625, + "learning_rate": 0.0001, + "loss": 7.3409, + "loss/crossentropy": 2.0783301174640654, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.19091757852584124, + "step": 13710 + }, + { + "epoch": 0.343, + "grad_norm": 32.0, + "grad_norm_var": 3.7150390625, + "learning_rate": 0.0001, + "loss": 7.3412, + "loss/crossentropy": 2.0553587220609186, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18813807256519793, + "step": 13720 + }, + { + "epoch": 0.34325, + "grad_norm": 34.25, + "grad_norm_var": 1586.7400390625, + "learning_rate": 0.0001, + "loss": 7.4918, + "loss/crossentropy": 2.14101425036788, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1929340995848179, + "step": 13730 + }, + { + "epoch": 0.3435, + "grad_norm": 32.5, + "grad_norm_var": 4.633072916666666, + "learning_rate": 0.0001, + "loss": 7.3162, + "loss/crossentropy": 2.0531166955828666, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.18961485847830772, + "step": 13740 + }, + { + "epoch": 0.34375, + "grad_norm": 51.5, + "grad_norm_var": 29.780989583333334, + "learning_rate": 0.0001, + "loss": 7.3981, + "loss/crossentropy": 2.146902695298195, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.18639619778841734, + "step": 13750 + }, + { + "epoch": 0.344, + "grad_norm": 31.875, + "grad_norm_var": 27.141080729166667, + "learning_rate": 0.0001, + "loss": 7.3596, + "loss/crossentropy": 2.2127513885498047, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.19791291914880277, + "step": 13760 + }, + { + "epoch": 0.34425, + "grad_norm": 32.25, + "grad_norm_var": 1.9809895833333333, + "learning_rate": 0.0001, + "loss": 7.3245, + "loss/crossentropy": 2.113487794995308, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.18752570655196904, + "step": 13770 + }, + { + "epoch": 0.3445, + "grad_norm": 28.875, + "grad_norm_var": 2.7666666666666666, + "learning_rate": 0.0001, + "loss": 7.3737, + "loss/crossentropy": 2.192716282606125, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1977760722860694, + "step": 13780 + }, + { + "epoch": 0.34475, + "grad_norm": 30.375, + "grad_norm_var": 1.4999348958333334, + "learning_rate": 0.0001, + "loss": 7.2308, + "loss/crossentropy": 2.1602986216545106, + "loss/hidden": 3.2796875, + "loss/jsd": 0.0, + "loss/logits": 0.1859057329595089, + "step": 13790 + }, + { + "epoch": 0.345, + "grad_norm": 45.25, + "grad_norm_var": 15.587434895833333, + "learning_rate": 0.0001, + "loss": 7.3689, + "loss/crossentropy": 2.042529730498791, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.2055663662031293, + "step": 13800 + }, + { + "epoch": 0.34525, + "grad_norm": 31.375, + "grad_norm_var": 16.111458333333335, + "learning_rate": 0.0001, + "loss": 7.355, + "loss/crossentropy": 2.004579763114452, + "loss/hidden": 3.46171875, + "loss/jsd": 0.0, + "loss/logits": 0.18781258668750525, + "step": 13810 + }, + { + "epoch": 0.3455, + "grad_norm": 27.75, + "grad_norm_var": 2.9369140625, + "learning_rate": 0.0001, + "loss": 7.2946, + "loss/crossentropy": 1.9647000446915626, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18860411625355483, + "step": 13820 + }, + { + "epoch": 0.34575, + "grad_norm": 28.625, + "grad_norm_var": 2.4830729166666665, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.1840324103832245, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18433275502175092, + "step": 13830 + }, + { + "epoch": 0.346, + "grad_norm": 31.0, + "grad_norm_var": 2.061393229166667, + "learning_rate": 0.0001, + "loss": 7.39, + "loss/crossentropy": 2.1660002395510674, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18589738458395005, + "step": 13840 + }, + { + "epoch": 0.34625, + "grad_norm": 30.625, + "grad_norm_var": 3.237434895833333, + "learning_rate": 0.0001, + "loss": 7.2772, + "loss/crossentropy": 2.0304790273308755, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19276629090309144, + "step": 13850 + }, + { + "epoch": 0.3465, + "grad_norm": 27.375, + "grad_norm_var": 4.35390625, + "learning_rate": 0.0001, + "loss": 7.2433, + "loss/crossentropy": 2.0913878247141837, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.17961858343333006, + "step": 13860 + }, + { + "epoch": 0.34675, + "grad_norm": 31.625, + "grad_norm_var": 1.9833333333333334, + "learning_rate": 0.0001, + "loss": 7.298, + "loss/crossentropy": 1.9085475612431764, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.1773257221095264, + "step": 13870 + }, + { + "epoch": 0.347, + "grad_norm": 31.75, + "grad_norm_var": 1.2061848958333334, + "learning_rate": 0.0001, + "loss": 7.3026, + "loss/crossentropy": 2.134579537808895, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.18533094711601733, + "step": 13880 + }, + { + "epoch": 0.34725, + "grad_norm": 29.75, + "grad_norm_var": 1.74140625, + "learning_rate": 0.0001, + "loss": 7.2397, + "loss/crossentropy": 2.0479501873254775, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.1840794005431235, + "step": 13890 + }, + { + "epoch": 0.3475, + "grad_norm": 30.625, + "grad_norm_var": 2.6400390625, + "learning_rate": 0.0001, + "loss": 7.2447, + "loss/crossentropy": 2.0175841793417932, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.17613419592380525, + "step": 13900 + }, + { + "epoch": 0.34775, + "grad_norm": 32.5, + "grad_norm_var": 5.333072916666667, + "learning_rate": 0.0001, + "loss": 7.3573, + "loss/crossentropy": 2.248972457647324, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.19207954667508603, + "step": 13910 + }, + { + "epoch": 0.348, + "grad_norm": 31.0, + "grad_norm_var": 4.843489583333334, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.215424671769142, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.19513901360332966, + "step": 13920 + }, + { + "epoch": 0.34825, + "grad_norm": 30.25, + "grad_norm_var": 2.004622395833333, + "learning_rate": 0.0001, + "loss": 7.4324, + "loss/crossentropy": 2.157894307374954, + "loss/hidden": 3.457421875, + "loss/jsd": 0.0, + "loss/logits": 0.20247049070894718, + "step": 13930 + }, + { + "epoch": 0.3485, + "grad_norm": 28.5, + "grad_norm_var": 2.4150390625, + "learning_rate": 0.0001, + "loss": 7.3422, + "loss/crossentropy": 2.14641355201602, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.1898975013755262, + "step": 13940 + }, + { + "epoch": 0.34875, + "grad_norm": 28.5, + "grad_norm_var": 6.0103515625, + "learning_rate": 0.0001, + "loss": 7.3309, + "loss/crossentropy": 2.0432268232107162, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.19791582636535168, + "step": 13950 + }, + { + "epoch": 0.349, + "grad_norm": 28.875, + "grad_norm_var": 4.4744140625, + "learning_rate": 0.0001, + "loss": 7.3187, + "loss/crossentropy": 2.1648118555545808, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.1794663654640317, + "step": 13960 + }, + { + "epoch": 0.34925, + "grad_norm": 29.5, + "grad_norm_var": 3.1332682291666667, + "learning_rate": 0.0001, + "loss": 7.3122, + "loss/crossentropy": 2.0566162675619126, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.18033871669322252, + "step": 13970 + }, + { + "epoch": 0.3495, + "grad_norm": 27.875, + "grad_norm_var": 1.3686848958333333, + "learning_rate": 0.0001, + "loss": 7.2538, + "loss/crossentropy": 2.0309479638934134, + "loss/hidden": 3.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.1834420131519437, + "step": 13980 + }, + { + "epoch": 0.34975, + "grad_norm": 30.0, + "grad_norm_var": 2.3510416666666667, + "learning_rate": 0.0001, + "loss": 7.3068, + "loss/crossentropy": 2.0843526370823384, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.18843147568404675, + "step": 13990 + }, + { + "epoch": 0.35, + "grad_norm": 34.25, + "grad_norm_var": 10.67265625, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.0708370715379716, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.19229107201099396, + "step": 14000 + }, + { + "epoch": 0.35025, + "grad_norm": 28.5, + "grad_norm_var": 11.178059895833334, + "learning_rate": 0.0001, + "loss": 7.3404, + "loss/crossentropy": 2.016050732135773, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.1993856718763709, + "step": 14010 + }, + { + "epoch": 0.3505, + "grad_norm": 31.375, + "grad_norm_var": 1.55625, + "learning_rate": 0.0001, + "loss": 7.4397, + "loss/crossentropy": 2.0941019743680953, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.1941377494484186, + "step": 14020 + }, + { + "epoch": 0.35075, + "grad_norm": 28.125, + "grad_norm_var": 1.7934895833333333, + "learning_rate": 0.0001, + "loss": 7.2787, + "loss/crossentropy": 2.175595435500145, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.1906340267509222, + "step": 14030 + }, + { + "epoch": 0.351, + "grad_norm": 29.625, + "grad_norm_var": 3.01015625, + "learning_rate": 0.0001, + "loss": 7.3322, + "loss/crossentropy": 1.945583702623844, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.17433239622041583, + "step": 14040 + }, + { + "epoch": 0.35125, + "grad_norm": 32.0, + "grad_norm_var": 11.308072916666667, + "learning_rate": 0.0001, + "loss": 7.4361, + "loss/crossentropy": 2.078453540802002, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.1803632376715541, + "step": 14050 + }, + { + "epoch": 0.3515, + "grad_norm": 31.125, + "grad_norm_var": 11.920833333333333, + "learning_rate": 0.0001, + "loss": 7.3917, + "loss/crossentropy": 2.2025377944111826, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.18862273804843427, + "step": 14060 + }, + { + "epoch": 0.35175, + "grad_norm": 26.875, + "grad_norm_var": 14.82890625, + "learning_rate": 0.0001, + "loss": 7.294, + "loss/crossentropy": 2.0608675971627237, + "loss/hidden": 3.229296875, + "loss/jsd": 0.0, + "loss/logits": 0.17895151115953922, + "step": 14070 + }, + { + "epoch": 0.352, + "grad_norm": 31.5, + "grad_norm_var": 16.512239583333333, + "learning_rate": 0.0001, + "loss": 7.2397, + "loss/crossentropy": 1.951252208650112, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.17488121166825293, + "step": 14080 + }, + { + "epoch": 0.35225, + "grad_norm": 29.25, + "grad_norm_var": 1.6044777159285975e+18, + "learning_rate": 0.0001, + "loss": 7.3104, + "loss/crossentropy": 2.1797886729240417, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.19085733741521835, + "step": 14090 + }, + { + "epoch": 0.3525, + "grad_norm": 30.625, + "grad_norm_var": 1.6044777150524774e+18, + "learning_rate": 0.0001, + "loss": 7.269, + "loss/crossentropy": 2.133794055879116, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.18788108974695206, + "step": 14100 + }, + { + "epoch": 0.35275, + "grad_norm": 31.125, + "grad_norm_var": 57.800455729166664, + "learning_rate": 0.0001, + "loss": 7.2859, + "loss/crossentropy": 2.0392952769994737, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.1854009686037898, + "step": 14110 + }, + { + "epoch": 0.353, + "grad_norm": 29.0, + "grad_norm_var": 2.729622395833333, + "learning_rate": 0.0001, + "loss": 7.3152, + "loss/crossentropy": 2.026089659333229, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18226654808968307, + "step": 14120 + }, + { + "epoch": 0.35325, + "grad_norm": 30.375, + "grad_norm_var": 2.703125, + "learning_rate": 0.0001, + "loss": 7.4705, + "loss/crossentropy": 2.132002358883619, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.2102237056940794, + "step": 14130 + }, + { + "epoch": 0.3535, + "grad_norm": 29.0, + "grad_norm_var": 2.9497395833333333, + "learning_rate": 0.0001, + "loss": 7.3236, + "loss/crossentropy": 2.0131492763757706, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19223638353869318, + "step": 14140 + }, + { + "epoch": 0.35375, + "grad_norm": 29.5, + "grad_norm_var": 2.7212890625, + "learning_rate": 0.0001, + "loss": 7.271, + "loss/crossentropy": 2.1130865029990673, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.1914577091112733, + "step": 14150 + }, + { + "epoch": 0.354, + "grad_norm": 31.125, + "grad_norm_var": 1.8254557291666667, + "learning_rate": 0.0001, + "loss": 7.4232, + "loss/crossentropy": 2.12569759786129, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.19339151214808226, + "step": 14160 + }, + { + "epoch": 0.35425, + "grad_norm": 29.875, + "grad_norm_var": 1064.6176432291666, + "learning_rate": 0.0001, + "loss": 7.2083, + "loss/crossentropy": 2.0492417976260184, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18359060864895582, + "step": 14170 + }, + { + "epoch": 0.3545, + "grad_norm": 31.375, + "grad_norm_var": 9.59765625, + "learning_rate": 0.0001, + "loss": 7.3908, + "loss/crossentropy": 2.099221628904343, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.18481487538665534, + "step": 14180 + }, + { + "epoch": 0.35475, + "grad_norm": 44.25, + "grad_norm_var": 12.892708333333333, + "learning_rate": 0.0001, + "loss": 7.3495, + "loss/crossentropy": 2.04138702750206, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.1797080934047699, + "step": 14190 + }, + { + "epoch": 0.355, + "grad_norm": 29.875, + "grad_norm_var": 15.040559895833333, + "learning_rate": 0.0001, + "loss": 7.3724, + "loss/crossentropy": 2.0216434367001055, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19053793977946043, + "step": 14200 + }, + { + "epoch": 0.35525, + "grad_norm": 28.25, + "grad_norm_var": 13.096809895833333, + "learning_rate": 0.0001, + "loss": 7.299, + "loss/crossentropy": 2.160589988529682, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.1827318999916315, + "step": 14210 + }, + { + "epoch": 0.3555, + "grad_norm": 31.625, + "grad_norm_var": 9.878059895833333, + "learning_rate": 0.0001, + "loss": 7.3381, + "loss/crossentropy": 1.7803485259413718, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.16204789485782384, + "step": 14220 + }, + { + "epoch": 0.35575, + "grad_norm": 28.625, + "grad_norm_var": 5.543489583333334, + "learning_rate": 0.0001, + "loss": 7.2506, + "loss/crossentropy": 1.8810015477240085, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.17873666901141405, + "step": 14230 + }, + { + "epoch": 0.356, + "grad_norm": 30.375, + "grad_norm_var": 4.208268229166666, + "learning_rate": 0.0001, + "loss": 7.4941, + "loss/crossentropy": 2.016931130737066, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.17651305962353944, + "step": 14240 + }, + { + "epoch": 0.35625, + "grad_norm": 29.75, + "grad_norm_var": 2.067122395833333, + "learning_rate": 0.0001, + "loss": 7.3659, + "loss/crossentropy": 2.1511716455221177, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18431759886443616, + "step": 14250 + }, + { + "epoch": 0.3565, + "grad_norm": 31.5, + "grad_norm_var": 14.267708333333333, + "learning_rate": 0.0001, + "loss": 7.4284, + "loss/crossentropy": 2.0923829920589925, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.20195957068353892, + "step": 14260 + }, + { + "epoch": 0.35675, + "grad_norm": 32.75, + "grad_norm_var": 2.780989583333333, + "learning_rate": 0.0001, + "loss": 7.3516, + "loss/crossentropy": 2.1890153646469117, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.19206568598747253, + "step": 14270 + }, + { + "epoch": 0.357, + "grad_norm": 26.875, + "grad_norm_var": 3.0858723958333334, + "learning_rate": 0.0001, + "loss": 7.2643, + "loss/crossentropy": 2.109933242201805, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1951975781470537, + "step": 14280 + }, + { + "epoch": 0.35725, + "grad_norm": 28.875, + "grad_norm_var": 1.7747395833333333, + "learning_rate": 0.0001, + "loss": 7.2549, + "loss/crossentropy": 2.104505704343319, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.19365676920861005, + "step": 14290 + }, + { + "epoch": 0.3575, + "grad_norm": 32.25, + "grad_norm_var": 11.114322916666667, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 2.108228546380997, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.18031150791794062, + "step": 14300 + }, + { + "epoch": 0.35775, + "grad_norm": 33.5, + "grad_norm_var": 4.084375, + "learning_rate": 0.0001, + "loss": 7.2652, + "loss/crossentropy": 2.11306362003088, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.19340378735214472, + "step": 14310 + }, + { + "epoch": 0.358, + "grad_norm": 31.625, + "grad_norm_var": 2.2712890625, + "learning_rate": 0.0001, + "loss": 7.5154, + "loss/crossentropy": 2.073298954963684, + "loss/hidden": 3.4765625, + "loss/jsd": 0.0, + "loss/logits": 0.20225481800734996, + "step": 14320 + }, + { + "epoch": 0.35825, + "grad_norm": 33.0, + "grad_norm_var": 7.609309895833333, + "learning_rate": 0.0001, + "loss": 7.3172, + "loss/crossentropy": 2.068447032570839, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.1848990023136139, + "step": 14330 + }, + { + "epoch": 0.3585, + "grad_norm": 28.625, + "grad_norm_var": 9.287434895833334, + "learning_rate": 0.0001, + "loss": 7.341, + "loss/crossentropy": 2.0122443050146104, + "loss/hidden": 3.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.16750977858901023, + "step": 14340 + }, + { + "epoch": 0.35875, + "grad_norm": 31.625, + "grad_norm_var": 1.5587890625, + "learning_rate": 0.0001, + "loss": 7.4114, + "loss/crossentropy": 1.9815908901393413, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.19162300042808056, + "step": 14350 + }, + { + "epoch": 0.359, + "grad_norm": 34.5, + "grad_norm_var": 1.75390625, + "learning_rate": 0.0001, + "loss": 7.4096, + "loss/crossentropy": 2.0974107921123504, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.18985395636409522, + "step": 14360 + }, + { + "epoch": 0.35925, + "grad_norm": 29.875, + "grad_norm_var": 27.2291015625, + "learning_rate": 0.0001, + "loss": 7.2838, + "loss/crossentropy": 2.18302740752697, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.19919114038348198, + "step": 14370 + }, + { + "epoch": 0.3595, + "grad_norm": 29.0, + "grad_norm_var": 2.393684895833333, + "learning_rate": 0.0001, + "loss": 7.3591, + "loss/crossentropy": 2.1828642159700395, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.1880602626129985, + "step": 14380 + }, + { + "epoch": 0.35975, + "grad_norm": 28.25, + "grad_norm_var": 1.8875, + "learning_rate": 0.0001, + "loss": 7.4149, + "loss/crossentropy": 2.0250351071357726, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.21426690481603144, + "step": 14390 + }, + { + "epoch": 0.36, + "grad_norm": 36.0, + "grad_norm_var": 4.823893229166667, + "learning_rate": 0.0001, + "loss": 7.3843, + "loss/crossentropy": 1.9927750542759894, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18537398371845484, + "step": 14400 + }, + { + "epoch": 0.36025, + "grad_norm": 48.0, + "grad_norm_var": 20.270247395833334, + "learning_rate": 0.0001, + "loss": 7.3442, + "loss/crossentropy": 2.2051343381404878, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.18966466654092073, + "step": 14410 + }, + { + "epoch": 0.3605, + "grad_norm": 26.75, + "grad_norm_var": 32.15598958333333, + "learning_rate": 0.0001, + "loss": 7.3599, + "loss/crossentropy": 2.050907912105322, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.18331205267459155, + "step": 14420 + }, + { + "epoch": 0.36075, + "grad_norm": 32.0, + "grad_norm_var": 15.5244140625, + "learning_rate": 0.0001, + "loss": 7.4431, + "loss/crossentropy": 1.9177762359380721, + "loss/hidden": 3.507421875, + "loss/jsd": 0.0, + "loss/logits": 0.18733049537986518, + "step": 14430 + }, + { + "epoch": 0.361, + "grad_norm": 31.625, + "grad_norm_var": 3.370247395833333, + "learning_rate": 0.0001, + "loss": 7.3748, + "loss/crossentropy": 2.1186861246824265, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.1846270103007555, + "step": 14440 + }, + { + "epoch": 0.36125, + "grad_norm": 31.125, + "grad_norm_var": 2.378059895833333, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 2.1338623888790607, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.1889139147475362, + "step": 14450 + }, + { + "epoch": 0.3615, + "grad_norm": 29.125, + "grad_norm_var": 5.045572916666667, + "learning_rate": 0.0001, + "loss": 7.3355, + "loss/crossentropy": 1.9586334988474845, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.1830888209864497, + "step": 14460 + }, + { + "epoch": 0.36175, + "grad_norm": 29.375, + "grad_norm_var": 11.797916666666667, + "learning_rate": 0.0001, + "loss": 7.4646, + "loss/crossentropy": 2.0122861742973326, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.2004035959020257, + "step": 14470 + }, + { + "epoch": 0.362, + "grad_norm": 32.0, + "grad_norm_var": 11.684309895833334, + "learning_rate": 0.0001, + "loss": 7.4154, + "loss/crossentropy": 2.1907495334744453, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.2014760635793209, + "step": 14480 + }, + { + "epoch": 0.36225, + "grad_norm": 32.75, + "grad_norm_var": 14.495247395833333, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 2.0414177522063257, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.17212325502187015, + "step": 14490 + }, + { + "epoch": 0.3625, + "grad_norm": 28.875, + "grad_norm_var": 9.85390625, + "learning_rate": 0.0001, + "loss": 7.4188, + "loss/crossentropy": 1.875846453011036, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.18002864755690098, + "step": 14500 + }, + { + "epoch": 0.36275, + "grad_norm": 32.5, + "grad_norm_var": 1.7400390625, + "learning_rate": 0.0001, + "loss": 7.3819, + "loss/crossentropy": 2.2282170712947846, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20617649108171462, + "step": 14510 + }, + { + "epoch": 0.363, + "grad_norm": 29.0, + "grad_norm_var": 26.622330729166666, + "learning_rate": 0.0001, + "loss": 7.3494, + "loss/crossentropy": 2.1207677230238913, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.18537767957895995, + "step": 14520 + }, + { + "epoch": 0.36325, + "grad_norm": 28.5, + "grad_norm_var": 1.15, + "learning_rate": 0.0001, + "loss": 7.2906, + "loss/crossentropy": 2.1179309889674185, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.18408581465482712, + "step": 14530 + }, + { + "epoch": 0.3635, + "grad_norm": 31.125, + "grad_norm_var": 2.851822916666667, + "learning_rate": 0.0001, + "loss": 7.3786, + "loss/crossentropy": 2.0486052967607975, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.17906265445053576, + "step": 14540 + }, + { + "epoch": 0.36375, + "grad_norm": 33.0, + "grad_norm_var": 3.184830729166667, + "learning_rate": 0.0001, + "loss": 7.3423, + "loss/crossentropy": 2.301698251068592, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.19077361291274428, + "step": 14550 + }, + { + "epoch": 0.364, + "grad_norm": 33.25, + "grad_norm_var": 2.2134765625, + "learning_rate": 0.0001, + "loss": 7.3047, + "loss/crossentropy": 2.074369618296623, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.17446096520870924, + "step": 14560 + }, + { + "epoch": 0.36425, + "grad_norm": 32.5, + "grad_norm_var": 2.6910807291666665, + "learning_rate": 0.0001, + "loss": 7.412, + "loss/crossentropy": 2.070549990236759, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.2114204354584217, + "step": 14570 + }, + { + "epoch": 0.3645, + "grad_norm": 31.875, + "grad_norm_var": 31.843489583333334, + "learning_rate": 0.0001, + "loss": 7.4461, + "loss/crossentropy": 2.2021229028701783, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1946948293596506, + "step": 14580 + }, + { + "epoch": 0.36475, + "grad_norm": 28.875, + "grad_norm_var": 32.17916666666667, + "learning_rate": 0.0001, + "loss": 7.33, + "loss/crossentropy": 2.07574619948864, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.199923849850893, + "step": 14590 + }, + { + "epoch": 0.365, + "grad_norm": 29.625, + "grad_norm_var": 1.3020833333333333, + "learning_rate": 0.0001, + "loss": 7.3285, + "loss/crossentropy": 2.236181080341339, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.18559221122413874, + "step": 14600 + }, + { + "epoch": 0.36525, + "grad_norm": 27.875, + "grad_norm_var": 1.6979166666666667, + "learning_rate": 0.0001, + "loss": 7.4122, + "loss/crossentropy": 2.029696011543274, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.18932779133319855, + "step": 14610 + }, + { + "epoch": 0.3655, + "grad_norm": 29.375, + "grad_norm_var": 3.17265625, + "learning_rate": 0.0001, + "loss": 7.3296, + "loss/crossentropy": 2.0676317110657694, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.1811735849827528, + "step": 14620 + }, + { + "epoch": 0.36575, + "grad_norm": 31.5, + "grad_norm_var": 3.8629557291666665, + "learning_rate": 0.0001, + "loss": 7.3105, + "loss/crossentropy": 2.1635128699243067, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.19529270604252816, + "step": 14630 + }, + { + "epoch": 0.366, + "grad_norm": 28.25, + "grad_norm_var": 3.3955729166666666, + "learning_rate": 0.0001, + "loss": 7.3913, + "loss/crossentropy": 2.0359733670949938, + "loss/hidden": 3.5, + "loss/jsd": 0.0, + "loss/logits": 0.20129154790192844, + "step": 14640 + }, + { + "epoch": 0.36625, + "grad_norm": 30.5, + "grad_norm_var": 3.72265625, + "learning_rate": 0.0001, + "loss": 7.3418, + "loss/crossentropy": 1.895400796085596, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.17521814415231346, + "step": 14650 + }, + { + "epoch": 0.3665, + "grad_norm": 30.375, + "grad_norm_var": 1.7302083333333333, + "learning_rate": 0.0001, + "loss": 7.3042, + "loss/crossentropy": 2.0394834615290165, + "loss/hidden": 3.21953125, + "loss/jsd": 0.0, + "loss/logits": 0.17233089366927742, + "step": 14660 + }, + { + "epoch": 0.36675, + "grad_norm": 28.25, + "grad_norm_var": 36.44479166666667, + "learning_rate": 0.0001, + "loss": 7.3335, + "loss/crossentropy": 1.8397274687886238, + "loss/hidden": 3.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.17788148634135723, + "step": 14670 + }, + { + "epoch": 0.367, + "grad_norm": 27.875, + "grad_norm_var": 38.53639322916667, + "learning_rate": 0.0001, + "loss": 7.3786, + "loss/crossentropy": 2.2247194588184356, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.19322688207030297, + "step": 14680 + }, + { + "epoch": 0.36725, + "grad_norm": 40.0, + "grad_norm_var": 13.45390625, + "learning_rate": 0.0001, + "loss": 7.3179, + "loss/crossentropy": 1.9937927357852459, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.17893323805183173, + "step": 14690 + }, + { + "epoch": 0.3675, + "grad_norm": 32.75, + "grad_norm_var": 11.25, + "learning_rate": 0.0001, + "loss": 7.2325, + "loss/crossentropy": 2.0396489590406417, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1912191865965724, + "step": 14700 + }, + { + "epoch": 0.36775, + "grad_norm": 33.0, + "grad_norm_var": 7.471875, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.1277743950486183, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.20408784411847591, + "step": 14710 + }, + { + "epoch": 0.368, + "grad_norm": 30.75, + "grad_norm_var": 8.546875, + "learning_rate": 0.0001, + "loss": 7.3855, + "loss/crossentropy": 2.2228190809488297, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1884877322241664, + "step": 14720 + }, + { + "epoch": 0.36825, + "grad_norm": 28.625, + "grad_norm_var": 15.142708333333333, + "learning_rate": 0.0001, + "loss": 7.4076, + "loss/crossentropy": 2.1207469016313554, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.17780339773744344, + "step": 14730 + }, + { + "epoch": 0.3685, + "grad_norm": 31.125, + "grad_norm_var": 3.004622395833333, + "learning_rate": 0.0001, + "loss": 7.3209, + "loss/crossentropy": 2.0066810354590414, + "loss/hidden": 3.519921875, + "loss/jsd": 0.0, + "loss/logits": 0.20999562088400126, + "step": 14740 + }, + { + "epoch": 0.36875, + "grad_norm": 38.25, + "grad_norm_var": 15.4259765625, + "learning_rate": 0.0001, + "loss": 7.4197, + "loss/crossentropy": 2.0302488803863525, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.1997592320665717, + "step": 14750 + }, + { + "epoch": 0.369, + "grad_norm": 32.5, + "grad_norm_var": 22.669791666666665, + "learning_rate": 0.0001, + "loss": 7.3414, + "loss/crossentropy": 2.006557123363018, + "loss/hidden": 3.333203125, + "loss/jsd": 0.0, + "loss/logits": 0.18148684445768595, + "step": 14760 + }, + { + "epoch": 0.36925, + "grad_norm": 29.5, + "grad_norm_var": 22.362434895833335, + "learning_rate": 0.0001, + "loss": 7.347, + "loss/crossentropy": 2.07253782749176, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.17341973297297955, + "step": 14770 + }, + { + "epoch": 0.3695, + "grad_norm": 30.75, + "grad_norm_var": 17.227018229166667, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.056871312856674, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.21048452276736498, + "step": 14780 + }, + { + "epoch": 0.36975, + "grad_norm": 27.875, + "grad_norm_var": 38.5947265625, + "learning_rate": 0.0001, + "loss": 7.4979, + "loss/crossentropy": 2.1910998940467836, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.20320441015064716, + "step": 14790 + }, + { + "epoch": 0.37, + "grad_norm": 31.25, + "grad_norm_var": 2.081788902243793e+18, + "learning_rate": 0.0001, + "loss": 7.3396, + "loss/crossentropy": 2.0854848250746727, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.19095470141619444, + "step": 14800 + }, + { + "epoch": 0.37025, + "grad_norm": 34.75, + "grad_norm_var": 2.0817889032778286e+18, + "learning_rate": 0.0001, + "loss": 7.349, + "loss/crossentropy": 2.024295690655708, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.2073504414409399, + "step": 14810 + }, + { + "epoch": 0.3705, + "grad_norm": 30.375, + "grad_norm_var": 8.116080729166667, + "learning_rate": 0.0001, + "loss": 7.1984, + "loss/crossentropy": 1.9738368421792984, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.175398519821465, + "step": 14820 + }, + { + "epoch": 0.37075, + "grad_norm": 34.0, + "grad_norm_var": 3.95390625, + "learning_rate": 0.0001, + "loss": 7.3269, + "loss/crossentropy": 2.0109583623707294, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.1902273640036583, + "step": 14830 + }, + { + "epoch": 0.371, + "grad_norm": 30.25, + "grad_norm_var": 3.410416666666667, + "learning_rate": 0.0001, + "loss": 7.3309, + "loss/crossentropy": 2.0899639569222925, + "loss/hidden": 3.444921875, + "loss/jsd": 0.0, + "loss/logits": 0.18828139845281838, + "step": 14840 + }, + { + "epoch": 0.37125, + "grad_norm": 31.375, + "grad_norm_var": 51.5087890625, + "learning_rate": 0.0001, + "loss": 7.2761, + "loss/crossentropy": 2.0561746567487718, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.17576684867963194, + "step": 14850 + }, + { + "epoch": 0.3715, + "grad_norm": 32.25, + "grad_norm_var": 54.873372395833336, + "learning_rate": 0.0001, + "loss": 7.2875, + "loss/crossentropy": 2.1231576301157475, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.1990961253643036, + "step": 14860 + }, + { + "epoch": 0.37175, + "grad_norm": 34.25, + "grad_norm_var": 29.984309895833334, + "learning_rate": 0.0001, + "loss": 7.3424, + "loss/crossentropy": 2.1051357120275496, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.17979057859629394, + "step": 14870 + }, + { + "epoch": 0.372, + "grad_norm": 28.25, + "grad_norm_var": 19850.327018229167, + "learning_rate": 0.0001, + "loss": 7.3823, + "loss/crossentropy": 2.3242007076740263, + "loss/hidden": 3.49609375, + "loss/jsd": 0.0, + "loss/logits": 0.2740444682538509, + "step": 14880 + }, + { + "epoch": 0.37225, + "grad_norm": 27.25, + "grad_norm_var": 35.0625, + "learning_rate": 0.0001, + "loss": 7.2273, + "loss/crossentropy": 2.143503928184509, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.19005000293254853, + "step": 14890 + }, + { + "epoch": 0.3725, + "grad_norm": 34.0, + "grad_norm_var": 38.23125, + "learning_rate": 0.0001, + "loss": 7.2655, + "loss/crossentropy": 2.1128626547753813, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.18479138296097516, + "step": 14900 + }, + { + "epoch": 0.37275, + "grad_norm": 31.0, + "grad_norm_var": 26.7478515625, + "learning_rate": 0.0001, + "loss": 7.3153, + "loss/crossentropy": 1.860854334384203, + "loss/hidden": 3.255859375, + "loss/jsd": 0.0, + "loss/logits": 0.1592037882655859, + "step": 14910 + }, + { + "epoch": 0.373, + "grad_norm": 29.625, + "grad_norm_var": 30.737239583333334, + "learning_rate": 0.0001, + "loss": 7.287, + "loss/crossentropy": 2.067621612548828, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.1897974604740739, + "step": 14920 + }, + { + "epoch": 0.37325, + "grad_norm": 29.0, + "grad_norm_var": 51.01223958333333, + "learning_rate": 0.0001, + "loss": 7.4038, + "loss/crossentropy": 2.0471679329872132, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.18869173359125851, + "step": 14930 + }, + { + "epoch": 0.3735, + "grad_norm": 40.25, + "grad_norm_var": 42.828059895833334, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.0969312518835066, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.19103059088811278, + "step": 14940 + }, + { + "epoch": 0.37375, + "grad_norm": 27.625, + "grad_norm_var": 16.3625, + "learning_rate": 0.0001, + "loss": 7.3022, + "loss/crossentropy": 2.208647185564041, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.1936396975070238, + "step": 14950 + }, + { + "epoch": 0.374, + "grad_norm": 28.625, + "grad_norm_var": 16.837239583333332, + "learning_rate": 0.0001, + "loss": 7.2414, + "loss/crossentropy": 2.1974997609853744, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.19469811543822288, + "step": 14960 + }, + { + "epoch": 0.37425, + "grad_norm": 29.625, + "grad_norm_var": 14.108072916666666, + "learning_rate": 0.0001, + "loss": 7.2849, + "loss/crossentropy": 2.134792809188366, + "loss/hidden": 3.20625, + "loss/jsd": 0.0, + "loss/logits": 0.17098608147352934, + "step": 14970 + }, + { + "epoch": 0.3745, + "grad_norm": 33.25, + "grad_norm_var": 27.1181640625, + "learning_rate": 0.0001, + "loss": 7.3014, + "loss/crossentropy": 2.1276439666748046, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.18990135621279478, + "step": 14980 + }, + { + "epoch": 0.37475, + "grad_norm": 29.75, + "grad_norm_var": 16.033072916666665, + "learning_rate": 0.0001, + "loss": 7.3335, + "loss/crossentropy": 2.0856949634850026, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.18325317203998565, + "step": 14990 + }, + { + "epoch": 0.375, + "grad_norm": 31.625, + "grad_norm_var": 3.321875, + "learning_rate": 0.0001, + "loss": 7.3642, + "loss/crossentropy": 2.1136090487241743, + "loss/hidden": 3.523828125, + "loss/jsd": 0.0, + "loss/logits": 0.20829910095781087, + "step": 15000 + }, + { + "epoch": 0.37525, + "grad_norm": 33.25, + "grad_norm_var": 4.076497395833333, + "learning_rate": 0.0001, + "loss": 7.3349, + "loss/crossentropy": 2.007338209450245, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.17481161141768098, + "step": 15010 + }, + { + "epoch": 0.3755, + "grad_norm": 30.125, + "grad_norm_var": 7.48515625, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.1611456774175166, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.1868826244957745, + "step": 15020 + }, + { + "epoch": 0.37575, + "grad_norm": 30.0, + "grad_norm_var": 5.459375, + "learning_rate": 0.0001, + "loss": 7.2878, + "loss/crossentropy": 2.0543340116739275, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.1872962446883321, + "step": 15030 + }, + { + "epoch": 0.376, + "grad_norm": 34.5, + "grad_norm_var": 4.847916666666666, + "learning_rate": 0.0001, + "loss": 7.2701, + "loss/crossentropy": 2.079775569587946, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.19110171888023614, + "step": 15040 + }, + { + "epoch": 0.37625, + "grad_norm": 29.5, + "grad_norm_var": 4.3212890625, + "learning_rate": 0.0001, + "loss": 7.3025, + "loss/crossentropy": 2.267373038828373, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.1936658274382353, + "step": 15050 + }, + { + "epoch": 0.3765, + "grad_norm": 31.875, + "grad_norm_var": 3.746875, + "learning_rate": 0.0001, + "loss": 7.3107, + "loss/crossentropy": 2.1074527725577354, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.18790427446365357, + "step": 15060 + }, + { + "epoch": 0.37675, + "grad_norm": 29.125, + "grad_norm_var": 5.162955729166667, + "learning_rate": 0.0001, + "loss": 7.3127, + "loss/crossentropy": 2.1202156826853753, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.18663588212803006, + "step": 15070 + }, + { + "epoch": 0.377, + "grad_norm": 32.5, + "grad_norm_var": 4.164583333333334, + "learning_rate": 0.0001, + "loss": 7.302, + "loss/crossentropy": 1.978425008058548, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.17640805542469024, + "step": 15080 + }, + { + "epoch": 0.37725, + "grad_norm": 29.75, + "grad_norm_var": 5.9962890625, + "learning_rate": 0.0001, + "loss": 7.2746, + "loss/crossentropy": 1.9541339822113515, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.18182314159348606, + "step": 15090 + }, + { + "epoch": 0.3775, + "grad_norm": 31.375, + "grad_norm_var": 5.4822265625, + "learning_rate": 0.0001, + "loss": 7.1614, + "loss/crossentropy": 2.26050483584404, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.18672147747129203, + "step": 15100 + }, + { + "epoch": 0.37775, + "grad_norm": 35.75, + "grad_norm_var": 10.6431640625, + "learning_rate": 0.0001, + "loss": 7.4291, + "loss/crossentropy": 1.9852021753787994, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.1927258798852563, + "step": 15110 + }, + { + "epoch": 0.378, + "grad_norm": 30.875, + "grad_norm_var": 6.601041666666666, + "learning_rate": 0.0001, + "loss": 7.2829, + "loss/crossentropy": 2.067719455063343, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.1875240265391767, + "step": 15120 + }, + { + "epoch": 0.37825, + "grad_norm": 34.75, + "grad_norm_var": 8.2572265625, + "learning_rate": 0.0001, + "loss": 7.206, + "loss/crossentropy": 2.0955534875392914, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.18975083716213703, + "step": 15130 + }, + { + "epoch": 0.3785, + "grad_norm": 33.5, + "grad_norm_var": 4.514518229166667, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.231545624136925, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.1933881167322397, + "step": 15140 + }, + { + "epoch": 0.37875, + "grad_norm": 31.25, + "grad_norm_var": 6.922916666666667, + "learning_rate": 0.0001, + "loss": 7.3186, + "loss/crossentropy": 2.1041683927178383, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.17568105049431323, + "step": 15150 + }, + { + "epoch": 0.379, + "grad_norm": 31.25, + "grad_norm_var": 8.064583333333333, + "learning_rate": 0.0001, + "loss": 7.1475, + "loss/crossentropy": 2.128729820251465, + "loss/hidden": 3.267578125, + "loss/jsd": 0.0, + "loss/logits": 0.18820853671059012, + "step": 15160 + }, + { + "epoch": 0.37925, + "grad_norm": 32.75, + "grad_norm_var": 54.0400390625, + "learning_rate": 0.0001, + "loss": 7.4073, + "loss/crossentropy": 2.0806841500103475, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.19645702093839645, + "step": 15170 + }, + { + "epoch": 0.3795, + "grad_norm": 31.375, + "grad_norm_var": 53.652018229166664, + "learning_rate": 0.0001, + "loss": 7.1232, + "loss/crossentropy": 2.077859415113926, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.18113754820078612, + "step": 15180 + }, + { + "epoch": 0.37975, + "grad_norm": 34.0, + "grad_norm_var": 2.986167996076768e+18, + "learning_rate": 0.0001, + "loss": 7.2362, + "loss/crossentropy": 1.911629044264555, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.17613004967570306, + "step": 15190 + }, + { + "epoch": 0.38, + "grad_norm": 30.0, + "grad_norm_var": 2.9861679996912794e+18, + "learning_rate": 0.0001, + "loss": 7.2782, + "loss/crossentropy": 2.120191100984812, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.1851762106642127, + "step": 15200 + }, + { + "epoch": 0.38025, + "grad_norm": 31.375, + "grad_norm_var": 2.6010416666666667, + "learning_rate": 0.0001, + "loss": 7.3305, + "loss/crossentropy": 2.084269215166569, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.1797555785626173, + "step": 15210 + }, + { + "epoch": 0.3805, + "grad_norm": 31.625, + "grad_norm_var": 10.604622395833333, + "learning_rate": 0.0001, + "loss": 7.2582, + "loss/crossentropy": 2.077724662423134, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.17972503434866666, + "step": 15220 + }, + { + "epoch": 0.38075, + "grad_norm": 32.25, + "grad_norm_var": 10.440625, + "learning_rate": 0.0001, + "loss": 7.298, + "loss/crossentropy": 2.025072456151247, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18922405112534763, + "step": 15230 + }, + { + "epoch": 0.381, + "grad_norm": 32.5, + "grad_norm_var": 3.56875, + "learning_rate": 0.0001, + "loss": 7.3076, + "loss/crossentropy": 2.1376619666814802, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.2057780459523201, + "step": 15240 + }, + { + "epoch": 0.38125, + "grad_norm": 30.25, + "grad_norm_var": 1.3330729166666666, + "learning_rate": 0.0001, + "loss": 7.2858, + "loss/crossentropy": 2.0143167987465858, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.19064239636063576, + "step": 15250 + }, + { + "epoch": 0.3815, + "grad_norm": 30.75, + "grad_norm_var": 1.2483723958333333, + "learning_rate": 0.0001, + "loss": 7.2522, + "loss/crossentropy": 2.0520439468324185, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.18599720411002635, + "step": 15260 + }, + { + "epoch": 0.38175, + "grad_norm": 31.0, + "grad_norm_var": 1.7712890625, + "learning_rate": 0.0001, + "loss": 7.1878, + "loss/crossentropy": 2.1704175978899003, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.17961867786943914, + "step": 15270 + }, + { + "epoch": 0.382, + "grad_norm": 30.375, + "grad_norm_var": 7.103125, + "learning_rate": 0.0001, + "loss": 7.3041, + "loss/crossentropy": 2.081410513818264, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.18557187523692847, + "step": 15280 + }, + { + "epoch": 0.38225, + "grad_norm": 32.25, + "grad_norm_var": 6.169205729166666, + "learning_rate": 0.0001, + "loss": 7.2756, + "loss/crossentropy": 2.0742292061448095, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18702635429799558, + "step": 15290 + }, + { + "epoch": 0.3825, + "grad_norm": 29.125, + "grad_norm_var": 5.061458333333333, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.081205701082945, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.1858987484127283, + "step": 15300 + }, + { + "epoch": 0.38275, + "grad_norm": 31.625, + "grad_norm_var": 3.6405598958333334, + "learning_rate": 0.0001, + "loss": 7.2722, + "loss/crossentropy": 1.9639541752636434, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.18367136251181365, + "step": 15310 + }, + { + "epoch": 0.383, + "grad_norm": 29.375, + "grad_norm_var": 4.342708333333333, + "learning_rate": 0.0001, + "loss": 7.3282, + "loss/crossentropy": 1.8718928024172783, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.18070896286517382, + "step": 15320 + }, + { + "epoch": 0.38325, + "grad_norm": 29.75, + "grad_norm_var": 15.129166666666666, + "learning_rate": 0.0001, + "loss": 7.1724, + "loss/crossentropy": 2.0988180577754973, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.1885043691843748, + "step": 15330 + }, + { + "epoch": 0.3835, + "grad_norm": 28.5, + "grad_norm_var": 11.845833333333333, + "learning_rate": 0.0001, + "loss": 7.385, + "loss/crossentropy": 2.1939360052347183, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.1921105194836855, + "step": 15340 + }, + { + "epoch": 0.38375, + "grad_norm": 37.0, + "grad_norm_var": 9.03515625, + "learning_rate": 0.0001, + "loss": 7.3133, + "loss/crossentropy": 2.0929661616683006, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.17670788932591677, + "step": 15350 + }, + { + "epoch": 0.384, + "grad_norm": 30.625, + "grad_norm_var": 6.0259765625, + "learning_rate": 0.0001, + "loss": 7.2666, + "loss/crossentropy": 2.146226355433464, + "loss/hidden": 3.27578125, + "loss/jsd": 0.0, + "loss/logits": 0.18036798052489758, + "step": 15360 + }, + { + "epoch": 0.38425, + "grad_norm": 33.75, + "grad_norm_var": 6.095768229166667, + "learning_rate": 0.0001, + "loss": 7.2881, + "loss/crossentropy": 2.1441670656204224, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.17735737133771182, + "step": 15370 + }, + { + "epoch": 0.3845, + "grad_norm": 31.0, + "grad_norm_var": 15.9291015625, + "learning_rate": 0.0001, + "loss": 7.2613, + "loss/crossentropy": 1.9772998243570328, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.1872927848249674, + "step": 15380 + }, + { + "epoch": 0.38475, + "grad_norm": 29.125, + "grad_norm_var": 23.395833333333332, + "learning_rate": 0.0001, + "loss": 7.3075, + "loss/crossentropy": 2.061436576396227, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.1824639480561018, + "step": 15390 + }, + { + "epoch": 0.385, + "grad_norm": 28.0, + "grad_norm_var": 40.42890625, + "learning_rate": 0.0001, + "loss": 7.307, + "loss/crossentropy": 2.222813582420349, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.1908612387254834, + "step": 15400 + }, + { + "epoch": 0.38525, + "grad_norm": 33.0, + "grad_norm_var": 6.859309895833333, + "learning_rate": 0.0001, + "loss": 7.3366, + "loss/crossentropy": 1.905498656630516, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.19282873664051295, + "step": 15410 + }, + { + "epoch": 0.3855, + "grad_norm": 34.0, + "grad_norm_var": 2.953125, + "learning_rate": 0.0001, + "loss": 7.3019, + "loss/crossentropy": 2.1414296105504036, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.1922605263069272, + "step": 15420 + }, + { + "epoch": 0.38575, + "grad_norm": 32.25, + "grad_norm_var": 3.1556640625, + "learning_rate": 0.0001, + "loss": 7.3754, + "loss/crossentropy": 2.168923759460449, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.18643623348325492, + "step": 15430 + }, + { + "epoch": 0.386, + "grad_norm": 47.5, + "grad_norm_var": 21.223893229166666, + "learning_rate": 0.0001, + "loss": 7.311, + "loss/crossentropy": 2.0150568321347238, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.18107260223478078, + "step": 15440 + }, + { + "epoch": 0.38625, + "grad_norm": 30.125, + "grad_norm_var": 22.613541666666666, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.000614331662655, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1795403585769236, + "step": 15450 + }, + { + "epoch": 0.3865, + "grad_norm": 30.875, + "grad_norm_var": 4.323958333333334, + "learning_rate": 0.0001, + "loss": 7.2787, + "loss/crossentropy": 2.1310448944568634, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.20316302124410868, + "step": 15460 + }, + { + "epoch": 0.38675, + "grad_norm": 28.875, + "grad_norm_var": 2.8900390625, + "learning_rate": 0.0001, + "loss": 7.2955, + "loss/crossentropy": 2.261080901324749, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.195007973536849, + "step": 15470 + }, + { + "epoch": 0.387, + "grad_norm": 34.5, + "grad_norm_var": 5.880208333333333, + "learning_rate": 0.0001, + "loss": 7.412, + "loss/crossentropy": 2.137118950486183, + "loss/hidden": 3.428125, + "loss/jsd": 0.0, + "loss/logits": 0.20062100179493428, + "step": 15480 + }, + { + "epoch": 0.38725, + "grad_norm": 31.5, + "grad_norm_var": 4.9634765625, + "learning_rate": 0.0001, + "loss": 7.3413, + "loss/crossentropy": 2.281751687824726, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.19415688924491406, + "step": 15490 + }, + { + "epoch": 0.3875, + "grad_norm": 31.875, + "grad_norm_var": 2.1134765625, + "learning_rate": 0.0001, + "loss": 7.3408, + "loss/crossentropy": 2.0828112423419953, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.20367840733379125, + "step": 15500 + }, + { + "epoch": 0.38775, + "grad_norm": 28.75, + "grad_norm_var": 67.39765625, + "learning_rate": 0.0001, + "loss": 7.3114, + "loss/crossentropy": 1.944053754210472, + "loss/hidden": 3.2671875, + "loss/jsd": 0.0, + "loss/logits": 0.16787241958081722, + "step": 15510 + }, + { + "epoch": 0.388, + "grad_norm": 30.0, + "grad_norm_var": 69.334375, + "learning_rate": 0.0001, + "loss": 7.418, + "loss/crossentropy": 2.0586235135793687, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.18477604109793902, + "step": 15520 + }, + { + "epoch": 0.38825, + "grad_norm": 30.25, + "grad_norm_var": 23.683333333333334, + "learning_rate": 0.0001, + "loss": 7.2656, + "loss/crossentropy": 2.1504869014024734, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.1810676084831357, + "step": 15530 + }, + { + "epoch": 0.3885, + "grad_norm": 30.0, + "grad_norm_var": 31.371875, + "learning_rate": 0.0001, + "loss": 7.2194, + "loss/crossentropy": 2.043111228942871, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.18173537645488977, + "step": 15540 + }, + { + "epoch": 0.38875, + "grad_norm": 27.75, + "grad_norm_var": 25.820247395833334, + "learning_rate": 0.0001, + "loss": 7.2142, + "loss/crossentropy": 1.9861251831054687, + "loss/hidden": 3.353125, + "loss/jsd": 0.0, + "loss/logits": 0.1864440616220236, + "step": 15550 + }, + { + "epoch": 0.389, + "grad_norm": 28.875, + "grad_norm_var": 31.534830729166668, + "learning_rate": 0.0001, + "loss": 7.303, + "loss/crossentropy": 2.0181840524077415, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.1844530213624239, + "step": 15560 + }, + { + "epoch": 0.38925, + "grad_norm": 33.0, + "grad_norm_var": 29.174739583333334, + "learning_rate": 0.0001, + "loss": 7.3188, + "loss/crossentropy": 2.0922836802899836, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.18936851639300584, + "step": 15570 + }, + { + "epoch": 0.3895, + "grad_norm": 32.25, + "grad_norm_var": 13.853125, + "learning_rate": 0.0001, + "loss": 7.2795, + "loss/crossentropy": 2.0901282012462614, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.18346477318555116, + "step": 15580 + }, + { + "epoch": 0.38975, + "grad_norm": 50.0, + "grad_norm_var": 32.6509765625, + "learning_rate": 0.0001, + "loss": 7.2721, + "loss/crossentropy": 2.1004329577088354, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1858608016744256, + "step": 15590 + }, + { + "epoch": 0.39, + "grad_norm": 28.5, + "grad_norm_var": 60.52389322916667, + "learning_rate": 0.0001, + "loss": 7.1135, + "loss/crossentropy": 2.058871814608574, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18565063830465078, + "step": 15600 + }, + { + "epoch": 0.39025, + "grad_norm": 32.25, + "grad_norm_var": 45.546875, + "learning_rate": 0.0001, + "loss": 7.1575, + "loss/crossentropy": 1.9910513393580913, + "loss/hidden": 3.268359375, + "loss/jsd": 0.0, + "loss/logits": 0.17116884887218475, + "step": 15610 + }, + { + "epoch": 0.3905, + "grad_norm": 28.625, + "grad_norm_var": 13.693489583333333, + "learning_rate": 0.0001, + "loss": 7.1938, + "loss/crossentropy": 2.0928867653012277, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.1812552411109209, + "step": 15620 + }, + { + "epoch": 0.39075, + "grad_norm": 38.0, + "grad_norm_var": 17.55390625, + "learning_rate": 0.0001, + "loss": 7.2295, + "loss/crossentropy": 1.908367295563221, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.17530275397002698, + "step": 15630 + }, + { + "epoch": 0.391, + "grad_norm": 32.5, + "grad_norm_var": 16.020572916666666, + "learning_rate": 0.0001, + "loss": 7.3788, + "loss/crossentropy": 2.1054799772799013, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.18641111236065627, + "step": 15640 + }, + { + "epoch": 0.39125, + "grad_norm": 37.5, + "grad_norm_var": 17.2744140625, + "learning_rate": 0.0001, + "loss": 7.2224, + "loss/crossentropy": 2.0865116521716116, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1855388769879937, + "step": 15650 + }, + { + "epoch": 0.3915, + "grad_norm": 30.5, + "grad_norm_var": 10.327018229166667, + "learning_rate": 0.0001, + "loss": 7.3264, + "loss/crossentropy": 2.0367104552686213, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.18481685388833285, + "step": 15660 + }, + { + "epoch": 0.39175, + "grad_norm": 35.25, + "grad_norm_var": 8.862239583333333, + "learning_rate": 0.0001, + "loss": 7.343, + "loss/crossentropy": 2.2644485861063, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.20473828464746474, + "step": 15670 + }, + { + "epoch": 0.392, + "grad_norm": 29.625, + "grad_norm_var": 9.986458333333333, + "learning_rate": 0.0001, + "loss": 7.3652, + "loss/crossentropy": 2.039817491173744, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.1852146415039897, + "step": 15680 + }, + { + "epoch": 0.39225, + "grad_norm": 36.5, + "grad_norm_var": 11.512239583333333, + "learning_rate": 0.0001, + "loss": 7.2599, + "loss/crossentropy": 2.0308106377720834, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.19251971375197172, + "step": 15690 + }, + { + "epoch": 0.3925, + "grad_norm": 30.375, + "grad_norm_var": 9.9650390625, + "learning_rate": 0.0001, + "loss": 7.2512, + "loss/crossentropy": 1.980294729024172, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.17508508097380399, + "step": 15700 + }, + { + "epoch": 0.39275, + "grad_norm": 32.25, + "grad_norm_var": 8.9869140625, + "learning_rate": 0.0001, + "loss": 7.2101, + "loss/crossentropy": 1.817311129719019, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17575446693226696, + "step": 15710 + }, + { + "epoch": 0.393, + "grad_norm": 32.0, + "grad_norm_var": 6.434375, + "learning_rate": 0.0001, + "loss": 7.3109, + "loss/crossentropy": 2.231336107850075, + "loss/hidden": 3.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.186120249889791, + "step": 15720 + }, + { + "epoch": 0.39325, + "grad_norm": 27.5, + "grad_norm_var": 6.1375, + "learning_rate": 0.0001, + "loss": 7.2341, + "loss/crossentropy": 2.1372755438089373, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19311708956956863, + "step": 15730 + }, + { + "epoch": 0.3935, + "grad_norm": 33.0, + "grad_norm_var": 12.706184895833333, + "learning_rate": 0.0001, + "loss": 7.4575, + "loss/crossentropy": 2.1330266147851944, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.19104793900623918, + "step": 15740 + }, + { + "epoch": 0.39375, + "grad_norm": 29.625, + "grad_norm_var": 14.846875, + "learning_rate": 0.0001, + "loss": 7.3186, + "loss/crossentropy": 2.090308539569378, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.20190660767257212, + "step": 15750 + }, + { + "epoch": 0.394, + "grad_norm": 30.0, + "grad_norm_var": 7.740559895833333, + "learning_rate": 0.0001, + "loss": 7.3049, + "loss/crossentropy": 2.075870428979397, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.1769866080954671, + "step": 15760 + }, + { + "epoch": 0.39425, + "grad_norm": 29.625, + "grad_norm_var": 4.682291666666667, + "learning_rate": 0.0001, + "loss": 7.2318, + "loss/crossentropy": 2.072967618703842, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.18285961151123048, + "step": 15770 + }, + { + "epoch": 0.3945, + "grad_norm": 28.0, + "grad_norm_var": 16.6494140625, + "learning_rate": 0.0001, + "loss": 7.2528, + "loss/crossentropy": 2.099040774255991, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.18526200093328954, + "step": 15780 + }, + { + "epoch": 0.39475, + "grad_norm": 30.75, + "grad_norm_var": 13.454622395833333, + "learning_rate": 0.0001, + "loss": 7.2255, + "loss/crossentropy": 2.0226017188280823, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.17934049041941763, + "step": 15790 + }, + { + "epoch": 0.395, + "grad_norm": 31.625, + "grad_norm_var": 2.4546223958333333, + "learning_rate": 0.0001, + "loss": 7.3046, + "loss/crossentropy": 2.1888489231467245, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.18036039546132088, + "step": 15800 + }, + { + "epoch": 0.39525, + "grad_norm": 30.125, + "grad_norm_var": 2.6393229166666665, + "learning_rate": 0.0001, + "loss": 7.2127, + "loss/crossentropy": 1.9043663069605827, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.17555271480232476, + "step": 15810 + }, + { + "epoch": 0.3955, + "grad_norm": 31.75, + "grad_norm_var": 85.26223958333334, + "learning_rate": 0.0001, + "loss": 7.2355, + "loss/crossentropy": 1.9372950181365014, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.17982141822576522, + "step": 15820 + }, + { + "epoch": 0.39575, + "grad_norm": 30.375, + "grad_norm_var": 86.18795572916666, + "learning_rate": 0.0001, + "loss": 7.2905, + "loss/crossentropy": 2.2129892081022264, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.19481155294924973, + "step": 15830 + }, + { + "epoch": 0.396, + "grad_norm": 31.75, + "grad_norm_var": 4.287434895833333, + "learning_rate": 0.0001, + "loss": 7.1819, + "loss/crossentropy": 1.9988259918987752, + "loss/hidden": 3.203515625, + "loss/jsd": 0.0, + "loss/logits": 0.16147571448236703, + "step": 15840 + }, + { + "epoch": 0.39625, + "grad_norm": 28.875, + "grad_norm_var": 4.6009765625, + "learning_rate": 0.0001, + "loss": 7.359, + "loss/crossentropy": 2.1142133861780166, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18781689032912255, + "step": 15850 + }, + { + "epoch": 0.3965, + "grad_norm": 32.0, + "grad_norm_var": 3.5452473958333335, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.0223607540130617, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.19158173706382514, + "step": 15860 + }, + { + "epoch": 0.39675, + "grad_norm": 28.625, + "grad_norm_var": 1.7729166666666667, + "learning_rate": 0.0001, + "loss": 7.3644, + "loss/crossentropy": 2.170432631671429, + "loss/hidden": 3.266015625, + "loss/jsd": 0.0, + "loss/logits": 0.18453630786389114, + "step": 15870 + }, + { + "epoch": 0.397, + "grad_norm": 32.0, + "grad_norm_var": 3.5559895833333335, + "learning_rate": 0.0001, + "loss": 7.3114, + "loss/crossentropy": 1.9615431509912014, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.18825806053355337, + "step": 15880 + }, + { + "epoch": 0.39725, + "grad_norm": 37.25, + "grad_norm_var": 71.86555989583333, + "learning_rate": 0.0001, + "loss": 7.3443, + "loss/crossentropy": 2.0923714101314546, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.18501388970762492, + "step": 15890 + }, + { + "epoch": 0.3975, + "grad_norm": 29.25, + "grad_norm_var": 75.43307291666666, + "learning_rate": 0.0001, + "loss": 7.2744, + "loss/crossentropy": 2.0929214730858803, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17970609571784735, + "step": 15900 + }, + { + "epoch": 0.39775, + "grad_norm": 31.375, + "grad_norm_var": 4.829622395833334, + "learning_rate": 0.0001, + "loss": 7.2885, + "loss/crossentropy": 1.9244549985975028, + "loss/hidden": 3.2625, + "loss/jsd": 0.0, + "loss/logits": 0.16870631305500866, + "step": 15910 + }, + { + "epoch": 0.398, + "grad_norm": 30.5, + "grad_norm_var": 2.684309895833333, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.024969767779112, + "loss/hidden": 3.30625, + "loss/jsd": 0.0, + "loss/logits": 0.17861165013164282, + "step": 15920 + }, + { + "epoch": 0.39825, + "grad_norm": 36.5, + "grad_norm_var": 13.322330729166667, + "learning_rate": 0.0001, + "loss": 7.2294, + "loss/crossentropy": 1.9006286635994911, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.1897978626191616, + "step": 15930 + }, + { + "epoch": 0.3985, + "grad_norm": 41.25, + "grad_norm_var": 19.4478515625, + "learning_rate": 0.0001, + "loss": 7.3494, + "loss/crossentropy": 2.1528166741132737, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.20025787688791752, + "step": 15940 + }, + { + "epoch": 0.39875, + "grad_norm": 30.125, + "grad_norm_var": 12.692122395833334, + "learning_rate": 0.0001, + "loss": 7.2807, + "loss/crossentropy": 2.159406080842018, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19357968419790267, + "step": 15950 + }, + { + "epoch": 0.399, + "grad_norm": 28.375, + "grad_norm_var": 9.457291666666666, + "learning_rate": 0.0001, + "loss": 7.0891, + "loss/crossentropy": 2.0662672072649, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.1739374803379178, + "step": 15960 + }, + { + "epoch": 0.39925, + "grad_norm": 33.25, + "grad_norm_var": 17.795833333333334, + "learning_rate": 0.0001, + "loss": 7.3368, + "loss/crossentropy": 2.2619366616010668, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.19278527814894914, + "step": 15970 + }, + { + "epoch": 0.3995, + "grad_norm": 33.5, + "grad_norm_var": 32.524739583333336, + "learning_rate": 0.0001, + "loss": 7.2702, + "loss/crossentropy": 1.9777626052498818, + "loss/hidden": 3.27109375, + "loss/jsd": 0.0, + "loss/logits": 0.1701345544308424, + "step": 15980 + }, + { + "epoch": 0.39975, + "grad_norm": 31.0, + "grad_norm_var": 27.562955729166667, + "learning_rate": 0.0001, + "loss": 7.2636, + "loss/crossentropy": 1.984829319268465, + "loss/hidden": 3.278125, + "loss/jsd": 0.0, + "loss/logits": 0.1783665182068944, + "step": 15990 + }, + { + "epoch": 0.4, + "grad_norm": 30.25, + "grad_norm_var": 7.468489583333334, + "learning_rate": 0.0001, + "loss": 7.3522, + "loss/crossentropy": 2.027949205040932, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.20451073683798313, + "step": 16000 + }, + { + "epoch": 0.40025, + "grad_norm": 30.75, + "grad_norm_var": 3.1311848958333335, + "learning_rate": 0.0001, + "loss": 7.3003, + "loss/crossentropy": 2.0037057876586912, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.1825689303688705, + "step": 16010 + }, + { + "epoch": 0.4005, + "grad_norm": 28.625, + "grad_norm_var": 4.043684895833334, + "learning_rate": 0.0001, + "loss": 7.3601, + "loss/crossentropy": 2.20539278537035, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.19239765033125877, + "step": 16020 + }, + { + "epoch": 0.40075, + "grad_norm": 32.75, + "grad_norm_var": 3.1626528094295864e+18, + "learning_rate": 0.0001, + "loss": 7.2581, + "loss/crossentropy": 1.9106431234627963, + "loss/hidden": 3.466796875, + "loss/jsd": 0.0, + "loss/logits": 0.15896897157654166, + "step": 16030 + }, + { + "epoch": 0.401, + "grad_norm": 31.125, + "grad_norm_var": 9.828125, + "learning_rate": 0.0001, + "loss": 7.2905, + "loss/crossentropy": 1.9915092147886753, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.1976497994735837, + "step": 16040 + }, + { + "epoch": 0.40125, + "grad_norm": 31.0, + "grad_norm_var": 1.596875, + "learning_rate": 0.0001, + "loss": 7.2439, + "loss/crossentropy": 2.0657053992152212, + "loss/hidden": 3.266015625, + "loss/jsd": 0.0, + "loss/logits": 0.17077930886298417, + "step": 16050 + }, + { + "epoch": 0.4015, + "grad_norm": 28.625, + "grad_norm_var": 7.05390625, + "learning_rate": 0.0001, + "loss": 7.2314, + "loss/crossentropy": 1.9348336219787599, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.18747655171900987, + "step": 16060 + }, + { + "epoch": 0.40175, + "grad_norm": 33.0, + "grad_norm_var": 10.48515625, + "learning_rate": 0.0001, + "loss": 7.406, + "loss/crossentropy": 2.0627846613526346, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.20208690427243708, + "step": 16070 + }, + { + "epoch": 0.402, + "grad_norm": 33.25, + "grad_norm_var": 16.162434895833332, + "learning_rate": 0.0001, + "loss": 7.3501, + "loss/crossentropy": 1.984161777049303, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.1927208350971341, + "step": 16080 + }, + { + "epoch": 0.40225, + "grad_norm": 32.25, + "grad_norm_var": 17.923958333333335, + "learning_rate": 0.0001, + "loss": 7.3793, + "loss/crossentropy": 2.074259965121746, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.18814291208982467, + "step": 16090 + }, + { + "epoch": 0.4025, + "grad_norm": 32.25, + "grad_norm_var": 3.3827473958333334, + "learning_rate": 0.0001, + "loss": 7.3529, + "loss/crossentropy": 2.1366620868444444, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.1963860308751464, + "step": 16100 + }, + { + "epoch": 0.40275, + "grad_norm": 31.25, + "grad_norm_var": 2.5254557291666666, + "learning_rate": 0.0001, + "loss": 7.2352, + "loss/crossentropy": 1.9054784037172794, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18159330673515797, + "step": 16110 + }, + { + "epoch": 0.403, + "grad_norm": 32.5, + "grad_norm_var": 29.687434895833334, + "learning_rate": 0.0001, + "loss": 7.4832, + "loss/crossentropy": 2.10203175842762, + "loss/hidden": 3.530859375, + "loss/jsd": 0.0, + "loss/logits": 0.21614400669932365, + "step": 16120 + }, + { + "epoch": 0.40325, + "grad_norm": 31.25, + "grad_norm_var": 29.977018229166667, + "learning_rate": 0.0001, + "loss": 7.1964, + "loss/crossentropy": 2.079006028920412, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.1874436529353261, + "step": 16130 + }, + { + "epoch": 0.4035, + "grad_norm": 29.25, + "grad_norm_var": 3.064583333333333, + "learning_rate": 0.0001, + "loss": 7.2316, + "loss/crossentropy": 2.0732211872935293, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.1887400571256876, + "step": 16140 + }, + { + "epoch": 0.40375, + "grad_norm": 31.875, + "grad_norm_var": 6.433072916666666, + "learning_rate": 0.0001, + "loss": 7.2228, + "loss/crossentropy": 1.862874235212803, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.17755231373012065, + "step": 16150 + }, + { + "epoch": 0.404, + "grad_norm": 31.625, + "grad_norm_var": 3.0809895833333334, + "learning_rate": 0.0001, + "loss": 7.307, + "loss/crossentropy": 2.0635001718997956, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.17978991996496915, + "step": 16160 + }, + { + "epoch": 0.40425, + "grad_norm": 30.375, + "grad_norm_var": 13.473958333333334, + "learning_rate": 0.0001, + "loss": 7.2643, + "loss/crossentropy": 1.9896802924573422, + "loss/hidden": 3.23046875, + "loss/jsd": 0.0, + "loss/logits": 0.16587578747421503, + "step": 16170 + }, + { + "epoch": 0.4045, + "grad_norm": 33.75, + "grad_norm_var": 8.254622395833334, + "learning_rate": 0.0001, + "loss": 7.4306, + "loss/crossentropy": 2.244751125574112, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.1836428429931402, + "step": 16180 + }, + { + "epoch": 0.40475, + "grad_norm": 31.875, + "grad_norm_var": 2.434477048521451e+18, + "learning_rate": 0.0001, + "loss": 7.318, + "loss/crossentropy": 1.8701960302889347, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18736766083166004, + "step": 16190 + }, + { + "epoch": 0.405, + "grad_norm": 28.5, + "grad_norm_var": 2.434477048703484e+18, + "learning_rate": 0.0001, + "loss": 7.3965, + "loss/crossentropy": 2.3353544265031814, + "loss/hidden": 3.465625, + "loss/jsd": 0.0, + "loss/logits": 0.20016160774976016, + "step": 16200 + }, + { + "epoch": 0.40525, + "grad_norm": 28.375, + "grad_norm_var": 6.144205729166667, + "learning_rate": 0.0001, + "loss": 7.2603, + "loss/crossentropy": 2.022757910192013, + "loss/hidden": 3.4578125, + "loss/jsd": 0.0, + "loss/logits": 0.1913651939481497, + "step": 16210 + }, + { + "epoch": 0.4055, + "grad_norm": 30.75, + "grad_norm_var": 2.872916666666667, + "learning_rate": 0.0001, + "loss": 7.2299, + "loss/crossentropy": 2.0279084965586662, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.1943587277084589, + "step": 16220 + }, + { + "epoch": 0.40575, + "grad_norm": 30.875, + "grad_norm_var": 3.814322916666667, + "learning_rate": 0.0001, + "loss": 7.3063, + "loss/crossentropy": 1.9705725610256195, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.18895087838172914, + "step": 16230 + }, + { + "epoch": 0.406, + "grad_norm": 30.5, + "grad_norm_var": 4.430208333333334, + "learning_rate": 0.0001, + "loss": 7.3762, + "loss/crossentropy": 2.0867358654737473, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.19180330783128738, + "step": 16240 + }, + { + "epoch": 0.40625, + "grad_norm": 29.875, + "grad_norm_var": 3.9479166666666665, + "learning_rate": 0.0001, + "loss": 7.3468, + "loss/crossentropy": 1.9416721653193236, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18548533669672906, + "step": 16250 + }, + { + "epoch": 0.4065, + "grad_norm": 29.75, + "grad_norm_var": 2.3525390625, + "learning_rate": 0.0001, + "loss": 7.1515, + "loss/crossentropy": 1.8505723856389522, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.1714062148705125, + "step": 16260 + }, + { + "epoch": 0.40675, + "grad_norm": 32.5, + "grad_norm_var": 1.9811848958333333, + "learning_rate": 0.0001, + "loss": 7.408, + "loss/crossentropy": 2.1564977198839186, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.1916896834038198, + "step": 16270 + }, + { + "epoch": 0.407, + "grad_norm": 28.875, + "grad_norm_var": 24.1884765625, + "learning_rate": 0.0001, + "loss": 7.2915, + "loss/crossentropy": 2.106581533700228, + "loss/hidden": 3.444921875, + "loss/jsd": 0.0, + "loss/logits": 0.20620792759582401, + "step": 16280 + }, + { + "epoch": 0.40725, + "grad_norm": 32.5, + "grad_norm_var": 28.264322916666668, + "learning_rate": 0.0001, + "loss": 7.2376, + "loss/crossentropy": 1.9940311327576636, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18459046920761465, + "step": 16290 + }, + { + "epoch": 0.4075, + "grad_norm": 32.0, + "grad_norm_var": 2.088997395833333, + "learning_rate": 0.0001, + "loss": 7.2199, + "loss/crossentropy": 1.866896914690733, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.17536403173580767, + "step": 16300 + }, + { + "epoch": 0.40775, + "grad_norm": 32.25, + "grad_norm_var": 2.9114583333333335, + "learning_rate": 0.0001, + "loss": 7.3328, + "loss/crossentropy": 2.12556598931551, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.1877920225262642, + "step": 16310 + }, + { + "epoch": 0.408, + "grad_norm": 31.375, + "grad_norm_var": 2.1035807291666666, + "learning_rate": 0.0001, + "loss": 7.3096, + "loss/crossentropy": 1.9810500107705593, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.18928063567727804, + "step": 16320 + }, + { + "epoch": 0.40825, + "grad_norm": 29.25, + "grad_norm_var": 3.1254557291666667, + "learning_rate": 0.0001, + "loss": 7.5025, + "loss/crossentropy": 2.2035975247621535, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.199692689999938, + "step": 16330 + }, + { + "epoch": 0.4085, + "grad_norm": 29.5, + "grad_norm_var": 3.981184895833333, + "learning_rate": 0.0001, + "loss": 7.2389, + "loss/crossentropy": 2.0442154854536057, + "loss/hidden": 3.263671875, + "loss/jsd": 0.0, + "loss/logits": 0.17368101943284273, + "step": 16340 + }, + { + "epoch": 0.40875, + "grad_norm": 30.25, + "grad_norm_var": 1.8572265625, + "learning_rate": 0.0001, + "loss": 7.3269, + "loss/crossentropy": 2.0223391756415365, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.19969959184527397, + "step": 16350 + }, + { + "epoch": 0.409, + "grad_norm": 31.625, + "grad_norm_var": 15.11640625, + "learning_rate": 0.0001, + "loss": 7.3063, + "loss/crossentropy": 1.8919450908899307, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.16754893381148578, + "step": 16360 + }, + { + "epoch": 0.40925, + "grad_norm": 32.25, + "grad_norm_var": 10.230989583333333, + "learning_rate": 0.0001, + "loss": 7.2766, + "loss/crossentropy": 2.0608230382204056, + "loss/hidden": 3.45859375, + "loss/jsd": 0.0, + "loss/logits": 0.19851373862475158, + "step": 16370 + }, + { + "epoch": 0.4095, + "grad_norm": 30.375, + "grad_norm_var": 11.682747395833333, + "learning_rate": 0.0001, + "loss": 7.2895, + "loss/crossentropy": 2.0847655333578587, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.186351899523288, + "step": 16380 + }, + { + "epoch": 0.40975, + "grad_norm": 37.75, + "grad_norm_var": 14.268684895833333, + "learning_rate": 0.0001, + "loss": 7.2556, + "loss/crossentropy": 2.010029584169388, + "loss/hidden": 3.302734375, + "loss/jsd": 0.0, + "loss/logits": 0.17632390139624476, + "step": 16390 + }, + { + "epoch": 0.41, + "grad_norm": 29.5, + "grad_norm_var": 6.380208333333333, + "learning_rate": 0.0001, + "loss": 7.2839, + "loss/crossentropy": 2.061359938979149, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.18306618463248014, + "step": 16400 + }, + { + "epoch": 0.41025, + "grad_norm": 30.75, + "grad_norm_var": 3.081184895833333, + "learning_rate": 0.0001, + "loss": 7.1644, + "loss/crossentropy": 2.210022081434727, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.20350072477012873, + "step": 16410 + }, + { + "epoch": 0.4105, + "grad_norm": 30.125, + "grad_norm_var": 6.44140625, + "learning_rate": 0.0001, + "loss": 7.3499, + "loss/crossentropy": 2.2339374631643296, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.19218460842967033, + "step": 16420 + }, + { + "epoch": 0.41075, + "grad_norm": 34.5, + "grad_norm_var": 7.73515625, + "learning_rate": 0.0001, + "loss": 7.3014, + "loss/crossentropy": 2.259621387720108, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.19978805705904962, + "step": 16430 + }, + { + "epoch": 0.411, + "grad_norm": 30.875, + "grad_norm_var": 4.659309895833333, + "learning_rate": 0.0001, + "loss": 7.29, + "loss/crossentropy": 1.9416244141757488, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.16338116079568862, + "step": 16440 + }, + { + "epoch": 0.41125, + "grad_norm": 30.625, + "grad_norm_var": 1.3218098958333333, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.0271296739578246, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.19065151941031216, + "step": 16450 + }, + { + "epoch": 0.4115, + "grad_norm": 30.25, + "grad_norm_var": 38.69973958333333, + "learning_rate": 0.0001, + "loss": 7.2814, + "loss/crossentropy": 1.9666823461651801, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.17561122979968785, + "step": 16460 + }, + { + "epoch": 0.41175, + "grad_norm": 39.75, + "grad_norm_var": 39.458333333333336, + "learning_rate": 0.0001, + "loss": 7.3983, + "loss/crossentropy": 2.104595158994198, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.18662185575813056, + "step": 16470 + }, + { + "epoch": 0.412, + "grad_norm": 29.25, + "grad_norm_var": 7.270768229166666, + "learning_rate": 0.0001, + "loss": 7.2818, + "loss/crossentropy": 2.118724799156189, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.18437495529651643, + "step": 16480 + }, + { + "epoch": 0.41225, + "grad_norm": 29.0, + "grad_norm_var": 1.1733723958333333, + "learning_rate": 0.0001, + "loss": 7.3057, + "loss/crossentropy": 1.988658072054386, + "loss/hidden": 3.354296875, + "loss/jsd": 0.0, + "loss/logits": 0.188140376098454, + "step": 16490 + }, + { + "epoch": 0.4125, + "grad_norm": 45.0, + "grad_norm_var": 3.7225065361444265e+18, + "learning_rate": 0.0001, + "loss": 7.3181, + "loss/crossentropy": 1.9564834155142308, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.1983109924942255, + "step": 16500 + }, + { + "epoch": 0.41275, + "grad_norm": 32.25, + "grad_norm_var": 3.7225065349305247e+18, + "learning_rate": 0.0001, + "loss": 7.4573, + "loss/crossentropy": 2.13476774841547, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.1941231820732355, + "step": 16510 + }, + { + "epoch": 0.413, + "grad_norm": 34.5, + "grad_norm_var": 20.303059895833332, + "learning_rate": 0.0001, + "loss": 7.3816, + "loss/crossentropy": 1.9623971730470657, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.186094144359231, + "step": 16520 + }, + { + "epoch": 0.41325, + "grad_norm": 30.25, + "grad_norm_var": 6.732747395833333, + "learning_rate": 0.0001, + "loss": 7.3583, + "loss/crossentropy": 2.065499635785818, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.17725836476311088, + "step": 16530 + }, + { + "epoch": 0.4135, + "grad_norm": 31.75, + "grad_norm_var": 19.949739583333333, + "learning_rate": 0.0001, + "loss": 7.3656, + "loss/crossentropy": 2.1521792262792587, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.2034964457154274, + "step": 16540 + }, + { + "epoch": 0.41375, + "grad_norm": 29.0, + "grad_norm_var": 34.993489583333336, + "learning_rate": 0.0001, + "loss": 7.2803, + "loss/crossentropy": 1.9985820479691028, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.1789230139926076, + "step": 16550 + }, + { + "epoch": 0.414, + "grad_norm": 30.75, + "grad_norm_var": 17.627018229166666, + "learning_rate": 0.0001, + "loss": 7.2555, + "loss/crossentropy": 2.094392439723015, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.18661210238933562, + "step": 16560 + }, + { + "epoch": 0.41425, + "grad_norm": 29.75, + "grad_norm_var": 12.305208333333333, + "learning_rate": 0.0001, + "loss": 7.1799, + "loss/crossentropy": 2.051650533825159, + "loss/hidden": 3.235546875, + "loss/jsd": 0.0, + "loss/logits": 0.1674873584881425, + "step": 16570 + }, + { + "epoch": 0.4145, + "grad_norm": 29.75, + "grad_norm_var": 1.2372395833333334, + "learning_rate": 0.0001, + "loss": 7.271, + "loss/crossentropy": 2.0924292795360087, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.18475022464990615, + "step": 16580 + }, + { + "epoch": 0.41475, + "grad_norm": 29.0, + "grad_norm_var": 10.5650390625, + "learning_rate": 0.0001, + "loss": 7.2485, + "loss/crossentropy": 2.0977213352918627, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.19010566975921392, + "step": 16590 + }, + { + "epoch": 0.415, + "grad_norm": 34.0, + "grad_norm_var": 20.742122395833334, + "learning_rate": 0.0001, + "loss": 7.4038, + "loss/crossentropy": 2.0668084505945443, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.1815639895386994, + "step": 16600 + }, + { + "epoch": 0.41525, + "grad_norm": 28.875, + "grad_norm_var": 10.555208333333333, + "learning_rate": 0.0001, + "loss": 7.1948, + "loss/crossentropy": 1.973907208442688, + "loss/hidden": 3.30078125, + "loss/jsd": 0.0, + "loss/logits": 0.17607561368495225, + "step": 16610 + }, + { + "epoch": 0.4155, + "grad_norm": 39.75, + "grad_norm_var": 12.46640625, + "learning_rate": 0.0001, + "loss": 7.2484, + "loss/crossentropy": 2.0116613686084746, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.17850100696086885, + "step": 16620 + }, + { + "epoch": 0.41575, + "grad_norm": 28.5, + "grad_norm_var": 13.439322916666667, + "learning_rate": 0.0001, + "loss": 7.2806, + "loss/crossentropy": 2.14372631162405, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.1865269223228097, + "step": 16630 + }, + { + "epoch": 0.416, + "grad_norm": 30.875, + "grad_norm_var": 41.07604166666667, + "learning_rate": 0.0001, + "loss": 7.3744, + "loss/crossentropy": 2.0837054416537284, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.17686022398993373, + "step": 16640 + }, + { + "epoch": 0.41625, + "grad_norm": 33.0, + "grad_norm_var": 4.229622395833333, + "learning_rate": 0.0001, + "loss": 7.2909, + "loss/crossentropy": 1.874044494330883, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.1757553458213806, + "step": 16650 + }, + { + "epoch": 0.4165, + "grad_norm": 33.5, + "grad_norm_var": 2.6275390625, + "learning_rate": 0.0001, + "loss": 7.1618, + "loss/crossentropy": 2.0809514716267588, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.18690248318016528, + "step": 16660 + }, + { + "epoch": 0.41675, + "grad_norm": 30.25, + "grad_norm_var": 1.6587890625, + "learning_rate": 0.0001, + "loss": 7.5271, + "loss/crossentropy": 2.1389145731925963, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.19053335022181273, + "step": 16670 + }, + { + "epoch": 0.417, + "grad_norm": 31.125, + "grad_norm_var": 1.4676432291666666, + "learning_rate": 0.0001, + "loss": 7.2686, + "loss/crossentropy": 1.9298078820109368, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.1739122910425067, + "step": 16680 + }, + { + "epoch": 0.41725, + "grad_norm": 29.0, + "grad_norm_var": 1.7457682291666667, + "learning_rate": 0.0001, + "loss": 7.2261, + "loss/crossentropy": 1.8423747673630715, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.1881049584597349, + "step": 16690 + }, + { + "epoch": 0.4175, + "grad_norm": 29.25, + "grad_norm_var": 2.6733723958333333, + "learning_rate": 0.0001, + "loss": 7.3344, + "loss/crossentropy": 2.116480963677168, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.20197081826627256, + "step": 16700 + }, + { + "epoch": 0.41775, + "grad_norm": 31.875, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 0.0001, + "loss": 7.3253, + "loss/crossentropy": 2.0403254240751267, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.17736360374838114, + "step": 16710 + }, + { + "epoch": 0.418, + "grad_norm": 30.875, + "grad_norm_var": 3.7622395833333333, + "learning_rate": 0.0001, + "loss": 7.37, + "loss/crossentropy": 2.1124753206968307, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.17344876211136578, + "step": 16720 + }, + { + "epoch": 0.41825, + "grad_norm": 30.25, + "grad_norm_var": 4.176497395833334, + "learning_rate": 0.0001, + "loss": 7.4396, + "loss/crossentropy": 2.163897532224655, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.20297291725873948, + "step": 16730 + }, + { + "epoch": 0.4185, + "grad_norm": 27.75, + "grad_norm_var": 6.505989583333333, + "learning_rate": 0.0001, + "loss": 7.1872, + "loss/crossentropy": 2.2656417414546013, + "loss/hidden": 3.297265625, + "loss/jsd": 0.0, + "loss/logits": 0.18971311207860708, + "step": 16740 + }, + { + "epoch": 0.41875, + "grad_norm": 29.25, + "grad_norm_var": 5.057291666666667, + "learning_rate": 0.0001, + "loss": 7.3375, + "loss/crossentropy": 2.0005149811506273, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.1829688923433423, + "step": 16750 + }, + { + "epoch": 0.419, + "grad_norm": 28.875, + "grad_norm_var": 2.914322916666667, + "learning_rate": 0.0001, + "loss": 7.2354, + "loss/crossentropy": 2.1137810915708544, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.19180109687149524, + "step": 16760 + }, + { + "epoch": 0.41925, + "grad_norm": 34.25, + "grad_norm_var": 3.4905598958333335, + "learning_rate": 0.0001, + "loss": 7.2489, + "loss/crossentropy": 2.1259815394878387, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18057906962931156, + "step": 16770 + }, + { + "epoch": 0.4195, + "grad_norm": 32.25, + "grad_norm_var": 2.396875, + "learning_rate": 0.0001, + "loss": 7.2524, + "loss/crossentropy": 1.9959501296281814, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.18154409490525722, + "step": 16780 + }, + { + "epoch": 0.41975, + "grad_norm": 32.25, + "grad_norm_var": 3.814322916666667, + "learning_rate": 0.0001, + "loss": 7.1953, + "loss/crossentropy": 2.0803288385272025, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.17842694241553544, + "step": 16790 + }, + { + "epoch": 0.42, + "grad_norm": 34.0, + "grad_norm_var": 4.6087890625, + "learning_rate": 0.0001, + "loss": 7.3088, + "loss/crossentropy": 2.1200789496302606, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.18628805093467235, + "step": 16800 + }, + { + "epoch": 0.42025, + "grad_norm": 29.875, + "grad_norm_var": 2.8666666666666667, + "learning_rate": 0.0001, + "loss": 7.3824, + "loss/crossentropy": 2.1946793287992477, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.18889498952776193, + "step": 16810 + }, + { + "epoch": 0.4205, + "grad_norm": 28.875, + "grad_norm_var": 2.343489583333333, + "learning_rate": 0.0001, + "loss": 7.2866, + "loss/crossentropy": 2.147686219215393, + "loss/hidden": 3.257421875, + "loss/jsd": 0.0, + "loss/logits": 0.17540164943784475, + "step": 16820 + }, + { + "epoch": 0.42075, + "grad_norm": 28.5, + "grad_norm_var": 2.631705729166667, + "learning_rate": 0.0001, + "loss": 7.3552, + "loss/crossentropy": 2.2283954128623007, + "loss/hidden": 3.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.1934325246140361, + "step": 16830 + }, + { + "epoch": 0.421, + "grad_norm": 28.75, + "grad_norm_var": 1.9114583333333333, + "learning_rate": 0.0001, + "loss": 7.285, + "loss/crossentropy": 2.0364210322499274, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.18664334248751402, + "step": 16840 + }, + { + "epoch": 0.42125, + "grad_norm": 31.0, + "grad_norm_var": 1.8322916666666667, + "learning_rate": 0.0001, + "loss": 7.3212, + "loss/crossentropy": 2.166780227422714, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.18862007781863213, + "step": 16850 + }, + { + "epoch": 0.4215, + "grad_norm": 28.375, + "grad_norm_var": 1.6910807291666667, + "learning_rate": 0.0001, + "loss": 7.2594, + "loss/crossentropy": 1.9404787957668304, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.1732590550556779, + "step": 16860 + }, + { + "epoch": 0.42175, + "grad_norm": 31.875, + "grad_norm_var": 1.4895182291666667, + "learning_rate": 0.0001, + "loss": 7.2446, + "loss/crossentropy": 2.146845671534538, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.19646301120519638, + "step": 16870 + }, + { + "epoch": 0.422, + "grad_norm": 32.75, + "grad_norm_var": 3.0452473958333335, + "learning_rate": 0.0001, + "loss": 7.3155, + "loss/crossentropy": 2.0759237363934515, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.2006880532950163, + "step": 16880 + }, + { + "epoch": 0.42225, + "grad_norm": 33.75, + "grad_norm_var": 2.7358723958333333, + "learning_rate": 0.0001, + "loss": 7.2928, + "loss/crossentropy": 2.1153281182050705, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.1919462438672781, + "step": 16890 + }, + { + "epoch": 0.4225, + "grad_norm": 34.0, + "grad_norm_var": 3.1927083333333335, + "learning_rate": 0.0001, + "loss": 7.3425, + "loss/crossentropy": 2.128311688452959, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1898970402777195, + "step": 16900 + }, + { + "epoch": 0.42275, + "grad_norm": 31.5, + "grad_norm_var": 2.2884765625, + "learning_rate": 0.0001, + "loss": 7.3652, + "loss/crossentropy": 2.14291021078825, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.1799526395276189, + "step": 16910 + }, + { + "epoch": 0.423, + "grad_norm": 35.75, + "grad_norm_var": 2.9853515625, + "learning_rate": 0.0001, + "loss": 7.374, + "loss/crossentropy": 2.172788438200951, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.19616848770529033, + "step": 16920 + }, + { + "epoch": 0.42325, + "grad_norm": 32.5, + "grad_norm_var": 3.1434895833333334, + "learning_rate": 0.0001, + "loss": 7.256, + "loss/crossentropy": 1.9609168127179146, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19055727850645782, + "step": 16930 + }, + { + "epoch": 0.4235, + "grad_norm": 31.5, + "grad_norm_var": 1.7197265625, + "learning_rate": 0.0001, + "loss": 7.337, + "loss/crossentropy": 2.3514596074819565, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.18605006970465182, + "step": 16940 + }, + { + "epoch": 0.42375, + "grad_norm": 32.0, + "grad_norm_var": 2.0171223958333333, + "learning_rate": 0.0001, + "loss": 7.3496, + "loss/crossentropy": 2.125219741463661, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.19694697633385658, + "step": 16950 + }, + { + "epoch": 0.424, + "grad_norm": 30.875, + "grad_norm_var": 2.827018229166667, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 2.1004397854208947, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.19005921352654695, + "step": 16960 + }, + { + "epoch": 0.42425, + "grad_norm": 31.625, + "grad_norm_var": 2.16875, + "learning_rate": 0.0001, + "loss": 7.3353, + "loss/crossentropy": 2.2350980192422867, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.20827819555997848, + "step": 16970 + }, + { + "epoch": 0.4245, + "grad_norm": 31.125, + "grad_norm_var": 1.7139973958333334, + "learning_rate": 0.0001, + "loss": 7.2948, + "loss/crossentropy": 2.057795172929764, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.18584750667214395, + "step": 16980 + }, + { + "epoch": 0.42475, + "grad_norm": 33.5, + "grad_norm_var": 9.813541666666667, + "learning_rate": 0.0001, + "loss": 7.2737, + "loss/crossentropy": 1.9335192561149597, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.17950817625969648, + "step": 16990 + }, + { + "epoch": 0.425, + "grad_norm": 30.625, + "grad_norm_var": 8.45, + "learning_rate": 0.0001, + "loss": 7.2137, + "loss/crossentropy": 2.158860814571381, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.19252809565514326, + "step": 17000 + }, + { + "epoch": 0.42525, + "grad_norm": 34.75, + "grad_norm_var": 3.940559895833333, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 1.9835775896906853, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18570326836779713, + "step": 17010 + }, + { + "epoch": 0.4255, + "grad_norm": 30.625, + "grad_norm_var": 13.29140625, + "learning_rate": 0.0001, + "loss": 7.4143, + "loss/crossentropy": 1.9494660779833795, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.2034120644442737, + "step": 17020 + }, + { + "epoch": 0.42575, + "grad_norm": 33.25, + "grad_norm_var": 2.7135416666666665, + "learning_rate": 0.0001, + "loss": 7.2964, + "loss/crossentropy": 2.1227547653019427, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18930197739973664, + "step": 17030 + }, + { + "epoch": 0.426, + "grad_norm": 31.0, + "grad_norm_var": 2.07890625, + "learning_rate": 0.0001, + "loss": 7.2105, + "loss/crossentropy": 1.9439098060131073, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.1776129573583603, + "step": 17040 + }, + { + "epoch": 0.42625, + "grad_norm": 43.25, + "grad_norm_var": 25.977018229166667, + "learning_rate": 0.0001, + "loss": 7.3636, + "loss/crossentropy": 2.1215348944067953, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.19092475529760122, + "step": 17050 + }, + { + "epoch": 0.4265, + "grad_norm": 32.75, + "grad_norm_var": 15.020572916666667, + "learning_rate": 0.0001, + "loss": 7.3927, + "loss/crossentropy": 2.0381774201989176, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.18921099193394184, + "step": 17060 + }, + { + "epoch": 0.42675, + "grad_norm": 34.0, + "grad_norm_var": 3.159309895833333, + "learning_rate": 0.0001, + "loss": 7.2753, + "loss/crossentropy": 2.053529141843319, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.18555910978466272, + "step": 17070 + }, + { + "epoch": 0.427, + "grad_norm": 28.875, + "grad_norm_var": 10.44140625, + "learning_rate": 0.0001, + "loss": 7.3331, + "loss/crossentropy": 2.0321754805743693, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17106994595378638, + "step": 17080 + }, + { + "epoch": 0.42725, + "grad_norm": 35.75, + "grad_norm_var": 11.438541666666667, + "learning_rate": 0.0001, + "loss": 7.2905, + "loss/crossentropy": 1.9504701748490334, + "loss/hidden": 3.262890625, + "loss/jsd": 0.0, + "loss/logits": 0.17157490737736225, + "step": 17090 + }, + { + "epoch": 0.4275, + "grad_norm": 30.375, + "grad_norm_var": 3.651497395833333, + "learning_rate": 0.0001, + "loss": 7.4396, + "loss/crossentropy": 2.1505665600299837, + "loss/hidden": 3.490625, + "loss/jsd": 0.0, + "loss/logits": 0.20605666618794202, + "step": 17100 + }, + { + "epoch": 0.42775, + "grad_norm": 29.875, + "grad_norm_var": 1.6688651155212861e+18, + "learning_rate": 0.0001, + "loss": 7.3328, + "loss/crossentropy": 2.0673328816890715, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.17912605479359628, + "step": 17110 + }, + { + "epoch": 0.428, + "grad_norm": 31.0, + "grad_norm_var": 1.7436848958333333, + "learning_rate": 0.0001, + "loss": 7.3111, + "loss/crossentropy": 2.0934616670012476, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.19241752084344627, + "step": 17120 + }, + { + "epoch": 0.42825, + "grad_norm": 30.375, + "grad_norm_var": 2.4580729166666666, + "learning_rate": 0.0001, + "loss": 7.2911, + "loss/crossentropy": 2.0029770001769065, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.1792398665100336, + "step": 17130 + }, + { + "epoch": 0.4285, + "grad_norm": 31.375, + "grad_norm_var": 3.4184895833333333, + "learning_rate": 0.0001, + "loss": 7.3235, + "loss/crossentropy": 2.232249027490616, + "loss/hidden": 3.354296875, + "loss/jsd": 0.0, + "loss/logits": 0.1950904905796051, + "step": 17140 + }, + { + "epoch": 0.42875, + "grad_norm": 31.0, + "grad_norm_var": 5.405143229166667, + "learning_rate": 0.0001, + "loss": 7.3339, + "loss/crossentropy": 2.1042421609163284, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18525240514427424, + "step": 17150 + }, + { + "epoch": 0.429, + "grad_norm": 29.625, + "grad_norm_var": 4.44765625, + "learning_rate": 0.0001, + "loss": 7.3841, + "loss/crossentropy": 2.0857191674411295, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.18139904439449311, + "step": 17160 + }, + { + "epoch": 0.42925, + "grad_norm": 26.75, + "grad_norm_var": 9.897330729166667, + "learning_rate": 0.0001, + "loss": 7.2916, + "loss/crossentropy": 1.9972345970571042, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.19361406620591878, + "step": 17170 + }, + { + "epoch": 0.4295, + "grad_norm": 33.0, + "grad_norm_var": 11.613997395833334, + "learning_rate": 0.0001, + "loss": 7.3454, + "loss/crossentropy": 2.0664907500147818, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.18069064132869245, + "step": 17180 + }, + { + "epoch": 0.42975, + "grad_norm": 31.375, + "grad_norm_var": 3.7225065363373645e+18, + "learning_rate": 0.0001, + "loss": 7.3382, + "loss/crossentropy": 2.174539531767368, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.1991014001891017, + "step": 17190 + }, + { + "epoch": 0.43, + "grad_norm": 31.75, + "grad_norm_var": 3.0708333333333333, + "learning_rate": 0.0001, + "loss": 7.3096, + "loss/crossentropy": 1.952410513907671, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1856127019971609, + "step": 17200 + }, + { + "epoch": 0.43025, + "grad_norm": 38.75, + "grad_norm_var": 8.0275390625, + "learning_rate": 0.0001, + "loss": 7.4419, + "loss/crossentropy": 2.128667399287224, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.2061656942591071, + "step": 17210 + }, + { + "epoch": 0.4305, + "grad_norm": 30.125, + "grad_norm_var": 8.792708333333334, + "learning_rate": 0.0001, + "loss": 7.2925, + "loss/crossentropy": 1.969744972884655, + "loss/hidden": 3.486328125, + "loss/jsd": 0.0, + "loss/logits": 0.2011998776346445, + "step": 17220 + }, + { + "epoch": 0.43075, + "grad_norm": 49.75, + "grad_norm_var": 25.044791666666665, + "learning_rate": 0.0001, + "loss": 7.3512, + "loss/crossentropy": 2.09512839615345, + "loss/hidden": 3.272265625, + "loss/jsd": 0.0, + "loss/logits": 0.17890227846801282, + "step": 17230 + }, + { + "epoch": 0.431, + "grad_norm": 29.625, + "grad_norm_var": 25.836458333333333, + "learning_rate": 0.0001, + "loss": 7.3174, + "loss/crossentropy": 2.2069885596632957, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.20973877012729644, + "step": 17240 + }, + { + "epoch": 0.43125, + "grad_norm": 31.875, + "grad_norm_var": 2.7309405650762245e+18, + "learning_rate": 0.0001, + "loss": 7.4689, + "loss/crossentropy": 2.197174680233002, + "loss/hidden": 3.248828125, + "loss/jsd": 0.0, + "loss/logits": 0.184923891723156, + "step": 17250 + }, + { + "epoch": 0.4315, + "grad_norm": 28.0, + "grad_norm_var": 2.730940565530678e+18, + "learning_rate": 0.0001, + "loss": 7.2989, + "loss/crossentropy": 2.08426301702857, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.18929961118847133, + "step": 17260 + }, + { + "epoch": 0.43175, + "grad_norm": 31.75, + "grad_norm_var": 9.408268229166667, + "learning_rate": 0.0001, + "loss": 7.3047, + "loss/crossentropy": 1.9928709417581558, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.19445511922240258, + "step": 17270 + }, + { + "epoch": 0.432, + "grad_norm": 41.25, + "grad_norm_var": 17.242708333333333, + "learning_rate": 0.0001, + "loss": 7.2872, + "loss/crossentropy": 1.9035924103111028, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.18102165032178164, + "step": 17280 + }, + { + "epoch": 0.43225, + "grad_norm": 28.125, + "grad_norm_var": 14.945768229166667, + "learning_rate": 0.0001, + "loss": 7.3079, + "loss/crossentropy": 2.2340116620063784, + "loss/hidden": 3.29921875, + "loss/jsd": 0.0, + "loss/logits": 0.19132067933678626, + "step": 17290 + }, + { + "epoch": 0.4325, + "grad_norm": 29.5, + "grad_norm_var": 8.06875, + "learning_rate": 0.0001, + "loss": 7.2915, + "loss/crossentropy": 2.0194656267762183, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.19431380424648523, + "step": 17300 + }, + { + "epoch": 0.43275, + "grad_norm": 30.875, + "grad_norm_var": 8.566666666666666, + "learning_rate": 0.0001, + "loss": 7.261, + "loss/crossentropy": 2.001611000299454, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.17275465354323388, + "step": 17310 + }, + { + "epoch": 0.433, + "grad_norm": 35.25, + "grad_norm_var": 13.3775390625, + "learning_rate": 0.0001, + "loss": 7.3377, + "loss/crossentropy": 2.114110495150089, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.20012194942682981, + "step": 17320 + }, + { + "epoch": 0.43325, + "grad_norm": 39.0, + "grad_norm_var": 44.28958333333333, + "learning_rate": 0.0001, + "loss": 7.3729, + "loss/crossentropy": 2.2978450536727903, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.18779626674950123, + "step": 17330 + }, + { + "epoch": 0.4335, + "grad_norm": 32.0, + "grad_norm_var": 33.8791015625, + "learning_rate": 0.0001, + "loss": 7.3487, + "loss/crossentropy": 2.0877898022532464, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.18020253200083972, + "step": 17340 + }, + { + "epoch": 0.43375, + "grad_norm": 28.25, + "grad_norm_var": 6.508333333333334, + "learning_rate": 0.0001, + "loss": 7.2337, + "loss/crossentropy": 1.9320446044206618, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.18643499026075006, + "step": 17350 + }, + { + "epoch": 0.434, + "grad_norm": 38.25, + "grad_norm_var": 9.514322916666666, + "learning_rate": 0.0001, + "loss": 7.3395, + "loss/crossentropy": 2.033699007332325, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18971308488398791, + "step": 17360 + }, + { + "epoch": 0.43425, + "grad_norm": 30.0, + "grad_norm_var": 6.888997395833333, + "learning_rate": 0.0001, + "loss": 7.353, + "loss/crossentropy": 2.1282121136784555, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.1824582252651453, + "step": 17370 + }, + { + "epoch": 0.4345, + "grad_norm": 30.5, + "grad_norm_var": 4.592708333333333, + "learning_rate": 0.0001, + "loss": 7.2093, + "loss/crossentropy": 2.1514922633767126, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.19244836810976268, + "step": 17380 + }, + { + "epoch": 0.43475, + "grad_norm": 31.0, + "grad_norm_var": 8.587434895833333, + "learning_rate": 0.0001, + "loss": 7.28, + "loss/crossentropy": 1.9829683952033519, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1997856667265296, + "step": 17390 + }, + { + "epoch": 0.435, + "grad_norm": 30.25, + "grad_norm_var": 12.732747395833334, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.0963875532150267, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.18260720651596785, + "step": 17400 + }, + { + "epoch": 0.43525, + "grad_norm": 39.25, + "grad_norm_var": 13.623893229166667, + "learning_rate": 0.0001, + "loss": 7.2851, + "loss/crossentropy": 2.1481306463479997, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.18422619421035052, + "step": 17410 + }, + { + "epoch": 0.4355, + "grad_norm": 35.25, + "grad_norm_var": 33.15514322916667, + "learning_rate": 0.0001, + "loss": 7.2614, + "loss/crossentropy": 1.94967889636755, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.16873782277107238, + "step": 17420 + }, + { + "epoch": 0.43575, + "grad_norm": 30.25, + "grad_norm_var": 19.037239583333335, + "learning_rate": 0.0001, + "loss": 7.2782, + "loss/crossentropy": 2.3510662406682967, + "loss/hidden": 3.228515625, + "loss/jsd": 0.0, + "loss/logits": 0.18572253845632075, + "step": 17430 + }, + { + "epoch": 0.436, + "grad_norm": 28.375, + "grad_norm_var": 1.3291666666666666, + "learning_rate": 0.0001, + "loss": 7.3387, + "loss/crossentropy": 2.1440586492419245, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.1921809207648039, + "step": 17440 + }, + { + "epoch": 0.43625, + "grad_norm": 30.0, + "grad_norm_var": 1.2837890625, + "learning_rate": 0.0001, + "loss": 7.2998, + "loss/crossentropy": 2.045154483616352, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.20081294253468512, + "step": 17450 + }, + { + "epoch": 0.4365, + "grad_norm": 35.75, + "grad_norm_var": 4.692643229166666, + "learning_rate": 0.0001, + "loss": 7.2213, + "loss/crossentropy": 1.9950920060276984, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.18348362799733878, + "step": 17460 + }, + { + "epoch": 0.43675, + "grad_norm": 28.625, + "grad_norm_var": 47.77890625, + "learning_rate": 0.0001, + "loss": 7.2084, + "loss/crossentropy": 2.0935964420437814, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.1935350162908435, + "step": 17470 + }, + { + "epoch": 0.437, + "grad_norm": 30.375, + "grad_norm_var": 51.95462239583333, + "learning_rate": 0.0001, + "loss": 7.3469, + "loss/crossentropy": 2.2382303804159163, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.2008738409727812, + "step": 17480 + }, + { + "epoch": 0.43725, + "grad_norm": 30.75, + "grad_norm_var": 7.177083333333333, + "learning_rate": 0.0001, + "loss": 7.2088, + "loss/crossentropy": 2.0825397919863464, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.19918248979374767, + "step": 17490 + }, + { + "epoch": 0.4375, + "grad_norm": 31.0, + "grad_norm_var": 41.16015625, + "learning_rate": 0.0001, + "loss": 7.3124, + "loss/crossentropy": 2.1086046427488325, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.2059457702562213, + "step": 17500 + }, + { + "epoch": 0.43775, + "grad_norm": 30.25, + "grad_norm_var": 41.1275390625, + "learning_rate": 0.0001, + "loss": 7.3142, + "loss/crossentropy": 2.1939937084913255, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.19669633246958257, + "step": 17510 + }, + { + "epoch": 0.438, + "grad_norm": 30.0, + "grad_norm_var": 5.426822916666667, + "learning_rate": 0.0001, + "loss": 7.4971, + "loss/crossentropy": 2.077922207117081, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.19905825443565844, + "step": 17520 + }, + { + "epoch": 0.43825, + "grad_norm": 35.0, + "grad_norm_var": 5.139518229166667, + "learning_rate": 0.0001, + "loss": 7.3389, + "loss/crossentropy": 2.2604495763778685, + "loss/hidden": 3.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.18292145486921071, + "step": 17530 + }, + { + "epoch": 0.4385, + "grad_norm": 29.125, + "grad_norm_var": 1.9483723958333334, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 2.033947843313217, + "loss/hidden": 3.444140625, + "loss/jsd": 0.0, + "loss/logits": 0.18323329892009496, + "step": 17540 + }, + { + "epoch": 0.43875, + "grad_norm": 30.625, + "grad_norm_var": 0.8455729166666667, + "learning_rate": 0.0001, + "loss": 7.2981, + "loss/crossentropy": 2.2359601676464083, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.18464031405746936, + "step": 17550 + }, + { + "epoch": 0.439, + "grad_norm": 40.5, + "grad_norm_var": 2.3053504042993976e+18, + "learning_rate": 0.0001, + "loss": 7.3321, + "loss/crossentropy": 1.9767519772052764, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.17584269326180219, + "step": 17560 + }, + { + "epoch": 0.43925, + "grad_norm": 29.5, + "grad_norm_var": 2.3053504048877535e+18, + "learning_rate": 0.0001, + "loss": 7.2311, + "loss/crossentropy": 2.08755609691143, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.18783280439674854, + "step": 17570 + }, + { + "epoch": 0.4395, + "grad_norm": 30.875, + "grad_norm_var": 2.872330729166667, + "learning_rate": 0.0001, + "loss": 7.2714, + "loss/crossentropy": 2.1911805018782617, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.19903278667479754, + "step": 17580 + }, + { + "epoch": 0.43975, + "grad_norm": 28.0, + "grad_norm_var": 16.263997395833332, + "learning_rate": 0.0001, + "loss": 7.4407, + "loss/crossentropy": 2.1172087833285334, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.1971781937405467, + "step": 17590 + }, + { + "epoch": 0.44, + "grad_norm": 31.25, + "grad_norm_var": 5.95390625, + "learning_rate": 0.0001, + "loss": 7.4514, + "loss/crossentropy": 2.0933860525488854, + "loss/hidden": 3.54453125, + "loss/jsd": 0.0, + "loss/logits": 0.21392161287367345, + "step": 17600 + }, + { + "epoch": 0.44025, + "grad_norm": 30.375, + "grad_norm_var": 8.245833333333334, + "learning_rate": 0.0001, + "loss": 7.3118, + "loss/crossentropy": 2.1100318849086763, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.21126667950302364, + "step": 17610 + }, + { + "epoch": 0.4405, + "grad_norm": 29.0, + "grad_norm_var": 7.802083333333333, + "learning_rate": 0.0001, + "loss": 7.2395, + "loss/crossentropy": 1.9662518702447414, + "loss/hidden": 3.31640625, + "loss/jsd": 0.0, + "loss/logits": 0.17263798629865051, + "step": 17620 + }, + { + "epoch": 0.44075, + "grad_norm": 37.75, + "grad_norm_var": 21.355208333333334, + "learning_rate": 0.0001, + "loss": 7.3091, + "loss/crossentropy": 2.1022576719522474, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.18059359192848207, + "step": 17630 + }, + { + "epoch": 0.441, + "grad_norm": 29.0, + "grad_norm_var": 18.4228515625, + "learning_rate": 0.0001, + "loss": 7.2647, + "loss/crossentropy": 2.158071845769882, + "loss/hidden": 3.255078125, + "loss/jsd": 0.0, + "loss/logits": 0.17066741604357957, + "step": 17640 + }, + { + "epoch": 0.44125, + "grad_norm": 36.5, + "grad_norm_var": 21.584309895833332, + "learning_rate": 0.0001, + "loss": 7.3432, + "loss/crossentropy": 2.017947067320347, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.1873489862307906, + "step": 17650 + }, + { + "epoch": 0.4415, + "grad_norm": 33.0, + "grad_norm_var": 6.862239583333333, + "learning_rate": 0.0001, + "loss": 7.2728, + "loss/crossentropy": 2.2292807698249817, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.18392078932374717, + "step": 17660 + }, + { + "epoch": 0.44175, + "grad_norm": 31.125, + "grad_norm_var": 2.849739583333333, + "learning_rate": 0.0001, + "loss": 7.1737, + "loss/crossentropy": 2.1030597440898418, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.18242286536842583, + "step": 17670 + }, + { + "epoch": 0.442, + "grad_norm": 30.0, + "grad_norm_var": 3.0385416666666667, + "learning_rate": 0.0001, + "loss": 7.3015, + "loss/crossentropy": 2.0946084149181843, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.1821262989193201, + "step": 17680 + }, + { + "epoch": 0.44225, + "grad_norm": 35.0, + "grad_norm_var": 3.60390625, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 2.151443210244179, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.19624145627021788, + "step": 17690 + }, + { + "epoch": 0.4425, + "grad_norm": 34.5, + "grad_norm_var": 3.2958333333333334, + "learning_rate": 0.0001, + "loss": 7.4457, + "loss/crossentropy": 2.153268966078758, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.1952244239859283, + "step": 17700 + }, + { + "epoch": 0.44275, + "grad_norm": 30.5, + "grad_norm_var": 2.9791015625, + "learning_rate": 0.0001, + "loss": 7.2833, + "loss/crossentropy": 2.0192960262298585, + "loss/hidden": 3.27578125, + "loss/jsd": 0.0, + "loss/logits": 0.1719641860574484, + "step": 17710 + }, + { + "epoch": 0.443, + "grad_norm": 29.5, + "grad_norm_var": 3.4875, + "learning_rate": 0.0001, + "loss": 7.3388, + "loss/crossentropy": 2.0337466023862363, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.18310964982956648, + "step": 17720 + }, + { + "epoch": 0.44325, + "grad_norm": 27.5, + "grad_norm_var": 8.132747395833333, + "learning_rate": 0.0001, + "loss": 7.2481, + "loss/crossentropy": 2.083891141414642, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.1840117072686553, + "step": 17730 + }, + { + "epoch": 0.4435, + "grad_norm": 65.0, + "grad_norm_var": 79.64270833333333, + "learning_rate": 0.0001, + "loss": 7.2557, + "loss/crossentropy": 2.193793588876724, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.18301294278353453, + "step": 17740 + }, + { + "epoch": 0.44375, + "grad_norm": 31.25, + "grad_norm_var": 79.53098958333334, + "learning_rate": 0.0001, + "loss": 7.3745, + "loss/crossentropy": 2.1639930069446565, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.18582290820777417, + "step": 17750 + }, + { + "epoch": 0.444, + "grad_norm": 28.875, + "grad_norm_var": 4.594791666666667, + "learning_rate": 0.0001, + "loss": 7.4009, + "loss/crossentropy": 2.0524846225976945, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.18291038312017918, + "step": 17760 + }, + { + "epoch": 0.44425, + "grad_norm": 29.25, + "grad_norm_var": 5.265559895833333, + "learning_rate": 0.0001, + "loss": 7.2015, + "loss/crossentropy": 2.0137507632374763, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.17316031139343976, + "step": 17770 + }, + { + "epoch": 0.4445, + "grad_norm": 32.0, + "grad_norm_var": 4.96015625, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.172424706816673, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.19778120778501035, + "step": 17780 + }, + { + "epoch": 0.44475, + "grad_norm": 29.25, + "grad_norm_var": 217.27083333333334, + "learning_rate": 0.0001, + "loss": 7.1559, + "loss/crossentropy": 2.076275750249624, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.17568769995123149, + "step": 17790 + }, + { + "epoch": 0.445, + "grad_norm": 33.25, + "grad_norm_var": 223.44368489583334, + "learning_rate": 0.0001, + "loss": 7.3438, + "loss/crossentropy": 2.270902451872826, + "loss/hidden": 3.22265625, + "loss/jsd": 0.0, + "loss/logits": 0.1801173485815525, + "step": 17800 + }, + { + "epoch": 0.44525, + "grad_norm": 39.75, + "grad_norm_var": 12.383072916666666, + "learning_rate": 0.0001, + "loss": 7.2998, + "loss/crossentropy": 2.1936209172010424, + "loss/hidden": 3.49296875, + "loss/jsd": 0.0, + "loss/logits": 0.19792029969394206, + "step": 17810 + }, + { + "epoch": 0.4455, + "grad_norm": 29.5, + "grad_norm_var": 7.808268229166667, + "learning_rate": 0.0001, + "loss": 7.3011, + "loss/crossentropy": 2.1165911227464678, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.1850597882643342, + "step": 17820 + }, + { + "epoch": 0.44575, + "grad_norm": 29.5, + "grad_norm_var": 2.218489583333333, + "learning_rate": 0.0001, + "loss": 7.2305, + "loss/crossentropy": 2.0913813322782517, + "loss/hidden": 3.26171875, + "loss/jsd": 0.0, + "loss/logits": 0.17276749946177006, + "step": 17830 + }, + { + "epoch": 0.446, + "grad_norm": 48.25, + "grad_norm_var": 40.351497395833334, + "learning_rate": 0.0001, + "loss": 7.3693, + "loss/crossentropy": 2.014899070560932, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.2062089815735817, + "step": 17840 + }, + { + "epoch": 0.44625, + "grad_norm": 32.5, + "grad_norm_var": 31.93515625, + "learning_rate": 0.0001, + "loss": 7.3685, + "loss/crossentropy": 2.048551322519779, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.17983095441013575, + "step": 17850 + }, + { + "epoch": 0.4465, + "grad_norm": 28.75, + "grad_norm_var": 30.507291666666667, + "learning_rate": 0.0001, + "loss": 7.3944, + "loss/crossentropy": 2.094169969111681, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.18028030041605234, + "step": 17860 + }, + { + "epoch": 0.44675, + "grad_norm": 46.0, + "grad_norm_var": 42.3962890625, + "learning_rate": 0.0001, + "loss": 7.2828, + "loss/crossentropy": 2.001505134999752, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19261684585362673, + "step": 17870 + }, + { + "epoch": 0.447, + "grad_norm": 29.125, + "grad_norm_var": 16.0697265625, + "learning_rate": 0.0001, + "loss": 7.2928, + "loss/crossentropy": 2.034270917624235, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.17677650935947894, + "step": 17880 + }, + { + "epoch": 0.44725, + "grad_norm": 30.5, + "grad_norm_var": 7.384830729166667, + "learning_rate": 0.0001, + "loss": 7.2414, + "loss/crossentropy": 2.177957884967327, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.20077546443790198, + "step": 17890 + }, + { + "epoch": 0.4475, + "grad_norm": 26.5, + "grad_norm_var": 19.184309895833334, + "learning_rate": 0.0001, + "loss": 7.2819, + "loss/crossentropy": 2.130458039045334, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.19989439938217402, + "step": 17900 + }, + { + "epoch": 0.44775, + "grad_norm": 35.5, + "grad_norm_var": 44.08515625, + "learning_rate": 0.0001, + "loss": 7.1854, + "loss/crossentropy": 2.1698092609643935, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18095692303031682, + "step": 17910 + }, + { + "epoch": 0.448, + "grad_norm": 38.25, + "grad_norm_var": 44.31041666666667, + "learning_rate": 0.0001, + "loss": 7.3364, + "loss/crossentropy": 2.06129896491766, + "loss/hidden": 3.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.18485308829694985, + "step": 17920 + }, + { + "epoch": 0.44825, + "grad_norm": 37.25, + "grad_norm_var": 30.994205729166666, + "learning_rate": 0.0001, + "loss": 7.3095, + "loss/crossentropy": 1.9626867271959783, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.18522640708833932, + "step": 17930 + }, + { + "epoch": 0.4485, + "grad_norm": 38.75, + "grad_norm_var": 14.283072916666667, + "learning_rate": 0.0001, + "loss": 7.3206, + "loss/crossentropy": 2.061312362551689, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.18516826909035444, + "step": 17940 + }, + { + "epoch": 0.44875, + "grad_norm": 39.25, + "grad_norm_var": 12.702083333333333, + "learning_rate": 0.0001, + "loss": 7.2801, + "loss/crossentropy": 2.0153495140373705, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.1914671964943409, + "step": 17950 + }, + { + "epoch": 0.449, + "grad_norm": 30.75, + "grad_norm_var": 10.4650390625, + "learning_rate": 0.0001, + "loss": 7.3371, + "loss/crossentropy": 2.0574869602918624, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.20203660018742084, + "step": 17960 + }, + { + "epoch": 0.44925, + "grad_norm": 31.0, + "grad_norm_var": 7.4431640625, + "learning_rate": 0.0001, + "loss": 7.301, + "loss/crossentropy": 2.1529300838708876, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.18314522188156843, + "step": 17970 + }, + { + "epoch": 0.4495, + "grad_norm": 29.375, + "grad_norm_var": 28.781705729166667, + "learning_rate": 0.0001, + "loss": 7.286, + "loss/crossentropy": 2.2260805390775205, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.18513377662748098, + "step": 17980 + }, + { + "epoch": 0.44975, + "grad_norm": 29.5, + "grad_norm_var": 16.774739583333332, + "learning_rate": 0.0001, + "loss": 7.4066, + "loss/crossentropy": 1.9546913027763366, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.18548114076256753, + "step": 17990 + }, + { + "epoch": 0.45, + "grad_norm": 31.875, + "grad_norm_var": 14.9619140625, + "learning_rate": 0.0001, + "loss": 7.2574, + "loss/crossentropy": 2.0575848095119, + "loss/hidden": 3.268359375, + "loss/jsd": 0.0, + "loss/logits": 0.17614138405770063, + "step": 18000 + }, + { + "epoch": 0.45025, + "grad_norm": 36.5, + "grad_norm_var": 9.5603515625, + "learning_rate": 0.0001, + "loss": 7.3801, + "loss/crossentropy": 2.0923216179013253, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.180926413834095, + "step": 18010 + }, + { + "epoch": 0.4505, + "grad_norm": 27.25, + "grad_norm_var": 13.5884765625, + "learning_rate": 0.0001, + "loss": 7.3101, + "loss/crossentropy": 2.0365143597126005, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.1913832610473037, + "step": 18020 + }, + { + "epoch": 0.45075, + "grad_norm": 31.875, + "grad_norm_var": 16.5962890625, + "learning_rate": 0.0001, + "loss": 7.2971, + "loss/crossentropy": 2.0645697891712187, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19654379645362496, + "step": 18030 + }, + { + "epoch": 0.451, + "grad_norm": 29.5, + "grad_norm_var": 11.794205729166666, + "learning_rate": 0.0001, + "loss": 7.2328, + "loss/crossentropy": 2.0605416260659695, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18139277547597885, + "step": 18040 + }, + { + "epoch": 0.45125, + "grad_norm": 32.0, + "grad_norm_var": 75.465625, + "learning_rate": 0.0001, + "loss": 7.2392, + "loss/crossentropy": 2.083356872946024, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.2048046118579805, + "step": 18050 + }, + { + "epoch": 0.4515, + "grad_norm": 36.5, + "grad_norm_var": 35.063541666666666, + "learning_rate": 0.0001, + "loss": 7.2963, + "loss/crossentropy": 1.9495014771819115, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.17369234804064035, + "step": 18060 + }, + { + "epoch": 0.45175, + "grad_norm": 37.5, + "grad_norm_var": 10.093684895833333, + "learning_rate": 0.0001, + "loss": 7.2651, + "loss/crossentropy": 2.124495878815651, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.19412798956036567, + "step": 18070 + }, + { + "epoch": 0.452, + "grad_norm": 26.5, + "grad_norm_var": 18.055208333333333, + "learning_rate": 0.0001, + "loss": 7.4276, + "loss/crossentropy": 2.1983793139457704, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.19716158043593168, + "step": 18080 + }, + { + "epoch": 0.45225, + "grad_norm": 29.0, + "grad_norm_var": 16.8509765625, + "learning_rate": 0.0001, + "loss": 7.2958, + "loss/crossentropy": 2.0635679766535757, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.19352352600544692, + "step": 18090 + }, + { + "epoch": 0.4525, + "grad_norm": 31.875, + "grad_norm_var": 4.707291666666666, + "learning_rate": 0.0001, + "loss": 7.3162, + "loss/crossentropy": 2.1419614136219023, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.1909960376098752, + "step": 18100 + }, + { + "epoch": 0.45275, + "grad_norm": 28.0, + "grad_norm_var": 3.0768229166666665, + "learning_rate": 0.0001, + "loss": 7.2917, + "loss/crossentropy": 2.0669834718108175, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.18134585060179234, + "step": 18110 + }, + { + "epoch": 0.453, + "grad_norm": 29.0, + "grad_norm_var": 4.955989583333333, + "learning_rate": 0.0001, + "loss": 7.3518, + "loss/crossentropy": 2.2067804619669915, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.18476400952786207, + "step": 18120 + }, + { + "epoch": 0.45325, + "grad_norm": 30.625, + "grad_norm_var": 11.580989583333333, + "learning_rate": 0.0001, + "loss": 7.2172, + "loss/crossentropy": 2.027407840639353, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.17775150025263428, + "step": 18130 + }, + { + "epoch": 0.4535, + "grad_norm": 34.5, + "grad_norm_var": 12.3259765625, + "learning_rate": 0.0001, + "loss": 7.2658, + "loss/crossentropy": 2.2659630328416824, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.1918324466794729, + "step": 18140 + }, + { + "epoch": 0.45375, + "grad_norm": 28.75, + "grad_norm_var": 5.1869140625, + "learning_rate": 0.0001, + "loss": 7.2627, + "loss/crossentropy": 2.1039141535758974, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.18257359908893706, + "step": 18150 + }, + { + "epoch": 0.454, + "grad_norm": 28.125, + "grad_norm_var": 1.4462890625, + "learning_rate": 0.0001, + "loss": 7.2659, + "loss/crossentropy": 2.1540609017014503, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.19397172834724188, + "step": 18160 + }, + { + "epoch": 0.45425, + "grad_norm": 28.625, + "grad_norm_var": 3.134375, + "learning_rate": 0.0001, + "loss": 7.2332, + "loss/crossentropy": 2.0309683278203012, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.19702901802957057, + "step": 18170 + }, + { + "epoch": 0.4545, + "grad_norm": 27.875, + "grad_norm_var": 3.4368798057619784e+18, + "learning_rate": 0.0001, + "loss": 7.3245, + "loss/crossentropy": 1.9289076425135137, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.162909129075706, + "step": 18180 + }, + { + "epoch": 0.45475, + "grad_norm": 29.75, + "grad_norm_var": 6.216666666666667, + "learning_rate": 0.0001, + "loss": 7.2699, + "loss/crossentropy": 2.1170002311468124, + "loss/hidden": 3.242578125, + "loss/jsd": 0.0, + "loss/logits": 0.17830445962026714, + "step": 18190 + }, + { + "epoch": 0.455, + "grad_norm": 31.5, + "grad_norm_var": 1.5801432291666666, + "learning_rate": 0.0001, + "loss": 7.3454, + "loss/crossentropy": 2.1520806990563868, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.193353005964309, + "step": 18200 + }, + { + "epoch": 0.45525, + "grad_norm": 30.5, + "grad_norm_var": 2.0270182291666665, + "learning_rate": 0.0001, + "loss": 7.2713, + "loss/crossentropy": 1.9926446735858918, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.19921180326491594, + "step": 18210 + }, + { + "epoch": 0.4555, + "grad_norm": 29.125, + "grad_norm_var": 22.040559895833333, + "learning_rate": 0.0001, + "loss": 7.3593, + "loss/crossentropy": 2.2070549339056016, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19235619865357875, + "step": 18220 + }, + { + "epoch": 0.45575, + "grad_norm": 31.75, + "grad_norm_var": 1.6927083333333333, + "learning_rate": 0.0001, + "loss": 7.3017, + "loss/crossentropy": 2.088302744925022, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.17863646000623704, + "step": 18230 + }, + { + "epoch": 0.456, + "grad_norm": 29.75, + "grad_norm_var": 2.7895182291666667, + "learning_rate": 0.0001, + "loss": 7.1981, + "loss/crossentropy": 2.076305481791496, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.16893827533349395, + "step": 18240 + }, + { + "epoch": 0.45625, + "grad_norm": 31.125, + "grad_norm_var": 42.75390625, + "learning_rate": 0.0001, + "loss": 7.3307, + "loss/crossentropy": 1.8771976202726364, + "loss/hidden": 3.473046875, + "loss/jsd": 0.0, + "loss/logits": 0.18784729558974506, + "step": 18250 + }, + { + "epoch": 0.4565, + "grad_norm": 30.5, + "grad_norm_var": 40.77265625, + "learning_rate": 0.0001, + "loss": 7.2858, + "loss/crossentropy": 2.078951980918646, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.19046283271163703, + "step": 18260 + }, + { + "epoch": 0.45675, + "grad_norm": 30.25, + "grad_norm_var": 2.2643229166666665, + "learning_rate": 0.0001, + "loss": 7.296, + "loss/crossentropy": 2.1873993456363676, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.18323179706931114, + "step": 18270 + }, + { + "epoch": 0.457, + "grad_norm": 27.125, + "grad_norm_var": 2.7113932291666667, + "learning_rate": 0.0001, + "loss": 7.2805, + "loss/crossentropy": 2.1417489528656004, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.1756190422922373, + "step": 18280 + }, + { + "epoch": 0.45725, + "grad_norm": 30.25, + "grad_norm_var": 2.3395182291666665, + "learning_rate": 0.0001, + "loss": 7.3185, + "loss/crossentropy": 2.0876102939248087, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.1938194077461958, + "step": 18290 + }, + { + "epoch": 0.4575, + "grad_norm": 31.75, + "grad_norm_var": 3.5551432291666667, + "learning_rate": 0.0001, + "loss": 7.3099, + "loss/crossentropy": 2.088182657957077, + "loss/hidden": 3.28984375, + "loss/jsd": 0.0, + "loss/logits": 0.18158297892659903, + "step": 18300 + }, + { + "epoch": 0.45775, + "grad_norm": 30.0, + "grad_norm_var": 2.09765625, + "learning_rate": 0.0001, + "loss": 7.3893, + "loss/crossentropy": 2.0980015411973, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.19810429476201535, + "step": 18310 + }, + { + "epoch": 0.458, + "grad_norm": 32.5, + "grad_norm_var": 14.725455729166667, + "learning_rate": 0.0001, + "loss": 7.3944, + "loss/crossentropy": 2.164955261349678, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.19350772826001048, + "step": 18320 + }, + { + "epoch": 0.45825, + "grad_norm": 29.75, + "grad_norm_var": 14.030989583333334, + "learning_rate": 0.0001, + "loss": 7.3787, + "loss/crossentropy": 2.040371149778366, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.17534797079861164, + "step": 18330 + }, + { + "epoch": 0.4585, + "grad_norm": 30.0, + "grad_norm_var": 2.597916666666667, + "learning_rate": 0.0001, + "loss": 7.2343, + "loss/crossentropy": 1.948985606431961, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.17247573314234615, + "step": 18340 + }, + { + "epoch": 0.45875, + "grad_norm": 33.25, + "grad_norm_var": 4.514322916666667, + "learning_rate": 0.0001, + "loss": 7.2451, + "loss/crossentropy": 2.1069004431366922, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.2032563455402851, + "step": 18350 + }, + { + "epoch": 0.459, + "grad_norm": 29.5, + "grad_norm_var": 3.6384765625, + "learning_rate": 0.0001, + "loss": 7.3308, + "loss/crossentropy": 2.174188384413719, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.19028009474277496, + "step": 18360 + }, + { + "epoch": 0.45925, + "grad_norm": 31.375, + "grad_norm_var": 3.6384765625, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 2.07789751291275, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.1836570143699646, + "step": 18370 + }, + { + "epoch": 0.4595, + "grad_norm": 30.5, + "grad_norm_var": 9.654622395833334, + "learning_rate": 0.0001, + "loss": 7.2052, + "loss/crossentropy": 2.0727937802672387, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.18003296088427306, + "step": 18380 + }, + { + "epoch": 0.45975, + "grad_norm": 30.75, + "grad_norm_var": 10.531184895833333, + "learning_rate": 0.0001, + "loss": 7.23, + "loss/crossentropy": 2.0466937959194182, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17429272253066302, + "step": 18390 + }, + { + "epoch": 0.46, + "grad_norm": 28.5, + "grad_norm_var": 5.357291666666667, + "learning_rate": 0.0001, + "loss": 7.2161, + "loss/crossentropy": 2.007898286730051, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.17548899427056314, + "step": 18400 + }, + { + "epoch": 0.46025, + "grad_norm": 30.5, + "grad_norm_var": 4.42890625, + "learning_rate": 0.0001, + "loss": 7.3552, + "loss/crossentropy": 2.12602576315403, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.18963672500103712, + "step": 18410 + }, + { + "epoch": 0.4605, + "grad_norm": 31.875, + "grad_norm_var": 2.1228515625, + "learning_rate": 0.0001, + "loss": 7.2265, + "loss/crossentropy": 1.946645411849022, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.1663317572325468, + "step": 18420 + }, + { + "epoch": 0.46075, + "grad_norm": 32.0, + "grad_norm_var": 2.312239583333333, + "learning_rate": 0.0001, + "loss": 7.3208, + "loss/crossentropy": 2.0005823224782944, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.1826445495709777, + "step": 18430 + }, + { + "epoch": 0.461, + "grad_norm": 30.375, + "grad_norm_var": 0.9931640625, + "learning_rate": 0.0001, + "loss": 7.3267, + "loss/crossentropy": 2.1357066452503206, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.19712673518806695, + "step": 18440 + }, + { + "epoch": 0.46125, + "grad_norm": 29.5, + "grad_norm_var": 1.9497395833333333, + "learning_rate": 0.0001, + "loss": 7.3522, + "loss/crossentropy": 2.2581495225429533, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.20862563773989679, + "step": 18450 + }, + { + "epoch": 0.4615, + "grad_norm": 31.25, + "grad_norm_var": 2.3629557291666665, + "learning_rate": 0.0001, + "loss": 7.2707, + "loss/crossentropy": 1.8819428831338882, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.17904695197939874, + "step": 18460 + }, + { + "epoch": 0.46175, + "grad_norm": 29.125, + "grad_norm_var": 2.0885416666666665, + "learning_rate": 0.0001, + "loss": 7.441, + "loss/crossentropy": 2.1469796687364577, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.20115426667034625, + "step": 18470 + }, + { + "epoch": 0.462, + "grad_norm": 29.375, + "grad_norm_var": 1.2333333333333334, + "learning_rate": 0.0001, + "loss": 7.337, + "loss/crossentropy": 2.054457852616906, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.18193845450878143, + "step": 18480 + }, + { + "epoch": 0.46225, + "grad_norm": 32.25, + "grad_norm_var": 1.3489583333333333, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.0947310835123063, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.18451766278594733, + "step": 18490 + }, + { + "epoch": 0.4625, + "grad_norm": 30.625, + "grad_norm_var": 1.9104166666666667, + "learning_rate": 0.0001, + "loss": 7.3593, + "loss/crossentropy": 2.1241430282592773, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.19629997350275516, + "step": 18500 + }, + { + "epoch": 0.46275, + "grad_norm": 27.75, + "grad_norm_var": 11.044791666666667, + "learning_rate": 0.0001, + "loss": 7.3238, + "loss/crossentropy": 1.99822306483984, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.17915242053568364, + "step": 18510 + }, + { + "epoch": 0.463, + "grad_norm": 28.25, + "grad_norm_var": 21.5275390625, + "learning_rate": 0.0001, + "loss": 7.3394, + "loss/crossentropy": 1.996624768525362, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.1680696103721857, + "step": 18520 + }, + { + "epoch": 0.46325, + "grad_norm": 31.875, + "grad_norm_var": 12.8447265625, + "learning_rate": 0.0001, + "loss": 7.2899, + "loss/crossentropy": 1.9818643540143968, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.1780361386016011, + "step": 18530 + }, + { + "epoch": 0.4635, + "grad_norm": 28.125, + "grad_norm_var": 2.0306640625, + "learning_rate": 0.0001, + "loss": 7.3093, + "loss/crossentropy": 1.976102039217949, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.17688030060380697, + "step": 18540 + }, + { + "epoch": 0.46375, + "grad_norm": 30.875, + "grad_norm_var": 1.6103515625, + "learning_rate": 0.0001, + "loss": 7.4761, + "loss/crossentropy": 2.0188881896436213, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.19428470879793167, + "step": 18550 + }, + { + "epoch": 0.464, + "grad_norm": 28.5, + "grad_norm_var": 2.060416666666667, + "learning_rate": 0.0001, + "loss": 7.2272, + "loss/crossentropy": 1.997609743475914, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.179104876331985, + "step": 18560 + }, + { + "epoch": 0.46425, + "grad_norm": 29.875, + "grad_norm_var": 3.9645182291666665, + "learning_rate": 0.0001, + "loss": 7.4043, + "loss/crossentropy": 2.1501515746116637, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.20029870867729188, + "step": 18570 + }, + { + "epoch": 0.4645, + "grad_norm": 31.125, + "grad_norm_var": 3.39140625, + "learning_rate": 0.0001, + "loss": 7.3202, + "loss/crossentropy": 2.1886572629213332, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.18652867767959833, + "step": 18580 + }, + { + "epoch": 0.46475, + "grad_norm": 31.25, + "grad_norm_var": 2.2582682291666667, + "learning_rate": 0.0001, + "loss": 7.372, + "loss/crossentropy": 2.1719513699412345, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.1916284618899226, + "step": 18590 + }, + { + "epoch": 0.465, + "grad_norm": 30.75, + "grad_norm_var": 3.7514973958333333, + "learning_rate": 0.0001, + "loss": 7.2974, + "loss/crossentropy": 2.013065665960312, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.1881342137232423, + "step": 18600 + }, + { + "epoch": 0.46525, + "grad_norm": 29.375, + "grad_norm_var": 2.730208333333333, + "learning_rate": 0.0001, + "loss": 7.3544, + "loss/crossentropy": 2.241141265630722, + "loss/hidden": 3.438671875, + "loss/jsd": 0.0, + "loss/logits": 0.19869245793670415, + "step": 18610 + }, + { + "epoch": 0.4655, + "grad_norm": 28.75, + "grad_norm_var": 2.289322916666667, + "learning_rate": 0.0001, + "loss": 7.2523, + "loss/crossentropy": 2.1553829818964005, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.19479922130703925, + "step": 18620 + }, + { + "epoch": 0.46575, + "grad_norm": 32.75, + "grad_norm_var": 1.896875, + "learning_rate": 0.0001, + "loss": 7.4636, + "loss/crossentropy": 2.1676395773887633, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.19261010140180587, + "step": 18630 + }, + { + "epoch": 0.466, + "grad_norm": 31.5, + "grad_norm_var": 2.2739583333333333, + "learning_rate": 0.0001, + "loss": 7.3126, + "loss/crossentropy": 1.9635831721127033, + "loss/hidden": 3.440234375, + "loss/jsd": 0.0, + "loss/logits": 0.18405662197619677, + "step": 18640 + }, + { + "epoch": 0.46625, + "grad_norm": 30.5, + "grad_norm_var": 2.8603515625, + "learning_rate": 0.0001, + "loss": 7.4266, + "loss/crossentropy": 2.194899543747306, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.1942247728817165, + "step": 18650 + }, + { + "epoch": 0.4665, + "grad_norm": 28.25, + "grad_norm_var": 4.420572916666667, + "learning_rate": 0.0001, + "loss": 7.2987, + "loss/crossentropy": 2.054854319989681, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.17799932742491364, + "step": 18660 + }, + { + "epoch": 0.46675, + "grad_norm": 28.75, + "grad_norm_var": 2.7754557291666666, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.0972565457224848, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.18171238731592892, + "step": 18670 + }, + { + "epoch": 0.467, + "grad_norm": 30.625, + "grad_norm_var": 2.254684908534436e+18, + "learning_rate": 0.0001, + "loss": 7.4225, + "loss/crossentropy": 2.2388961255550384, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.20014150273054837, + "step": 18680 + }, + { + "epoch": 0.46725, + "grad_norm": 28.375, + "grad_norm_var": 8.96875, + "learning_rate": 0.0001, + "loss": 7.255, + "loss/crossentropy": 1.959997519850731, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.1796635765582323, + "step": 18690 + }, + { + "epoch": 0.4675, + "grad_norm": 30.875, + "grad_norm_var": 3.253125, + "learning_rate": 0.0001, + "loss": 7.4076, + "loss/crossentropy": 1.9209066323935986, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.18982454147189856, + "step": 18700 + }, + { + "epoch": 0.46775, + "grad_norm": 29.875, + "grad_norm_var": 4.195572916666666, + "learning_rate": 0.0001, + "loss": 7.2657, + "loss/crossentropy": 2.0889955282211305, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.17577018877491354, + "step": 18710 + }, + { + "epoch": 0.468, + "grad_norm": 31.125, + "grad_norm_var": 3.7997395833333334, + "learning_rate": 0.0001, + "loss": 7.3168, + "loss/crossentropy": 2.0014477796852588, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.18312612753361462, + "step": 18720 + }, + { + "epoch": 0.46825, + "grad_norm": 31.25, + "grad_norm_var": 3.222916666666667, + "learning_rate": 0.0001, + "loss": 7.4403, + "loss/crossentropy": 2.1561698734760286, + "loss/hidden": 3.262109375, + "loss/jsd": 0.0, + "loss/logits": 0.17687523942440747, + "step": 18730 + }, + { + "epoch": 0.4685, + "grad_norm": 30.5, + "grad_norm_var": 3.4395182291666666, + "learning_rate": 0.0001, + "loss": 7.3355, + "loss/crossentropy": 2.116148530691862, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.20347614474594594, + "step": 18740 + }, + { + "epoch": 0.46875, + "grad_norm": 30.75, + "grad_norm_var": 3.3020833333333335, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.244252935051918, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.19784990418702364, + "step": 18750 + }, + { + "epoch": 0.469, + "grad_norm": 30.125, + "grad_norm_var": 2.567122131243849e+18, + "learning_rate": 0.0001, + "loss": 7.2032, + "loss/crossentropy": 2.1832313030958175, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.2140472937375307, + "step": 18760 + }, + { + "epoch": 0.46925, + "grad_norm": 28.5, + "grad_norm_var": 2.567122131550942e+18, + "learning_rate": 0.0001, + "loss": 7.336, + "loss/crossentropy": 2.181801524013281, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.18407561387866736, + "step": 18770 + }, + { + "epoch": 0.4695, + "grad_norm": 27.875, + "grad_norm_var": 1.78125, + "learning_rate": 0.0001, + "loss": 7.3641, + "loss/crossentropy": 2.1652345418930055, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.19351193793118, + "step": 18780 + }, + { + "epoch": 0.46975, + "grad_norm": 28.25, + "grad_norm_var": 1.56875, + "learning_rate": 0.0001, + "loss": 7.2025, + "loss/crossentropy": 2.159587188065052, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.18127773366868496, + "step": 18790 + }, + { + "epoch": 0.47, + "grad_norm": 31.625, + "grad_norm_var": 2.247916666666667, + "learning_rate": 0.0001, + "loss": 7.3839, + "loss/crossentropy": 2.0050713911652567, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.1919370399788022, + "step": 18800 + }, + { + "epoch": 0.47025, + "grad_norm": 34.25, + "grad_norm_var": 3.3997395833333335, + "learning_rate": 0.0001, + "loss": 7.34, + "loss/crossentropy": 2.121124693751335, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.18033649511635302, + "step": 18810 + }, + { + "epoch": 0.4705, + "grad_norm": 30.875, + "grad_norm_var": 3.4650390625, + "learning_rate": 0.0001, + "loss": 7.1365, + "loss/crossentropy": 1.9974480658769607, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.17458155732601882, + "step": 18820 + }, + { + "epoch": 0.47075, + "grad_norm": 27.875, + "grad_norm_var": 2.5473307291666667, + "learning_rate": 0.0001, + "loss": 7.3431, + "loss/crossentropy": 1.9829891242086888, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.18825765140354633, + "step": 18830 + }, + { + "epoch": 0.471, + "grad_norm": 30.875, + "grad_norm_var": 3.7457682291666665, + "learning_rate": 0.0001, + "loss": 7.3446, + "loss/crossentropy": 2.1416624397039414, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.19354026410728692, + "step": 18840 + }, + { + "epoch": 0.47125, + "grad_norm": 30.5, + "grad_norm_var": 11.280989583333334, + "learning_rate": 0.0001, + "loss": 7.3904, + "loss/crossentropy": 2.072100041806698, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.1879756365902722, + "step": 18850 + }, + { + "epoch": 0.4715, + "grad_norm": 28.5, + "grad_norm_var": 10.832291666666666, + "learning_rate": 0.0001, + "loss": 7.4158, + "loss/crossentropy": 2.196484336256981, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.19391732811927795, + "step": 18860 + }, + { + "epoch": 0.47175, + "grad_norm": 29.0, + "grad_norm_var": 4.468489583333334, + "learning_rate": 0.0001, + "loss": 7.21, + "loss/crossentropy": 2.0025278739631176, + "loss/hidden": 3.26015625, + "loss/jsd": 0.0, + "loss/logits": 0.1782412089407444, + "step": 18870 + }, + { + "epoch": 0.472, + "grad_norm": 29.25, + "grad_norm_var": 2.85, + "learning_rate": 0.0001, + "loss": 7.2742, + "loss/crossentropy": 1.8140784852206706, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1594803395681083, + "step": 18880 + }, + { + "epoch": 0.47225, + "grad_norm": 31.0, + "grad_norm_var": 3.594153947490541e+18, + "learning_rate": 0.0001, + "loss": 7.4099, + "loss/crossentropy": 2.0685198314487936, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.17976643573492765, + "step": 18890 + }, + { + "epoch": 0.4725, + "grad_norm": 27.5, + "grad_norm_var": 3.5941539466769157e+18, + "learning_rate": 0.0001, + "loss": 7.3278, + "loss/crossentropy": 1.995526134967804, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.20469516087323428, + "step": 18900 + }, + { + "epoch": 0.47275, + "grad_norm": 28.5, + "grad_norm_var": 3.9176432291666665, + "learning_rate": 0.0001, + "loss": 7.2844, + "loss/crossentropy": 2.081358040869236, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.1750167841091752, + "step": 18910 + }, + { + "epoch": 0.473, + "grad_norm": 28.25, + "grad_norm_var": 1.0518229166666666, + "learning_rate": 0.0001, + "loss": 7.2403, + "loss/crossentropy": 2.08730803206563, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1881487349048257, + "step": 18920 + }, + { + "epoch": 0.47325, + "grad_norm": 31.125, + "grad_norm_var": 2.4871128695672494e+18, + "learning_rate": 0.0001, + "loss": 7.3072, + "loss/crossentropy": 2.050686779618263, + "loss/hidden": 3.263671875, + "loss/jsd": 0.0, + "loss/logits": 0.18110219333320857, + "step": 18930 + }, + { + "epoch": 0.4735, + "grad_norm": 31.125, + "grad_norm_var": 2.48711286963296e+18, + "learning_rate": 0.0001, + "loss": 7.3354, + "loss/crossentropy": 2.116658928990364, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.18662892282009125, + "step": 18940 + }, + { + "epoch": 0.47375, + "grad_norm": 30.5, + "grad_norm_var": 3.417122395833333, + "learning_rate": 0.0001, + "loss": 7.2869, + "loss/crossentropy": 2.181796707212925, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.1841755596920848, + "step": 18950 + }, + { + "epoch": 0.474, + "grad_norm": 31.375, + "grad_norm_var": 4.01640625, + "learning_rate": 0.0001, + "loss": 7.238, + "loss/crossentropy": 2.096955654025078, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.1859470222145319, + "step": 18960 + }, + { + "epoch": 0.47425, + "grad_norm": 29.625, + "grad_norm_var": 2.2020833333333334, + "learning_rate": 0.0001, + "loss": 7.2649, + "loss/crossentropy": 2.1101110115647317, + "loss/hidden": 3.2515625, + "loss/jsd": 0.0, + "loss/logits": 0.18113658875226973, + "step": 18970 + }, + { + "epoch": 0.4745, + "grad_norm": 31.5, + "grad_norm_var": 1.6952473958333334, + "learning_rate": 0.0001, + "loss": 7.218, + "loss/crossentropy": 1.9812964178621768, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17663604663684965, + "step": 18980 + }, + { + "epoch": 0.47475, + "grad_norm": 31.875, + "grad_norm_var": 1.8926432291666666, + "learning_rate": 0.0001, + "loss": 7.464, + "loss/crossentropy": 2.2813973009586332, + "loss/hidden": 3.28203125, + "loss/jsd": 0.0, + "loss/logits": 0.19019655492156745, + "step": 18990 + }, + { + "epoch": 0.475, + "grad_norm": 32.25, + "grad_norm_var": 3.423958333333333, + "learning_rate": 0.0001, + "loss": 7.427, + "loss/crossentropy": 2.1718254670500756, + "loss/hidden": 3.2765625, + "loss/jsd": 0.0, + "loss/logits": 0.18812184669077398, + "step": 19000 + }, + { + "epoch": 0.47525, + "grad_norm": 31.0, + "grad_norm_var": 2.986168000216896e+18, + "learning_rate": 0.0001, + "loss": 7.4384, + "loss/crossentropy": 1.9550217375159264, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.17679040897637605, + "step": 19010 + }, + { + "epoch": 0.4755, + "grad_norm": 28.875, + "grad_norm_var": 2.9861680010449213e+18, + "learning_rate": 0.0001, + "loss": 7.3947, + "loss/crossentropy": 2.001023256778717, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.18076751604676247, + "step": 19020 + }, + { + "epoch": 0.47575, + "grad_norm": 27.375, + "grad_norm_var": 1.3268229166666667, + "learning_rate": 0.0001, + "loss": 7.2484, + "loss/crossentropy": 2.0497733250260355, + "loss/hidden": 3.19453125, + "loss/jsd": 0.0, + "loss/logits": 0.1572325760498643, + "step": 19030 + }, + { + "epoch": 0.476, + "grad_norm": 30.625, + "grad_norm_var": 1.796875, + "learning_rate": 0.0001, + "loss": 7.4071, + "loss/crossentropy": 2.1493683993816375, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.19892700631171464, + "step": 19040 + }, + { + "epoch": 0.47625, + "grad_norm": 35.0, + "grad_norm_var": 3.356705729166667, + "learning_rate": 0.0001, + "loss": 7.2913, + "loss/crossentropy": 2.0519852221012114, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.1828221082687378, + "step": 19050 + }, + { + "epoch": 0.4765, + "grad_norm": 36.0, + "grad_norm_var": 3.5624176427021107e+18, + "learning_rate": 0.0001, + "loss": 7.3334, + "loss/crossentropy": 2.201715832948685, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.20039877742528917, + "step": 19060 + }, + { + "epoch": 0.47675, + "grad_norm": 30.5, + "grad_norm_var": 3.562417643000955e+18, + "learning_rate": 0.0001, + "loss": 7.3979, + "loss/crossentropy": 2.0442496329545974, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.1853096466511488, + "step": 19070 + }, + { + "epoch": 0.477, + "grad_norm": 30.0, + "grad_norm_var": 1.5322265625, + "learning_rate": 0.0001, + "loss": 7.2213, + "loss/crossentropy": 2.0714069202542307, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.1765752835199237, + "step": 19080 + }, + { + "epoch": 0.47725, + "grad_norm": 28.0, + "grad_norm_var": 2.2462890625, + "learning_rate": 0.0001, + "loss": 7.2175, + "loss/crossentropy": 1.784161239117384, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.19739716919139028, + "step": 19090 + }, + { + "epoch": 0.4775, + "grad_norm": 30.375, + "grad_norm_var": 3.033072916666667, + "learning_rate": 0.0001, + "loss": 7.258, + "loss/crossentropy": 2.0708198331296446, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.18721713982522487, + "step": 19100 + }, + { + "epoch": 0.47775, + "grad_norm": 30.875, + "grad_norm_var": 3.3442041695808343e+18, + "learning_rate": 0.0001, + "loss": 7.363, + "loss/crossentropy": 2.105399575829506, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.19771641138941048, + "step": 19110 + }, + { + "epoch": 0.478, + "grad_norm": 31.25, + "grad_norm_var": 6.77890625, + "learning_rate": 0.0001, + "loss": 7.3457, + "loss/crossentropy": 2.0999079287052154, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.1782015733420849, + "step": 19120 + }, + { + "epoch": 0.47825, + "grad_norm": 29.375, + "grad_norm_var": 2.011458333333333, + "learning_rate": 0.0001, + "loss": 7.2927, + "loss/crossentropy": 2.060526109486818, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.19682135116308927, + "step": 19130 + }, + { + "epoch": 0.4785, + "grad_norm": 31.375, + "grad_norm_var": 2.3447265625, + "learning_rate": 0.0001, + "loss": 7.1563, + "loss/crossentropy": 1.9666556693613528, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.18265628404915332, + "step": 19140 + }, + { + "epoch": 0.47875, + "grad_norm": 29.0, + "grad_norm_var": 1.0858723958333334, + "learning_rate": 0.0001, + "loss": 7.3782, + "loss/crossentropy": 2.233207347989082, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.17987454514950513, + "step": 19150 + }, + { + "epoch": 0.479, + "grad_norm": 30.75, + "grad_norm_var": 32.65807291666667, + "learning_rate": 0.0001, + "loss": 7.2859, + "loss/crossentropy": 2.304356482625008, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.19401859547942876, + "step": 19160 + }, + { + "epoch": 0.47925, + "grad_norm": 26.75, + "grad_norm_var": 35.8134765625, + "learning_rate": 0.0001, + "loss": 7.1512, + "loss/crossentropy": 2.187753638625145, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.19042326286435127, + "step": 19170 + }, + { + "epoch": 0.4795, + "grad_norm": 27.875, + "grad_norm_var": 3.17890625, + "learning_rate": 0.0001, + "loss": 7.3497, + "loss/crossentropy": 2.25252815335989, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.1958579957485199, + "step": 19180 + }, + { + "epoch": 0.47975, + "grad_norm": 33.25, + "grad_norm_var": 3.4058471885172465e+18, + "learning_rate": 0.0001, + "loss": 7.2983, + "loss/crossentropy": 1.9786495432257651, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.19189926832914353, + "step": 19190 + }, + { + "epoch": 0.48, + "grad_norm": 30.75, + "grad_norm_var": 3.405847187102368e+18, + "learning_rate": 0.0001, + "loss": 7.2884, + "loss/crossentropy": 2.13499116525054, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.18915282813832163, + "step": 19200 + }, + { + "epoch": 0.48025, + "grad_norm": 29.375, + "grad_norm_var": 7.429622395833333, + "learning_rate": 0.0001, + "loss": 7.3051, + "loss/crossentropy": 2.1838356882333754, + "loss/hidden": 3.423828125, + "loss/jsd": 0.0, + "loss/logits": 0.19484667405486106, + "step": 19210 + }, + { + "epoch": 0.4805, + "grad_norm": 35.0, + "grad_norm_var": 5.877018229166667, + "learning_rate": 0.0001, + "loss": 7.2957, + "loss/crossentropy": 2.1596252396702766, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.19484965018928052, + "step": 19220 + }, + { + "epoch": 0.48075, + "grad_norm": 32.25, + "grad_norm_var": 6.5744140625, + "learning_rate": 0.0001, + "loss": 7.3458, + "loss/crossentropy": 1.9286774158477784, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.20619444027543068, + "step": 19230 + }, + { + "epoch": 0.481, + "grad_norm": 30.75, + "grad_norm_var": 2.6780598958333335, + "learning_rate": 0.0001, + "loss": 7.3249, + "loss/crossentropy": 2.0902815729379656, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.17610749285668134, + "step": 19240 + }, + { + "epoch": 0.48125, + "grad_norm": 31.625, + "grad_norm_var": 3.191666666666667, + "learning_rate": 0.0001, + "loss": 7.3046, + "loss/crossentropy": 2.117246875911951, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.1708180472254753, + "step": 19250 + }, + { + "epoch": 0.4815, + "grad_norm": 29.875, + "grad_norm_var": 3.9801432291666665, + "learning_rate": 0.0001, + "loss": 7.285, + "loss/crossentropy": 1.9743897125124932, + "loss/hidden": 3.475, + "loss/jsd": 0.0, + "loss/logits": 0.18220343571156264, + "step": 19260 + }, + { + "epoch": 0.48175, + "grad_norm": 29.5, + "grad_norm_var": 1.52265625, + "learning_rate": 0.0001, + "loss": 7.416, + "loss/crossentropy": 2.128947339951992, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.18609113954007625, + "step": 19270 + }, + { + "epoch": 0.482, + "grad_norm": 32.25, + "grad_norm_var": 2.0311848958333334, + "learning_rate": 0.0001, + "loss": 7.4237, + "loss/crossentropy": 1.9989797204732895, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.19045947063714266, + "step": 19280 + }, + { + "epoch": 0.48225, + "grad_norm": 30.25, + "grad_norm_var": 2.456184895833333, + "learning_rate": 0.0001, + "loss": 7.3041, + "loss/crossentropy": 2.0317627035081385, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.18456566017121076, + "step": 19290 + }, + { + "epoch": 0.4825, + "grad_norm": 29.875, + "grad_norm_var": 1.0416015625, + "learning_rate": 0.0001, + "loss": 7.2606, + "loss/crossentropy": 1.994530802220106, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18545446768403054, + "step": 19300 + }, + { + "epoch": 0.48275, + "grad_norm": 30.75, + "grad_norm_var": 8.439518229166667, + "learning_rate": 0.0001, + "loss": 7.352, + "loss/crossentropy": 2.023553788661957, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.17272286228835582, + "step": 19310 + }, + { + "epoch": 0.483, + "grad_norm": 27.625, + "grad_norm_var": 8.5462890625, + "learning_rate": 0.0001, + "loss": 7.3492, + "loss/crossentropy": 2.0078469097614287, + "loss/hidden": 3.45078125, + "loss/jsd": 0.0, + "loss/logits": 0.1847759123891592, + "step": 19320 + }, + { + "epoch": 0.48325, + "grad_norm": 31.5, + "grad_norm_var": 2.8020833333333335, + "learning_rate": 0.0001, + "loss": 7.2202, + "loss/crossentropy": 2.084314952790737, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.17277139481157064, + "step": 19330 + }, + { + "epoch": 0.4835, + "grad_norm": 27.875, + "grad_norm_var": 2.183333333333333, + "learning_rate": 0.0001, + "loss": 7.3691, + "loss/crossentropy": 2.0023327678442002, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.17878497913479804, + "step": 19340 + }, + { + "epoch": 0.48375, + "grad_norm": 29.375, + "grad_norm_var": 1.9556640625, + "learning_rate": 0.0001, + "loss": 7.2622, + "loss/crossentropy": 2.160814478993416, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19914763923734427, + "step": 19350 + }, + { + "epoch": 0.484, + "grad_norm": 30.875, + "grad_norm_var": 1.22265625, + "learning_rate": 0.0001, + "loss": 7.4019, + "loss/crossentropy": 2.0562733739614485, + "loss/hidden": 3.521484375, + "loss/jsd": 0.0, + "loss/logits": 0.231661388091743, + "step": 19360 + }, + { + "epoch": 0.48425, + "grad_norm": 32.5, + "grad_norm_var": 5.6650390625, + "learning_rate": 0.0001, + "loss": 7.3196, + "loss/crossentropy": 2.1037007592618466, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.20012609604746104, + "step": 19370 + }, + { + "epoch": 0.4845, + "grad_norm": 30.25, + "grad_norm_var": 5.585872395833333, + "learning_rate": 0.0001, + "loss": 7.2967, + "loss/crossentropy": 2.0158598870038986, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18958695326000452, + "step": 19380 + }, + { + "epoch": 0.48475, + "grad_norm": 30.0, + "grad_norm_var": 2.6494140625, + "learning_rate": 0.0001, + "loss": 7.3039, + "loss/crossentropy": 2.155204123258591, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.2042725382372737, + "step": 19390 + }, + { + "epoch": 0.485, + "grad_norm": 29.375, + "grad_norm_var": 2.001822916666667, + "learning_rate": 0.0001, + "loss": 7.3471, + "loss/crossentropy": 2.130756896734238, + "loss/hidden": 3.48984375, + "loss/jsd": 0.0, + "loss/logits": 0.21289011891931295, + "step": 19400 + }, + { + "epoch": 0.48525, + "grad_norm": 29.875, + "grad_norm_var": 1.2583333333333333, + "learning_rate": 0.0001, + "loss": 7.3385, + "loss/crossentropy": 1.9661409441381692, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.1758869204670191, + "step": 19410 + }, + { + "epoch": 0.4855, + "grad_norm": 31.25, + "grad_norm_var": 1.2416015625, + "learning_rate": 0.0001, + "loss": 7.3636, + "loss/crossentropy": 2.088372728228569, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.1817894596606493, + "step": 19420 + }, + { + "epoch": 0.48575, + "grad_norm": 31.875, + "grad_norm_var": 1.9983723958333333, + "learning_rate": 0.0001, + "loss": 7.2972, + "loss/crossentropy": 2.288235005736351, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.20123199094086885, + "step": 19430 + }, + { + "epoch": 0.486, + "grad_norm": 30.25, + "grad_norm_var": 0.8858723958333333, + "learning_rate": 0.0001, + "loss": 7.3093, + "loss/crossentropy": 2.1147582471370696, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.1947942255064845, + "step": 19440 + }, + { + "epoch": 0.48625, + "grad_norm": 34.0, + "grad_norm_var": 2.8509765625, + "learning_rate": 0.0001, + "loss": 7.3116, + "loss/crossentropy": 2.089379907399416, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.19097770117223262, + "step": 19450 + }, + { + "epoch": 0.4865, + "grad_norm": 29.25, + "grad_norm_var": 2.7372395833333334, + "learning_rate": 0.0001, + "loss": 7.3315, + "loss/crossentropy": 2.16123593300581, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.19270185381174088, + "step": 19460 + }, + { + "epoch": 0.48675, + "grad_norm": 31.0, + "grad_norm_var": 2.511458333333333, + "learning_rate": 0.0001, + "loss": 7.3166, + "loss/crossentropy": 1.9626320779323578, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.1842844920232892, + "step": 19470 + }, + { + "epoch": 0.487, + "grad_norm": 28.625, + "grad_norm_var": 2.5254557291666666, + "learning_rate": 0.0001, + "loss": 7.3752, + "loss/crossentropy": 2.0067977510392665, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.1906943406909704, + "step": 19480 + }, + { + "epoch": 0.48725, + "grad_norm": 30.25, + "grad_norm_var": 5.573372395833333, + "learning_rate": 0.0001, + "loss": 7.3081, + "loss/crossentropy": 1.9796999536454678, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.17342409659177066, + "step": 19490 + }, + { + "epoch": 0.4875, + "grad_norm": 31.25, + "grad_norm_var": 6.215559895833334, + "learning_rate": 0.0001, + "loss": 7.2416, + "loss/crossentropy": 2.0168533116579055, + "loss/hidden": 3.288671875, + "loss/jsd": 0.0, + "loss/logits": 0.17710478082299233, + "step": 19500 + }, + { + "epoch": 0.48775, + "grad_norm": 28.375, + "grad_norm_var": 5.944791666666666, + "learning_rate": 0.0001, + "loss": 7.2389, + "loss/crossentropy": 2.0261049672961233, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.17297806199640037, + "step": 19510 + }, + { + "epoch": 0.488, + "grad_norm": 29.75, + "grad_norm_var": 2.620247395833333, + "learning_rate": 0.0001, + "loss": 7.3401, + "loss/crossentropy": 2.156434553861618, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.19226507674902676, + "step": 19520 + }, + { + "epoch": 0.48825, + "grad_norm": 31.625, + "grad_norm_var": 2.568489583333333, + "learning_rate": 0.0001, + "loss": 7.2962, + "loss/crossentropy": 2.0590639621019364, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.17470778487622737, + "step": 19530 + }, + { + "epoch": 0.4885, + "grad_norm": 30.375, + "grad_norm_var": 1.6957682291666667, + "learning_rate": 0.0001, + "loss": 7.2644, + "loss/crossentropy": 2.2576445043087006, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17541414052248, + "step": 19540 + }, + { + "epoch": 0.48875, + "grad_norm": 33.25, + "grad_norm_var": 5.232747395833333, + "learning_rate": 0.0001, + "loss": 7.2863, + "loss/crossentropy": 1.8561327636241913, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.17461737468838692, + "step": 19550 + }, + { + "epoch": 0.489, + "grad_norm": 30.25, + "grad_norm_var": 9.878125, + "learning_rate": 0.0001, + "loss": 7.2798, + "loss/crossentropy": 1.989323940873146, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.19638592712581157, + "step": 19560 + }, + { + "epoch": 0.48925, + "grad_norm": 32.5, + "grad_norm_var": 13.573893229166666, + "learning_rate": 0.0001, + "loss": 7.4564, + "loss/crossentropy": 2.173425105214119, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19522506278008223, + "step": 19570 + }, + { + "epoch": 0.4895, + "grad_norm": 32.25, + "grad_norm_var": 13.3087890625, + "learning_rate": 0.0001, + "loss": 7.4398, + "loss/crossentropy": 2.1963840425014496, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.20581881813704966, + "step": 19580 + }, + { + "epoch": 0.48975, + "grad_norm": 31.375, + "grad_norm_var": 14.501497395833333, + "learning_rate": 0.0001, + "loss": 7.288, + "loss/crossentropy": 2.2206703200936317, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.1854398850351572, + "step": 19590 + }, + { + "epoch": 0.49, + "grad_norm": 29.125, + "grad_norm_var": 6.174934895833333, + "learning_rate": 0.0001, + "loss": 7.3071, + "loss/crossentropy": 2.182942679524422, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.20937685370445253, + "step": 19600 + }, + { + "epoch": 0.49025, + "grad_norm": 30.625, + "grad_norm_var": 5.620768229166667, + "learning_rate": 0.0001, + "loss": 7.2791, + "loss/crossentropy": 1.968775314092636, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.18280262742191553, + "step": 19610 + }, + { + "epoch": 0.4905, + "grad_norm": 31.0, + "grad_norm_var": 4.764322916666667, + "learning_rate": 0.0001, + "loss": 7.2851, + "loss/crossentropy": 2.1555724523961546, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.1888716220855713, + "step": 19620 + }, + { + "epoch": 0.49075, + "grad_norm": 36.5, + "grad_norm_var": 7.4150390625, + "learning_rate": 0.0001, + "loss": 7.2932, + "loss/crossentropy": 2.2111911468207834, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18723924830555916, + "step": 19630 + }, + { + "epoch": 0.491, + "grad_norm": 31.25, + "grad_norm_var": 6.720572916666667, + "learning_rate": 0.0001, + "loss": 7.2567, + "loss/crossentropy": 1.9819117814302445, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.17056979294866323, + "step": 19640 + }, + { + "epoch": 0.49125, + "grad_norm": 28.0, + "grad_norm_var": 3.317643229166667, + "learning_rate": 0.0001, + "loss": 7.2356, + "loss/crossentropy": 2.142751504480839, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.17397481221705674, + "step": 19650 + }, + { + "epoch": 0.4915, + "grad_norm": 31.125, + "grad_norm_var": 5.001041666666667, + "learning_rate": 0.0001, + "loss": 7.3434, + "loss/crossentropy": 2.073681902885437, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.2028063558973372, + "step": 19660 + }, + { + "epoch": 0.49175, + "grad_norm": 29.375, + "grad_norm_var": 4.9697265625, + "learning_rate": 0.0001, + "loss": 7.2295, + "loss/crossentropy": 2.070267468690872, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.18656076975166797, + "step": 19670 + }, + { + "epoch": 0.492, + "grad_norm": 32.25, + "grad_norm_var": 6.004166666666666, + "learning_rate": 0.0001, + "loss": 7.3869, + "loss/crossentropy": 2.0801706589758395, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.2120389549061656, + "step": 19680 + }, + { + "epoch": 0.49225, + "grad_norm": 35.0, + "grad_norm_var": 4.446809895833334, + "learning_rate": 0.0001, + "loss": 7.4227, + "loss/crossentropy": 2.1620673507452013, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.197611235268414, + "step": 19690 + }, + { + "epoch": 0.4925, + "grad_norm": 31.875, + "grad_norm_var": 3.01640625, + "learning_rate": 0.0001, + "loss": 7.3952, + "loss/crossentropy": 2.102504736185074, + "loss/hidden": 3.260546875, + "loss/jsd": 0.0, + "loss/logits": 0.18240329679101705, + "step": 19700 + }, + { + "epoch": 0.49275, + "grad_norm": 30.375, + "grad_norm_var": 3.0444333199307136e+18, + "learning_rate": 0.0001, + "loss": 7.2426, + "loss/crossentropy": 2.2651669278740885, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.18710593543946744, + "step": 19710 + }, + { + "epoch": 0.493, + "grad_norm": 30.875, + "grad_norm_var": 3.044433320824939e+18, + "learning_rate": 0.0001, + "loss": 7.2411, + "loss/crossentropy": 2.185315527021885, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.1886447848752141, + "step": 19720 + }, + { + "epoch": 0.49325, + "grad_norm": 33.25, + "grad_norm_var": 2.78125, + "learning_rate": 0.0001, + "loss": 7.3084, + "loss/crossentropy": 2.066023611277342, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.1871557403355837, + "step": 19730 + }, + { + "epoch": 0.4935, + "grad_norm": 31.375, + "grad_norm_var": 3.215625, + "learning_rate": 0.0001, + "loss": 7.2056, + "loss/crossentropy": 2.2202082961797713, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.17992934715002776, + "step": 19740 + }, + { + "epoch": 0.49375, + "grad_norm": 29.25, + "grad_norm_var": 3.4452473958333334, + "learning_rate": 0.0001, + "loss": 7.3547, + "loss/crossentropy": 2.0926157012581825, + "loss/hidden": 3.322265625, + "loss/jsd": 0.0, + "loss/logits": 0.19089100752025842, + "step": 19750 + }, + { + "epoch": 0.494, + "grad_norm": 32.5, + "grad_norm_var": 2.0973307291666665, + "learning_rate": 0.0001, + "loss": 7.3392, + "loss/crossentropy": 2.0247383177280427, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.19218139182776212, + "step": 19760 + }, + { + "epoch": 0.49425, + "grad_norm": 6677331968.0, + "grad_norm_var": 2.786672612235455e+18, + "learning_rate": 0.0001, + "loss": 7.3587, + "loss/crossentropy": 1.9743504285812379, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.1963216718286276, + "step": 19770 + }, + { + "epoch": 0.4945, + "grad_norm": 31.25, + "grad_norm_var": 2.7866726115468554e+18, + "learning_rate": 0.0001, + "loss": 7.3528, + "loss/crossentropy": 2.065458830446005, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.1823502728715539, + "step": 19780 + }, + { + "epoch": 0.49475, + "grad_norm": 30.25, + "grad_norm_var": 2.9124348958333335, + "learning_rate": 0.0001, + "loss": 7.3247, + "loss/crossentropy": 2.108359482139349, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.2117020718753338, + "step": 19790 + }, + { + "epoch": 0.495, + "grad_norm": 28.375, + "grad_norm_var": 3.7059895833333334, + "learning_rate": 0.0001, + "loss": 7.3429, + "loss/crossentropy": 2.245875895023346, + "loss/hidden": 3.28046875, + "loss/jsd": 0.0, + "loss/logits": 0.1797666964121163, + "step": 19800 + }, + { + "epoch": 0.49525, + "grad_norm": 33.0, + "grad_norm_var": 2.463997395833333, + "learning_rate": 0.0001, + "loss": 7.3058, + "loss/crossentropy": 2.031013163924217, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.17521858643740415, + "step": 19810 + }, + { + "epoch": 0.4955, + "grad_norm": 29.625, + "grad_norm_var": 2.26875, + "learning_rate": 0.0001, + "loss": 7.2214, + "loss/crossentropy": 2.048213458806276, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.1798891793936491, + "step": 19820 + }, + { + "epoch": 0.49575, + "grad_norm": 30.5, + "grad_norm_var": 18.353059895833333, + "learning_rate": 0.0001, + "loss": 7.3332, + "loss/crossentropy": 2.1515459649264814, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1860925205051899, + "step": 19830 + }, + { + "epoch": 0.496, + "grad_norm": 29.125, + "grad_norm_var": 19.2275390625, + "learning_rate": 0.0001, + "loss": 7.3004, + "loss/crossentropy": 2.1721823632717134, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.19930550809949638, + "step": 19840 + }, + { + "epoch": 0.49625, + "grad_norm": 30.875, + "grad_norm_var": 2.5072265625, + "learning_rate": 0.0001, + "loss": 7.2698, + "loss/crossentropy": 1.9935913749039174, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.17680493406951428, + "step": 19850 + }, + { + "epoch": 0.4965, + "grad_norm": 6945767424.0, + "grad_norm_var": 3.015230293460858e+18, + "learning_rate": 0.0001, + "loss": 7.2426, + "loss/crossentropy": 2.069664215296507, + "loss/hidden": 3.569921875, + "loss/jsd": 0.0, + "loss/logits": 0.18678983384743333, + "step": 19860 + }, + { + "epoch": 0.49675, + "grad_norm": 32.5, + "grad_norm_var": 3.015230291984882e+18, + "learning_rate": 0.0001, + "loss": 7.3594, + "loss/crossentropy": 2.0391472943127154, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.18784078639000654, + "step": 19870 + }, + { + "epoch": 0.497, + "grad_norm": 29.875, + "grad_norm_var": 2.3822916666666667, + "learning_rate": 0.0001, + "loss": 7.2527, + "loss/crossentropy": 2.046357312053442, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.17729539722204207, + "step": 19880 + }, + { + "epoch": 0.49725, + "grad_norm": 30.5, + "grad_norm_var": 1.7872395833333334, + "learning_rate": 0.0001, + "loss": 7.307, + "loss/crossentropy": 2.021935646235943, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.20562903992831708, + "step": 19890 + }, + { + "epoch": 0.4975, + "grad_norm": 31.5, + "grad_norm_var": 2.1747395833333334, + "learning_rate": 0.0001, + "loss": 7.3574, + "loss/crossentropy": 2.0218880496919156, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.1812274256721139, + "step": 19900 + }, + { + "epoch": 0.49775, + "grad_norm": 42.5, + "grad_norm_var": 11.66640625, + "learning_rate": 0.0001, + "loss": 7.2552, + "loss/crossentropy": 2.1163423866033555, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.18092286475002767, + "step": 19910 + }, + { + "epoch": 0.498, + "grad_norm": 31.0, + "grad_norm_var": 13.023893229166667, + "learning_rate": 0.0001, + "loss": 7.4151, + "loss/crossentropy": 2.25559261739254, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.19431090019643307, + "step": 19920 + }, + { + "epoch": 0.49825, + "grad_norm": 34.0, + "grad_norm_var": 9.103059895833333, + "learning_rate": 0.0001, + "loss": 7.33, + "loss/crossentropy": 2.1288936853408815, + "loss/hidden": 3.2796875, + "loss/jsd": 0.0, + "loss/logits": 0.185909984074533, + "step": 19930 + }, + { + "epoch": 0.4985, + "grad_norm": 30.625, + "grad_norm_var": 2.645768229166667, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.1082677975296975, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18483771663159132, + "step": 19940 + }, + { + "epoch": 0.49875, + "grad_norm": 29.625, + "grad_norm_var": 1.3114583333333334, + "learning_rate": 0.0001, + "loss": 7.3146, + "loss/crossentropy": 2.047964480519295, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.18684354815632104, + "step": 19950 + }, + { + "epoch": 0.499, + "grad_norm": 30.25, + "grad_norm_var": 7.695833333333334, + "learning_rate": 0.0001, + "loss": 7.1954, + "loss/crossentropy": 2.0788462795317173, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18402842283248902, + "step": 19960 + }, + { + "epoch": 0.49925, + "grad_norm": 31.375, + "grad_norm_var": 7.2056640625, + "learning_rate": 0.0001, + "loss": 7.3153, + "loss/crossentropy": 1.910679142177105, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.16827462129294873, + "step": 19970 + }, + { + "epoch": 0.4995, + "grad_norm": 31.5, + "grad_norm_var": 2.7434895833333335, + "learning_rate": 0.0001, + "loss": 7.2924, + "loss/crossentropy": 2.133229525387287, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.18238822668790816, + "step": 19980 + }, + { + "epoch": 0.49975, + "grad_norm": 32.0, + "grad_norm_var": 2.7212890625, + "learning_rate": 0.0001, + "loss": 7.3936, + "loss/crossentropy": 2.1271730199456216, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.19191278666257858, + "step": 19990 + }, + { + "epoch": 0.5, + "grad_norm": 31.125, + "grad_norm_var": 3.5551432291666667, + "learning_rate": 0.0001, + "loss": 7.2894, + "loss/crossentropy": 2.0791160173714163, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.1787281386554241, + "step": 20000 + }, + { + "epoch": 0.50025, + "grad_norm": 30.125, + "grad_norm_var": 4.286393229166666, + "learning_rate": 0.0001, + "loss": 7.3934, + "loss/crossentropy": 2.100822980701923, + "loss/hidden": 3.31640625, + "loss/jsd": 0.0, + "loss/logits": 0.1837709965184331, + "step": 20010 + }, + { + "epoch": 0.5005, + "grad_norm": 27.0, + "grad_norm_var": 6.8275390625, + "learning_rate": 0.0001, + "loss": 7.2903, + "loss/crossentropy": 2.0603928424417974, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18236217414960265, + "step": 20020 + }, + { + "epoch": 0.50075, + "grad_norm": 31.125, + "grad_norm_var": 5.8509765625, + "learning_rate": 0.0001, + "loss": 7.2986, + "loss/crossentropy": 2.0794493168592454, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.17758788168430328, + "step": 20030 + }, + { + "epoch": 0.501, + "grad_norm": 31.5, + "grad_norm_var": 2.5884765625, + "learning_rate": 0.0001, + "loss": 7.3217, + "loss/crossentropy": 2.180893415212631, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.19812680166214705, + "step": 20040 + }, + { + "epoch": 0.50125, + "grad_norm": 29.25, + "grad_norm_var": 2.2718098958333335, + "learning_rate": 0.0001, + "loss": 7.2937, + "loss/crossentropy": 2.0710356786847113, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.18272985238581896, + "step": 20050 + }, + { + "epoch": 0.5015, + "grad_norm": 32.5, + "grad_norm_var": 3.61015625, + "learning_rate": 0.0001, + "loss": 7.3658, + "loss/crossentropy": 2.2894588097929955, + "loss/hidden": 3.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.19785721227526665, + "step": 20060 + }, + { + "epoch": 0.50175, + "grad_norm": 33.0, + "grad_norm_var": 6.8572265625, + "learning_rate": 0.0001, + "loss": 7.3323, + "loss/crossentropy": 1.9864086501300335, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.18656586892902852, + "step": 20070 + }, + { + "epoch": 0.502, + "grad_norm": 28.875, + "grad_norm_var": 3.0302083333333334, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 2.08188853263855, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1811662282794714, + "step": 20080 + }, + { + "epoch": 0.50225, + "grad_norm": 28.75, + "grad_norm_var": 2.2728515625, + "learning_rate": 0.0001, + "loss": 7.1962, + "loss/crossentropy": 2.175425173342228, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.18717424534261226, + "step": 20090 + }, + { + "epoch": 0.5025, + "grad_norm": 31.875, + "grad_norm_var": 2.0770833333333334, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.1691887110471724, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.19185122177004815, + "step": 20100 + }, + { + "epoch": 0.50275, + "grad_norm": 29.75, + "grad_norm_var": 2.1285807291666665, + "learning_rate": 0.0001, + "loss": 7.2232, + "loss/crossentropy": 2.2031719744205476, + "loss/hidden": 3.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.18911925759166479, + "step": 20110 + }, + { + "epoch": 0.503, + "grad_norm": 29.75, + "grad_norm_var": 1.6791015625, + "learning_rate": 0.0001, + "loss": 7.344, + "loss/crossentropy": 1.9854541048407555, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.17294510118663312, + "step": 20120 + }, + { + "epoch": 0.50325, + "grad_norm": 29.25, + "grad_norm_var": 1.9504557291666667, + "learning_rate": 0.0001, + "loss": 7.291, + "loss/crossentropy": 2.098982587456703, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.18347709700465203, + "step": 20130 + }, + { + "epoch": 0.5035, + "grad_norm": 28.5, + "grad_norm_var": 8.94765625, + "learning_rate": 0.0001, + "loss": 7.3558, + "loss/crossentropy": 2.0379406858235596, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.20050731115043163, + "step": 20140 + }, + { + "epoch": 0.50375, + "grad_norm": 29.0, + "grad_norm_var": 10.076822916666666, + "learning_rate": 0.0001, + "loss": 7.4874, + "loss/crossentropy": 2.1478746205568315, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.19442609827965499, + "step": 20150 + }, + { + "epoch": 0.504, + "grad_norm": 29.125, + "grad_norm_var": 18.79765625, + "learning_rate": 0.0001, + "loss": 7.2431, + "loss/crossentropy": 2.0215858951210977, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.1762773571535945, + "step": 20160 + }, + { + "epoch": 0.50425, + "grad_norm": 31.75, + "grad_norm_var": 19.851041666666667, + "learning_rate": 0.0001, + "loss": 7.2569, + "loss/crossentropy": 2.053553320467472, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18032009098678828, + "step": 20170 + }, + { + "epoch": 0.5045, + "grad_norm": 31.125, + "grad_norm_var": 1.47265625, + "learning_rate": 0.0001, + "loss": 7.3017, + "loss/crossentropy": 2.1755366683006288, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.19120553210377694, + "step": 20180 + }, + { + "epoch": 0.50475, + "grad_norm": 30.375, + "grad_norm_var": 2.2145182291666665, + "learning_rate": 0.0001, + "loss": 7.2499, + "loss/crossentropy": 2.0905803576111794, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.18027825355529786, + "step": 20190 + }, + { + "epoch": 0.505, + "grad_norm": 31.75, + "grad_norm_var": 2.0837890625, + "learning_rate": 0.0001, + "loss": 7.3299, + "loss/crossentropy": 2.1392801135778425, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.18984563760459422, + "step": 20200 + }, + { + "epoch": 0.50525, + "grad_norm": 32.25, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 0.0001, + "loss": 7.3683, + "loss/crossentropy": 1.9904960945248604, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.19267166014760734, + "step": 20210 + }, + { + "epoch": 0.5055, + "grad_norm": 29.375, + "grad_norm_var": 1.4369140625, + "learning_rate": 0.0001, + "loss": 7.4132, + "loss/crossentropy": 2.118674224615097, + "loss/hidden": 3.265625, + "loss/jsd": 0.0, + "loss/logits": 0.18962926007807254, + "step": 20220 + }, + { + "epoch": 0.50575, + "grad_norm": 31.125, + "grad_norm_var": 1.5275390625, + "learning_rate": 0.0001, + "loss": 7.2848, + "loss/crossentropy": 2.042106767743826, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.1888286828994751, + "step": 20230 + }, + { + "epoch": 0.506, + "grad_norm": 43.5, + "grad_norm_var": 11.646875, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.01454746350646, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.17999137863516806, + "step": 20240 + }, + { + "epoch": 0.50625, + "grad_norm": 30.625, + "grad_norm_var": 16.970572916666665, + "learning_rate": 0.0001, + "loss": 7.3463, + "loss/crossentropy": 2.0871743500232696, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.18978464882820845, + "step": 20250 + }, + { + "epoch": 0.5065, + "grad_norm": 29.75, + "grad_norm_var": 23.7556640625, + "learning_rate": 0.0001, + "loss": 7.3989, + "loss/crossentropy": 2.0043682172894477, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.20130981262773276, + "step": 20260 + }, + { + "epoch": 0.50675, + "grad_norm": 32.75, + "grad_norm_var": 2.066666666666667, + "learning_rate": 0.0001, + "loss": 7.3753, + "loss/crossentropy": 2.0674937814474106, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.1961617423221469, + "step": 20270 + }, + { + "epoch": 0.507, + "grad_norm": 29.375, + "grad_norm_var": 2.8934895833333334, + "learning_rate": 0.0001, + "loss": 7.2008, + "loss/crossentropy": 2.1054218977689745, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.1805409086868167, + "step": 20280 + }, + { + "epoch": 0.50725, + "grad_norm": 31.75, + "grad_norm_var": 2.3395182291666665, + "learning_rate": 0.0001, + "loss": 7.3738, + "loss/crossentropy": 2.1355152033269404, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.18312898697331548, + "step": 20290 + }, + { + "epoch": 0.5075, + "grad_norm": 44.5, + "grad_norm_var": 14.501041666666667, + "learning_rate": 0.0001, + "loss": 7.3439, + "loss/crossentropy": 1.953713247179985, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.16738616386428476, + "step": 20300 + }, + { + "epoch": 0.50775, + "grad_norm": 30.0, + "grad_norm_var": 13.852083333333333, + "learning_rate": 0.0001, + "loss": 7.3785, + "loss/crossentropy": 2.1630780398845673, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.18321501482278107, + "step": 20310 + }, + { + "epoch": 0.508, + "grad_norm": 35.25, + "grad_norm_var": 3.4587890625, + "learning_rate": 0.0001, + "loss": 7.3335, + "loss/crossentropy": 1.973591250926256, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.1923765016719699, + "step": 20320 + }, + { + "epoch": 0.50825, + "grad_norm": 30.75, + "grad_norm_var": 2.640625, + "learning_rate": 0.0001, + "loss": 7.2324, + "loss/crossentropy": 1.98987924605608, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18974843434989452, + "step": 20330 + }, + { + "epoch": 0.5085, + "grad_norm": 28.75, + "grad_norm_var": 6.96875, + "learning_rate": 0.0001, + "loss": 7.3109, + "loss/crossentropy": 2.277467969059944, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.19834709204733372, + "step": 20340 + }, + { + "epoch": 0.50875, + "grad_norm": 29.625, + "grad_norm_var": 11.067122395833334, + "learning_rate": 0.0001, + "loss": 7.2231, + "loss/crossentropy": 1.959492462873459, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1740871286019683, + "step": 20350 + }, + { + "epoch": 0.509, + "grad_norm": 38.0, + "grad_norm_var": 8.055989583333334, + "learning_rate": 0.0001, + "loss": 7.3223, + "loss/crossentropy": 2.1561495400965214, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.18467604517936706, + "step": 20360 + }, + { + "epoch": 0.50925, + "grad_norm": 36.25, + "grad_norm_var": 6.133333333333334, + "learning_rate": 0.0001, + "loss": 7.3836, + "loss/crossentropy": 2.0499372258782387, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.17811761405318977, + "step": 20370 + }, + { + "epoch": 0.5095, + "grad_norm": 36.75, + "grad_norm_var": 6.571809895833334, + "learning_rate": 0.0001, + "loss": 7.3418, + "loss/crossentropy": 2.0755091689527037, + "loss/hidden": 3.536328125, + "loss/jsd": 0.0, + "loss/logits": 0.2004254411906004, + "step": 20380 + }, + { + "epoch": 0.50975, + "grad_norm": 31.125, + "grad_norm_var": 2.4083702435510134e+18, + "learning_rate": 0.0001, + "loss": 7.4065, + "loss/crossentropy": 2.1109745904803274, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18784465081989765, + "step": 20390 + }, + { + "epoch": 0.51, + "grad_norm": 34.25, + "grad_norm_var": 2.408370242716871e+18, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 2.204335790872574, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.1780173137784004, + "step": 20400 + }, + { + "epoch": 0.51025, + "grad_norm": 32.0, + "grad_norm_var": 41.43723958333333, + "learning_rate": 0.0001, + "loss": 7.1806, + "loss/crossentropy": 1.9774738259613514, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.18556309957057238, + "step": 20410 + }, + { + "epoch": 0.5105, + "grad_norm": 27.625, + "grad_norm_var": 65.22473958333333, + "learning_rate": 0.0001, + "loss": 7.3462, + "loss/crossentropy": 2.167100024223328, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18838558737188577, + "step": 20420 + }, + { + "epoch": 0.51075, + "grad_norm": 28.375, + "grad_norm_var": 36.87291666666667, + "learning_rate": 0.0001, + "loss": 7.3272, + "loss/crossentropy": 2.005172957479954, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.19127585394307972, + "step": 20430 + }, + { + "epoch": 0.511, + "grad_norm": 30.25, + "grad_norm_var": 37.78125, + "learning_rate": 0.0001, + "loss": 7.226, + "loss/crossentropy": 2.063726345449686, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.19834220120683313, + "step": 20440 + }, + { + "epoch": 0.51125, + "grad_norm": 35.0, + "grad_norm_var": 12.229166666666666, + "learning_rate": 0.0001, + "loss": 7.2767, + "loss/crossentropy": 2.0135110549628736, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.17450594212859868, + "step": 20450 + }, + { + "epoch": 0.5115, + "grad_norm": 31.875, + "grad_norm_var": 7.4228515625, + "learning_rate": 0.0001, + "loss": 7.3327, + "loss/crossentropy": 2.102003552019596, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.19359097983688117, + "step": 20460 + }, + { + "epoch": 0.51175, + "grad_norm": 33.0, + "grad_norm_var": 5.968684895833333, + "learning_rate": 0.0001, + "loss": 7.3575, + "loss/crossentropy": 2.117770975828171, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.20843392983078957, + "step": 20470 + }, + { + "epoch": 0.512, + "grad_norm": 28.625, + "grad_norm_var": 3.80390625, + "learning_rate": 0.0001, + "loss": 7.294, + "loss/crossentropy": 2.048271709680557, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.19015705585479736, + "step": 20480 + }, + { + "epoch": 0.51225, + "grad_norm": 27.625, + "grad_norm_var": 4.162955729166667, + "learning_rate": 0.0001, + "loss": 7.2999, + "loss/crossentropy": 2.2269428968429565, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.18866086583584546, + "step": 20490 + }, + { + "epoch": 0.5125, + "grad_norm": 30.5, + "grad_norm_var": 6.625455729166666, + "learning_rate": 0.0001, + "loss": 7.2051, + "loss/crossentropy": 2.077080491185188, + "loss/hidden": 3.29921875, + "loss/jsd": 0.0, + "loss/logits": 0.18723124526441098, + "step": 20500 + }, + { + "epoch": 0.51275, + "grad_norm": 31.125, + "grad_norm_var": 2.943489583333333, + "learning_rate": 0.0001, + "loss": 7.3489, + "loss/crossentropy": 1.9472593426704408, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.18634757827967405, + "step": 20510 + }, + { + "epoch": 0.513, + "grad_norm": 29.875, + "grad_norm_var": 4.5431640625, + "learning_rate": 0.0001, + "loss": 7.2731, + "loss/crossentropy": 2.0889667838811876, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.17469800189137458, + "step": 20520 + }, + { + "epoch": 0.51325, + "grad_norm": 31.375, + "grad_norm_var": 3.168684895833333, + "learning_rate": 0.0001, + "loss": 7.3807, + "loss/crossentropy": 2.1357873290777207, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.19474379923194646, + "step": 20530 + }, + { + "epoch": 0.5135, + "grad_norm": 28.625, + "grad_norm_var": 3.7177083333333334, + "learning_rate": 0.0001, + "loss": 7.3564, + "loss/crossentropy": 2.1814554005861284, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.19765354953706266, + "step": 20540 + }, + { + "epoch": 0.51375, + "grad_norm": 29.5, + "grad_norm_var": 7.0916015625, + "learning_rate": 0.0001, + "loss": 7.3209, + "loss/crossentropy": 1.9481236249208451, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.17467648405581712, + "step": 20550 + }, + { + "epoch": 0.514, + "grad_norm": 28.875, + "grad_norm_var": 2.842643229166667, + "learning_rate": 0.0001, + "loss": 7.2827, + "loss/crossentropy": 1.9511701211333274, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.17900633215904235, + "step": 20560 + }, + { + "epoch": 0.51425, + "grad_norm": 30.0, + "grad_norm_var": 2.1205729166666667, + "learning_rate": 0.0001, + "loss": 7.3626, + "loss/crossentropy": 1.947856567800045, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.1868027526885271, + "step": 20570 + }, + { + "epoch": 0.5145, + "grad_norm": 28.625, + "grad_norm_var": 19.1875, + "learning_rate": 0.0001, + "loss": 7.2629, + "loss/crossentropy": 2.0266165107488634, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.17448003627359868, + "step": 20580 + }, + { + "epoch": 0.51475, + "grad_norm": 30.0, + "grad_norm_var": 0.7046223958333333, + "learning_rate": 0.0001, + "loss": 7.2807, + "loss/crossentropy": 2.015270346403122, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.18326627276837826, + "step": 20590 + }, + { + "epoch": 0.515, + "grad_norm": 31.875, + "grad_norm_var": 5.234309895833333, + "learning_rate": 0.0001, + "loss": 7.3668, + "loss/crossentropy": 2.034985066950321, + "loss/hidden": 3.3265625, + "loss/jsd": 0.0, + "loss/logits": 0.17759786024689675, + "step": 20600 + }, + { + "epoch": 0.51525, + "grad_norm": 30.5, + "grad_norm_var": 6.9494140625, + "learning_rate": 0.0001, + "loss": 7.3519, + "loss/crossentropy": 2.0228724449872972, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.18106682635843754, + "step": 20610 + }, + { + "epoch": 0.5155, + "grad_norm": 48.0, + "grad_norm_var": 41.123958333333334, + "learning_rate": 0.0001, + "loss": 7.4442, + "loss/crossentropy": 1.99410739839077, + "loss/hidden": 3.528515625, + "loss/jsd": 0.0, + "loss/logits": 0.18867430798709392, + "step": 20620 + }, + { + "epoch": 0.51575, + "grad_norm": 31.0, + "grad_norm_var": 112.25807291666666, + "learning_rate": 0.0001, + "loss": 7.3746, + "loss/crossentropy": 2.063592994213104, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.20251007843762636, + "step": 20630 + }, + { + "epoch": 0.516, + "grad_norm": 30.5, + "grad_norm_var": 3.1122395833333334, + "learning_rate": 0.0001, + "loss": 7.3186, + "loss/crossentropy": 2.1162465453147887, + "loss/hidden": 3.2609375, + "loss/jsd": 0.0, + "loss/logits": 0.1863781152293086, + "step": 20640 + }, + { + "epoch": 0.51625, + "grad_norm": 29.375, + "grad_norm_var": 11.701041666666667, + "learning_rate": 0.0001, + "loss": 7.303, + "loss/crossentropy": 1.9705959290266037, + "loss/hidden": 3.4875, + "loss/jsd": 0.0, + "loss/logits": 0.189107047021389, + "step": 20650 + }, + { + "epoch": 0.5165, + "grad_norm": 42.5, + "grad_norm_var": 25.134830729166666, + "learning_rate": 0.0001, + "loss": 7.2953, + "loss/crossentropy": 2.04353059977293, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.18167578242719173, + "step": 20660 + }, + { + "epoch": 0.51675, + "grad_norm": 63.25, + "grad_norm_var": 90.03483072916667, + "learning_rate": 0.0001, + "loss": 7.1868, + "loss/crossentropy": 1.9600037515163422, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.1769449569284916, + "step": 20670 + }, + { + "epoch": 0.517, + "grad_norm": 28.75, + "grad_norm_var": 73.69264322916666, + "learning_rate": 0.0001, + "loss": 7.3364, + "loss/crossentropy": 2.235697329044342, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19675933755934238, + "step": 20680 + }, + { + "epoch": 0.51725, + "grad_norm": 60.0, + "grad_norm_var": 67.38951822916667, + "learning_rate": 0.0001, + "loss": 7.2911, + "loss/crossentropy": 2.047742946445942, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.1932275788858533, + "step": 20690 + }, + { + "epoch": 0.5175, + "grad_norm": 30.875, + "grad_norm_var": 66.7978515625, + "learning_rate": 0.0001, + "loss": 7.369, + "loss/crossentropy": 2.1426502510905268, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.19659061841666697, + "step": 20700 + }, + { + "epoch": 0.51775, + "grad_norm": 30.375, + "grad_norm_var": 2.9848307291666667, + "learning_rate": 0.0001, + "loss": 7.3526, + "loss/crossentropy": 2.0521486185491087, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.18707139398902656, + "step": 20710 + }, + { + "epoch": 0.518, + "grad_norm": 30.5, + "grad_norm_var": 16.7634765625, + "learning_rate": 0.0001, + "loss": 7.363, + "loss/crossentropy": 2.0975049562752246, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.190740005671978, + "step": 20720 + }, + { + "epoch": 0.51825, + "grad_norm": 34.5, + "grad_norm_var": 27.069205729166665, + "learning_rate": 0.0001, + "loss": 7.3643, + "loss/crossentropy": 2.1847240805625914, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.20393175166100264, + "step": 20730 + }, + { + "epoch": 0.5185, + "grad_norm": 28.5, + "grad_norm_var": 28.865625, + "learning_rate": 0.0001, + "loss": 7.3442, + "loss/crossentropy": 2.079416597634554, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.18700419086962938, + "step": 20740 + }, + { + "epoch": 0.51875, + "grad_norm": 30.0, + "grad_norm_var": 5.898893229166666, + "learning_rate": 0.0001, + "loss": 7.2968, + "loss/crossentropy": 2.099186307191849, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19339050203561783, + "step": 20750 + }, + { + "epoch": 0.519, + "grad_norm": 33.5, + "grad_norm_var": 7.76640625, + "learning_rate": 0.0001, + "loss": 7.2292, + "loss/crossentropy": 1.9471840865910053, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.18490714272484182, + "step": 20760 + }, + { + "epoch": 0.51925, + "grad_norm": 30.625, + "grad_norm_var": 3.6176432291666667, + "learning_rate": 0.0001, + "loss": 7.2505, + "loss/crossentropy": 1.9791502140462398, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.1867075043730438, + "step": 20770 + }, + { + "epoch": 0.5195, + "grad_norm": 28.25, + "grad_norm_var": 1.8843098958333333, + "learning_rate": 0.0001, + "loss": 7.2881, + "loss/crossentropy": 2.228019216656685, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.19135302156209946, + "step": 20780 + }, + { + "epoch": 0.51975, + "grad_norm": 29.375, + "grad_norm_var": 1.9684895833333333, + "learning_rate": 0.0001, + "loss": 7.2552, + "loss/crossentropy": 2.0987790375947952, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.17536395322531462, + "step": 20790 + }, + { + "epoch": 0.52, + "grad_norm": 30.375, + "grad_norm_var": 1.2309895833333333, + "learning_rate": 0.0001, + "loss": 7.3921, + "loss/crossentropy": 2.0298918724060058, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.1898334242403507, + "step": 20800 + }, + { + "epoch": 0.52025, + "grad_norm": 28.875, + "grad_norm_var": 1.5083333333333333, + "learning_rate": 0.0001, + "loss": 7.234, + "loss/crossentropy": 1.9918959826231002, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18577282279729843, + "step": 20810 + }, + { + "epoch": 0.5205, + "grad_norm": 34.25, + "grad_norm_var": 3.3785807291666665, + "learning_rate": 0.0001, + "loss": 7.3487, + "loss/crossentropy": 2.0936102479696275, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.19174492321908473, + "step": 20820 + }, + { + "epoch": 0.52075, + "grad_norm": 37.75, + "grad_norm_var": 1.9157890389214495e+18, + "learning_rate": 0.0001, + "loss": 7.2159, + "loss/crossentropy": 1.8458645723760128, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.1738974804058671, + "step": 20830 + }, + { + "epoch": 0.521, + "grad_norm": 31.0, + "grad_norm_var": 1.9157890383966372e+18, + "learning_rate": 0.0001, + "loss": 7.2727, + "loss/crossentropy": 1.9869740903377533, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.19819957707077265, + "step": 20840 + }, + { + "epoch": 0.52125, + "grad_norm": 31.75, + "grad_norm_var": 2.275, + "learning_rate": 0.0001, + "loss": 7.2681, + "loss/crossentropy": 2.0040796995162964, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.18156437613070012, + "step": 20850 + }, + { + "epoch": 0.5215, + "grad_norm": 29.375, + "grad_norm_var": 4.849934895833333, + "learning_rate": 0.0001, + "loss": 7.3494, + "loss/crossentropy": 2.165298455953598, + "loss/hidden": 3.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.18115576189011334, + "step": 20860 + }, + { + "epoch": 0.52175, + "grad_norm": 30.25, + "grad_norm_var": 3.8848307291666666, + "learning_rate": 0.0001, + "loss": 7.3202, + "loss/crossentropy": 2.100299298763275, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.19552922286093236, + "step": 20870 + }, + { + "epoch": 0.522, + "grad_norm": 31.625, + "grad_norm_var": 2.60390625, + "learning_rate": 0.0001, + "loss": 7.2684, + "loss/crossentropy": 2.022940080612898, + "loss/hidden": 3.255859375, + "loss/jsd": 0.0, + "loss/logits": 0.1667741946876049, + "step": 20880 + }, + { + "epoch": 0.52225, + "grad_norm": 55.25, + "grad_norm_var": 40.665625, + "learning_rate": 0.0001, + "loss": 7.2855, + "loss/crossentropy": 2.102290881425142, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.1791887005791068, + "step": 20890 + }, + { + "epoch": 0.5225, + "grad_norm": 30.375, + "grad_norm_var": 44.436458333333334, + "learning_rate": 0.0001, + "loss": 7.1947, + "loss/crossentropy": 1.8682746514678001, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18195230644196272, + "step": 20900 + }, + { + "epoch": 0.52275, + "grad_norm": 32.0, + "grad_norm_var": 2.5416015625, + "learning_rate": 0.0001, + "loss": 7.3462, + "loss/crossentropy": 2.0893214523792265, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.19191989079117774, + "step": 20910 + }, + { + "epoch": 0.523, + "grad_norm": 31.375, + "grad_norm_var": 2.3520182291666667, + "learning_rate": 0.0001, + "loss": 7.347, + "loss/crossentropy": 2.0877039656043053, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.1851141469553113, + "step": 20920 + }, + { + "epoch": 0.52325, + "grad_norm": 28.0, + "grad_norm_var": 2.8997395833333335, + "learning_rate": 0.0001, + "loss": 7.2357, + "loss/crossentropy": 2.1176202185451984, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19145621228963136, + "step": 20930 + }, + { + "epoch": 0.5235, + "grad_norm": 30.75, + "grad_norm_var": 3.2186848958333334, + "learning_rate": 0.0001, + "loss": 7.2674, + "loss/crossentropy": 2.0402719154953957, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.1739909777417779, + "step": 20940 + }, + { + "epoch": 0.52375, + "grad_norm": 29.5, + "grad_norm_var": 3.0759765625, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.030729368329048, + "loss/hidden": 3.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.1842687962576747, + "step": 20950 + }, + { + "epoch": 0.524, + "grad_norm": 32.25, + "grad_norm_var": 3.3958333333333335, + "learning_rate": 0.0001, + "loss": 7.3384, + "loss/crossentropy": 2.0869288608431815, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.18400898948311806, + "step": 20960 + }, + { + "epoch": 0.52425, + "grad_norm": 33.0, + "grad_norm_var": 2.9067057291666667, + "learning_rate": 0.0001, + "loss": 7.4186, + "loss/crossentropy": 2.11807075291872, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.18031747452914715, + "step": 20970 + }, + { + "epoch": 0.5245, + "grad_norm": 29.75, + "grad_norm_var": 6.5244140625, + "learning_rate": 0.0001, + "loss": 7.2164, + "loss/crossentropy": 1.9718056872487069, + "loss/hidden": 3.26640625, + "loss/jsd": 0.0, + "loss/logits": 0.17341692205518483, + "step": 20980 + }, + { + "epoch": 0.52475, + "grad_norm": 30.625, + "grad_norm_var": 6.382747395833333, + "learning_rate": 0.0001, + "loss": 7.3702, + "loss/crossentropy": 2.0978681847453116, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.17709601484239101, + "step": 20990 + }, + { + "epoch": 0.525, + "grad_norm": 31.5, + "grad_norm_var": 9.713997395833333, + "learning_rate": 0.0001, + "loss": 7.3043, + "loss/crossentropy": 2.034855252504349, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.1906108744442463, + "step": 21000 + }, + { + "epoch": 0.52525, + "grad_norm": 28.625, + "grad_norm_var": 13.075455729166666, + "learning_rate": 0.0001, + "loss": 7.4447, + "loss/crossentropy": 2.2921934738755225, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.1889318484812975, + "step": 21010 + }, + { + "epoch": 0.5255, + "grad_norm": 29.5, + "grad_norm_var": 6.339322916666666, + "learning_rate": 0.0001, + "loss": 7.2976, + "loss/crossentropy": 1.8201854094862937, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.18457184694707393, + "step": 21020 + }, + { + "epoch": 0.52575, + "grad_norm": 28.875, + "grad_norm_var": 29.380989583333335, + "learning_rate": 0.0001, + "loss": 7.3322, + "loss/crossentropy": 2.131899508088827, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.2020415196195245, + "step": 21030 + }, + { + "epoch": 0.526, + "grad_norm": 31.875, + "grad_norm_var": 41.7431640625, + "learning_rate": 0.0001, + "loss": 7.2982, + "loss/crossentropy": 1.802038711309433, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.1719184698536992, + "step": 21040 + }, + { + "epoch": 0.52625, + "grad_norm": 29.125, + "grad_norm_var": 2.9468098958333333, + "learning_rate": 0.0001, + "loss": 7.2365, + "loss/crossentropy": 1.9720843493938447, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.18162036091089248, + "step": 21050 + }, + { + "epoch": 0.5265, + "grad_norm": 29.625, + "grad_norm_var": 8.392708333333333, + "learning_rate": 0.0001, + "loss": 7.2629, + "loss/crossentropy": 2.101500564068556, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.18104594042524697, + "step": 21060 + }, + { + "epoch": 0.52675, + "grad_norm": 31.75, + "grad_norm_var": 12.012239583333333, + "learning_rate": 0.0001, + "loss": 7.4139, + "loss/crossentropy": 2.192843732237816, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.1871086286380887, + "step": 21070 + }, + { + "epoch": 0.527, + "grad_norm": 33.75, + "grad_norm_var": 7.6291015625, + "learning_rate": 0.0001, + "loss": 7.3464, + "loss/crossentropy": 1.935421773791313, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.18491943813860418, + "step": 21080 + }, + { + "epoch": 0.52725, + "grad_norm": 29.25, + "grad_norm_var": 8.599934895833334, + "learning_rate": 0.0001, + "loss": 7.3769, + "loss/crossentropy": 2.1220942765474318, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.18080089893192053, + "step": 21090 + }, + { + "epoch": 0.5275, + "grad_norm": 32.5, + "grad_norm_var": 13.941666666666666, + "learning_rate": 0.0001, + "loss": 7.3172, + "loss/crossentropy": 2.1950094163417817, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.19560035392642022, + "step": 21100 + }, + { + "epoch": 0.52775, + "grad_norm": 29.125, + "grad_norm_var": 2.9431640625, + "learning_rate": 0.0001, + "loss": 7.2835, + "loss/crossentropy": 2.0231982678174973, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.1959500327706337, + "step": 21110 + }, + { + "epoch": 0.528, + "grad_norm": 34.75, + "grad_norm_var": 4.874934895833333, + "learning_rate": 0.0001, + "loss": 7.2968, + "loss/crossentropy": 2.014877498149872, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.18462858479470015, + "step": 21120 + }, + { + "epoch": 0.52825, + "grad_norm": 30.125, + "grad_norm_var": 3.410872395833333, + "learning_rate": 0.0001, + "loss": 7.2742, + "loss/crossentropy": 2.0856463953852655, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.1750152625143528, + "step": 21130 + }, + { + "epoch": 0.5285, + "grad_norm": 30.5, + "grad_norm_var": 4.5125, + "learning_rate": 0.0001, + "loss": 7.3537, + "loss/crossentropy": 2.097464480996132, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.1879219425842166, + "step": 21140 + }, + { + "epoch": 0.52875, + "grad_norm": 30.625, + "grad_norm_var": 3.91875, + "learning_rate": 0.0001, + "loss": 7.2096, + "loss/crossentropy": 2.025162447988987, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.18031715657562017, + "step": 21150 + }, + { + "epoch": 0.529, + "grad_norm": 31.0, + "grad_norm_var": 687.7858723958333, + "learning_rate": 0.0001, + "loss": 7.3077, + "loss/crossentropy": 2.0105077303946017, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.17551136016845703, + "step": 21160 + }, + { + "epoch": 0.52925, + "grad_norm": 27.875, + "grad_norm_var": 690.9580729166667, + "learning_rate": 0.0001, + "loss": 7.2711, + "loss/crossentropy": 1.984055256843567, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18593314290046692, + "step": 21170 + }, + { + "epoch": 0.5295, + "grad_norm": 30.125, + "grad_norm_var": 6.970768229166667, + "learning_rate": 0.0001, + "loss": 7.2034, + "loss/crossentropy": 2.1591003805398943, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.18686197344213723, + "step": 21180 + }, + { + "epoch": 0.52975, + "grad_norm": 27.625, + "grad_norm_var": 4.736393229166667, + "learning_rate": 0.0001, + "loss": 7.4247, + "loss/crossentropy": 2.0217775389552117, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1973855821415782, + "step": 21190 + }, + { + "epoch": 0.53, + "grad_norm": 28.75, + "grad_norm_var": 5.764322916666667, + "learning_rate": 0.0001, + "loss": 7.3758, + "loss/crossentropy": 2.0906185880303383, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.19189006537199021, + "step": 21200 + }, + { + "epoch": 0.53025, + "grad_norm": 31.375, + "grad_norm_var": 3.7018229166666665, + "learning_rate": 0.0001, + "loss": 7.3831, + "loss/crossentropy": 2.2920484393835068, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.19543348867446184, + "step": 21210 + }, + { + "epoch": 0.5305, + "grad_norm": 32.5, + "grad_norm_var": 1.301540273547833e+18, + "learning_rate": 0.0001, + "loss": 7.3275, + "loss/crossentropy": 2.0418326549232004, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.1798579154536128, + "step": 21220 + }, + { + "epoch": 0.53075, + "grad_norm": 30.625, + "grad_norm_var": 17.545572916666668, + "learning_rate": 0.0001, + "loss": 7.3427, + "loss/crossentropy": 2.1924046859145165, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.1793345332145691, + "step": 21230 + }, + { + "epoch": 0.531, + "grad_norm": 30.75, + "grad_norm_var": 3.9702473958333333, + "learning_rate": 0.0001, + "loss": 7.1754, + "loss/crossentropy": 1.9836316756904124, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.18065028581768275, + "step": 21240 + }, + { + "epoch": 0.53125, + "grad_norm": 28.625, + "grad_norm_var": 1.4410807291666667, + "learning_rate": 0.0001, + "loss": 7.3525, + "loss/crossentropy": 1.9980021633207798, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.18613223265856504, + "step": 21250 + }, + { + "epoch": 0.5315, + "grad_norm": 32.0, + "grad_norm_var": 2.5712890625, + "learning_rate": 0.0001, + "loss": 7.3235, + "loss/crossentropy": 1.9790296763181687, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.19255476742982863, + "step": 21260 + }, + { + "epoch": 0.53175, + "grad_norm": 28.375, + "grad_norm_var": 3.1264973958333333, + "learning_rate": 0.0001, + "loss": 7.2533, + "loss/crossentropy": 2.0499105736613275, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.17501375842839478, + "step": 21270 + }, + { + "epoch": 0.532, + "grad_norm": 31.625, + "grad_norm_var": 3.6997395833333333, + "learning_rate": 0.0001, + "loss": 7.2437, + "loss/crossentropy": 2.0615583658218384, + "loss/hidden": 3.273828125, + "loss/jsd": 0.0, + "loss/logits": 0.17125342395156623, + "step": 21280 + }, + { + "epoch": 0.53225, + "grad_norm": 31.25, + "grad_norm_var": 2.3228515625, + "learning_rate": 0.0001, + "loss": 7.2509, + "loss/crossentropy": 2.0179983586072923, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.17567140907049178, + "step": 21290 + }, + { + "epoch": 0.5325, + "grad_norm": 28.875, + "grad_norm_var": 0.8462890625, + "learning_rate": 0.0001, + "loss": 7.2822, + "loss/crossentropy": 2.044732092320919, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.17829035893082618, + "step": 21300 + }, + { + "epoch": 0.53275, + "grad_norm": 30.625, + "grad_norm_var": 6.9916015625, + "learning_rate": 0.0001, + "loss": 7.2366, + "loss/crossentropy": 2.108456004410982, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.1892499382607639, + "step": 21310 + }, + { + "epoch": 0.533, + "grad_norm": 30.75, + "grad_norm_var": 7.531184895833333, + "learning_rate": 0.0001, + "loss": 7.2825, + "loss/crossentropy": 2.0786417722702026, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.1849788771942258, + "step": 21320 + }, + { + "epoch": 0.53325, + "grad_norm": 30.5, + "grad_norm_var": 2.8067057291666666, + "learning_rate": 0.0001, + "loss": 7.3622, + "loss/crossentropy": 2.0772973991930486, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.19109403509646655, + "step": 21330 + }, + { + "epoch": 0.5335, + "grad_norm": 29.625, + "grad_norm_var": 3.414322916666667, + "learning_rate": 0.0001, + "loss": 7.3283, + "loss/crossentropy": 2.2267979592084886, + "loss/hidden": 3.263671875, + "loss/jsd": 0.0, + "loss/logits": 0.17751519959419965, + "step": 21340 + }, + { + "epoch": 0.53375, + "grad_norm": 31.0, + "grad_norm_var": 7.658333333333333, + "learning_rate": 0.0001, + "loss": 7.3284, + "loss/crossentropy": 2.0170826002955438, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.17177296355366706, + "step": 21350 + }, + { + "epoch": 0.534, + "grad_norm": 29.0, + "grad_norm_var": 1.7, + "learning_rate": 0.0001, + "loss": 7.267, + "loss/crossentropy": 2.175469179451466, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.2005373438820243, + "step": 21360 + }, + { + "epoch": 0.53425, + "grad_norm": 30.625, + "grad_norm_var": 8.731705729166666, + "learning_rate": 0.0001, + "loss": 7.137, + "loss/crossentropy": 2.0817506462335587, + "loss/hidden": 3.30625, + "loss/jsd": 0.0, + "loss/logits": 0.17554874606430532, + "step": 21370 + }, + { + "epoch": 0.5345, + "grad_norm": 29.375, + "grad_norm_var": 10.517122395833333, + "learning_rate": 0.0001, + "loss": 7.3312, + "loss/crossentropy": 2.0762263566255568, + "loss/hidden": 3.244140625, + "loss/jsd": 0.0, + "loss/logits": 0.17292992006987334, + "step": 21380 + }, + { + "epoch": 0.53475, + "grad_norm": 29.375, + "grad_norm_var": 7.6447265625, + "learning_rate": 0.0001, + "loss": 7.279, + "loss/crossentropy": 2.073785340040922, + "loss/hidden": 3.2546875, + "loss/jsd": 0.0, + "loss/logits": 0.18681957013905048, + "step": 21390 + }, + { + "epoch": 0.535, + "grad_norm": 28.75, + "grad_norm_var": 2.1333333333333333, + "learning_rate": 0.0001, + "loss": 7.3367, + "loss/crossentropy": 2.16663718521595, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.18254186888225377, + "step": 21400 + }, + { + "epoch": 0.53525, + "grad_norm": 30.5, + "grad_norm_var": 2.27890625, + "learning_rate": 0.0001, + "loss": 7.3824, + "loss/crossentropy": 2.000949743390083, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.19326272774487735, + "step": 21410 + }, + { + "epoch": 0.5355, + "grad_norm": 30.625, + "grad_norm_var": 1.3202473958333334, + "learning_rate": 0.0001, + "loss": 7.3294, + "loss/crossentropy": 1.9899908214807511, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.19131155535578728, + "step": 21420 + }, + { + "epoch": 0.53575, + "grad_norm": 29.125, + "grad_norm_var": 1.3811848958333333, + "learning_rate": 0.0001, + "loss": 7.3148, + "loss/crossentropy": 2.1891008853912353, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19048418197780848, + "step": 21430 + }, + { + "epoch": 0.536, + "grad_norm": 31.875, + "grad_norm_var": 1.6072916666666666, + "learning_rate": 0.0001, + "loss": 7.2268, + "loss/crossentropy": 2.1451114103198052, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.179903282225132, + "step": 21440 + }, + { + "epoch": 0.53625, + "grad_norm": 31.125, + "grad_norm_var": 5.038997395833333, + "learning_rate": 0.0001, + "loss": 7.3707, + "loss/crossentropy": 2.0621043920516966, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.2030002610757947, + "step": 21450 + }, + { + "epoch": 0.5365, + "grad_norm": 32.25, + "grad_norm_var": 1.1434895833333334, + "learning_rate": 0.0001, + "loss": 7.3413, + "loss/crossentropy": 2.0637945592403413, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.21434453465044498, + "step": 21460 + }, + { + "epoch": 0.53675, + "grad_norm": 30.5, + "grad_norm_var": 1.9046223958333333, + "learning_rate": 0.0001, + "loss": 7.331, + "loss/crossentropy": 2.100746136903763, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.20721383020281792, + "step": 21470 + }, + { + "epoch": 0.537, + "grad_norm": 31.625, + "grad_norm_var": 1.8875, + "learning_rate": 0.0001, + "loss": 7.266, + "loss/crossentropy": 2.085133180767298, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.19881708025932313, + "step": 21480 + }, + { + "epoch": 0.53725, + "grad_norm": 26.875, + "grad_norm_var": 4.8931640625, + "learning_rate": 0.0001, + "loss": 7.2806, + "loss/crossentropy": 1.9575376056134701, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.17017574273049832, + "step": 21490 + }, + { + "epoch": 0.5375, + "grad_norm": 36.75, + "grad_norm_var": 27.210872395833334, + "learning_rate": 0.0001, + "loss": 7.3797, + "loss/crossentropy": 2.107981327176094, + "loss/hidden": 3.496484375, + "loss/jsd": 0.0, + "loss/logits": 0.20202290676534176, + "step": 21500 + }, + { + "epoch": 0.53775, + "grad_norm": 43.25, + "grad_norm_var": 217.90625, + "learning_rate": 0.0001, + "loss": 7.3549, + "loss/crossentropy": 2.0315024718642234, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18244437091052532, + "step": 21510 + }, + { + "epoch": 0.538, + "grad_norm": 35.75, + "grad_norm_var": 209.64140625, + "learning_rate": 0.0001, + "loss": 7.2932, + "loss/crossentropy": 2.0473300129175187, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1864867027848959, + "step": 21520 + }, + { + "epoch": 0.53825, + "grad_norm": 30.0, + "grad_norm_var": 28.510872395833335, + "learning_rate": 0.0001, + "loss": 7.2708, + "loss/crossentropy": 2.2367413520812987, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.19972213320434093, + "step": 21530 + }, + { + "epoch": 0.5385, + "grad_norm": 30.875, + "grad_norm_var": 30.212955729166666, + "learning_rate": 0.0001, + "loss": 7.2704, + "loss/crossentropy": 2.1623730801045893, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.1971738815307617, + "step": 21540 + }, + { + "epoch": 0.53875, + "grad_norm": 35.0, + "grad_norm_var": 7.9134765625, + "learning_rate": 0.0001, + "loss": 7.2673, + "loss/crossentropy": 2.1150405749678614, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.18175593428313733, + "step": 21550 + }, + { + "epoch": 0.539, + "grad_norm": 31.5, + "grad_norm_var": 2.928465630196033e+18, + "learning_rate": 0.0001, + "loss": 7.3405, + "loss/crossentropy": 2.0370446920394896, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.1800082778558135, + "step": 21560 + }, + { + "epoch": 0.53925, + "grad_norm": 35.5, + "grad_norm_var": 2.928465629582826e+18, + "learning_rate": 0.0001, + "loss": 7.3808, + "loss/crossentropy": 2.1404812157154085, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.18585823830217124, + "step": 21570 + }, + { + "epoch": 0.5395, + "grad_norm": 30.375, + "grad_norm_var": 4.48515625, + "learning_rate": 0.0001, + "loss": 7.3203, + "loss/crossentropy": 2.2115766972303392, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.19159688670188188, + "step": 21580 + }, + { + "epoch": 0.53975, + "grad_norm": 28.0, + "grad_norm_var": 6.1666015625, + "learning_rate": 0.0001, + "loss": 7.2824, + "loss/crossentropy": 2.1206958115100862, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.18236497584730388, + "step": 21590 + }, + { + "epoch": 0.54, + "grad_norm": 33.0, + "grad_norm_var": 8.221809895833333, + "learning_rate": 0.0001, + "loss": 7.1904, + "loss/crossentropy": 1.9439353846013545, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.16596114486455918, + "step": 21600 + }, + { + "epoch": 0.54025, + "grad_norm": 30.625, + "grad_norm_var": 4.26640625, + "learning_rate": 0.0001, + "loss": 7.2192, + "loss/crossentropy": 2.084056280553341, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.1889295268803835, + "step": 21610 + }, + { + "epoch": 0.5405, + "grad_norm": 30.75, + "grad_norm_var": 4.08515625, + "learning_rate": 0.0001, + "loss": 7.2147, + "loss/crossentropy": 1.9731713935732842, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17223210986703635, + "step": 21620 + }, + { + "epoch": 0.54075, + "grad_norm": 30.875, + "grad_norm_var": 8.837239583333334, + "learning_rate": 0.0001, + "loss": 7.2933, + "loss/crossentropy": 2.049588477611542, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.18233908694237472, + "step": 21630 + }, + { + "epoch": 0.541, + "grad_norm": 29.625, + "grad_norm_var": 5.414322916666666, + "learning_rate": 0.0001, + "loss": 7.2083, + "loss/crossentropy": 2.045769859850407, + "loss/hidden": 3.2625, + "loss/jsd": 0.0, + "loss/logits": 0.17521222569048406, + "step": 21640 + }, + { + "epoch": 0.54125, + "grad_norm": 32.5, + "grad_norm_var": 2.04140625, + "learning_rate": 0.0001, + "loss": 7.297, + "loss/crossentropy": 2.1753006681799887, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.1932337900623679, + "step": 21650 + }, + { + "epoch": 0.5415, + "grad_norm": 27.875, + "grad_norm_var": 4.039583333333334, + "learning_rate": 0.0001, + "loss": 7.1617, + "loss/crossentropy": 2.1425692200660706, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.18889971654862164, + "step": 21660 + }, + { + "epoch": 0.54175, + "grad_norm": 34.25, + "grad_norm_var": 4.156184895833333, + "learning_rate": 0.0001, + "loss": 7.1944, + "loss/crossentropy": 2.0136908262968065, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.1680521246045828, + "step": 21670 + }, + { + "epoch": 0.542, + "grad_norm": 30.875, + "grad_norm_var": 4.99140625, + "learning_rate": 0.0001, + "loss": 7.2897, + "loss/crossentropy": 2.148314264416695, + "loss/hidden": 3.24765625, + "loss/jsd": 0.0, + "loss/logits": 0.170760527998209, + "step": 21680 + }, + { + "epoch": 0.54225, + "grad_norm": 30.75, + "grad_norm_var": 2.8059895833333335, + "learning_rate": 0.0001, + "loss": 7.1925, + "loss/crossentropy": 2.055006366968155, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1854418007656932, + "step": 21690 + }, + { + "epoch": 0.5425, + "grad_norm": 31.75, + "grad_norm_var": 3.6384765625, + "learning_rate": 0.0001, + "loss": 7.3447, + "loss/crossentropy": 2.0953226678073404, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.1921064408496022, + "step": 21700 + }, + { + "epoch": 0.54275, + "grad_norm": 31.25, + "grad_norm_var": 3.0625, + "learning_rate": 0.0001, + "loss": 7.3351, + "loss/crossentropy": 2.075012197345495, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.18619187362492085, + "step": 21710 + }, + { + "epoch": 0.543, + "grad_norm": 30.125, + "grad_norm_var": 3.9306640625, + "learning_rate": 0.0001, + "loss": 7.2675, + "loss/crossentropy": 2.104280537366867, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.18965906854718922, + "step": 21720 + }, + { + "epoch": 0.54325, + "grad_norm": 28.625, + "grad_norm_var": 4.276041666666667, + "learning_rate": 0.0001, + "loss": 7.2077, + "loss/crossentropy": 1.8971405908465386, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.17859434587880968, + "step": 21730 + }, + { + "epoch": 0.5435, + "grad_norm": 31.875, + "grad_norm_var": 3.109375, + "learning_rate": 0.0001, + "loss": 7.3256, + "loss/crossentropy": 2.1083227381110192, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.1904505856335163, + "step": 21740 + }, + { + "epoch": 0.54375, + "grad_norm": 30.25, + "grad_norm_var": 3.4728515625, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 2.3079893991351126, + "loss/hidden": 3.28046875, + "loss/jsd": 0.0, + "loss/logits": 0.18368349261581898, + "step": 21750 + }, + { + "epoch": 0.544, + "grad_norm": 30.125, + "grad_norm_var": 5.176822916666667, + "learning_rate": 0.0001, + "loss": 7.3763, + "loss/crossentropy": 2.098197969794273, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.2088245895691216, + "step": 21760 + }, + { + "epoch": 0.54425, + "grad_norm": 29.5, + "grad_norm_var": 7.164322916666666, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.1331639789044856, + "loss/hidden": 3.353125, + "loss/jsd": 0.0, + "loss/logits": 0.17733938563615084, + "step": 21770 + }, + { + "epoch": 0.5445, + "grad_norm": 30.625, + "grad_norm_var": 165.75045572916667, + "learning_rate": 0.0001, + "loss": 7.2084, + "loss/crossentropy": 1.9805914640426636, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.18551983144134282, + "step": 21780 + }, + { + "epoch": 0.54475, + "grad_norm": 29.125, + "grad_norm_var": 32.90598958333333, + "learning_rate": 0.0001, + "loss": 7.2898, + "loss/crossentropy": 2.0330356270074845, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18011851627379655, + "step": 21790 + }, + { + "epoch": 0.545, + "grad_norm": 37.25, + "grad_norm_var": 29.716666666666665, + "learning_rate": 0.0001, + "loss": 7.4356, + "loss/crossentropy": 2.1713989078998566, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.19549211151897908, + "step": 21800 + }, + { + "epoch": 0.54525, + "grad_norm": 29.75, + "grad_norm_var": 6.017122395833334, + "learning_rate": 0.0001, + "loss": 7.2726, + "loss/crossentropy": 2.2081006675958634, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.1883497454226017, + "step": 21810 + }, + { + "epoch": 0.5455, + "grad_norm": 31.25, + "grad_norm_var": 4.859830729166666, + "learning_rate": 0.0001, + "loss": 7.3724, + "loss/crossentropy": 1.9547579608857633, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.1771907014772296, + "step": 21820 + }, + { + "epoch": 0.54575, + "grad_norm": 32.75, + "grad_norm_var": 5.364518229166666, + "learning_rate": 0.0001, + "loss": 7.3612, + "loss/crossentropy": 2.098206965625286, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.19235289450734855, + "step": 21830 + }, + { + "epoch": 0.546, + "grad_norm": 31.5, + "grad_norm_var": 6.561393229166667, + "learning_rate": 0.0001, + "loss": 7.2737, + "loss/crossentropy": 2.1263977140188217, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.1888118337839842, + "step": 21840 + }, + { + "epoch": 0.54625, + "grad_norm": 33.0, + "grad_norm_var": 5.3603515625, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 2.101663938164711, + "loss/hidden": 3.27265625, + "loss/jsd": 0.0, + "loss/logits": 0.17398083284497262, + "step": 21850 + }, + { + "epoch": 0.5465, + "grad_norm": 28.375, + "grad_norm_var": 4.398958333333334, + "learning_rate": 0.0001, + "loss": 7.1952, + "loss/crossentropy": 1.9339447408914565, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.16659772507846354, + "step": 21860 + }, + { + "epoch": 0.54675, + "grad_norm": 35.5, + "grad_norm_var": 4.6666015625, + "learning_rate": 0.0001, + "loss": 7.2218, + "loss/crossentropy": 1.9582382440567017, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.17305021807551385, + "step": 21870 + }, + { + "epoch": 0.547, + "grad_norm": 32.25, + "grad_norm_var": 3.503059895833333, + "learning_rate": 0.0001, + "loss": 7.1931, + "loss/crossentropy": 1.9935566574335097, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.17071465943008662, + "step": 21880 + }, + { + "epoch": 0.54725, + "grad_norm": 29.625, + "grad_norm_var": 4.862955729166667, + "learning_rate": 0.0001, + "loss": 7.2374, + "loss/crossentropy": 2.036357142031193, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.19538825107738375, + "step": 21890 + }, + { + "epoch": 0.5475, + "grad_norm": 32.25, + "grad_norm_var": 5.046875, + "learning_rate": 0.0001, + "loss": 7.2621, + "loss/crossentropy": 1.92364434376359, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.17744148215278982, + "step": 21900 + }, + { + "epoch": 0.54775, + "grad_norm": 29.875, + "grad_norm_var": 3.564583333333333, + "learning_rate": 0.0001, + "loss": 7.285, + "loss/crossentropy": 2.0298509031534193, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.17162161748856306, + "step": 21910 + }, + { + "epoch": 0.548, + "grad_norm": 31.0, + "grad_norm_var": 5.1619140625, + "learning_rate": 0.0001, + "loss": 7.3575, + "loss/crossentropy": 2.175525971502066, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.18408444514498115, + "step": 21920 + }, + { + "epoch": 0.54825, + "grad_norm": 30.25, + "grad_norm_var": 2.9114583333333335, + "learning_rate": 0.0001, + "loss": 7.1831, + "loss/crossentropy": 1.8858811140060425, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.1825306786224246, + "step": 21930 + }, + { + "epoch": 0.5485, + "grad_norm": 30.0, + "grad_norm_var": 2.62890625, + "learning_rate": 0.0001, + "loss": 7.2268, + "loss/crossentropy": 2.026767262816429, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.17721015885472297, + "step": 21940 + }, + { + "epoch": 0.54875, + "grad_norm": 36.5, + "grad_norm_var": 5.305989583333333, + "learning_rate": 0.0001, + "loss": 7.2989, + "loss/crossentropy": 1.9967989712953567, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1873170206323266, + "step": 21950 + }, + { + "epoch": 0.549, + "grad_norm": 28.375, + "grad_norm_var": 239.72180989583333, + "learning_rate": 0.0001, + "loss": 7.3301, + "loss/crossentropy": 2.163920529931784, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.1893145816400647, + "step": 21960 + }, + { + "epoch": 0.54925, + "grad_norm": 29.0, + "grad_norm_var": 4.773372395833333, + "learning_rate": 0.0001, + "loss": 7.3758, + "loss/crossentropy": 2.1935864582657816, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.19310898408293725, + "step": 21970 + }, + { + "epoch": 0.5495, + "grad_norm": 36.0, + "grad_norm_var": 6.3072265625, + "learning_rate": 0.0001, + "loss": 7.2928, + "loss/crossentropy": 1.9404637925326824, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.18799830228090286, + "step": 21980 + }, + { + "epoch": 0.54975, + "grad_norm": 6174015488.0, + "grad_norm_var": 2.3824041793920087e+18, + "learning_rate": 0.0001, + "loss": 7.2622, + "loss/crossentropy": 2.111705342680216, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.2000083826482296, + "step": 21990 + }, + { + "epoch": 0.55, + "grad_norm": 29.625, + "grad_norm_var": 2.3824041760992005e+18, + "learning_rate": 0.0001, + "loss": 7.3763, + "loss/crossentropy": 2.0126213818788528, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.21517845019698142, + "step": 22000 + }, + { + "epoch": 0.55025, + "grad_norm": 29.625, + "grad_norm_var": 11.349934895833334, + "learning_rate": 0.0001, + "loss": 7.3056, + "loss/crossentropy": 2.1753602206707, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.1973609359934926, + "step": 22010 + }, + { + "epoch": 0.5505, + "grad_norm": 29.875, + "grad_norm_var": 1.3613932291666666, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.045438584685326, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.17302108742296696, + "step": 22020 + }, + { + "epoch": 0.55075, + "grad_norm": 31.75, + "grad_norm_var": 2.4893229166666666, + "learning_rate": 0.0001, + "loss": 7.2572, + "loss/crossentropy": 1.988423915207386, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.19198302906006576, + "step": 22030 + }, + { + "epoch": 0.551, + "grad_norm": 29.75, + "grad_norm_var": 2.018684895833333, + "learning_rate": 0.0001, + "loss": 7.3145, + "loss/crossentropy": 2.186215503513813, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.188802495226264, + "step": 22040 + }, + { + "epoch": 0.55125, + "grad_norm": 35.5, + "grad_norm_var": 3.05, + "learning_rate": 0.0001, + "loss": 7.3018, + "loss/crossentropy": 2.0135563641786574, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.19467689003795385, + "step": 22050 + }, + { + "epoch": 0.5515, + "grad_norm": 34.5, + "grad_norm_var": 34.940625, + "learning_rate": 0.0001, + "loss": 7.2785, + "loss/crossentropy": 1.9395711533725262, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.1815367082133889, + "step": 22060 + }, + { + "epoch": 0.55175, + "grad_norm": 29.75, + "grad_norm_var": 23.817122395833334, + "learning_rate": 0.0001, + "loss": 7.3435, + "loss/crossentropy": 2.024968159198761, + "loss/hidden": 3.51484375, + "loss/jsd": 0.0, + "loss/logits": 0.20050131883472205, + "step": 22070 + }, + { + "epoch": 0.552, + "grad_norm": 27.375, + "grad_norm_var": 3.2143229166666667, + "learning_rate": 0.0001, + "loss": 7.3201, + "loss/crossentropy": 2.29172303378582, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.19257347993552684, + "step": 22080 + }, + { + "epoch": 0.55225, + "grad_norm": 28.875, + "grad_norm_var": 4.649739583333333, + "learning_rate": 0.0001, + "loss": 7.2483, + "loss/crossentropy": 1.9568318665027618, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.17994072400033473, + "step": 22090 + }, + { + "epoch": 0.5525, + "grad_norm": 34.0, + "grad_norm_var": 4.592708333333333, + "learning_rate": 0.0001, + "loss": 7.2302, + "loss/crossentropy": 2.148341643810272, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.18362896144390106, + "step": 22100 + }, + { + "epoch": 0.55275, + "grad_norm": 31.375, + "grad_norm_var": 3.36640625, + "learning_rate": 0.0001, + "loss": 7.2849, + "loss/crossentropy": 1.9461228162050248, + "loss/hidden": 3.508203125, + "loss/jsd": 0.0, + "loss/logits": 0.1895364910364151, + "step": 22110 + }, + { + "epoch": 0.553, + "grad_norm": 33.0, + "grad_norm_var": 2.9497395833333333, + "learning_rate": 0.0001, + "loss": 7.4344, + "loss/crossentropy": 2.1932419329881667, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.19293667376041412, + "step": 22120 + }, + { + "epoch": 0.55325, + "grad_norm": 29.75, + "grad_norm_var": 7.080143229166667, + "learning_rate": 0.0001, + "loss": 7.3256, + "loss/crossentropy": 2.164583859592676, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.1796217132359743, + "step": 22130 + }, + { + "epoch": 0.5535, + "grad_norm": 33.75, + "grad_norm_var": 4.024739583333333, + "learning_rate": 0.0001, + "loss": 7.3128, + "loss/crossentropy": 2.0828025907278063, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.17759974133223294, + "step": 22140 + }, + { + "epoch": 0.55375, + "grad_norm": 31.875, + "grad_norm_var": 2.9150390625, + "learning_rate": 0.0001, + "loss": 7.2276, + "loss/crossentropy": 1.9275901369750499, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.16787631679326295, + "step": 22150 + }, + { + "epoch": 0.554, + "grad_norm": 32.25, + "grad_norm_var": 3.190559895833333, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.0888339787721635, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.17749339248985052, + "step": 22160 + }, + { + "epoch": 0.55425, + "grad_norm": 32.5, + "grad_norm_var": 2.7129557291666666, + "learning_rate": 0.0001, + "loss": 7.3324, + "loss/crossentropy": 1.9096436515450477, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.16837584171444178, + "step": 22170 + }, + { + "epoch": 0.5545, + "grad_norm": 31.25, + "grad_norm_var": 2.463997395833333, + "learning_rate": 0.0001, + "loss": 7.2819, + "loss/crossentropy": 2.0638594791293143, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1814063997939229, + "step": 22180 + }, + { + "epoch": 0.55475, + "grad_norm": 39.5, + "grad_norm_var": 7.167643229166667, + "learning_rate": 0.0001, + "loss": 7.2706, + "loss/crossentropy": 2.1528440028429032, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.19305985253304242, + "step": 22190 + }, + { + "epoch": 0.555, + "grad_norm": 34.5, + "grad_norm_var": 7.8875, + "learning_rate": 0.0001, + "loss": 7.3135, + "loss/crossentropy": 2.163337790966034, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.17552390675991775, + "step": 22200 + }, + { + "epoch": 0.55525, + "grad_norm": 30.125, + "grad_norm_var": 2.520247395833333, + "learning_rate": 0.0001, + "loss": 7.1905, + "loss/crossentropy": 2.0848088540136813, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1875115931034088, + "step": 22210 + }, + { + "epoch": 0.5555, + "grad_norm": 31.875, + "grad_norm_var": 2.845833333333333, + "learning_rate": 0.0001, + "loss": 7.2472, + "loss/crossentropy": 2.200931230187416, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.20207254495471716, + "step": 22220 + }, + { + "epoch": 0.55575, + "grad_norm": 31.0, + "grad_norm_var": 3.02265625, + "learning_rate": 0.0001, + "loss": 7.3226, + "loss/crossentropy": 1.9815104238688945, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.1747302796691656, + "step": 22230 + }, + { + "epoch": 0.556, + "grad_norm": 30.125, + "grad_norm_var": 4.238997395833334, + "learning_rate": 0.0001, + "loss": 7.3566, + "loss/crossentropy": 2.1646396666765213, + "loss/hidden": 3.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.2087303439155221, + "step": 22240 + }, + { + "epoch": 0.55625, + "grad_norm": 31.875, + "grad_norm_var": 5.177018229166666, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.0839816108345985, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.18073585536330938, + "step": 22250 + }, + { + "epoch": 0.5565, + "grad_norm": 30.625, + "grad_norm_var": 3.723372395833333, + "learning_rate": 0.0001, + "loss": 7.3226, + "loss/crossentropy": 2.0152421653270722, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.17794121894985437, + "step": 22260 + }, + { + "epoch": 0.55675, + "grad_norm": 28.5, + "grad_norm_var": 8.039518229166667, + "learning_rate": 0.0001, + "loss": 7.3145, + "loss/crossentropy": 2.0704862751066684, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.17498300932347774, + "step": 22270 + }, + { + "epoch": 0.557, + "grad_norm": 31.125, + "grad_norm_var": 10.777083333333334, + "learning_rate": 0.0001, + "loss": 7.2271, + "loss/crossentropy": 1.9257421031594277, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.19484029989689589, + "step": 22280 + }, + { + "epoch": 0.55725, + "grad_norm": 30.5, + "grad_norm_var": 6.412239583333333, + "learning_rate": 0.0001, + "loss": 7.249, + "loss/crossentropy": 2.0617389529943466, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.18348366860300303, + "step": 22290 + }, + { + "epoch": 0.5575, + "grad_norm": 31.875, + "grad_norm_var": 3.5582682291666665, + "learning_rate": 0.0001, + "loss": 7.3245, + "loss/crossentropy": 2.0664080172777175, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.17801499664783477, + "step": 22300 + }, + { + "epoch": 0.55775, + "grad_norm": 28.375, + "grad_norm_var": 2.41015625, + "learning_rate": 0.0001, + "loss": 7.2956, + "loss/crossentropy": 2.009297924488783, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.18730297349393368, + "step": 22310 + }, + { + "epoch": 0.558, + "grad_norm": 29.0, + "grad_norm_var": 3.723893229166667, + "learning_rate": 0.0001, + "loss": 7.3288, + "loss/crossentropy": 2.138043949007988, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.19984424524009228, + "step": 22320 + }, + { + "epoch": 0.55825, + "grad_norm": 31.125, + "grad_norm_var": 9.493489583333334, + "learning_rate": 0.0001, + "loss": 7.2771, + "loss/crossentropy": 2.038010062277317, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.1933336803689599, + "step": 22330 + }, + { + "epoch": 0.5585, + "grad_norm": 29.75, + "grad_norm_var": 2.6884765625, + "learning_rate": 0.0001, + "loss": 7.2633, + "loss/crossentropy": 2.2755305796861647, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.19489618260413408, + "step": 22340 + }, + { + "epoch": 0.55875, + "grad_norm": 30.875, + "grad_norm_var": 51.78333333333333, + "learning_rate": 0.0001, + "loss": 7.3329, + "loss/crossentropy": 2.179186634719372, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.1874596353620291, + "step": 22350 + }, + { + "epoch": 0.559, + "grad_norm": 36.25, + "grad_norm_var": 38.72057291666667, + "learning_rate": 0.0001, + "loss": 7.3535, + "loss/crossentropy": 1.9558674409985541, + "loss/hidden": 3.55546875, + "loss/jsd": 0.0, + "loss/logits": 0.19192036390304565, + "step": 22360 + }, + { + "epoch": 0.55925, + "grad_norm": 32.5, + "grad_norm_var": 52.962955729166666, + "learning_rate": 0.0001, + "loss": 7.4462, + "loss/crossentropy": 2.16518527418375, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.1972449893131852, + "step": 22370 + }, + { + "epoch": 0.5595, + "grad_norm": 33.75, + "grad_norm_var": 41.524739583333336, + "learning_rate": 0.0001, + "loss": 7.2722, + "loss/crossentropy": 2.1358800739049912, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.18384176325052975, + "step": 22380 + }, + { + "epoch": 0.55975, + "grad_norm": 41.5, + "grad_norm_var": 23.659830729166668, + "learning_rate": 0.0001, + "loss": 7.3707, + "loss/crossentropy": 2.090507461130619, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.17735959570854903, + "step": 22390 + }, + { + "epoch": 0.56, + "grad_norm": 32.5, + "grad_norm_var": 15.437239583333334, + "learning_rate": 0.0001, + "loss": 7.1671, + "loss/crossentropy": 2.0199001021683216, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.176216440834105, + "step": 22400 + }, + { + "epoch": 0.56025, + "grad_norm": 32.5, + "grad_norm_var": 12.668489583333333, + "learning_rate": 0.0001, + "loss": 7.3144, + "loss/crossentropy": 2.0520263105630874, + "loss/hidden": 3.526953125, + "loss/jsd": 0.0, + "loss/logits": 0.207816849835217, + "step": 22410 + }, + { + "epoch": 0.5605, + "grad_norm": 30.0, + "grad_norm_var": 10.243489583333334, + "learning_rate": 0.0001, + "loss": 7.251, + "loss/crossentropy": 2.191933420300484, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.18148876912891865, + "step": 22420 + }, + { + "epoch": 0.56075, + "grad_norm": 27.625, + "grad_norm_var": 8.584830729166667, + "learning_rate": 0.0001, + "loss": 7.2872, + "loss/crossentropy": 2.0942600347101687, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.1781538650393486, + "step": 22430 + }, + { + "epoch": 0.561, + "grad_norm": 28.375, + "grad_norm_var": 30.66640625, + "learning_rate": 0.0001, + "loss": 7.3342, + "loss/crossentropy": 2.0626624435186387, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.20761814154684544, + "step": 22440 + }, + { + "epoch": 0.56125, + "grad_norm": 28.875, + "grad_norm_var": 5.724739583333333, + "learning_rate": 0.0001, + "loss": 7.3023, + "loss/crossentropy": 2.153649944067001, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.2057520892471075, + "step": 22450 + }, + { + "epoch": 0.5615, + "grad_norm": 30.0, + "grad_norm_var": 8.825455729166666, + "learning_rate": 0.0001, + "loss": 7.4168, + "loss/crossentropy": 2.1032363295555117, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.17875799927860497, + "step": 22460 + }, + { + "epoch": 0.56175, + "grad_norm": 31.75, + "grad_norm_var": 6.794205729166666, + "learning_rate": 0.0001, + "loss": 7.2705, + "loss/crossentropy": 2.2260005325078964, + "loss/hidden": 3.319921875, + "loss/jsd": 0.0, + "loss/logits": 0.19685445744544267, + "step": 22470 + }, + { + "epoch": 0.562, + "grad_norm": 31.5, + "grad_norm_var": 4.7478515625, + "learning_rate": 0.0001, + "loss": 7.2752, + "loss/crossentropy": 2.0784255638718605, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.19737142305821181, + "step": 22480 + }, + { + "epoch": 0.56225, + "grad_norm": 34.75, + "grad_norm_var": 6.758072916666666, + "learning_rate": 0.0001, + "loss": 7.3435, + "loss/crossentropy": 2.143521362543106, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.17078035548329354, + "step": 22490 + }, + { + "epoch": 0.5625, + "grad_norm": 29.0, + "grad_norm_var": 7.628580729166667, + "learning_rate": 0.0001, + "loss": 7.3417, + "loss/crossentropy": 2.0641577050089834, + "loss/hidden": 3.555078125, + "loss/jsd": 0.0, + "loss/logits": 0.1941977996379137, + "step": 22500 + }, + { + "epoch": 0.56275, + "grad_norm": 29.875, + "grad_norm_var": 7.066666666666666, + "learning_rate": 0.0001, + "loss": 7.4512, + "loss/crossentropy": 2.1928183168172835, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.19421984851360322, + "step": 22510 + }, + { + "epoch": 0.563, + "grad_norm": 29.5, + "grad_norm_var": 3.2030598958333334, + "learning_rate": 0.0001, + "loss": 7.3443, + "loss/crossentropy": 2.068145313858986, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.19339582826942206, + "step": 22520 + }, + { + "epoch": 0.56325, + "grad_norm": 31.25, + "grad_norm_var": 9.23515625, + "learning_rate": 0.0001, + "loss": 7.378, + "loss/crossentropy": 2.130034548044205, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.19611325915902852, + "step": 22530 + }, + { + "epoch": 0.5635, + "grad_norm": 29.625, + "grad_norm_var": 11.611458333333333, + "learning_rate": 0.0001, + "loss": 7.2822, + "loss/crossentropy": 1.9730725206434727, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.2219389332458377, + "step": 22540 + }, + { + "epoch": 0.56375, + "grad_norm": 28.5, + "grad_norm_var": 3.5660807291666665, + "learning_rate": 0.0001, + "loss": 7.2197, + "loss/crossentropy": 2.150769717991352, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.20134973097592593, + "step": 22550 + }, + { + "epoch": 0.564, + "grad_norm": 31.375, + "grad_norm_var": 1.5135416666666666, + "learning_rate": 0.0001, + "loss": 7.2492, + "loss/crossentropy": 2.00289306640625, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.17620294205844403, + "step": 22560 + }, + { + "epoch": 0.56425, + "grad_norm": 31.25, + "grad_norm_var": 1.3301432291666666, + "learning_rate": 0.0001, + "loss": 7.2268, + "loss/crossentropy": 2.106088588386774, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20606033485382796, + "step": 22570 + }, + { + "epoch": 0.5645, + "grad_norm": 28.125, + "grad_norm_var": 3.089583333333333, + "learning_rate": 0.0001, + "loss": 7.2221, + "loss/crossentropy": 1.9868631184101104, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.17915018759667872, + "step": 22580 + }, + { + "epoch": 0.56475, + "grad_norm": 31.625, + "grad_norm_var": 2.951041666666667, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.104344055056572, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.19927312359213828, + "step": 22590 + }, + { + "epoch": 0.565, + "grad_norm": 28.875, + "grad_norm_var": 18.667643229166668, + "learning_rate": 0.0001, + "loss": 7.3948, + "loss/crossentropy": 2.129189969599247, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.2167124640196562, + "step": 22600 + }, + { + "epoch": 0.56525, + "grad_norm": 31.625, + "grad_norm_var": 18.68515625, + "learning_rate": 0.0001, + "loss": 7.3995, + "loss/crossentropy": 2.0350073277950287, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.1885037964209914, + "step": 22610 + }, + { + "epoch": 0.5655, + "grad_norm": 35.25, + "grad_norm_var": 5.2375, + "learning_rate": 0.0001, + "loss": 7.1725, + "loss/crossentropy": 1.988342931866646, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.1710897535085678, + "step": 22620 + }, + { + "epoch": 0.56575, + "grad_norm": 29.75, + "grad_norm_var": 3.592643229166667, + "learning_rate": 0.0001, + "loss": 7.3575, + "loss/crossentropy": 2.1170627892017366, + "loss/hidden": 3.47109375, + "loss/jsd": 0.0, + "loss/logits": 0.20078443121165038, + "step": 22630 + }, + { + "epoch": 0.566, + "grad_norm": 29.25, + "grad_norm_var": 2.0160807291666667, + "learning_rate": 0.0001, + "loss": 7.2549, + "loss/crossentropy": 1.9923384763300418, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.2009210099466145, + "step": 22640 + }, + { + "epoch": 0.56625, + "grad_norm": 30.125, + "grad_norm_var": 2.9759765625, + "learning_rate": 0.0001, + "loss": 7.3738, + "loss/crossentropy": 2.166072864830494, + "loss/hidden": 3.28828125, + "loss/jsd": 0.0, + "loss/logits": 0.1796887915581465, + "step": 22650 + }, + { + "epoch": 0.5665, + "grad_norm": 6274678784.0, + "grad_norm_var": 2.46072459061866e+18, + "learning_rate": 0.0001, + "loss": 7.3881, + "loss/crossentropy": 2.002020299434662, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.2040075208991766, + "step": 22660 + }, + { + "epoch": 0.56675, + "grad_norm": 29.75, + "grad_norm_var": 2.460724590599052e+18, + "learning_rate": 0.0001, + "loss": 7.3138, + "loss/crossentropy": 2.0951256424188616, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.17585320938378574, + "step": 22670 + }, + { + "epoch": 0.567, + "grad_norm": 31.25, + "grad_norm_var": 0.7947265625, + "learning_rate": 0.0001, + "loss": 7.302, + "loss/crossentropy": 1.918744233250618, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.1783786091953516, + "step": 22680 + }, + { + "epoch": 0.56725, + "grad_norm": 29.375, + "grad_norm_var": 1.9080729166666666, + "learning_rate": 0.0001, + "loss": 7.3487, + "loss/crossentropy": 2.0890044644474983, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.186820235196501, + "step": 22690 + }, + { + "epoch": 0.5675, + "grad_norm": 32.25, + "grad_norm_var": 2.16015625, + "learning_rate": 0.0001, + "loss": 7.3366, + "loss/crossentropy": 2.0049152970314026, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.18571792878210544, + "step": 22700 + }, + { + "epoch": 0.56775, + "grad_norm": 31.625, + "grad_norm_var": 1.2119140625, + "learning_rate": 0.0001, + "loss": 7.3629, + "loss/crossentropy": 2.188979035615921, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.18791910037398338, + "step": 22710 + }, + { + "epoch": 0.568, + "grad_norm": 28.125, + "grad_norm_var": 6.873958333333333, + "learning_rate": 0.0001, + "loss": 7.2721, + "loss/crossentropy": 1.9086629442870617, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.1676910617388785, + "step": 22720 + }, + { + "epoch": 0.56825, + "grad_norm": 29.375, + "grad_norm_var": 4.77890625, + "learning_rate": 0.0001, + "loss": 7.2104, + "loss/crossentropy": 1.9729705944657325, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.18355059083551167, + "step": 22730 + }, + { + "epoch": 0.5685, + "grad_norm": 29.375, + "grad_norm_var": 3.0416015625, + "learning_rate": 0.0001, + "loss": 7.3276, + "loss/crossentropy": 1.983503246307373, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.1911757795140147, + "step": 22740 + }, + { + "epoch": 0.56875, + "grad_norm": 30.5, + "grad_norm_var": 2.209830729166667, + "learning_rate": 0.0001, + "loss": 7.3981, + "loss/crossentropy": 2.0647961035370828, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.19169409601017834, + "step": 22750 + }, + { + "epoch": 0.569, + "grad_norm": 28.5, + "grad_norm_var": 2.794205729166667, + "learning_rate": 0.0001, + "loss": 7.3512, + "loss/crossentropy": 2.0892448581755163, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.18739675851538778, + "step": 22760 + }, + { + "epoch": 0.56925, + "grad_norm": 30.5, + "grad_norm_var": 4.69375, + "learning_rate": 0.0001, + "loss": 7.3437, + "loss/crossentropy": 2.052200788259506, + "loss/hidden": 3.51875, + "loss/jsd": 0.0, + "loss/logits": 0.19390411972999572, + "step": 22770 + }, + { + "epoch": 0.5695, + "grad_norm": 29.25, + "grad_norm_var": 3.9322265625, + "learning_rate": 0.0001, + "loss": 7.2241, + "loss/crossentropy": 2.2473602324724196, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.18824474215507508, + "step": 22780 + }, + { + "epoch": 0.56975, + "grad_norm": 30.0, + "grad_norm_var": 3.1973307291666666, + "learning_rate": 0.0001, + "loss": 7.4546, + "loss/crossentropy": 2.2174068212509157, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.2089033018797636, + "step": 22790 + }, + { + "epoch": 0.57, + "grad_norm": 31.25, + "grad_norm_var": 2.8775390625, + "learning_rate": 0.0001, + "loss": 7.2814, + "loss/crossentropy": 2.098750987648964, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.18969560451805592, + "step": 22800 + }, + { + "epoch": 0.57025, + "grad_norm": 29.0, + "grad_norm_var": 4.448958333333334, + "learning_rate": 0.0001, + "loss": 7.3282, + "loss/crossentropy": 2.2710734084248543, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.18953933827579023, + "step": 22810 + }, + { + "epoch": 0.5705, + "grad_norm": 27.75, + "grad_norm_var": 10.401041666666666, + "learning_rate": 0.0001, + "loss": 7.2824, + "loss/crossentropy": 2.08201609402895, + "loss/hidden": 3.215234375, + "loss/jsd": 0.0, + "loss/logits": 0.16761352298781276, + "step": 22820 + }, + { + "epoch": 0.57075, + "grad_norm": 28.75, + "grad_norm_var": 12.773893229166667, + "learning_rate": 0.0001, + "loss": 7.4861, + "loss/crossentropy": 2.1187492191791533, + "loss/hidden": 3.484765625, + "loss/jsd": 0.0, + "loss/logits": 0.19788370802998542, + "step": 22830 + }, + { + "epoch": 0.571, + "grad_norm": 28.75, + "grad_norm_var": 2.25625, + "learning_rate": 0.0001, + "loss": 7.34, + "loss/crossentropy": 2.141358491778374, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17862968128174544, + "step": 22840 + }, + { + "epoch": 0.57125, + "grad_norm": 29.75, + "grad_norm_var": 1.4372395833333333, + "learning_rate": 0.0001, + "loss": 7.3556, + "loss/crossentropy": 2.159883065521717, + "loss/hidden": 3.427734375, + "loss/jsd": 0.0, + "loss/logits": 0.20434525702148676, + "step": 22850 + }, + { + "epoch": 0.5715, + "grad_norm": 29.375, + "grad_norm_var": 1.6311848958333333, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 2.0141698867082596, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.17547854110598565, + "step": 22860 + }, + { + "epoch": 0.57175, + "grad_norm": 27.875, + "grad_norm_var": 1.8858723958333334, + "learning_rate": 0.0001, + "loss": 7.1736, + "loss/crossentropy": 2.070405501127243, + "loss/hidden": 3.28515625, + "loss/jsd": 0.0, + "loss/logits": 0.17701356951147318, + "step": 22870 + }, + { + "epoch": 0.572, + "grad_norm": 33.75, + "grad_norm_var": 57.8150390625, + "learning_rate": 0.0001, + "loss": 7.3726, + "loss/crossentropy": 2.1324386440217493, + "loss/hidden": 3.46328125, + "loss/jsd": 0.0, + "loss/logits": 0.20903213806450366, + "step": 22880 + }, + { + "epoch": 0.57225, + "grad_norm": 30.125, + "grad_norm_var": 57.978580729166666, + "learning_rate": 0.0001, + "loss": 7.2264, + "loss/crossentropy": 2.056166473031044, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.19125475976616144, + "step": 22890 + }, + { + "epoch": 0.5725, + "grad_norm": 29.0, + "grad_norm_var": 2.1166015625, + "learning_rate": 0.0001, + "loss": 7.3529, + "loss/crossentropy": 2.0231011360883713, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.19594160905107855, + "step": 22900 + }, + { + "epoch": 0.57275, + "grad_norm": 30.5, + "grad_norm_var": 2.2059895833333334, + "learning_rate": 0.0001, + "loss": 7.2611, + "loss/crossentropy": 2.1009394943714144, + "loss/hidden": 3.421875, + "loss/jsd": 0.0, + "loss/logits": 0.20553287640213966, + "step": 22910 + }, + { + "epoch": 0.573, + "grad_norm": 27.625, + "grad_norm_var": 2.99765625, + "learning_rate": 0.0001, + "loss": 7.2733, + "loss/crossentropy": 2.1342287354171274, + "loss/hidden": 3.35546875, + "loss/jsd": 0.0, + "loss/logits": 0.19136658031493425, + "step": 22920 + }, + { + "epoch": 0.57325, + "grad_norm": 30.375, + "grad_norm_var": 2.39765625, + "learning_rate": 0.0001, + "loss": 7.2991, + "loss/crossentropy": 2.0322689548134805, + "loss/hidden": 3.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.17143328655511142, + "step": 22930 + }, + { + "epoch": 0.5735, + "grad_norm": 29.0, + "grad_norm_var": 1.94765625, + "learning_rate": 0.0001, + "loss": 7.278, + "loss/crossentropy": 2.1039293974637987, + "loss/hidden": 3.244140625, + "loss/jsd": 0.0, + "loss/logits": 0.1707957200706005, + "step": 22940 + }, + { + "epoch": 0.57375, + "grad_norm": 29.75, + "grad_norm_var": 3.2108723958333334, + "learning_rate": 0.0001, + "loss": 7.2094, + "loss/crossentropy": 2.0728151589632033, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.17216427512466909, + "step": 22950 + }, + { + "epoch": 0.574, + "grad_norm": 32.5, + "grad_norm_var": 1.4358723958333333, + "learning_rate": 0.0001, + "loss": 7.3065, + "loss/crossentropy": 2.161167304217815, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18787571005523204, + "step": 22960 + }, + { + "epoch": 0.57425, + "grad_norm": 29.375, + "grad_norm_var": 2.8697916666666665, + "learning_rate": 0.0001, + "loss": 7.3397, + "loss/crossentropy": 2.104795481264591, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.1772844972088933, + "step": 22970 + }, + { + "epoch": 0.5745, + "grad_norm": 33.25, + "grad_norm_var": 2.2395833333333335, + "learning_rate": 0.0001, + "loss": 7.3098, + "loss/crossentropy": 2.170831048488617, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.1898819526657462, + "step": 22980 + }, + { + "epoch": 0.57475, + "grad_norm": 29.375, + "grad_norm_var": 1.8598307291666667, + "learning_rate": 0.0001, + "loss": 7.2474, + "loss/crossentropy": 2.072402949631214, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.20032873265445234, + "step": 22990 + }, + { + "epoch": 0.575, + "grad_norm": 29.5, + "grad_norm_var": 2.0864583333333333, + "learning_rate": 0.0001, + "loss": 7.1527, + "loss/crossentropy": 1.9359148509800435, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.17023193156346678, + "step": 23000 + }, + { + "epoch": 0.57525, + "grad_norm": 31.0, + "grad_norm_var": 10.963541666666666, + "learning_rate": 0.0001, + "loss": 7.3357, + "loss/crossentropy": 2.1682570412755013, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.20298131536692382, + "step": 23010 + }, + { + "epoch": 0.5755, + "grad_norm": 33.25, + "grad_norm_var": 9.428125, + "learning_rate": 0.0001, + "loss": 7.3109, + "loss/crossentropy": 2.045479938387871, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18103279825299978, + "step": 23020 + }, + { + "epoch": 0.57575, + "grad_norm": 30.75, + "grad_norm_var": 2.339583333333333, + "learning_rate": 0.0001, + "loss": 7.3223, + "loss/crossentropy": 2.197012846916914, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.1997917301952839, + "step": 23030 + }, + { + "epoch": 0.576, + "grad_norm": 31.875, + "grad_norm_var": 1.3955729166666666, + "learning_rate": 0.0001, + "loss": 7.328, + "loss/crossentropy": 2.0420756086707117, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.19402425028383732, + "step": 23040 + }, + { + "epoch": 0.57625, + "grad_norm": 30.375, + "grad_norm_var": 1.7926432291666667, + "learning_rate": 0.0001, + "loss": 7.3225, + "loss/crossentropy": 2.227956184744835, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.20268909279257058, + "step": 23050 + }, + { + "epoch": 0.5765, + "grad_norm": 31.375, + "grad_norm_var": 2.319205729166667, + "learning_rate": 0.0001, + "loss": 7.3548, + "loss/crossentropy": 2.03707487732172, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.18702135644853116, + "step": 23060 + }, + { + "epoch": 0.57675, + "grad_norm": 31.5, + "grad_norm_var": 3.103261589190476e+18, + "learning_rate": 0.0001, + "loss": 7.3001, + "loss/crossentropy": 2.2075371608138084, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.18566503124311567, + "step": 23070 + }, + { + "epoch": 0.577, + "grad_norm": 36.25, + "grad_norm_var": 4.934309895833334, + "learning_rate": 0.0001, + "loss": 7.244, + "loss/crossentropy": 2.0198142036795614, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.18854798339307308, + "step": 23080 + }, + { + "epoch": 0.57725, + "grad_norm": 32.5, + "grad_norm_var": 6.170768229166667, + "learning_rate": 0.0001, + "loss": 7.3501, + "loss/crossentropy": 2.1881318643689154, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.19292537569999696, + "step": 23090 + }, + { + "epoch": 0.5775, + "grad_norm": 30.375, + "grad_norm_var": 2.0249348958333333, + "learning_rate": 0.0001, + "loss": 7.2021, + "loss/crossentropy": 2.0514975383877756, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.1834509514272213, + "step": 23100 + }, + { + "epoch": 0.57775, + "grad_norm": 27.875, + "grad_norm_var": 2.409375, + "learning_rate": 0.0001, + "loss": 7.2787, + "loss/crossentropy": 1.8552511997520924, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.172595002502203, + "step": 23110 + }, + { + "epoch": 0.578, + "grad_norm": 30.0, + "grad_norm_var": 2.903580729166667, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.0356574684381483, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.1825650293380022, + "step": 23120 + }, + { + "epoch": 0.57825, + "grad_norm": 29.125, + "grad_norm_var": 2.1791015625, + "learning_rate": 0.0001, + "loss": 7.2816, + "loss/crossentropy": 1.8349962562322617, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.18128557410091162, + "step": 23130 + }, + { + "epoch": 0.5785, + "grad_norm": 30.5, + "grad_norm_var": 1.8479166666666667, + "learning_rate": 0.0001, + "loss": 7.435, + "loss/crossentropy": 1.9793405070900918, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.18810991439968347, + "step": 23140 + }, + { + "epoch": 0.57875, + "grad_norm": 27.75, + "grad_norm_var": 1.3958333333333333, + "learning_rate": 0.0001, + "loss": 7.3526, + "loss/crossentropy": 2.2002530977129937, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.1851093016564846, + "step": 23150 + }, + { + "epoch": 0.579, + "grad_norm": 30.375, + "grad_norm_var": 9.785872395833334, + "learning_rate": 0.0001, + "loss": 7.2364, + "loss/crossentropy": 1.8952147908508779, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.17190355882048608, + "step": 23160 + }, + { + "epoch": 0.57925, + "grad_norm": 29.875, + "grad_norm_var": 1.5212890625, + "learning_rate": 0.0001, + "loss": 7.3738, + "loss/crossentropy": 2.0228386133909226, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.20162178240716458, + "step": 23170 + }, + { + "epoch": 0.5795, + "grad_norm": 31.5, + "grad_norm_var": 2.876822916666667, + "learning_rate": 0.0001, + "loss": 7.3244, + "loss/crossentropy": 2.1140632264316084, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.18904146291315554, + "step": 23180 + }, + { + "epoch": 0.57975, + "grad_norm": 31.875, + "grad_norm_var": 2.46015625, + "learning_rate": 0.0001, + "loss": 7.3871, + "loss/crossentropy": 2.046115532517433, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.1814727798104286, + "step": 23190 + }, + { + "epoch": 0.58, + "grad_norm": 31.375, + "grad_norm_var": 1.8806640625, + "learning_rate": 0.0001, + "loss": 7.2065, + "loss/crossentropy": 2.09497309550643, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.18063673991709947, + "step": 23200 + }, + { + "epoch": 0.58025, + "grad_norm": 30.875, + "grad_norm_var": 1.7184895833333333, + "learning_rate": 0.0001, + "loss": 7.2516, + "loss/crossentropy": 2.120602674782276, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.17760662753134965, + "step": 23210 + }, + { + "epoch": 0.5805, + "grad_norm": 29.375, + "grad_norm_var": 12.099739583333333, + "learning_rate": 0.0001, + "loss": 7.3189, + "loss/crossentropy": 1.8746614530682564, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.17366855386644603, + "step": 23220 + }, + { + "epoch": 0.58075, + "grad_norm": 34.0, + "grad_norm_var": 8.370572916666667, + "learning_rate": 0.0001, + "loss": 7.219, + "loss/crossentropy": 1.9041770923882724, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.17128421002998948, + "step": 23230 + }, + { + "epoch": 0.581, + "grad_norm": 29.375, + "grad_norm_var": 2.69765625, + "learning_rate": 0.0001, + "loss": 7.2582, + "loss/crossentropy": 2.141513818502426, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.19048179611563681, + "step": 23240 + }, + { + "epoch": 0.58125, + "grad_norm": 30.375, + "grad_norm_var": 3.5434895833333333, + "learning_rate": 0.0001, + "loss": 7.2658, + "loss/crossentropy": 2.0868393763899804, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.19045372307300568, + "step": 23250 + }, + { + "epoch": 0.5815, + "grad_norm": 29.375, + "grad_norm_var": 5.077018229166667, + "learning_rate": 0.0001, + "loss": 7.2014, + "loss/crossentropy": 2.101007029414177, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.1854638984426856, + "step": 23260 + }, + { + "epoch": 0.58175, + "grad_norm": 31.125, + "grad_norm_var": 218.71920572916667, + "learning_rate": 0.0001, + "loss": 7.3693, + "loss/crossentropy": 1.9861895442008972, + "loss/hidden": 3.498046875, + "loss/jsd": 0.0, + "loss/logits": 0.20660532703623175, + "step": 23270 + }, + { + "epoch": 0.582, + "grad_norm": 28.625, + "grad_norm_var": 225.99140625, + "learning_rate": 0.0001, + "loss": 7.2359, + "loss/crossentropy": 1.9705702036619186, + "loss/hidden": 3.25625, + "loss/jsd": 0.0, + "loss/logits": 0.1714084181934595, + "step": 23280 + }, + { + "epoch": 0.58225, + "grad_norm": 33.25, + "grad_norm_var": 1.9921223958333334, + "learning_rate": 0.0001, + "loss": 7.2894, + "loss/crossentropy": 1.9414188079535961, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.18114634826779366, + "step": 23290 + }, + { + "epoch": 0.5825, + "grad_norm": 29.625, + "grad_norm_var": 3.2875, + "learning_rate": 0.0001, + "loss": 7.1979, + "loss/crossentropy": 2.05298957452178, + "loss/hidden": 3.316015625, + "loss/jsd": 0.0, + "loss/logits": 0.18091321904212238, + "step": 23300 + }, + { + "epoch": 0.58275, + "grad_norm": 29.375, + "grad_norm_var": 3.949934895833333, + "learning_rate": 0.0001, + "loss": 7.2295, + "loss/crossentropy": 2.0978086851537228, + "loss/hidden": 3.2453125, + "loss/jsd": 0.0, + "loss/logits": 0.17559153838083147, + "step": 23310 + }, + { + "epoch": 0.583, + "grad_norm": 30.5, + "grad_norm_var": 3.1143229166666666, + "learning_rate": 0.0001, + "loss": 7.3858, + "loss/crossentropy": 2.1657110869884493, + "loss/hidden": 3.28515625, + "loss/jsd": 0.0, + "loss/logits": 0.17203837148845197, + "step": 23320 + }, + { + "epoch": 0.58325, + "grad_norm": 30.75, + "grad_norm_var": 1.0768229166666667, + "learning_rate": 0.0001, + "loss": 7.3152, + "loss/crossentropy": 2.141623441874981, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.19364709984511136, + "step": 23330 + }, + { + "epoch": 0.5835, + "grad_norm": 30.625, + "grad_norm_var": 4.070572916666666, + "learning_rate": 0.0001, + "loss": 7.2853, + "loss/crossentropy": 2.1005776453763247, + "loss/hidden": 3.31640625, + "loss/jsd": 0.0, + "loss/logits": 0.1824392621871084, + "step": 23340 + }, + { + "epoch": 0.58375, + "grad_norm": 30.5, + "grad_norm_var": 2.6770833333333335, + "learning_rate": 0.0001, + "loss": 7.2691, + "loss/crossentropy": 2.182777139544487, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.19071639832109213, + "step": 23350 + }, + { + "epoch": 0.584, + "grad_norm": 29.375, + "grad_norm_var": 2.8061848958333333, + "learning_rate": 0.0001, + "loss": 7.2273, + "loss/crossentropy": 2.005091509968042, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.17102911770343782, + "step": 23360 + }, + { + "epoch": 0.58425, + "grad_norm": 31.125, + "grad_norm_var": 3.6129557291666665, + "learning_rate": 0.0001, + "loss": 7.3946, + "loss/crossentropy": 1.958120694756508, + "loss/hidden": 3.47890625, + "loss/jsd": 0.0, + "loss/logits": 0.20584547519683838, + "step": 23370 + }, + { + "epoch": 0.5845, + "grad_norm": 33.75, + "grad_norm_var": 3.59765625, + "learning_rate": 0.0001, + "loss": 7.4152, + "loss/crossentropy": 2.119162234663963, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.19411879703402518, + "step": 23380 + }, + { + "epoch": 0.58475, + "grad_norm": 30.75, + "grad_norm_var": 4.620572916666666, + "learning_rate": 0.0001, + "loss": 7.3097, + "loss/crossentropy": 2.1464238077402116, + "loss/hidden": 3.4796875, + "loss/jsd": 0.0, + "loss/logits": 0.19650665801018477, + "step": 23390 + }, + { + "epoch": 0.585, + "grad_norm": 30.0, + "grad_norm_var": 20.2478515625, + "learning_rate": 0.0001, + "loss": 7.4, + "loss/crossentropy": 2.039510624855757, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.18683098405599594, + "step": 23400 + }, + { + "epoch": 0.58525, + "grad_norm": 32.25, + "grad_norm_var": 41.15, + "learning_rate": 0.0001, + "loss": 7.3072, + "loss/crossentropy": 2.182946425676346, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.19105008877813817, + "step": 23410 + }, + { + "epoch": 0.5855, + "grad_norm": 35.25, + "grad_norm_var": 4.7994140625, + "learning_rate": 0.0001, + "loss": 7.3148, + "loss/crossentropy": 2.0956947550177576, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20151016414165496, + "step": 23420 + }, + { + "epoch": 0.58575, + "grad_norm": 34.5, + "grad_norm_var": 4.302018229166666, + "learning_rate": 0.0001, + "loss": 7.3171, + "loss/crossentropy": 2.0624319493770598, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.17940568849444388, + "step": 23430 + }, + { + "epoch": 0.586, + "grad_norm": 33.75, + "grad_norm_var": 2.205208333333333, + "learning_rate": 0.0001, + "loss": 7.2999, + "loss/crossentropy": 2.1612589955329895, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.18243170958012342, + "step": 23440 + }, + { + "epoch": 0.58625, + "grad_norm": 43.0, + "grad_norm_var": 16.95, + "learning_rate": 0.0001, + "loss": 7.3768, + "loss/crossentropy": 2.119496448338032, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.21236374396830798, + "step": 23450 + }, + { + "epoch": 0.5865, + "grad_norm": 30.5, + "grad_norm_var": 16.997330729166666, + "learning_rate": 0.0001, + "loss": 7.2713, + "loss/crossentropy": 2.226336379349232, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18839403800666332, + "step": 23460 + }, + { + "epoch": 0.58675, + "grad_norm": 30.25, + "grad_norm_var": 5.237239583333333, + "learning_rate": 0.0001, + "loss": 7.1908, + "loss/crossentropy": 2.0912389412522314, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.18238936755806207, + "step": 23470 + }, + { + "epoch": 0.587, + "grad_norm": 36.75, + "grad_norm_var": 5.435416666666667, + "learning_rate": 0.0001, + "loss": 7.1305, + "loss/crossentropy": 2.014119653403759, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.16768602933734655, + "step": 23480 + }, + { + "epoch": 0.58725, + "grad_norm": 31.875, + "grad_norm_var": 11.8509765625, + "learning_rate": 0.0001, + "loss": 7.2135, + "loss/crossentropy": 2.193871647119522, + "loss/hidden": 3.266796875, + "loss/jsd": 0.0, + "loss/logits": 0.1774727776646614, + "step": 23490 + }, + { + "epoch": 0.5875, + "grad_norm": 30.5, + "grad_norm_var": 8.875, + "learning_rate": 0.0001, + "loss": 7.2883, + "loss/crossentropy": 1.9861755840480328, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.18203947991132735, + "step": 23500 + }, + { + "epoch": 0.58775, + "grad_norm": 34.25, + "grad_norm_var": 3.1697265625, + "learning_rate": 0.0001, + "loss": 7.3764, + "loss/crossentropy": 2.1041073873639107, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.1823105582036078, + "step": 23510 + }, + { + "epoch": 0.588, + "grad_norm": 28.75, + "grad_norm_var": 6.472916666666666, + "learning_rate": 0.0001, + "loss": 7.2363, + "loss/crossentropy": 2.0386205092072487, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.18798258919268845, + "step": 23520 + }, + { + "epoch": 0.58825, + "grad_norm": 29.5, + "grad_norm_var": 4.173958333333333, + "learning_rate": 0.0001, + "loss": 7.2541, + "loss/crossentropy": 1.9665139004588128, + "loss/hidden": 3.41640625, + "loss/jsd": 0.0, + "loss/logits": 0.19156214147806166, + "step": 23530 + }, + { + "epoch": 0.5885, + "grad_norm": 28.375, + "grad_norm_var": 2.6264973958333333, + "learning_rate": 0.0001, + "loss": 7.1821, + "loss/crossentropy": 2.100816609710455, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18041261453181506, + "step": 23540 + }, + { + "epoch": 0.58875, + "grad_norm": 28.25, + "grad_norm_var": 6.0681640625, + "learning_rate": 0.0001, + "loss": 7.2194, + "loss/crossentropy": 2.160541406273842, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.18264771308749914, + "step": 23550 + }, + { + "epoch": 0.589, + "grad_norm": 41.5, + "grad_norm_var": 13.784309895833333, + "learning_rate": 0.0001, + "loss": 7.3172, + "loss/crossentropy": 2.0907914683222772, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.19286628402769565, + "step": 23560 + }, + { + "epoch": 0.58925, + "grad_norm": 26.25, + "grad_norm_var": 33.94576822916667, + "learning_rate": 0.0001, + "loss": 7.0992, + "loss/crossentropy": 2.035450255870819, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17769902311265467, + "step": 23570 + }, + { + "epoch": 0.5895, + "grad_norm": 29.375, + "grad_norm_var": 31.033268229166666, + "learning_rate": 0.0001, + "loss": 7.2749, + "loss/crossentropy": 2.011720988154411, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.18213979490101337, + "step": 23580 + }, + { + "epoch": 0.58975, + "grad_norm": 28.0, + "grad_norm_var": 39.16451822916667, + "learning_rate": 0.0001, + "loss": 7.2706, + "loss/crossentropy": 2.0319022722542286, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.1784214310348034, + "step": 23590 + }, + { + "epoch": 0.59, + "grad_norm": 43.25, + "grad_norm_var": 43.842122395833336, + "learning_rate": 0.0001, + "loss": 7.2368, + "loss/crossentropy": 2.042397302389145, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.17687461860477924, + "step": 23600 + }, + { + "epoch": 0.59025, + "grad_norm": 32.75, + "grad_norm_var": 13.026822916666667, + "learning_rate": 0.0001, + "loss": 7.2755, + "loss/crossentropy": 1.9421588897705078, + "loss/hidden": 3.371875, + "loss/jsd": 0.0, + "loss/logits": 0.1788268475793302, + "step": 23610 + }, + { + "epoch": 0.5905, + "grad_norm": 32.25, + "grad_norm_var": 7.690625, + "learning_rate": 0.0001, + "loss": 7.1561, + "loss/crossentropy": 2.0542794331908225, + "loss/hidden": 3.26796875, + "loss/jsd": 0.0, + "loss/logits": 0.16889538057148457, + "step": 23620 + }, + { + "epoch": 0.59075, + "grad_norm": 33.5, + "grad_norm_var": 10.685416666666667, + "learning_rate": 0.0001, + "loss": 7.1786, + "loss/crossentropy": 2.0298443526029586, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.1826344771310687, + "step": 23630 + }, + { + "epoch": 0.591, + "grad_norm": 39.5, + "grad_norm_var": 11.2806640625, + "learning_rate": 0.0001, + "loss": 7.2082, + "loss/crossentropy": 2.119662294536829, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.18428312726318835, + "step": 23640 + }, + { + "epoch": 0.59125, + "grad_norm": 29.75, + "grad_norm_var": 8.54765625, + "learning_rate": 0.0001, + "loss": 7.1784, + "loss/crossentropy": 1.9878482483327389, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.18119903113692998, + "step": 23650 + }, + { + "epoch": 0.5915, + "grad_norm": 30.75, + "grad_norm_var": 2.0336566844344238e+18, + "learning_rate": 0.0001, + "loss": 7.3489, + "loss/crossentropy": 1.8514883562922477, + "loss/hidden": 3.5328125, + "loss/jsd": 0.0, + "loss/logits": 0.21164753548800946, + "step": 23660 + }, + { + "epoch": 0.59175, + "grad_norm": 31.0, + "grad_norm_var": 11.344791666666667, + "learning_rate": 0.0001, + "loss": 7.3203, + "loss/crossentropy": 2.245069167017937, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.17927178423851728, + "step": 23670 + }, + { + "epoch": 0.592, + "grad_norm": 29.0, + "grad_norm_var": 413.53020833333335, + "learning_rate": 0.0001, + "loss": 7.2885, + "loss/crossentropy": 2.0498727947473525, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.18411201229318977, + "step": 23680 + }, + { + "epoch": 0.59225, + "grad_norm": 27.375, + "grad_norm_var": 25.974934895833332, + "learning_rate": 0.0001, + "loss": 7.1401, + "loss/crossentropy": 1.9942969411611557, + "loss/hidden": 3.302734375, + "loss/jsd": 0.0, + "loss/logits": 0.16860720347613095, + "step": 23690 + }, + { + "epoch": 0.5925, + "grad_norm": 29.875, + "grad_norm_var": 8.026822916666667, + "learning_rate": 0.0001, + "loss": 7.2296, + "loss/crossentropy": 2.0294051103293897, + "loss/hidden": 3.26875, + "loss/jsd": 0.0, + "loss/logits": 0.18335335925221444, + "step": 23700 + }, + { + "epoch": 0.59275, + "grad_norm": 28.5, + "grad_norm_var": 9.404166666666667, + "learning_rate": 0.0001, + "loss": 7.272, + "loss/crossentropy": 2.1226597487926484, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17993064355105162, + "step": 23710 + }, + { + "epoch": 0.593, + "grad_norm": 27.125, + "grad_norm_var": 14.909375, + "learning_rate": 0.0001, + "loss": 7.2318, + "loss/crossentropy": 2.154828441143036, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.18905305583029985, + "step": 23720 + }, + { + "epoch": 0.59325, + "grad_norm": 31.0, + "grad_norm_var": 9.435416666666667, + "learning_rate": 0.0001, + "loss": 7.2639, + "loss/crossentropy": 2.0010774195194245, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.1794521464034915, + "step": 23730 + }, + { + "epoch": 0.5935, + "grad_norm": 31.875, + "grad_norm_var": 7.265625, + "learning_rate": 0.0001, + "loss": 7.2917, + "loss/crossentropy": 2.1218893982470037, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.17976281177252532, + "step": 23740 + }, + { + "epoch": 0.59375, + "grad_norm": 29.875, + "grad_norm_var": 5.7837890625, + "learning_rate": 0.0001, + "loss": 7.2556, + "loss/crossentropy": 2.0276944383978845, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.1864795895293355, + "step": 23750 + }, + { + "epoch": 0.594, + "grad_norm": 34.0, + "grad_norm_var": 7.016666666666667, + "learning_rate": 0.0001, + "loss": 7.2928, + "loss/crossentropy": 1.929387104511261, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.17239041812717915, + "step": 23760 + }, + { + "epoch": 0.59425, + "grad_norm": 31.0, + "grad_norm_var": 7.468489583333334, + "learning_rate": 0.0001, + "loss": 7.3058, + "loss/crossentropy": 2.0659641668200495, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.1967785032466054, + "step": 23770 + }, + { + "epoch": 0.5945, + "grad_norm": 28.375, + "grad_norm_var": 5.722330729166667, + "learning_rate": 0.0001, + "loss": 7.2596, + "loss/crossentropy": 2.0173033215105534, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.17501826863735914, + "step": 23780 + }, + { + "epoch": 0.59475, + "grad_norm": 27.75, + "grad_norm_var": 5.6806640625, + "learning_rate": 0.0001, + "loss": 7.3387, + "loss/crossentropy": 2.063326106220484, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.181632025167346, + "step": 23790 + }, + { + "epoch": 0.595, + "grad_norm": 29.0, + "grad_norm_var": 5.265559895833333, + "learning_rate": 0.0001, + "loss": 7.3309, + "loss/crossentropy": 2.200699530541897, + "loss/hidden": 3.30859375, + "loss/jsd": 0.0, + "loss/logits": 0.1817481342703104, + "step": 23800 + }, + { + "epoch": 0.59525, + "grad_norm": 40.5, + "grad_norm_var": 3.8861138646518113e+18, + "learning_rate": 0.0001, + "loss": 7.3693, + "loss/crossentropy": 2.1953633323311808, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18916522301733493, + "step": 23810 + }, + { + "epoch": 0.5955, + "grad_norm": 29.125, + "grad_norm_var": 1.1892317594268467e+18, + "learning_rate": 0.0001, + "loss": 7.1476, + "loss/crossentropy": 2.100045497715473, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.18449472673237324, + "step": 23820 + }, + { + "epoch": 0.59575, + "grad_norm": 29.875, + "grad_norm_var": 7.113541666666666, + "learning_rate": 0.0001, + "loss": 7.2694, + "loss/crossentropy": 2.13639819920063, + "loss/hidden": 3.260546875, + "loss/jsd": 0.0, + "loss/logits": 0.18081072345376015, + "step": 23830 + }, + { + "epoch": 0.596, + "grad_norm": 32.75, + "grad_norm_var": 10.190625, + "learning_rate": 0.0001, + "loss": 7.4218, + "loss/crossentropy": 2.0917629063129426, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.20855254549533128, + "step": 23840 + }, + { + "epoch": 0.59625, + "grad_norm": 35.25, + "grad_norm_var": 7.001822916666667, + "learning_rate": 0.0001, + "loss": 7.3759, + "loss/crossentropy": 2.1656418085098266, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.1982706617563963, + "step": 23850 + }, + { + "epoch": 0.5965, + "grad_norm": 30.625, + "grad_norm_var": 14.1806640625, + "learning_rate": 0.0001, + "loss": 7.3419, + "loss/crossentropy": 1.978961955010891, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.19264845587313176, + "step": 23860 + }, + { + "epoch": 0.59675, + "grad_norm": 28.75, + "grad_norm_var": 9.461458333333333, + "learning_rate": 0.0001, + "loss": 7.1425, + "loss/crossentropy": 2.064087450504303, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.18434563372284174, + "step": 23870 + }, + { + "epoch": 0.597, + "grad_norm": 32.0, + "grad_norm_var": 3.3979166666666667, + "learning_rate": 0.0001, + "loss": 7.2846, + "loss/crossentropy": 2.2514883071184157, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.1978050796315074, + "step": 23880 + }, + { + "epoch": 0.59725, + "grad_norm": 29.375, + "grad_norm_var": 2.5010416666666666, + "learning_rate": 0.0001, + "loss": 7.3878, + "loss/crossentropy": 2.1330555945634844, + "loss/hidden": 3.3546875, + "loss/jsd": 0.0, + "loss/logits": 0.1861652435734868, + "step": 23890 + }, + { + "epoch": 0.5975, + "grad_norm": 33.25, + "grad_norm_var": 5.234309895833333, + "learning_rate": 0.0001, + "loss": 7.2809, + "loss/crossentropy": 2.15719972550869, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.18762489072978497, + "step": 23900 + }, + { + "epoch": 0.59775, + "grad_norm": 28.125, + "grad_norm_var": 9.90625, + "learning_rate": 0.0001, + "loss": 7.3282, + "loss/crossentropy": 2.2501418024301527, + "loss/hidden": 3.288671875, + "loss/jsd": 0.0, + "loss/logits": 0.18364315424114466, + "step": 23910 + }, + { + "epoch": 0.598, + "grad_norm": 7885291520.0, + "grad_norm_var": 3.8861138654567685e+18, + "learning_rate": 0.0001, + "loss": 7.4436, + "loss/crossentropy": 1.849592723697424, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.1732720138505101, + "step": 23920 + }, + { + "epoch": 0.59825, + "grad_norm": 35.25, + "grad_norm_var": 3.886113865095359e+18, + "learning_rate": 0.0001, + "loss": 7.3624, + "loss/crossentropy": 2.0290102303028106, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19378960439935328, + "step": 23930 + }, + { + "epoch": 0.5985, + "grad_norm": 28.25, + "grad_norm_var": 4.639322916666667, + "learning_rate": 0.0001, + "loss": 7.2436, + "loss/crossentropy": 1.921845416724682, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.19508706871420145, + "step": 23940 + }, + { + "epoch": 0.59875, + "grad_norm": 33.25, + "grad_norm_var": 3.9879557291666665, + "learning_rate": 0.0001, + "loss": 7.2391, + "loss/crossentropy": 2.113616520166397, + "loss/hidden": 3.190234375, + "loss/jsd": 0.0, + "loss/logits": 0.1806779010221362, + "step": 23950 + }, + { + "epoch": 0.599, + "grad_norm": 31.5, + "grad_norm_var": 4.765559895833333, + "learning_rate": 0.0001, + "loss": 7.2259, + "loss/crossentropy": 2.0889094918966293, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.19335521198809147, + "step": 23960 + }, + { + "epoch": 0.59925, + "grad_norm": 29.25, + "grad_norm_var": 4.7759765625, + "learning_rate": 0.0001, + "loss": 7.3169, + "loss/crossentropy": 2.0757755756378176, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18353872746229172, + "step": 23970 + }, + { + "epoch": 0.5995, + "grad_norm": 30.625, + "grad_norm_var": 3.417122395833333, + "learning_rate": 0.0001, + "loss": 7.354, + "loss/crossentropy": 2.151854282617569, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.18792435489594936, + "step": 23980 + }, + { + "epoch": 0.59975, + "grad_norm": 30.75, + "grad_norm_var": 15.67890625, + "learning_rate": 0.0001, + "loss": 7.3461, + "loss/crossentropy": 1.9950793728232383, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.20234323088079692, + "step": 23990 + }, + { + "epoch": 0.6, + "grad_norm": 29.875, + "grad_norm_var": 20.184309895833334, + "learning_rate": 0.0001, + "loss": 7.2263, + "loss/crossentropy": 1.9956952631473541, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.18217481905594468, + "step": 24000 + }, + { + "epoch": 0.60025, + "grad_norm": 38.0, + "grad_norm_var": 9.999739583333334, + "learning_rate": 0.0001, + "loss": 7.3267, + "loss/crossentropy": 2.0959785729646683, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.1935113796964288, + "step": 24010 + }, + { + "epoch": 0.6005, + "grad_norm": 32.75, + "grad_norm_var": 5.863997395833334, + "learning_rate": 0.0001, + "loss": 7.2052, + "loss/crossentropy": 2.023172339051962, + "loss/hidden": 3.32890625, + "loss/jsd": 0.0, + "loss/logits": 0.17665521912276744, + "step": 24020 + }, + { + "epoch": 0.60075, + "grad_norm": 30.0, + "grad_norm_var": 2.0645182291666666, + "learning_rate": 0.0001, + "loss": 7.3997, + "loss/crossentropy": 2.044221018254757, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.17454885710030793, + "step": 24030 + }, + { + "epoch": 0.601, + "grad_norm": 29.375, + "grad_norm_var": 3.1791015625, + "learning_rate": 0.0001, + "loss": 7.4056, + "loss/crossentropy": 2.017246203124523, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19033315852284433, + "step": 24040 + }, + { + "epoch": 0.60125, + "grad_norm": 34.0, + "grad_norm_var": 7304.769791666667, + "learning_rate": 0.0001, + "loss": 7.3027, + "loss/crossentropy": 2.0873162485659122, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.1803006211295724, + "step": 24050 + }, + { + "epoch": 0.6015, + "grad_norm": 42.0, + "grad_norm_var": 7222.07265625, + "learning_rate": 0.0001, + "loss": 7.3271, + "loss/crossentropy": 2.089706966280937, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.17721448224037886, + "step": 24060 + }, + { + "epoch": 0.60175, + "grad_norm": 28.5, + "grad_norm_var": 17.274934895833333, + "learning_rate": 0.0001, + "loss": 7.2893, + "loss/crossentropy": 2.1399046301841738, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.17965221870690584, + "step": 24070 + }, + { + "epoch": 0.602, + "grad_norm": 27.875, + "grad_norm_var": 3.4098307291666665, + "learning_rate": 0.0001, + "loss": 7.2298, + "loss/crossentropy": 2.232226923108101, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.17638658490031958, + "step": 24080 + }, + { + "epoch": 0.60225, + "grad_norm": 29.375, + "grad_norm_var": 2.467643229166667, + "learning_rate": 0.0001, + "loss": 7.2752, + "loss/crossentropy": 2.0909645952284337, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.18019966538995505, + "step": 24090 + }, + { + "epoch": 0.6025, + "grad_norm": 33.25, + "grad_norm_var": 4.315559895833333, + "learning_rate": 0.0001, + "loss": 7.1836, + "loss/crossentropy": 2.043725144863129, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.1770002892240882, + "step": 24100 + }, + { + "epoch": 0.60275, + "grad_norm": 32.75, + "grad_norm_var": 2.894791666666667, + "learning_rate": 0.0001, + "loss": 7.2902, + "loss/crossentropy": 2.09500589966774, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.18369378615170717, + "step": 24110 + }, + { + "epoch": 0.603, + "grad_norm": 27.125, + "grad_norm_var": 5.116080729166667, + "learning_rate": 0.0001, + "loss": 7.3256, + "loss/crossentropy": 1.9551431708037854, + "loss/hidden": 3.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.19141522590070964, + "step": 24120 + }, + { + "epoch": 0.60325, + "grad_norm": 28.75, + "grad_norm_var": 5.28125, + "learning_rate": 0.0001, + "loss": 7.248, + "loss/crossentropy": 1.9168051473796368, + "loss/hidden": 3.31875, + "loss/jsd": 0.0, + "loss/logits": 0.17059860788285733, + "step": 24130 + }, + { + "epoch": 0.6035, + "grad_norm": 28.25, + "grad_norm_var": 4.403059895833334, + "learning_rate": 0.0001, + "loss": 7.432, + "loss/crossentropy": 1.9579944089055061, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.20447726584970952, + "step": 24140 + }, + { + "epoch": 0.60375, + "grad_norm": 29.125, + "grad_norm_var": 5.737955729166667, + "learning_rate": 0.0001, + "loss": 7.4361, + "loss/crossentropy": 2.150589424371719, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.20431780871003866, + "step": 24150 + }, + { + "epoch": 0.604, + "grad_norm": 29.875, + "grad_norm_var": 8.734309895833333, + "learning_rate": 0.0001, + "loss": 7.3029, + "loss/crossentropy": 2.104359980672598, + "loss/hidden": 3.314453125, + "loss/jsd": 0.0, + "loss/logits": 0.19141491586342455, + "step": 24160 + }, + { + "epoch": 0.60425, + "grad_norm": 32.25, + "grad_norm_var": 4.3416015625, + "learning_rate": 0.0001, + "loss": 7.3517, + "loss/crossentropy": 2.0446281746029853, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.18326206738129258, + "step": 24170 + }, + { + "epoch": 0.6045, + "grad_norm": 34.5, + "grad_norm_var": 35.74791666666667, + "learning_rate": 0.0001, + "loss": 7.1912, + "loss/crossentropy": 1.9954920992255212, + "loss/hidden": 3.32578125, + "loss/jsd": 0.0, + "loss/logits": 0.17927296683192254, + "step": 24180 + }, + { + "epoch": 0.60475, + "grad_norm": 326.0, + "grad_norm_var": 5441.927018229167, + "learning_rate": 0.0001, + "loss": 7.3115, + "loss/crossentropy": 1.9973294004797935, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.20168818794190885, + "step": 24190 + }, + { + "epoch": 0.605, + "grad_norm": 30.25, + "grad_norm_var": 5427.784309895834, + "learning_rate": 0.0001, + "loss": 7.3283, + "loss/crossentropy": 2.075428619980812, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.1890948511660099, + "step": 24200 + }, + { + "epoch": 0.60525, + "grad_norm": 37.5, + "grad_norm_var": 18.364583333333332, + "learning_rate": 0.0001, + "loss": 7.3213, + "loss/crossentropy": 2.072233888506889, + "loss/hidden": 3.25, + "loss/jsd": 0.0, + "loss/logits": 0.17045267168432474, + "step": 24210 + }, + { + "epoch": 0.6055, + "grad_norm": 28.5, + "grad_norm_var": 6.415559895833334, + "learning_rate": 0.0001, + "loss": 7.3117, + "loss/crossentropy": 2.1279248766601087, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.17065610643476248, + "step": 24220 + }, + { + "epoch": 0.60575, + "grad_norm": 27.375, + "grad_norm_var": 1.4393229166666666, + "learning_rate": 0.0001, + "loss": 7.1783, + "loss/crossentropy": 2.0654909342527388, + "loss/hidden": 3.29921875, + "loss/jsd": 0.0, + "loss/logits": 0.1788815937936306, + "step": 24230 + }, + { + "epoch": 0.606, + "grad_norm": 28.75, + "grad_norm_var": 2.3931640625, + "learning_rate": 0.0001, + "loss": 7.2818, + "loss/crossentropy": 1.8843905299901962, + "loss/hidden": 3.484765625, + "loss/jsd": 0.0, + "loss/logits": 0.20363699235022067, + "step": 24240 + }, + { + "epoch": 0.60625, + "grad_norm": 30.5, + "grad_norm_var": 1.3238932291666667, + "learning_rate": 0.0001, + "loss": 7.2077, + "loss/crossentropy": 2.0136945739388468, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.169308259524405, + "step": 24250 + }, + { + "epoch": 0.6065, + "grad_norm": 30.25, + "grad_norm_var": 2.241080729166667, + "learning_rate": 0.0001, + "loss": 7.3067, + "loss/crossentropy": 2.06818515509367, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.18866120781749487, + "step": 24260 + }, + { + "epoch": 0.60675, + "grad_norm": 30.25, + "grad_norm_var": 2.923958333333333, + "learning_rate": 0.0001, + "loss": 7.35, + "loss/crossentropy": 2.1705613493919373, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.18546785078942776, + "step": 24270 + }, + { + "epoch": 0.607, + "grad_norm": 29.75, + "grad_norm_var": 17.083333333333332, + "learning_rate": 0.0001, + "loss": 7.2704, + "loss/crossentropy": 2.1217471458017827, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.20033492222428323, + "step": 24280 + }, + { + "epoch": 0.60725, + "grad_norm": 32.5, + "grad_norm_var": 14.017643229166667, + "learning_rate": 0.0001, + "loss": 7.3065, + "loss/crossentropy": 2.013144116103649, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.18037521606311202, + "step": 24290 + }, + { + "epoch": 0.6075, + "grad_norm": 31.5, + "grad_norm_var": 41.54765625, + "learning_rate": 0.0001, + "loss": 7.3413, + "loss/crossentropy": 1.9536602176725864, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.1871596833691001, + "step": 24300 + }, + { + "epoch": 0.60775, + "grad_norm": 29.25, + "grad_norm_var": 45.2791015625, + "learning_rate": 0.0001, + "loss": 7.3009, + "loss/crossentropy": 2.122793934494257, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.18271594103425742, + "step": 24310 + }, + { + "epoch": 0.608, + "grad_norm": 32.0, + "grad_norm_var": 1.7455729166666667, + "learning_rate": 0.0001, + "loss": 7.3036, + "loss/crossentropy": 1.9508492700755595, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.1780675446614623, + "step": 24320 + }, + { + "epoch": 0.60825, + "grad_norm": 30.0, + "grad_norm_var": 3.0426432291666665, + "learning_rate": 0.0001, + "loss": 7.3479, + "loss/crossentropy": 2.091268266737461, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.1819396572187543, + "step": 24330 + }, + { + "epoch": 0.6085, + "grad_norm": 31.0, + "grad_norm_var": 2.6900390625, + "learning_rate": 0.0001, + "loss": 7.3685, + "loss/crossentropy": 2.1921914756298064, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.19144053254276513, + "step": 24340 + }, + { + "epoch": 0.60875, + "grad_norm": 29.875, + "grad_norm_var": 3547.4488932291665, + "learning_rate": 0.0001, + "loss": 7.32, + "loss/crossentropy": 2.0454428791999817, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.19868660382926465, + "step": 24350 + }, + { + "epoch": 0.609, + "grad_norm": 31.5, + "grad_norm_var": 1.9535807291666667, + "learning_rate": 0.0001, + "loss": 7.2958, + "loss/crossentropy": 1.978618311882019, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.1801746640354395, + "step": 24360 + }, + { + "epoch": 0.60925, + "grad_norm": 29.625, + "grad_norm_var": 3.381705729166667, + "learning_rate": 0.0001, + "loss": 7.2396, + "loss/crossentropy": 2.011453700065613, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.17533374354243278, + "step": 24370 + }, + { + "epoch": 0.6095, + "grad_norm": 34.5, + "grad_norm_var": 4.911393229166666, + "learning_rate": 0.0001, + "loss": 7.2557, + "loss/crossentropy": 2.1272184163331986, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.19443935044109822, + "step": 24380 + }, + { + "epoch": 0.60975, + "grad_norm": 29.5, + "grad_norm_var": 3.5712890625, + "learning_rate": 0.0001, + "loss": 7.3372, + "loss/crossentropy": 2.0613644808530807, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1952214989811182, + "step": 24390 + }, + { + "epoch": 0.61, + "grad_norm": 30.875, + "grad_norm_var": 2.2291666666666665, + "learning_rate": 0.0001, + "loss": 7.3664, + "loss/crossentropy": 2.082505702972412, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.1836231516674161, + "step": 24400 + }, + { + "epoch": 0.61025, + "grad_norm": 32.0, + "grad_norm_var": 1.94140625, + "learning_rate": 0.0001, + "loss": 7.4019, + "loss/crossentropy": 1.964651158452034, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.18479474298655987, + "step": 24410 + }, + { + "epoch": 0.6105, + "grad_norm": 30.625, + "grad_norm_var": 1.1181640625, + "learning_rate": 0.0001, + "loss": 7.3918, + "loss/crossentropy": 2.0536778286099433, + "loss/hidden": 3.47734375, + "loss/jsd": 0.0, + "loss/logits": 0.2254286851733923, + "step": 24420 + }, + { + "epoch": 0.61075, + "grad_norm": 28.625, + "grad_norm_var": 1.8384765625, + "learning_rate": 0.0001, + "loss": 7.2519, + "loss/crossentropy": 2.0931551665067674, + "loss/hidden": 3.461328125, + "loss/jsd": 0.0, + "loss/logits": 0.19350009728223086, + "step": 24430 + }, + { + "epoch": 0.611, + "grad_norm": 28.625, + "grad_norm_var": 2.3259765625, + "learning_rate": 0.0001, + "loss": 7.2784, + "loss/crossentropy": 2.1479686103761195, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.17983189076185227, + "step": 24440 + }, + { + "epoch": 0.61125, + "grad_norm": 32.75, + "grad_norm_var": 7.439322916666667, + "learning_rate": 0.0001, + "loss": 7.4267, + "loss/crossentropy": 2.171735206246376, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.20030937865376472, + "step": 24450 + }, + { + "epoch": 0.6115, + "grad_norm": 30.375, + "grad_norm_var": 2.9629557291666666, + "learning_rate": 0.0001, + "loss": 7.2518, + "loss/crossentropy": 1.9000327825546264, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.1778967533260584, + "step": 24460 + }, + { + "epoch": 0.61175, + "grad_norm": 29.0, + "grad_norm_var": 3.87265625, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.1398353300988675, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.1760717548429966, + "step": 24470 + }, + { + "epoch": 0.612, + "grad_norm": 32.25, + "grad_norm_var": 4.654166666666667, + "learning_rate": 0.0001, + "loss": 7.3402, + "loss/crossentropy": 2.0892323054373265, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.18320874478667976, + "step": 24480 + }, + { + "epoch": 0.61225, + "grad_norm": 28.25, + "grad_norm_var": 26.684375, + "learning_rate": 0.0001, + "loss": 7.3876, + "loss/crossentropy": 1.9319149151444435, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.18836453650146723, + "step": 24490 + }, + { + "epoch": 0.6125, + "grad_norm": 28.625, + "grad_norm_var": 33.487239583333334, + "learning_rate": 0.0001, + "loss": 7.3939, + "loss/crossentropy": 2.1875485301017763, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.17974788155406712, + "step": 24500 + }, + { + "epoch": 0.61275, + "grad_norm": 28.0, + "grad_norm_var": 16.7, + "learning_rate": 0.0001, + "loss": 7.2364, + "loss/crossentropy": 2.050992714613676, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.1952893292531371, + "step": 24510 + }, + { + "epoch": 0.613, + "grad_norm": 30.75, + "grad_norm_var": 9.8212890625, + "learning_rate": 0.0001, + "loss": 7.3393, + "loss/crossentropy": 2.1732106573879717, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.1879525838419795, + "step": 24520 + }, + { + "epoch": 0.61325, + "grad_norm": 31.375, + "grad_norm_var": 18.480989583333333, + "learning_rate": 0.0001, + "loss": 7.3735, + "loss/crossentropy": 2.038769894838333, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18006683513522148, + "step": 24530 + }, + { + "epoch": 0.6135, + "grad_norm": 52.25, + "grad_norm_var": 3.436879802679899e+18, + "learning_rate": 0.0001, + "loss": 7.2147, + "loss/crossentropy": 2.0876556798815726, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.19918184336274863, + "step": 24540 + }, + { + "epoch": 0.61375, + "grad_norm": 29.625, + "grad_norm_var": 3.4368798033364823e+18, + "learning_rate": 0.0001, + "loss": 7.3947, + "loss/crossentropy": 2.115431872010231, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19660551697015763, + "step": 24550 + }, + { + "epoch": 0.614, + "grad_norm": 29.875, + "grad_norm_var": 10.852018229166667, + "learning_rate": 0.0001, + "loss": 7.1815, + "loss/crossentropy": 2.01566364467144, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.1908690959215164, + "step": 24560 + }, + { + "epoch": 0.61425, + "grad_norm": 29.25, + "grad_norm_var": 10.713541666666666, + "learning_rate": 0.0001, + "loss": 7.2148, + "loss/crossentropy": 1.9971563063561917, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.19316924586892129, + "step": 24570 + }, + { + "epoch": 0.6145, + "grad_norm": 28.75, + "grad_norm_var": 5.447330729166667, + "learning_rate": 0.0001, + "loss": 7.295, + "loss/crossentropy": 2.1087226420640945, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1953347373753786, + "step": 24580 + }, + { + "epoch": 0.61475, + "grad_norm": 31.75, + "grad_norm_var": 3.9184895833333333, + "learning_rate": 0.0001, + "loss": 7.2813, + "loss/crossentropy": 2.0959045618772505, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.183117344789207, + "step": 24590 + }, + { + "epoch": 0.615, + "grad_norm": 31.0, + "grad_norm_var": 4.040559895833334, + "learning_rate": 0.0001, + "loss": 7.2361, + "loss/crossentropy": 1.9878082022070884, + "loss/hidden": 3.31875, + "loss/jsd": 0.0, + "loss/logits": 0.17381655257195233, + "step": 24600 + }, + { + "epoch": 0.61525, + "grad_norm": 30.875, + "grad_norm_var": 5.760416666666667, + "learning_rate": 0.0001, + "loss": 7.3343, + "loss/crossentropy": 1.9321263507008553, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.1903281847015023, + "step": 24610 + }, + { + "epoch": 0.6155, + "grad_norm": 31.25, + "grad_norm_var": 5.926822916666667, + "learning_rate": 0.0001, + "loss": 7.2588, + "loss/crossentropy": 2.2268902391195295, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.19283902421593666, + "step": 24620 + }, + { + "epoch": 0.61575, + "grad_norm": 30.125, + "grad_norm_var": 3.1416015625, + "learning_rate": 0.0001, + "loss": 7.3226, + "loss/crossentropy": 2.0439958930015565, + "loss/hidden": 3.393359375, + "loss/jsd": 0.0, + "loss/logits": 0.18890973944216966, + "step": 24630 + }, + { + "epoch": 0.616, + "grad_norm": 34.0, + "grad_norm_var": 2.653059895833333, + "learning_rate": 0.0001, + "loss": 7.2599, + "loss/crossentropy": 1.945370616018772, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.17533454354852437, + "step": 24640 + }, + { + "epoch": 0.61625, + "grad_norm": 29.625, + "grad_norm_var": 6.738541666666666, + "learning_rate": 0.0001, + "loss": 7.2148, + "loss/crossentropy": 1.991739486157894, + "loss/hidden": 3.35390625, + "loss/jsd": 0.0, + "loss/logits": 0.18211280852556228, + "step": 24650 + }, + { + "epoch": 0.6165, + "grad_norm": 29.25, + "grad_norm_var": 14.560416666666667, + "learning_rate": 0.0001, + "loss": 7.3611, + "loss/crossentropy": 2.075421932339668, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.19871473647654056, + "step": 24660 + }, + { + "epoch": 0.61675, + "grad_norm": 29.75, + "grad_norm_var": 12.080989583333333, + "learning_rate": 0.0001, + "loss": 7.239, + "loss/crossentropy": 2.008017046749592, + "loss/hidden": 3.27265625, + "loss/jsd": 0.0, + "loss/logits": 0.17111360877752305, + "step": 24670 + }, + { + "epoch": 0.617, + "grad_norm": 28.25, + "grad_norm_var": 8.795833333333333, + "learning_rate": 0.0001, + "loss": 7.2497, + "loss/crossentropy": 1.9377702206373215, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.17999718375504017, + "step": 24680 + }, + { + "epoch": 0.61725, + "grad_norm": 30.5, + "grad_norm_var": 2.385872395833333, + "learning_rate": 0.0001, + "loss": 7.3279, + "loss/crossentropy": 2.1406032904982566, + "loss/hidden": 3.281640625, + "loss/jsd": 0.0, + "loss/logits": 0.1813998430967331, + "step": 24690 + }, + { + "epoch": 0.6175, + "grad_norm": 31.125, + "grad_norm_var": 1.7212890625, + "learning_rate": 0.0001, + "loss": 7.3508, + "loss/crossentropy": 2.074493780732155, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.1942134676501155, + "step": 24700 + }, + { + "epoch": 0.61775, + "grad_norm": 31.5, + "grad_norm_var": 1.9275390625, + "learning_rate": 0.0001, + "loss": 7.3579, + "loss/crossentropy": 2.235245580971241, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.1957023985683918, + "step": 24710 + }, + { + "epoch": 0.618, + "grad_norm": 29.375, + "grad_norm_var": 2.6817057291666666, + "learning_rate": 0.0001, + "loss": 7.3896, + "loss/crossentropy": 2.0386796444654465, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.19206191059201955, + "step": 24720 + }, + { + "epoch": 0.61825, + "grad_norm": 29.625, + "grad_norm_var": 1.7705729166666666, + "learning_rate": 0.0001, + "loss": 7.3513, + "loss/crossentropy": 2.0323100224137307, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.20894596744328736, + "step": 24730 + }, + { + "epoch": 0.6185, + "grad_norm": 31.0, + "grad_norm_var": 1.4322265625, + "learning_rate": 0.0001, + "loss": 7.2488, + "loss/crossentropy": 2.0485980585217476, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18473935313522816, + "step": 24740 + }, + { + "epoch": 0.61875, + "grad_norm": 34.0, + "grad_norm_var": 1.8436848958333334, + "learning_rate": 0.0001, + "loss": 7.3125, + "loss/crossentropy": 2.12614818662405, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.18453964591026306, + "step": 24750 + }, + { + "epoch": 0.619, + "grad_norm": 35.25, + "grad_norm_var": 3.7080729166666666, + "learning_rate": 0.0001, + "loss": 7.307, + "loss/crossentropy": 2.0480948239564896, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.18478762162849308, + "step": 24760 + }, + { + "epoch": 0.61925, + "grad_norm": 30.625, + "grad_norm_var": 2.8223307291666666, + "learning_rate": 0.0001, + "loss": 7.2388, + "loss/crossentropy": 2.007586943730712, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.1709001273382455, + "step": 24770 + }, + { + "epoch": 0.6195, + "grad_norm": 28.5, + "grad_norm_var": 2.1497395833333335, + "learning_rate": 0.0001, + "loss": 7.2203, + "loss/crossentropy": 1.9536567956209183, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.18609143160283564, + "step": 24780 + }, + { + "epoch": 0.61975, + "grad_norm": 30.875, + "grad_norm_var": 2.158333333333333, + "learning_rate": 0.0001, + "loss": 7.4193, + "loss/crossentropy": 2.1475788801908493, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19103976357728242, + "step": 24790 + }, + { + "epoch": 0.62, + "grad_norm": 29.375, + "grad_norm_var": 1.4150390625, + "learning_rate": 0.0001, + "loss": 7.2597, + "loss/crossentropy": 1.93576335683465, + "loss/hidden": 3.26015625, + "loss/jsd": 0.0, + "loss/logits": 0.1649831486865878, + "step": 24800 + }, + { + "epoch": 0.62025, + "grad_norm": 29.75, + "grad_norm_var": 2.064322916666667, + "learning_rate": 0.0001, + "loss": 7.264, + "loss/crossentropy": 2.000751170516014, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.17869363045319914, + "step": 24810 + }, + { + "epoch": 0.6205, + "grad_norm": 31.5, + "grad_norm_var": 4.509830729166667, + "learning_rate": 0.0001, + "loss": 7.3578, + "loss/crossentropy": 2.100572241842747, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.19480398446321487, + "step": 24820 + }, + { + "epoch": 0.62075, + "grad_norm": 31.0, + "grad_norm_var": 3.252795170612926e+18, + "learning_rate": 0.0001, + "loss": 7.3303, + "loss/crossentropy": 2.0044652953743936, + "loss/hidden": 3.444921875, + "loss/jsd": 0.0, + "loss/logits": 0.19480419661849738, + "step": 24830 + }, + { + "epoch": 0.621, + "grad_norm": 30.0, + "grad_norm_var": 3.0885416666666665, + "learning_rate": 0.0001, + "loss": 7.3171, + "loss/crossentropy": 2.163808681815863, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.18681660266593098, + "step": 24840 + }, + { + "epoch": 0.62125, + "grad_norm": 30.25, + "grad_norm_var": 1.4400390625, + "learning_rate": 0.0001, + "loss": 7.333, + "loss/crossentropy": 2.03867654427886, + "loss/hidden": 3.288671875, + "loss/jsd": 0.0, + "loss/logits": 0.1776962866075337, + "step": 24850 + }, + { + "epoch": 0.6215, + "grad_norm": 28.5, + "grad_norm_var": 2.17265625, + "learning_rate": 0.0001, + "loss": 7.3692, + "loss/crossentropy": 2.0644729137420654, + "loss/hidden": 3.509375, + "loss/jsd": 0.0, + "loss/logits": 0.1982646532356739, + "step": 24860 + }, + { + "epoch": 0.62175, + "grad_norm": 27.625, + "grad_norm_var": 3.06015625, + "learning_rate": 0.0001, + "loss": 7.3453, + "loss/crossentropy": 2.171038728952408, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.19105625264346598, + "step": 24870 + }, + { + "epoch": 0.622, + "grad_norm": 31.625, + "grad_norm_var": 2.09140625, + "learning_rate": 0.0001, + "loss": 7.3096, + "loss/crossentropy": 2.0491355784237384, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.19366521965712308, + "step": 24880 + }, + { + "epoch": 0.62225, + "grad_norm": 29.375, + "grad_norm_var": 2.001822916666667, + "learning_rate": 0.0001, + "loss": 7.2684, + "loss/crossentropy": 2.185651781409979, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.1791619287803769, + "step": 24890 + }, + { + "epoch": 0.6225, + "grad_norm": 28.25, + "grad_norm_var": 2.0056640625, + "learning_rate": 0.0001, + "loss": 7.2828, + "loss/crossentropy": 2.1073651291429996, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.19452255237847565, + "step": 24900 + }, + { + "epoch": 0.62275, + "grad_norm": 28.25, + "grad_norm_var": 2.814583333333333, + "learning_rate": 0.0001, + "loss": 7.324, + "loss/crossentropy": 2.1819552317261697, + "loss/hidden": 3.47734375, + "loss/jsd": 0.0, + "loss/logits": 0.19712742660194635, + "step": 24910 + }, + { + "epoch": 0.623, + "grad_norm": 31.5, + "grad_norm_var": 1.340625, + "learning_rate": 0.0001, + "loss": 7.2135, + "loss/crossentropy": 2.0284355215728285, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.17623181734234095, + "step": 24920 + }, + { + "epoch": 0.62325, + "grad_norm": 33.0, + "grad_norm_var": 3.7405598958333335, + "learning_rate": 0.0001, + "loss": 7.3556, + "loss/crossentropy": 1.9424869917333125, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.16103002466261387, + "step": 24930 + }, + { + "epoch": 0.6235, + "grad_norm": 30.125, + "grad_norm_var": 2.687239583333333, + "learning_rate": 0.0001, + "loss": 7.4098, + "loss/crossentropy": 2.1412764593958853, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.18787063639611007, + "step": 24940 + }, + { + "epoch": 0.62375, + "grad_norm": 30.375, + "grad_norm_var": 2.4374348958333334, + "learning_rate": 0.0001, + "loss": 7.423, + "loss/crossentropy": 2.1100604355335237, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.1824411964043975, + "step": 24950 + }, + { + "epoch": 0.624, + "grad_norm": 28.75, + "grad_norm_var": 2.1405598958333334, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.173184335231781, + "loss/hidden": 3.3046875, + "loss/jsd": 0.0, + "loss/logits": 0.18387555023655294, + "step": 24960 + }, + { + "epoch": 0.62425, + "grad_norm": 33.25, + "grad_norm_var": 2.775, + "learning_rate": 0.0001, + "loss": 7.4108, + "loss/crossentropy": 2.2970198541879654, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.1978620221838355, + "step": 24970 + }, + { + "epoch": 0.6245, + "grad_norm": 30.25, + "grad_norm_var": 6.2869140625, + "learning_rate": 0.0001, + "loss": 7.3759, + "loss/crossentropy": 2.098717086017132, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.1739401025697589, + "step": 24980 + }, + { + "epoch": 0.62475, + "grad_norm": 30.75, + "grad_norm_var": 1.34765625, + "learning_rate": 0.0001, + "loss": 7.1911, + "loss/crossentropy": 2.0788547694683075, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.18489116076380013, + "step": 24990 + }, + { + "epoch": 0.625, + "grad_norm": 29.125, + "grad_norm_var": 2.3181640625, + "learning_rate": 0.0001, + "loss": 7.289, + "loss/crossentropy": 2.05898377597332, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.19787134062498807, + "step": 25000 + }, + { + "epoch": 0.62525, + "grad_norm": 30.5, + "grad_norm_var": 5.060416666666667, + "learning_rate": 0.0001, + "loss": 7.3426, + "loss/crossentropy": 2.191389924287796, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.1947077002376318, + "step": 25010 + }, + { + "epoch": 0.6255, + "grad_norm": 29.75, + "grad_norm_var": 3.7660807291666667, + "learning_rate": 0.0001, + "loss": 7.3552, + "loss/crossentropy": 2.1716764003038405, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.18750451393425466, + "step": 25020 + }, + { + "epoch": 0.62575, + "grad_norm": 28.75, + "grad_norm_var": 3.3934895833333334, + "learning_rate": 0.0001, + "loss": 7.2251, + "loss/crossentropy": 2.0290740236639975, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.2046494733542204, + "step": 25030 + }, + { + "epoch": 0.626, + "grad_norm": 29.125, + "grad_norm_var": 15.915559895833333, + "learning_rate": 0.0001, + "loss": 7.3018, + "loss/crossentropy": 2.0449746288359165, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.17823896156623958, + "step": 25040 + }, + { + "epoch": 0.62625, + "grad_norm": 30.875, + "grad_norm_var": 2.39140625, + "learning_rate": 0.0001, + "loss": 7.2551, + "loss/crossentropy": 2.0032063253223895, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.18292871192097665, + "step": 25050 + }, + { + "epoch": 0.6265, + "grad_norm": 34.25, + "grad_norm_var": 2.574934895833333, + "learning_rate": 0.0001, + "loss": 7.3934, + "loss/crossentropy": 1.9866469122469426, + "loss/hidden": 3.444921875, + "loss/jsd": 0.0, + "loss/logits": 0.18698836211115122, + "step": 25060 + }, + { + "epoch": 0.62675, + "grad_norm": 30.625, + "grad_norm_var": 1.9202473958333333, + "learning_rate": 0.0001, + "loss": 7.3007, + "loss/crossentropy": 2.0594868302345275, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.18997351210564375, + "step": 25070 + }, + { + "epoch": 0.627, + "grad_norm": 34.5, + "grad_norm_var": 3.217643229166667, + "learning_rate": 0.0001, + "loss": 7.2211, + "loss/crossentropy": 2.0622723668813707, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18301580119878053, + "step": 25080 + }, + { + "epoch": 0.62725, + "grad_norm": 30.875, + "grad_norm_var": 6.513541666666667, + "learning_rate": 0.0001, + "loss": 7.2187, + "loss/crossentropy": 2.19646572843194, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.21105999127030373, + "step": 25090 + }, + { + "epoch": 0.6275, + "grad_norm": 31.0, + "grad_norm_var": 3.287239583333333, + "learning_rate": 0.0001, + "loss": 7.3144, + "loss/crossentropy": 2.1136497229337694, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18730370607227087, + "step": 25100 + }, + { + "epoch": 0.62775, + "grad_norm": 29.125, + "grad_norm_var": 1.6087890625, + "learning_rate": 0.0001, + "loss": 7.4126, + "loss/crossentropy": 2.2990898922085763, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.192084638774395, + "step": 25110 + }, + { + "epoch": 0.628, + "grad_norm": 32.5, + "grad_norm_var": 2.9275390625, + "learning_rate": 0.0001, + "loss": 7.3276, + "loss/crossentropy": 1.8526959542185069, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.19008365646004677, + "step": 25120 + }, + { + "epoch": 0.62825, + "grad_norm": 29.125, + "grad_norm_var": 2.4181640625, + "learning_rate": 0.0001, + "loss": 7.1824, + "loss/crossentropy": 1.9762011893093585, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.170592157356441, + "step": 25130 + }, + { + "epoch": 0.6285, + "grad_norm": 30.75, + "grad_norm_var": 3.67890625, + "learning_rate": 0.0001, + "loss": 7.3663, + "loss/crossentropy": 2.0375132709741592, + "loss/hidden": 3.42265625, + "loss/jsd": 0.0, + "loss/logits": 0.19559677615761756, + "step": 25140 + }, + { + "epoch": 0.62875, + "grad_norm": 5268045824.0, + "grad_norm_var": 1.7345191550355768e+18, + "learning_rate": 0.0001, + "loss": 7.2744, + "loss/crossentropy": 2.116751839220524, + "loss/hidden": 3.64453125, + "loss/jsd": 0.0, + "loss/logits": 0.20043763648718596, + "step": 25150 + }, + { + "epoch": 0.629, + "grad_norm": 29.625, + "grad_norm_var": 1.7345191549916764e+18, + "learning_rate": 0.0001, + "loss": 7.1578, + "loss/crossentropy": 1.8911869063973428, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.17289240173995496, + "step": 25160 + }, + { + "epoch": 0.62925, + "grad_norm": 32.25, + "grad_norm_var": 2.628125, + "learning_rate": 0.0001, + "loss": 7.2708, + "loss/crossentropy": 1.9820346303284169, + "loss/hidden": 3.387109375, + "loss/jsd": 0.0, + "loss/logits": 0.17856419319286942, + "step": 25170 + }, + { + "epoch": 0.6295, + "grad_norm": 32.25, + "grad_norm_var": 3.124739583333333, + "learning_rate": 0.0001, + "loss": 7.2472, + "loss/crossentropy": 2.166484069824219, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.1979134999215603, + "step": 25180 + }, + { + "epoch": 0.62975, + "grad_norm": 29.875, + "grad_norm_var": 15.1025390625, + "learning_rate": 0.0001, + "loss": 7.2973, + "loss/crossentropy": 2.0940047293901443, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.19144841339439153, + "step": 25190 + }, + { + "epoch": 0.63, + "grad_norm": 30.625, + "grad_norm_var": 2.3934895833333334, + "learning_rate": 0.0001, + "loss": 7.2884, + "loss/crossentropy": 2.0762594789266586, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.1935945412144065, + "step": 25200 + }, + { + "epoch": 0.63025, + "grad_norm": 30.75, + "grad_norm_var": 3.083268229166667, + "learning_rate": 0.0001, + "loss": 7.3037, + "loss/crossentropy": 2.1710478901863097, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.19344222452491522, + "step": 25210 + }, + { + "epoch": 0.6305, + "grad_norm": 29.625, + "grad_norm_var": 3.512434895833333, + "learning_rate": 0.0001, + "loss": 7.3412, + "loss/crossentropy": 2.1715824723243715, + "loss/hidden": 3.367578125, + "loss/jsd": 0.0, + "loss/logits": 0.18924503941088916, + "step": 25220 + }, + { + "epoch": 0.63075, + "grad_norm": 28.125, + "grad_norm_var": 3.0830729166666666, + "learning_rate": 0.0001, + "loss": 7.2856, + "loss/crossentropy": 2.1482178628444673, + "loss/hidden": 3.275, + "loss/jsd": 0.0, + "loss/logits": 0.1806714640930295, + "step": 25230 + }, + { + "epoch": 0.631, + "grad_norm": 31.25, + "grad_norm_var": 4.3875, + "learning_rate": 0.0001, + "loss": 7.3432, + "loss/crossentropy": 1.8948672696948052, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.18248430043458938, + "step": 25240 + }, + { + "epoch": 0.63125, + "grad_norm": 30.625, + "grad_norm_var": 5.961393229166666, + "learning_rate": 0.0001, + "loss": 7.307, + "loss/crossentropy": 1.979700568318367, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.18111934289336204, + "step": 25250 + }, + { + "epoch": 0.6315, + "grad_norm": 30.75, + "grad_norm_var": 2.0884765625, + "learning_rate": 0.0001, + "loss": 7.3091, + "loss/crossentropy": 2.1454955518245695, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.20133222881704568, + "step": 25260 + }, + { + "epoch": 0.63175, + "grad_norm": 32.25, + "grad_norm_var": 11.745247395833333, + "learning_rate": 0.0001, + "loss": 7.3298, + "loss/crossentropy": 2.052339327335358, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.1785123085603118, + "step": 25270 + }, + { + "epoch": 0.632, + "grad_norm": 30.25, + "grad_norm_var": 16.828580729166667, + "learning_rate": 0.0001, + "loss": 7.4661, + "loss/crossentropy": 2.0869883239269256, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.2130020596086979, + "step": 25280 + }, + { + "epoch": 0.63225, + "grad_norm": 30.25, + "grad_norm_var": 2.2916015625, + "learning_rate": 0.0001, + "loss": 7.2918, + "loss/crossentropy": 1.994824843108654, + "loss/hidden": 3.207421875, + "loss/jsd": 0.0, + "loss/logits": 0.17445887122303247, + "step": 25290 + }, + { + "epoch": 0.6325, + "grad_norm": 35.0, + "grad_norm_var": 116.28098958333334, + "learning_rate": 0.0001, + "loss": 7.2917, + "loss/crossentropy": 2.073824466764927, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.18123935330659152, + "step": 25300 + }, + { + "epoch": 0.63275, + "grad_norm": 30.625, + "grad_norm_var": 195.25774739583332, + "learning_rate": 0.0001, + "loss": 7.293, + "loss/crossentropy": 2.102488835155964, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1843341593630612, + "step": 25310 + }, + { + "epoch": 0.633, + "grad_norm": 30.125, + "grad_norm_var": 103.74973958333334, + "learning_rate": 0.0001, + "loss": 7.2951, + "loss/crossentropy": 1.9738748401403428, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.19301892705261708, + "step": 25320 + }, + { + "epoch": 0.63325, + "grad_norm": 30.25, + "grad_norm_var": 1.2635411529232722e+18, + "learning_rate": 0.0001, + "loss": 7.3154, + "loss/crossentropy": 2.1203658998012545, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.1767484113574028, + "step": 25330 + }, + { + "epoch": 0.6335, + "grad_norm": 28.0, + "grad_norm_var": 1.2635411512184276e+18, + "learning_rate": 0.0001, + "loss": 7.3058, + "loss/crossentropy": 2.0557747781276703, + "loss/hidden": 3.344140625, + "loss/jsd": 0.0, + "loss/logits": 0.18306177966296672, + "step": 25340 + }, + { + "epoch": 0.63375, + "grad_norm": 28.625, + "grad_norm_var": 34.753125, + "learning_rate": 0.0001, + "loss": 7.0454, + "loss/crossentropy": 2.0685575902462006, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1882263720035553, + "step": 25350 + }, + { + "epoch": 0.634, + "grad_norm": 28.875, + "grad_norm_var": 22.106705729166666, + "learning_rate": 0.0001, + "loss": 7.2534, + "loss/crossentropy": 2.208861267566681, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.197300866805017, + "step": 25360 + }, + { + "epoch": 0.63425, + "grad_norm": 28.125, + "grad_norm_var": 10.518489583333333, + "learning_rate": 0.0001, + "loss": 7.1867, + "loss/crossentropy": 2.133610498905182, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.1941371016204357, + "step": 25370 + }, + { + "epoch": 0.6345, + "grad_norm": 28.875, + "grad_norm_var": 21.129166666666666, + "learning_rate": 0.0001, + "loss": 7.2715, + "loss/crossentropy": 2.063799539208412, + "loss/hidden": 3.273828125, + "loss/jsd": 0.0, + "loss/logits": 0.1706739742308855, + "step": 25380 + }, + { + "epoch": 0.63475, + "grad_norm": 38.25, + "grad_norm_var": 14.7416015625, + "learning_rate": 0.0001, + "loss": 7.0581, + "loss/crossentropy": 2.0165546610951424, + "loss/hidden": 3.141796875, + "loss/jsd": 0.0, + "loss/logits": 0.15498318960890173, + "step": 25390 + }, + { + "epoch": 0.635, + "grad_norm": 37.75, + "grad_norm_var": 14.0353515625, + "learning_rate": 0.0001, + "loss": 7.2961, + "loss/crossentropy": 2.240743267536163, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.18806628789752722, + "step": 25400 + }, + { + "epoch": 0.63525, + "grad_norm": 33.25, + "grad_norm_var": 19.630989583333335, + "learning_rate": 0.0001, + "loss": 7.1944, + "loss/crossentropy": 2.1175243996083735, + "loss/hidden": 3.28046875, + "loss/jsd": 0.0, + "loss/logits": 0.18017748612910509, + "step": 25410 + }, + { + "epoch": 0.6355, + "grad_norm": 30.375, + "grad_norm_var": 16.4025390625, + "learning_rate": 0.0001, + "loss": 7.2159, + "loss/crossentropy": 2.11346255838871, + "loss/hidden": 3.270703125, + "loss/jsd": 0.0, + "loss/logits": 0.1687102049589157, + "step": 25420 + }, + { + "epoch": 0.63575, + "grad_norm": 31.75, + "grad_norm_var": 41.88014322916667, + "learning_rate": 0.0001, + "loss": 7.2323, + "loss/crossentropy": 2.2356766797602177, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.18139579724520444, + "step": 25430 + }, + { + "epoch": 0.636, + "grad_norm": 30.75, + "grad_norm_var": 13.415559895833333, + "learning_rate": 0.0001, + "loss": 7.1997, + "loss/crossentropy": 1.9421189457178116, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.17150872349739074, + "step": 25440 + }, + { + "epoch": 0.63625, + "grad_norm": 29.625, + "grad_norm_var": 5.9775390625, + "learning_rate": 0.0001, + "loss": 7.1837, + "loss/crossentropy": 2.0306406378746034, + "loss/hidden": 3.2375, + "loss/jsd": 0.0, + "loss/logits": 0.17749902671203016, + "step": 25450 + }, + { + "epoch": 0.6365, + "grad_norm": 30.0, + "grad_norm_var": 8156.041666666667, + "learning_rate": 0.0001, + "loss": 7.2467, + "loss/crossentropy": 2.0117203891277313, + "loss/hidden": 3.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.17507544625550508, + "step": 25460 + }, + { + "epoch": 0.63675, + "grad_norm": 36.75, + "grad_norm_var": 18.21015625, + "learning_rate": 0.0001, + "loss": 7.2268, + "loss/crossentropy": 2.0377916976809503, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.17452282812446357, + "step": 25470 + }, + { + "epoch": 0.637, + "grad_norm": 29.625, + "grad_norm_var": 26.654166666666665, + "learning_rate": 0.0001, + "loss": 7.2038, + "loss/crossentropy": 2.0024905622005464, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.18442925848066807, + "step": 25480 + }, + { + "epoch": 0.63725, + "grad_norm": 27.125, + "grad_norm_var": 28.959309895833332, + "learning_rate": 0.0001, + "loss": 7.2125, + "loss/crossentropy": 2.0766751676797868, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.20771742928773165, + "step": 25490 + }, + { + "epoch": 0.6375, + "grad_norm": 34.75, + "grad_norm_var": 6.7650390625, + "learning_rate": 0.0001, + "loss": 7.1703, + "loss/crossentropy": 2.0064065635204313, + "loss/hidden": 3.289453125, + "loss/jsd": 0.0, + "loss/logits": 0.18250296991318465, + "step": 25500 + }, + { + "epoch": 0.63775, + "grad_norm": 30.75, + "grad_norm_var": 4.0697265625, + "learning_rate": 0.0001, + "loss": 7.2983, + "loss/crossentropy": 2.0614541321992874, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.2076763341203332, + "step": 25510 + }, + { + "epoch": 0.638, + "grad_norm": 29.5, + "grad_norm_var": 7.487239583333333, + "learning_rate": 0.0001, + "loss": 7.2289, + "loss/crossentropy": 1.9859767436981202, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.17486365381628274, + "step": 25520 + }, + { + "epoch": 0.63825, + "grad_norm": 35.75, + "grad_norm_var": 17.522916666666667, + "learning_rate": 0.0001, + "loss": 7.2378, + "loss/crossentropy": 1.9697401717305183, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1799583438783884, + "step": 25530 + }, + { + "epoch": 0.6385, + "grad_norm": 28.375, + "grad_norm_var": 38.739322916666666, + "learning_rate": 0.0001, + "loss": 7.2664, + "loss/crossentropy": 2.1364367194473743, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.18297823830507695, + "step": 25540 + }, + { + "epoch": 0.63875, + "grad_norm": 30.125, + "grad_norm_var": 35.773372395833334, + "learning_rate": 0.0001, + "loss": 7.1437, + "loss/crossentropy": 2.0668285630643366, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.1804805461317301, + "step": 25550 + }, + { + "epoch": 0.639, + "grad_norm": 28.625, + "grad_norm_var": 11.2619140625, + "learning_rate": 0.0001, + "loss": 7.2275, + "loss/crossentropy": 2.093304879963398, + "loss/hidden": 3.273046875, + "loss/jsd": 0.0, + "loss/logits": 0.18014004416763782, + "step": 25560 + }, + { + "epoch": 0.63925, + "grad_norm": 35.0, + "grad_norm_var": 10.3150390625, + "learning_rate": 0.0001, + "loss": 7.3097, + "loss/crossentropy": 2.20695668309927, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19993221275508405, + "step": 25570 + }, + { + "epoch": 0.6395, + "grad_norm": 35.5, + "grad_norm_var": 6.303125, + "learning_rate": 0.0001, + "loss": 7.257, + "loss/crossentropy": 2.0658087216317655, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.18781778533011675, + "step": 25580 + }, + { + "epoch": 0.63975, + "grad_norm": 29.875, + "grad_norm_var": 4.42265625, + "learning_rate": 0.0001, + "loss": 7.2849, + "loss/crossentropy": 2.1931436568498612, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.18365668430924414, + "step": 25590 + }, + { + "epoch": 0.64, + "grad_norm": 28.125, + "grad_norm_var": 4.941666666666666, + "learning_rate": 0.0001, + "loss": 7.3091, + "loss/crossentropy": 2.105237030982971, + "loss/hidden": 3.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.18291805479675532, + "step": 25600 + }, + { + "epoch": 0.64025, + "grad_norm": 30.125, + "grad_norm_var": 1.7983723958333333, + "learning_rate": 0.0001, + "loss": 7.3204, + "loss/crossentropy": 1.995211163163185, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.17982382252812384, + "step": 25610 + }, + { + "epoch": 0.6405, + "grad_norm": 31.5, + "grad_norm_var": 1.4504557291666667, + "learning_rate": 0.0001, + "loss": 7.4159, + "loss/crossentropy": 2.0612260174006223, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19814395047724248, + "step": 25620 + }, + { + "epoch": 0.64075, + "grad_norm": 29.25, + "grad_norm_var": 1.5655598958333334, + "learning_rate": 0.0001, + "loss": 7.3244, + "loss/crossentropy": 2.1825088411569595, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.17284797839820384, + "step": 25630 + }, + { + "epoch": 0.641, + "grad_norm": 30.625, + "grad_norm_var": 4.470572916666667, + "learning_rate": 0.0001, + "loss": 7.3127, + "loss/crossentropy": 2.123421373963356, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.1951315425336361, + "step": 25640 + }, + { + "epoch": 0.64125, + "grad_norm": 29.25, + "grad_norm_var": 2.6211653257674097e+18, + "learning_rate": 0.0001, + "loss": 7.4566, + "loss/crossentropy": 1.9836766071617604, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.18384753745049237, + "step": 25650 + }, + { + "epoch": 0.6415, + "grad_norm": 30.125, + "grad_norm_var": 10.3619140625, + "learning_rate": 0.0001, + "loss": 7.2435, + "loss/crossentropy": 2.024305807799101, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.18380997078493239, + "step": 25660 + }, + { + "epoch": 0.64175, + "grad_norm": 29.25, + "grad_norm_var": 6.990559895833333, + "learning_rate": 0.0001, + "loss": 7.3449, + "loss/crossentropy": 2.0873545318841935, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1851763878017664, + "step": 25670 + }, + { + "epoch": 0.642, + "grad_norm": 29.75, + "grad_norm_var": 6.716080729166666, + "learning_rate": 0.0001, + "loss": 7.3962, + "loss/crossentropy": 1.953444430232048, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.18732787277549506, + "step": 25680 + }, + { + "epoch": 0.64225, + "grad_norm": 30.0, + "grad_norm_var": 4.412434895833333, + "learning_rate": 0.0001, + "loss": 7.3693, + "loss/crossentropy": 2.0779720529913903, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.19645520951598883, + "step": 25690 + }, + { + "epoch": 0.6425, + "grad_norm": 36.0, + "grad_norm_var": 6.120247395833333, + "learning_rate": 0.0001, + "loss": 7.2773, + "loss/crossentropy": 2.080344262719154, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.18719701357185842, + "step": 25700 + }, + { + "epoch": 0.64275, + "grad_norm": 31.75, + "grad_norm_var": 5.784309895833333, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 2.1494850531220435, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.2048310289159417, + "step": 25710 + }, + { + "epoch": 0.643, + "grad_norm": 30.75, + "grad_norm_var": 1.9572916666666667, + "learning_rate": 0.0001, + "loss": 7.1719, + "loss/crossentropy": 2.06035181209445, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.19010064098984003, + "step": 25720 + }, + { + "epoch": 0.64325, + "grad_norm": 29.5, + "grad_norm_var": 1.4150390625, + "learning_rate": 0.0001, + "loss": 7.3085, + "loss/crossentropy": 1.9468969091773034, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.18584002749994397, + "step": 25730 + }, + { + "epoch": 0.6435, + "grad_norm": 33.25, + "grad_norm_var": 2.221875, + "learning_rate": 0.0001, + "loss": 7.2789, + "loss/crossentropy": 1.9196617759764194, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.17771600587293507, + "step": 25740 + }, + { + "epoch": 0.64375, + "grad_norm": 30.25, + "grad_norm_var": 14.9853515625, + "learning_rate": 0.0001, + "loss": 7.2678, + "loss/crossentropy": 2.1726732790470122, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.18251333478838205, + "step": 25750 + }, + { + "epoch": 0.644, + "grad_norm": 30.25, + "grad_norm_var": 2.2059895833333334, + "learning_rate": 0.0001, + "loss": 7.3832, + "loss/crossentropy": 2.1877689242362974, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19289560578763484, + "step": 25760 + }, + { + "epoch": 0.64425, + "grad_norm": 62.75, + "grad_norm_var": 69.259375, + "learning_rate": 0.0001, + "loss": 7.2773, + "loss/crossentropy": 2.070755659043789, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.17988401111215352, + "step": 25770 + }, + { + "epoch": 0.6445, + "grad_norm": 29.375, + "grad_norm_var": 113.71399739583333, + "learning_rate": 0.0001, + "loss": 7.3734, + "loss/crossentropy": 2.106437236815691, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.19078649543225765, + "step": 25780 + }, + { + "epoch": 0.64475, + "grad_norm": 30.625, + "grad_norm_var": 1.3952473958333333, + "learning_rate": 0.0001, + "loss": 7.3165, + "loss/crossentropy": 2.1690757513046264, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19081941563636065, + "step": 25790 + }, + { + "epoch": 0.645, + "grad_norm": 28.75, + "grad_norm_var": 1.6077473958333333, + "learning_rate": 0.0001, + "loss": 7.3326, + "loss/crossentropy": 2.1447702586650848, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.19072114508599042, + "step": 25800 + }, + { + "epoch": 0.64525, + "grad_norm": 30.5, + "grad_norm_var": 2.5410807291666666, + "learning_rate": 0.0001, + "loss": 7.3061, + "loss/crossentropy": 2.243308076262474, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.19677081871777774, + "step": 25810 + }, + { + "epoch": 0.6455, + "grad_norm": 29.25, + "grad_norm_var": 2.2811848958333334, + "learning_rate": 0.0001, + "loss": 7.2598, + "loss/crossentropy": 2.1137851983308793, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18019928000867366, + "step": 25820 + }, + { + "epoch": 0.64575, + "grad_norm": 28.875, + "grad_norm_var": 3.3067057291666666, + "learning_rate": 0.0001, + "loss": 7.2768, + "loss/crossentropy": 2.046476516127586, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.1881034353747964, + "step": 25830 + }, + { + "epoch": 0.646, + "grad_norm": 30.875, + "grad_norm_var": 7.928059895833333, + "learning_rate": 0.0001, + "loss": 7.4478, + "loss/crossentropy": 1.9755774058401585, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.18677257960662247, + "step": 25840 + }, + { + "epoch": 0.64625, + "grad_norm": 29.0, + "grad_norm_var": 10.15625, + "learning_rate": 0.0001, + "loss": 7.35, + "loss/crossentropy": 2.0693862795829774, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.18849658444523812, + "step": 25850 + }, + { + "epoch": 0.6465, + "grad_norm": 30.125, + "grad_norm_var": 10.2125, + "learning_rate": 0.0001, + "loss": 7.3352, + "loss/crossentropy": 2.121247774362564, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.20228977780789137, + "step": 25860 + }, + { + "epoch": 0.64675, + "grad_norm": 29.75, + "grad_norm_var": 3.5160807291666667, + "learning_rate": 0.0001, + "loss": 7.3408, + "loss/crossentropy": 2.0337157592177393, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.18892905581742525, + "step": 25870 + }, + { + "epoch": 0.647, + "grad_norm": 29.75, + "grad_norm_var": 4.974739583333333, + "learning_rate": 0.0001, + "loss": 7.3046, + "loss/crossentropy": 1.9749686062335967, + "loss/hidden": 3.486328125, + "loss/jsd": 0.0, + "loss/logits": 0.19374344814568759, + "step": 25880 + }, + { + "epoch": 0.64725, + "grad_norm": 33.25, + "grad_norm_var": 5.467708333333333, + "learning_rate": 0.0001, + "loss": 7.3869, + "loss/crossentropy": 2.083652201294899, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.2009931992739439, + "step": 25890 + }, + { + "epoch": 0.6475, + "grad_norm": 34.5, + "grad_norm_var": 4.03515625, + "learning_rate": 0.0001, + "loss": 7.3113, + "loss/crossentropy": 2.0592153131961823, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.19190128184854985, + "step": 25900 + }, + { + "epoch": 0.64775, + "grad_norm": 28.625, + "grad_norm_var": 4.687955729166666, + "learning_rate": 0.0001, + "loss": 7.2909, + "loss/crossentropy": 1.9693334527313708, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.1817644027993083, + "step": 25910 + }, + { + "epoch": 0.648, + "grad_norm": 31.125, + "grad_norm_var": 2.124739583333333, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.079821580648422, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.17972610034048558, + "step": 25920 + }, + { + "epoch": 0.64825, + "grad_norm": 28.375, + "grad_norm_var": 3.4934895833333335, + "learning_rate": 0.0001, + "loss": 7.2074, + "loss/crossentropy": 2.1161312617361547, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.1969278533011675, + "step": 25930 + }, + { + "epoch": 0.6485, + "grad_norm": 27.25, + "grad_norm_var": 1.8212890625, + "learning_rate": 0.0001, + "loss": 7.3702, + "loss/crossentropy": 2.1124040342867376, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.18695534877479075, + "step": 25940 + }, + { + "epoch": 0.64875, + "grad_norm": 34.25, + "grad_norm_var": 4.521809895833333, + "learning_rate": 0.0001, + "loss": 7.4366, + "loss/crossentropy": 2.1193448536098005, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.18631769344210625, + "step": 25950 + }, + { + "epoch": 0.649, + "grad_norm": 30.875, + "grad_norm_var": 4.4697265625, + "learning_rate": 0.0001, + "loss": 7.4333, + "loss/crossentropy": 2.1864205464720725, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.19288004878908396, + "step": 25960 + }, + { + "epoch": 0.64925, + "grad_norm": 32.25, + "grad_norm_var": 2.65390625, + "learning_rate": 0.0001, + "loss": 7.3549, + "loss/crossentropy": 2.0109216958284377, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18039658814668655, + "step": 25970 + }, + { + "epoch": 0.6495, + "grad_norm": 6677331968.0, + "grad_norm_var": 2.786672611783344e+18, + "learning_rate": 0.0001, + "loss": 7.3897, + "loss/crossentropy": 1.9869451895356178, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.17617017943412067, + "step": 25980 + }, + { + "epoch": 0.64975, + "grad_norm": 29.25, + "grad_norm_var": 2.7866726110599665e+18, + "learning_rate": 0.0001, + "loss": 7.2854, + "loss/crossentropy": 2.2265473932027815, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.2203309465199709, + "step": 25990 + }, + { + "epoch": 0.65, + "grad_norm": 30.5, + "grad_norm_var": 2.5822265625, + "learning_rate": 0.0001, + "loss": 7.3065, + "loss/crossentropy": 2.0998080484569073, + "loss/hidden": 3.4484375, + "loss/jsd": 0.0, + "loss/logits": 0.18724181046709418, + "step": 26000 + }, + { + "epoch": 0.65025, + "grad_norm": 29.0, + "grad_norm_var": 9.589518229166666, + "learning_rate": 0.0001, + "loss": 7.0924, + "loss/crossentropy": 1.897086238861084, + "loss/hidden": 3.268359375, + "loss/jsd": 0.0, + "loss/logits": 0.16815462270751597, + "step": 26010 + }, + { + "epoch": 0.6505, + "grad_norm": 28.75, + "grad_norm_var": 9.5869140625, + "learning_rate": 0.0001, + "loss": 7.2116, + "loss/crossentropy": 2.1296266965568065, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.18028478175401688, + "step": 26020 + }, + { + "epoch": 0.65075, + "grad_norm": 28.75, + "grad_norm_var": 3.5872395833333335, + "learning_rate": 0.0001, + "loss": 7.2718, + "loss/crossentropy": 2.0453512027859686, + "loss/hidden": 3.26484375, + "loss/jsd": 0.0, + "loss/logits": 0.17293851375579833, + "step": 26030 + }, + { + "epoch": 0.651, + "grad_norm": 32.0, + "grad_norm_var": 24.023372395833334, + "learning_rate": 0.0001, + "loss": 7.3717, + "loss/crossentropy": 2.1030614957213403, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.17731762621551753, + "step": 26040 + }, + { + "epoch": 0.65125, + "grad_norm": 28.75, + "grad_norm_var": 9.29140625, + "learning_rate": 0.0001, + "loss": 7.1228, + "loss/crossentropy": 2.021975876390934, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18740149699151515, + "step": 26050 + }, + { + "epoch": 0.6515, + "grad_norm": 31.0, + "grad_norm_var": 2.4125, + "learning_rate": 0.0001, + "loss": 7.2849, + "loss/crossentropy": 2.0947029411792757, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.18155105784535408, + "step": 26060 + }, + { + "epoch": 0.65175, + "grad_norm": 31.25, + "grad_norm_var": 2.3160807291666665, + "learning_rate": 0.0001, + "loss": 7.3504, + "loss/crossentropy": 2.0333006374537943, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.21557488106191158, + "step": 26070 + }, + { + "epoch": 0.652, + "grad_norm": 66.0, + "grad_norm_var": 78.5587890625, + "learning_rate": 0.0001, + "loss": 7.2023, + "loss/crossentropy": 2.028635681420565, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.18349210238084196, + "step": 26080 + }, + { + "epoch": 0.65225, + "grad_norm": 29.875, + "grad_norm_var": 81.18723958333334, + "learning_rate": 0.0001, + "loss": 7.2882, + "loss/crossentropy": 2.110158285498619, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.18722082171589136, + "step": 26090 + }, + { + "epoch": 0.6525, + "grad_norm": 29.375, + "grad_norm_var": 2.225, + "learning_rate": 0.0001, + "loss": 7.251, + "loss/crossentropy": 2.032511693239212, + "loss/hidden": 3.329296875, + "loss/jsd": 0.0, + "loss/logits": 0.20495209330692887, + "step": 26100 + }, + { + "epoch": 0.65275, + "grad_norm": 30.5, + "grad_norm_var": 1.6942057291666666, + "learning_rate": 0.0001, + "loss": 7.3307, + "loss/crossentropy": 2.1366796389222147, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.18084107786417009, + "step": 26110 + }, + { + "epoch": 0.653, + "grad_norm": 33.0, + "grad_norm_var": 1.9559895833333334, + "learning_rate": 0.0001, + "loss": 7.2364, + "loss/crossentropy": 2.198021276295185, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.1900389553979039, + "step": 26120 + }, + { + "epoch": 0.65325, + "grad_norm": 30.125, + "grad_norm_var": 2.31640625, + "learning_rate": 0.0001, + "loss": 7.2938, + "loss/crossentropy": 2.0711926236748694, + "loss/hidden": 3.28203125, + "loss/jsd": 0.0, + "loss/logits": 0.17610593941062688, + "step": 26130 + }, + { + "epoch": 0.6535, + "grad_norm": 30.375, + "grad_norm_var": 2.4072916666666666, + "learning_rate": 0.0001, + "loss": 7.2918, + "loss/crossentropy": 2.1544741809368135, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.1978639720007777, + "step": 26140 + }, + { + "epoch": 0.65375, + "grad_norm": 28.75, + "grad_norm_var": 3.3955729166666666, + "learning_rate": 0.0001, + "loss": 7.2798, + "loss/crossentropy": 2.0444336414337156, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.19784381221979858, + "step": 26150 + }, + { + "epoch": 0.654, + "grad_norm": 28.875, + "grad_norm_var": 3.7181640625, + "learning_rate": 0.0001, + "loss": 7.1077, + "loss/crossentropy": 1.962803577259183, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.17380120740272104, + "step": 26160 + }, + { + "epoch": 0.65425, + "grad_norm": 28.75, + "grad_norm_var": 10.5541015625, + "learning_rate": 0.0001, + "loss": 7.2623, + "loss/crossentropy": 2.0290966272354125, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.1842226268723607, + "step": 26170 + }, + { + "epoch": 0.6545, + "grad_norm": 31.5, + "grad_norm_var": 8.968489583333334, + "learning_rate": 0.0001, + "loss": 7.2974, + "loss/crossentropy": 2.0671102195978164, + "loss/hidden": 3.2921875, + "loss/jsd": 0.0, + "loss/logits": 0.1779710568487644, + "step": 26180 + }, + { + "epoch": 0.65475, + "grad_norm": 29.125, + "grad_norm_var": 2.0238932291666667, + "learning_rate": 0.0001, + "loss": 7.1929, + "loss/crossentropy": 1.9803795799613, + "loss/hidden": 3.2546875, + "loss/jsd": 0.0, + "loss/logits": 0.16597625594586135, + "step": 26190 + }, + { + "epoch": 0.655, + "grad_norm": 31.75, + "grad_norm_var": 1.8580729166666667, + "learning_rate": 0.0001, + "loss": 7.2966, + "loss/crossentropy": 2.0811556324362757, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.1854775669053197, + "step": 26200 + }, + { + "epoch": 0.65525, + "grad_norm": 30.125, + "grad_norm_var": 1.6707682291666666, + "learning_rate": 0.0001, + "loss": 7.2357, + "loss/crossentropy": 2.1400867104530334, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.186828719265759, + "step": 26210 + }, + { + "epoch": 0.6555, + "grad_norm": 29.625, + "grad_norm_var": 3.0072916666666667, + "learning_rate": 0.0001, + "loss": 7.3599, + "loss/crossentropy": 2.144706717133522, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.19139624712988734, + "step": 26220 + }, + { + "epoch": 0.65575, + "grad_norm": 30.25, + "grad_norm_var": 3.9947265625, + "learning_rate": 0.0001, + "loss": 7.3455, + "loss/crossentropy": 1.9842251062393188, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.18394090216606857, + "step": 26230 + }, + { + "epoch": 0.656, + "grad_norm": 30.75, + "grad_norm_var": 2.8645833333333335, + "learning_rate": 0.0001, + "loss": 7.2688, + "loss/crossentropy": 2.0962007105350495, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.19022967889904976, + "step": 26240 + }, + { + "epoch": 0.65625, + "grad_norm": 30.5, + "grad_norm_var": 1.3143229166666666, + "learning_rate": 0.0001, + "loss": 7.1723, + "loss/crossentropy": 2.0132046937942505, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.1787968127988279, + "step": 26250 + }, + { + "epoch": 0.6565, + "grad_norm": 30.5, + "grad_norm_var": 1.4431640625, + "learning_rate": 0.0001, + "loss": 7.3366, + "loss/crossentropy": 1.9615045070648194, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.1971464239060879, + "step": 26260 + }, + { + "epoch": 0.65675, + "grad_norm": 32.75, + "grad_norm_var": 2.0830729166666666, + "learning_rate": 0.0001, + "loss": 7.1952, + "loss/crossentropy": 2.1030160948634147, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.19630551328882576, + "step": 26270 + }, + { + "epoch": 0.657, + "grad_norm": 29.625, + "grad_norm_var": 2.318489583333333, + "learning_rate": 0.0001, + "loss": 7.3209, + "loss/crossentropy": 2.2092444196343424, + "loss/hidden": 3.28046875, + "loss/jsd": 0.0, + "loss/logits": 0.18095392240211366, + "step": 26280 + }, + { + "epoch": 0.65725, + "grad_norm": 29.875, + "grad_norm_var": 2.3046223958333334, + "learning_rate": 0.0001, + "loss": 7.2758, + "loss/crossentropy": 2.183464777469635, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.18903474770486356, + "step": 26290 + }, + { + "epoch": 0.6575, + "grad_norm": 31.25, + "grad_norm_var": 1.9020182291666667, + "learning_rate": 0.0001, + "loss": 7.4311, + "loss/crossentropy": 2.121826934814453, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.18777972608804702, + "step": 26300 + }, + { + "epoch": 0.65775, + "grad_norm": 32.25, + "grad_norm_var": 1.5416015625, + "learning_rate": 0.0001, + "loss": 7.3132, + "loss/crossentropy": 1.9226845420897007, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.17548658936284484, + "step": 26310 + }, + { + "epoch": 0.658, + "grad_norm": 30.0, + "grad_norm_var": 3.8861138654567685e+18, + "learning_rate": 0.0001, + "loss": 7.347, + "loss/crossentropy": 2.1671392902731896, + "loss/hidden": 3.2453125, + "loss/jsd": 0.0, + "loss/logits": 0.18836958818137645, + "step": 26320 + }, + { + "epoch": 0.65825, + "grad_norm": 30.375, + "grad_norm_var": 5.1875, + "learning_rate": 0.0001, + "loss": 7.2642, + "loss/crossentropy": 2.178332243859768, + "loss/hidden": 3.2421875, + "loss/jsd": 0.0, + "loss/logits": 0.17817392963916062, + "step": 26330 + }, + { + "epoch": 0.6585, + "grad_norm": 27.875, + "grad_norm_var": 1.3184895833333334, + "learning_rate": 0.0001, + "loss": 7.2345, + "loss/crossentropy": 2.0184032306075097, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.18142256420105696, + "step": 26340 + }, + { + "epoch": 0.65875, + "grad_norm": 29.875, + "grad_norm_var": 3.35390625, + "learning_rate": 0.0001, + "loss": 7.4358, + "loss/crossentropy": 2.070467638969421, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18492308855056763, + "step": 26350 + }, + { + "epoch": 0.659, + "grad_norm": 30.5, + "grad_norm_var": 37.557291666666664, + "learning_rate": 0.0001, + "loss": 7.4061, + "loss/crossentropy": 2.0928609974682333, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.19751368090510368, + "step": 26360 + }, + { + "epoch": 0.65925, + "grad_norm": 31.625, + "grad_norm_var": 48.10182291666667, + "learning_rate": 0.0001, + "loss": 7.3098, + "loss/crossentropy": 2.0748341269791126, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.19011560268700123, + "step": 26370 + }, + { + "epoch": 0.6595, + "grad_norm": 32.25, + "grad_norm_var": 8.820833333333333, + "learning_rate": 0.0001, + "loss": 7.3229, + "loss/crossentropy": 1.9827906161546707, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.17519382825121282, + "step": 26380 + }, + { + "epoch": 0.65975, + "grad_norm": 30.375, + "grad_norm_var": 21.3150390625, + "learning_rate": 0.0001, + "loss": 7.3129, + "loss/crossentropy": 2.027328287810087, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1863293987698853, + "step": 26390 + }, + { + "epoch": 0.66, + "grad_norm": 34.25, + "grad_norm_var": 11.154622395833334, + "learning_rate": 0.0001, + "loss": 7.3147, + "loss/crossentropy": 2.0382798433303835, + "loss/hidden": 3.2234375, + "loss/jsd": 0.0, + "loss/logits": 0.17127347458153963, + "step": 26400 + }, + { + "epoch": 0.66025, + "grad_norm": 29.125, + "grad_norm_var": 11.134309895833333, + "learning_rate": 0.0001, + "loss": 7.1528, + "loss/crossentropy": 2.187176838517189, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18337102625519036, + "step": 26410 + }, + { + "epoch": 0.6605, + "grad_norm": 30.875, + "grad_norm_var": 2.5205729166666666, + "learning_rate": 0.0001, + "loss": 7.1994, + "loss/crossentropy": 1.8949346490204335, + "loss/hidden": 3.483984375, + "loss/jsd": 0.0, + "loss/logits": 0.1746540881693363, + "step": 26420 + }, + { + "epoch": 0.66075, + "grad_norm": 29.875, + "grad_norm_var": 6.1259765625, + "learning_rate": 0.0001, + "loss": 7.3588, + "loss/crossentropy": 2.1892473474144936, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.18937738612294197, + "step": 26430 + }, + { + "epoch": 0.661, + "grad_norm": 29.5, + "grad_norm_var": 5.56640625, + "learning_rate": 0.0001, + "loss": 7.3516, + "loss/crossentropy": 2.1834142342209817, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19312054924666883, + "step": 26440 + }, + { + "epoch": 0.66125, + "grad_norm": 32.0, + "grad_norm_var": 2.439583333333333, + "learning_rate": 0.0001, + "loss": 7.3007, + "loss/crossentropy": 2.1448648869991302, + "loss/hidden": 3.483203125, + "loss/jsd": 0.0, + "loss/logits": 0.2111048873513937, + "step": 26450 + }, + { + "epoch": 0.6615, + "grad_norm": 31.0, + "grad_norm_var": 2.1797421966524677e+18, + "learning_rate": 0.0001, + "loss": 7.3607, + "loss/crossentropy": 1.9932497456669807, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.2144283948466182, + "step": 26460 + }, + { + "epoch": 0.66175, + "grad_norm": 34.5, + "grad_norm_var": 5.677018229166666, + "learning_rate": 0.0001, + "loss": 7.306, + "loss/crossentropy": 2.0623308643698692, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.18320082779973745, + "step": 26470 + }, + { + "epoch": 0.662, + "grad_norm": 27.0, + "grad_norm_var": 4.14765625, + "learning_rate": 0.0001, + "loss": 7.3311, + "loss/crossentropy": 2.0493450671434403, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.19104604441672562, + "step": 26480 + }, + { + "epoch": 0.66225, + "grad_norm": 29.75, + "grad_norm_var": 3.660872395833333, + "learning_rate": 0.0001, + "loss": 7.2653, + "loss/crossentropy": 2.031788290292025, + "loss/hidden": 3.29609375, + "loss/jsd": 0.0, + "loss/logits": 0.1821944302879274, + "step": 26490 + }, + { + "epoch": 0.6625, + "grad_norm": 30.125, + "grad_norm_var": 1.8124348958333334, + "learning_rate": 0.0001, + "loss": 7.4374, + "loss/crossentropy": 2.1484886214137076, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.20224297530949115, + "step": 26500 + }, + { + "epoch": 0.66275, + "grad_norm": 29.0, + "grad_norm_var": 3.9587890625, + "learning_rate": 0.0001, + "loss": 7.2393, + "loss/crossentropy": 2.018560293316841, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.18759384863078593, + "step": 26510 + }, + { + "epoch": 0.663, + "grad_norm": 30.5, + "grad_norm_var": 2.620247395833333, + "learning_rate": 0.0001, + "loss": 7.3181, + "loss/crossentropy": 2.145674556493759, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18435985408723354, + "step": 26520 + }, + { + "epoch": 0.66325, + "grad_norm": 31.125, + "grad_norm_var": 7.412239583333333, + "learning_rate": 0.0001, + "loss": 7.2507, + "loss/crossentropy": 2.0798817604780195, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.18773154616355897, + "step": 26530 + }, + { + "epoch": 0.6635, + "grad_norm": 29.5, + "grad_norm_var": 7.8681640625, + "learning_rate": 0.0001, + "loss": 7.2586, + "loss/crossentropy": 2.061510816216469, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.19057576302438975, + "step": 26540 + }, + { + "epoch": 0.66375, + "grad_norm": 31.125, + "grad_norm_var": 6.52890625, + "learning_rate": 0.0001, + "loss": 7.3062, + "loss/crossentropy": 2.072283536195755, + "loss/hidden": 3.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.18644256051629782, + "step": 26550 + }, + { + "epoch": 0.664, + "grad_norm": 29.5, + "grad_norm_var": 7.605143229166667, + "learning_rate": 0.0001, + "loss": 7.2383, + "loss/crossentropy": 1.9341054081916809, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.16742698140442372, + "step": 26560 + }, + { + "epoch": 0.66425, + "grad_norm": 31.125, + "grad_norm_var": 2.6518229166666667, + "learning_rate": 0.0001, + "loss": 7.244, + "loss/crossentropy": 2.0679861828684807, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.17691615503281355, + "step": 26570 + }, + { + "epoch": 0.6645, + "grad_norm": 31.125, + "grad_norm_var": 4.612434895833333, + "learning_rate": 0.0001, + "loss": 7.3328, + "loss/crossentropy": 2.1904489412903785, + "loss/hidden": 3.303125, + "loss/jsd": 0.0, + "loss/logits": 0.1809101757593453, + "step": 26580 + }, + { + "epoch": 0.66475, + "grad_norm": 38.5, + "grad_norm_var": 7.606705729166666, + "learning_rate": 0.0001, + "loss": 7.2347, + "loss/crossentropy": 1.9506058931350707, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19015372060239316, + "step": 26590 + }, + { + "epoch": 0.665, + "grad_norm": 30.0, + "grad_norm_var": 6.116080729166667, + "learning_rate": 0.0001, + "loss": 7.1566, + "loss/crossentropy": 1.8624847821891308, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.1748434605076909, + "step": 26600 + }, + { + "epoch": 0.66525, + "grad_norm": 30.375, + "grad_norm_var": 3.2030598958333334, + "learning_rate": 0.0001, + "loss": 7.2837, + "loss/crossentropy": 2.1139489620923997, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.1822458105161786, + "step": 26610 + }, + { + "epoch": 0.6655, + "grad_norm": 28.875, + "grad_norm_var": 46.1978515625, + "learning_rate": 0.0001, + "loss": 7.1554, + "loss/crossentropy": 1.8614851802587509, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.16477091368287802, + "step": 26620 + }, + { + "epoch": 0.66575, + "grad_norm": 30.75, + "grad_norm_var": 2.6504557291666666, + "learning_rate": 0.0001, + "loss": 7.2609, + "loss/crossentropy": 2.2226543337106706, + "loss/hidden": 3.301953125, + "loss/jsd": 0.0, + "loss/logits": 0.1956513339653611, + "step": 26630 + }, + { + "epoch": 0.666, + "grad_norm": 31.125, + "grad_norm_var": 3.601822916666667, + "learning_rate": 0.0001, + "loss": 7.2908, + "loss/crossentropy": 2.0410634987056255, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.171825285628438, + "step": 26640 + }, + { + "epoch": 0.66625, + "grad_norm": 28.75, + "grad_norm_var": 19.392708333333335, + "learning_rate": 0.0001, + "loss": 7.2709, + "loss/crossentropy": 2.044853265583515, + "loss/hidden": 3.29921875, + "loss/jsd": 0.0, + "loss/logits": 0.18632043413817884, + "step": 26650 + }, + { + "epoch": 0.6665, + "grad_norm": 31.0, + "grad_norm_var": 4.559375, + "learning_rate": 0.0001, + "loss": 7.2649, + "loss/crossentropy": 2.1507553339004515, + "loss/hidden": 3.284375, + "loss/jsd": 0.0, + "loss/logits": 0.17937973625957965, + "step": 26660 + }, + { + "epoch": 0.66675, + "grad_norm": 33.25, + "grad_norm_var": 3.41875, + "learning_rate": 0.0001, + "loss": 7.3469, + "loss/crossentropy": 2.1353399351239206, + "loss/hidden": 3.253125, + "loss/jsd": 0.0, + "loss/logits": 0.17566792499274014, + "step": 26670 + }, + { + "epoch": 0.667, + "grad_norm": 28.125, + "grad_norm_var": 2.9119140625, + "learning_rate": 0.0001, + "loss": 7.2886, + "loss/crossentropy": 2.0442838847637175, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1838838504627347, + "step": 26680 + }, + { + "epoch": 0.66725, + "grad_norm": 29.75, + "grad_norm_var": 3.5166015625, + "learning_rate": 0.0001, + "loss": 7.268, + "loss/crossentropy": 2.0146602764725685, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.189494782499969, + "step": 26690 + }, + { + "epoch": 0.6675, + "grad_norm": 31.375, + "grad_norm_var": 2.2447916666666665, + "learning_rate": 0.0001, + "loss": 7.2655, + "loss/crossentropy": 2.062611496448517, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.20330444928258656, + "step": 26700 + }, + { + "epoch": 0.66775, + "grad_norm": 28.375, + "grad_norm_var": 3.17265625, + "learning_rate": 0.0001, + "loss": 7.3095, + "loss/crossentropy": 2.1450613498687745, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.18933778218925, + "step": 26710 + }, + { + "epoch": 0.668, + "grad_norm": 30.25, + "grad_norm_var": 1.965625, + "learning_rate": 0.0001, + "loss": 7.2676, + "loss/crossentropy": 1.9853663839399815, + "loss/hidden": 3.250390625, + "loss/jsd": 0.0, + "loss/logits": 0.17568059861660004, + "step": 26720 + }, + { + "epoch": 0.66825, + "grad_norm": 29.75, + "grad_norm_var": 115517274069465.19, + "learning_rate": 0.0001, + "loss": 7.3871, + "loss/crossentropy": 2.2481682300567627, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.19613477736711502, + "step": 26730 + }, + { + "epoch": 0.6685, + "grad_norm": 29.875, + "grad_norm_var": 115517270039004.08, + "learning_rate": 0.0001, + "loss": 7.3836, + "loss/crossentropy": 2.221486309170723, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.19494929853826762, + "step": 26740 + }, + { + "epoch": 0.66875, + "grad_norm": 27.875, + "grad_norm_var": 1.67890625, + "learning_rate": 0.0001, + "loss": 7.2099, + "loss/crossentropy": 2.0120978243649006, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.16912673255428673, + "step": 26750 + }, + { + "epoch": 0.669, + "grad_norm": 28.0, + "grad_norm_var": 3.424934895833333, + "learning_rate": 0.0001, + "loss": 7.2854, + "loss/crossentropy": 2.1522657334804536, + "loss/hidden": 3.284375, + "loss/jsd": 0.0, + "loss/logits": 0.17450021374970676, + "step": 26760 + }, + { + "epoch": 0.66925, + "grad_norm": 32.5, + "grad_norm_var": 10.490625, + "learning_rate": 0.0001, + "loss": 7.4043, + "loss/crossentropy": 2.03169424161315, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18973638992756606, + "step": 26770 + }, + { + "epoch": 0.6695, + "grad_norm": 40.5, + "grad_norm_var": 13.8400390625, + "learning_rate": 0.0001, + "loss": 7.3005, + "loss/crossentropy": 2.010363797843456, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.2009760271757841, + "step": 26780 + }, + { + "epoch": 0.66975, + "grad_norm": 29.875, + "grad_norm_var": 9.314518229166667, + "learning_rate": 0.0001, + "loss": 7.3094, + "loss/crossentropy": 2.0870240703225136, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.19831706527620555, + "step": 26790 + }, + { + "epoch": 0.67, + "grad_norm": 34.0, + "grad_norm_var": 2.0208333333333335, + "learning_rate": 0.0001, + "loss": 7.3591, + "loss/crossentropy": 1.9903123408555985, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.18768544010818006, + "step": 26800 + }, + { + "epoch": 0.67025, + "grad_norm": 30.75, + "grad_norm_var": 5.1962890625, + "learning_rate": 0.0001, + "loss": 7.3379, + "loss/crossentropy": 1.9887514635920525, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.18731831386685371, + "step": 26810 + }, + { + "epoch": 0.6705, + "grad_norm": 30.875, + "grad_norm_var": 5.4275390625, + "learning_rate": 0.0001, + "loss": 7.2099, + "loss/crossentropy": 2.2334194481372833, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.18677575308829547, + "step": 26820 + }, + { + "epoch": 0.67075, + "grad_norm": 30.5, + "grad_norm_var": 2.9624348958333333, + "learning_rate": 0.0001, + "loss": 7.2166, + "loss/crossentropy": 2.09789634346962, + "loss/hidden": 3.352734375, + "loss/jsd": 0.0, + "loss/logits": 0.1792703490704298, + "step": 26830 + }, + { + "epoch": 0.671, + "grad_norm": 28.75, + "grad_norm_var": 1.0145833333333334, + "learning_rate": 0.0001, + "loss": 7.2917, + "loss/crossentropy": 2.083233179152012, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.1986743938177824, + "step": 26840 + }, + { + "epoch": 0.67125, + "grad_norm": 37.25, + "grad_norm_var": 23.931184895833333, + "learning_rate": 0.0001, + "loss": 7.2443, + "loss/crossentropy": 2.0056370086967945, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.1743487582542002, + "step": 26850 + }, + { + "epoch": 0.6715, + "grad_norm": 30.0, + "grad_norm_var": 21.345768229166666, + "learning_rate": 0.0001, + "loss": 7.2698, + "loss/crossentropy": 2.222350174188614, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.199802653118968, + "step": 26860 + }, + { + "epoch": 0.67175, + "grad_norm": 30.5, + "grad_norm_var": 106.80930989583334, + "learning_rate": 0.0001, + "loss": 7.3626, + "loss/crossentropy": 2.0834795638918875, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.18940293360501528, + "step": 26870 + }, + { + "epoch": 0.672, + "grad_norm": 30.875, + "grad_norm_var": 2.638541666666667, + "learning_rate": 0.0001, + "loss": 7.3212, + "loss/crossentropy": 2.131199154257774, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.19130074717104434, + "step": 26880 + }, + { + "epoch": 0.67225, + "grad_norm": 7314866176.0, + "grad_norm_var": 3.3442041705028127e+18, + "learning_rate": 0.0001, + "loss": 7.269, + "loss/crossentropy": 2.047175918519497, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.174586620926857, + "step": 26890 + }, + { + "epoch": 0.6725, + "grad_norm": 31.25, + "grad_norm_var": 3.344204168613139e+18, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.140024873614311, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1875780440866947, + "step": 26900 + }, + { + "epoch": 0.67275, + "grad_norm": 30.375, + "grad_norm_var": 19.369791666666668, + "learning_rate": 0.0001, + "loss": 7.2309, + "loss/crossentropy": 2.0859541945159434, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.17924507465213538, + "step": 26910 + }, + { + "epoch": 0.673, + "grad_norm": 31.0, + "grad_norm_var": 2.1056640625, + "learning_rate": 0.0001, + "loss": 7.3053, + "loss/crossentropy": 1.9552275583148002, + "loss/hidden": 3.28203125, + "loss/jsd": 0.0, + "loss/logits": 0.16615418754518033, + "step": 26920 + }, + { + "epoch": 0.67325, + "grad_norm": 29.125, + "grad_norm_var": 9.1228515625, + "learning_rate": 0.0001, + "loss": 7.3161, + "loss/crossentropy": 2.051589508354664, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.1953272173181176, + "step": 26930 + }, + { + "epoch": 0.6735, + "grad_norm": 33.25, + "grad_norm_var": 12.4322265625, + "learning_rate": 0.0001, + "loss": 7.2163, + "loss/crossentropy": 2.039374630153179, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.1790797933936119, + "step": 26940 + }, + { + "epoch": 0.67375, + "grad_norm": 29.5, + "grad_norm_var": 10.328580729166667, + "learning_rate": 0.0001, + "loss": 7.2611, + "loss/crossentropy": 2.135941173136234, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.18610633611679078, + "step": 26950 + }, + { + "epoch": 0.674, + "grad_norm": 28.5, + "grad_norm_var": 2.2393229166666666, + "learning_rate": 0.0001, + "loss": 7.2542, + "loss/crossentropy": 2.061223492026329, + "loss/hidden": 3.237109375, + "loss/jsd": 0.0, + "loss/logits": 0.1693508742377162, + "step": 26960 + }, + { + "epoch": 0.67425, + "grad_norm": 31.375, + "grad_norm_var": 2.8309895833333334, + "learning_rate": 0.0001, + "loss": 7.3494, + "loss/crossentropy": 2.034488780796528, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18367824982851744, + "step": 26970 + }, + { + "epoch": 0.6745, + "grad_norm": 32.0, + "grad_norm_var": 1.6082682291666666, + "learning_rate": 0.0001, + "loss": 7.2714, + "loss/crossentropy": 2.127776539325714, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.19541560169309377, + "step": 26980 + }, + { + "epoch": 0.67475, + "grad_norm": 32.0, + "grad_norm_var": 2.8431640625, + "learning_rate": 0.0001, + "loss": 7.2163, + "loss/crossentropy": 1.9815926529467105, + "loss/hidden": 3.291796875, + "loss/jsd": 0.0, + "loss/logits": 0.187463073246181, + "step": 26990 + }, + { + "epoch": 0.675, + "grad_norm": 29.875, + "grad_norm_var": 12.0572265625, + "learning_rate": 0.0001, + "loss": 7.3076, + "loss/crossentropy": 2.025939218699932, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.18148638047277926, + "step": 27000 + }, + { + "epoch": 0.67525, + "grad_norm": 28.25, + "grad_norm_var": 2.4364583333333334, + "learning_rate": 0.0001, + "loss": 7.2504, + "loss/crossentropy": 2.1637841314077377, + "loss/hidden": 3.25625, + "loss/jsd": 0.0, + "loss/logits": 0.1768895300105214, + "step": 27010 + }, + { + "epoch": 0.6755, + "grad_norm": 27.375, + "grad_norm_var": 3.078125, + "learning_rate": 0.0001, + "loss": 7.2796, + "loss/crossentropy": 2.167166344821453, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.20178660629317163, + "step": 27020 + }, + { + "epoch": 0.67575, + "grad_norm": 30.25, + "grad_norm_var": 2.6186848958333333, + "learning_rate": 0.0001, + "loss": 7.3119, + "loss/crossentropy": 2.0646814957261084, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.19334694920107723, + "step": 27030 + }, + { + "epoch": 0.676, + "grad_norm": 30.125, + "grad_norm_var": 2.6322265625, + "learning_rate": 0.0001, + "loss": 7.2636, + "loss/crossentropy": 2.04083681628108, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17480793623253704, + "step": 27040 + }, + { + "epoch": 0.67625, + "grad_norm": 28.125, + "grad_norm_var": 2.957246447870105e+18, + "learning_rate": 0.0001, + "loss": 7.3215, + "loss/crossentropy": 2.062483212351799, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.1790636266581714, + "step": 27050 + }, + { + "epoch": 0.6765, + "grad_norm": 31.0, + "grad_norm_var": 2.9572464473757015e+18, + "learning_rate": 0.0001, + "loss": 7.2078, + "loss/crossentropy": 2.1033657416701317, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.1807030899450183, + "step": 27060 + }, + { + "epoch": 0.67675, + "grad_norm": 28.875, + "grad_norm_var": 5.155208333333333, + "learning_rate": 0.0001, + "loss": 7.2486, + "loss/crossentropy": 1.976552402228117, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.17463484006002544, + "step": 27070 + }, + { + "epoch": 0.677, + "grad_norm": 30.625, + "grad_norm_var": 1.590625, + "learning_rate": 0.0001, + "loss": 7.2763, + "loss/crossentropy": 2.1383912026882173, + "loss/hidden": 3.283984375, + "loss/jsd": 0.0, + "loss/logits": 0.1793740052729845, + "step": 27080 + }, + { + "epoch": 0.67725, + "grad_norm": 29.125, + "grad_norm_var": 1.7791015625, + "learning_rate": 0.0001, + "loss": 7.3335, + "loss/crossentropy": 2.183702952414751, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.20361373238265515, + "step": 27090 + }, + { + "epoch": 0.6775, + "grad_norm": 31.625, + "grad_norm_var": 1.4552083333333334, + "learning_rate": 0.0001, + "loss": 7.1603, + "loss/crossentropy": 2.2112974256277083, + "loss/hidden": 3.250390625, + "loss/jsd": 0.0, + "loss/logits": 0.17802942357957363, + "step": 27100 + }, + { + "epoch": 0.67775, + "grad_norm": 29.375, + "grad_norm_var": 2.090625, + "learning_rate": 0.0001, + "loss": 7.282, + "loss/crossentropy": 2.1228010304272176, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.1823104264214635, + "step": 27110 + }, + { + "epoch": 0.678, + "grad_norm": 31.125, + "grad_norm_var": 1.84765625, + "learning_rate": 0.0001, + "loss": 7.2531, + "loss/crossentropy": 2.0709615498781204, + "loss/hidden": 3.299609375, + "loss/jsd": 0.0, + "loss/logits": 0.171175280213356, + "step": 27120 + }, + { + "epoch": 0.67825, + "grad_norm": 30.125, + "grad_norm_var": 2.8655598958333335, + "learning_rate": 0.0001, + "loss": 7.3062, + "loss/crossentropy": 2.2544207006692885, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.20476784072816373, + "step": 27130 + }, + { + "epoch": 0.6785, + "grad_norm": 30.25, + "grad_norm_var": 1.9958333333333333, + "learning_rate": 0.0001, + "loss": 7.269, + "loss/crossentropy": 2.044566038250923, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.18673994652926923, + "step": 27140 + }, + { + "epoch": 0.67875, + "grad_norm": 33.25, + "grad_norm_var": 2.787239583333333, + "learning_rate": 0.0001, + "loss": 7.3861, + "loss/crossentropy": 2.076129969954491, + "loss/hidden": 3.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.17596831321716308, + "step": 27150 + }, + { + "epoch": 0.679, + "grad_norm": 30.375, + "grad_norm_var": 4.278059895833334, + "learning_rate": 0.0001, + "loss": 7.4392, + "loss/crossentropy": 2.0484411746263502, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.1874468218535185, + "step": 27160 + }, + { + "epoch": 0.67925, + "grad_norm": 30.625, + "grad_norm_var": 2.25390625, + "learning_rate": 0.0001, + "loss": 7.1788, + "loss/crossentropy": 2.180029886960983, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.20117557551711798, + "step": 27170 + }, + { + "epoch": 0.6795, + "grad_norm": 31.625, + "grad_norm_var": 5.522330729166667, + "learning_rate": 0.0001, + "loss": 7.2891, + "loss/crossentropy": 1.8975796066224575, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.17547342590987683, + "step": 27180 + }, + { + "epoch": 0.67975, + "grad_norm": 30.375, + "grad_norm_var": 11.6009765625, + "learning_rate": 0.0001, + "loss": 7.1943, + "loss/crossentropy": 2.1247182063758374, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.17981126178056, + "step": 27190 + }, + { + "epoch": 0.68, + "grad_norm": 28.25, + "grad_norm_var": 8.919205729166666, + "learning_rate": 0.0001, + "loss": 7.2694, + "loss/crossentropy": 2.037150639295578, + "loss/hidden": 3.262109375, + "loss/jsd": 0.0, + "loss/logits": 0.17851810567080975, + "step": 27200 + }, + { + "epoch": 0.68025, + "grad_norm": 31.875, + "grad_norm_var": 2.218489583333333, + "learning_rate": 0.0001, + "loss": 7.2526, + "loss/crossentropy": 1.9110659010708333, + "loss/hidden": 3.5203125, + "loss/jsd": 0.0, + "loss/logits": 0.19786193277686834, + "step": 27210 + }, + { + "epoch": 0.6805, + "grad_norm": 32.0, + "grad_norm_var": 1.8385416666666667, + "learning_rate": 0.0001, + "loss": 7.2515, + "loss/crossentropy": 2.2368867814540865, + "loss/hidden": 3.2171875, + "loss/jsd": 0.0, + "loss/logits": 0.1798218859359622, + "step": 27220 + }, + { + "epoch": 0.68075, + "grad_norm": 33.0, + "grad_norm_var": 3.54140625, + "learning_rate": 0.0001, + "loss": 7.2991, + "loss/crossentropy": 2.1260806649923323, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20750290621072054, + "step": 27230 + }, + { + "epoch": 0.681, + "grad_norm": 29.875, + "grad_norm_var": 3.755989583333333, + "learning_rate": 0.0001, + "loss": 7.3501, + "loss/crossentropy": 2.031808337569237, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.18845813032239675, + "step": 27240 + }, + { + "epoch": 0.68125, + "grad_norm": 31.375, + "grad_norm_var": 3.9942057291666666, + "learning_rate": 0.0001, + "loss": 7.2698, + "loss/crossentropy": 2.1050592973828315, + "loss/hidden": 3.322265625, + "loss/jsd": 0.0, + "loss/logits": 0.18391012214124203, + "step": 27250 + }, + { + "epoch": 0.6815, + "grad_norm": 30.25, + "grad_norm_var": 2.36640625, + "learning_rate": 0.0001, + "loss": 7.3684, + "loss/crossentropy": 2.1474289670586586, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1923911923542619, + "step": 27260 + }, + { + "epoch": 0.68175, + "grad_norm": 31.25, + "grad_norm_var": 1.9858723958333333, + "learning_rate": 0.0001, + "loss": 7.2966, + "loss/crossentropy": 2.0553861789405348, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.17631796486675738, + "step": 27270 + }, + { + "epoch": 0.682, + "grad_norm": 30.375, + "grad_norm_var": 2.138541666666667, + "learning_rate": 0.0001, + "loss": 7.27, + "loss/crossentropy": 1.9470392048358918, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.1823154903948307, + "step": 27280 + }, + { + "epoch": 0.68225, + "grad_norm": 29.375, + "grad_norm_var": 6.4525390625, + "learning_rate": 0.0001, + "loss": 7.3825, + "loss/crossentropy": 2.1857406944036484, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.19291127175092698, + "step": 27290 + }, + { + "epoch": 0.6825, + "grad_norm": 31.0, + "grad_norm_var": 2.48515625, + "learning_rate": 0.0001, + "loss": 7.3561, + "loss/crossentropy": 2.227454760670662, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18692610040307045, + "step": 27300 + }, + { + "epoch": 0.68275, + "grad_norm": 30.25, + "grad_norm_var": 1.13515625, + "learning_rate": 0.0001, + "loss": 7.2776, + "loss/crossentropy": 1.9948305085301399, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.1988155659288168, + "step": 27310 + }, + { + "epoch": 0.683, + "grad_norm": 28.875, + "grad_norm_var": 1.8541015625, + "learning_rate": 0.0001, + "loss": 7.2481, + "loss/crossentropy": 2.1666594982147216, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.18878179658204317, + "step": 27320 + }, + { + "epoch": 0.68325, + "grad_norm": 29.875, + "grad_norm_var": 2.687239583333333, + "learning_rate": 0.0001, + "loss": 7.3854, + "loss/crossentropy": 2.1071100682020187, + "loss/hidden": 3.30625, + "loss/jsd": 0.0, + "loss/logits": 0.17986488938331605, + "step": 27330 + }, + { + "epoch": 0.6835, + "grad_norm": 31.375, + "grad_norm_var": 3.3889973958333335, + "learning_rate": 0.0001, + "loss": 7.235, + "loss/crossentropy": 2.029652649909258, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.1780470222234726, + "step": 27340 + }, + { + "epoch": 0.68375, + "grad_norm": 28.875, + "grad_norm_var": 2.155143229166667, + "learning_rate": 0.0001, + "loss": 7.3103, + "loss/crossentropy": 1.9602101355791093, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.18400138095021248, + "step": 27350 + }, + { + "epoch": 0.684, + "grad_norm": 29.375, + "grad_norm_var": 0.45598958333333334, + "learning_rate": 0.0001, + "loss": 7.213, + "loss/crossentropy": 2.106926290690899, + "loss/hidden": 3.305078125, + "loss/jsd": 0.0, + "loss/logits": 0.17419682666659356, + "step": 27360 + }, + { + "epoch": 0.68425, + "grad_norm": 29.75, + "grad_norm_var": 0.44973958333333336, + "learning_rate": 0.0001, + "loss": 7.2879, + "loss/crossentropy": 2.237940040230751, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.1868164110928774, + "step": 27370 + }, + { + "epoch": 0.6845, + "grad_norm": 36.25, + "grad_norm_var": 2.9947916666666665, + "learning_rate": 0.0001, + "loss": 7.4911, + "loss/crossentropy": 2.2364405103027822, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.1962603386491537, + "step": 27380 + }, + { + "epoch": 0.68475, + "grad_norm": 32.75, + "grad_norm_var": 7.664583333333334, + "learning_rate": 0.0001, + "loss": 7.3546, + "loss/crossentropy": 2.125661809742451, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.17499309908598662, + "step": 27390 + }, + { + "epoch": 0.685, + "grad_norm": 29.125, + "grad_norm_var": 7.216666666666667, + "learning_rate": 0.0001, + "loss": 7.2694, + "loss/crossentropy": 2.17802165299654, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.18032563328742982, + "step": 27400 + }, + { + "epoch": 0.68525, + "grad_norm": 28.625, + "grad_norm_var": 2.5254557291666666, + "learning_rate": 0.0001, + "loss": 7.2098, + "loss/crossentropy": 2.0895639203488825, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.17885948363691567, + "step": 27410 + }, + { + "epoch": 0.6855, + "grad_norm": 31.875, + "grad_norm_var": 3.098893229166667, + "learning_rate": 0.0001, + "loss": 7.2392, + "loss/crossentropy": 2.0427750945091248, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.184170157648623, + "step": 27420 + }, + { + "epoch": 0.68575, + "grad_norm": 31.125, + "grad_norm_var": 1.5853515625, + "learning_rate": 0.0001, + "loss": 7.4218, + "loss/crossentropy": 2.2016945309937, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.1827758725732565, + "step": 27430 + }, + { + "epoch": 0.686, + "grad_norm": 33.25, + "grad_norm_var": 1.7332682291666666, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 1.969399856030941, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.17159626744687556, + "step": 27440 + }, + { + "epoch": 0.68625, + "grad_norm": 34.5, + "grad_norm_var": 22.256184895833332, + "learning_rate": 0.0001, + "loss": 7.2665, + "loss/crossentropy": 2.0050867199897766, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.18022035136818887, + "step": 27450 + }, + { + "epoch": 0.6865, + "grad_norm": 27.625, + "grad_norm_var": 22.336458333333333, + "learning_rate": 0.0001, + "loss": 7.2446, + "loss/crossentropy": 1.9901010252535343, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.17804777259007096, + "step": 27460 + }, + { + "epoch": 0.68675, + "grad_norm": 28.125, + "grad_norm_var": 12.424739583333333, + "learning_rate": 0.0001, + "loss": 7.2441, + "loss/crossentropy": 2.031005633622408, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.168888134136796, + "step": 27470 + }, + { + "epoch": 0.687, + "grad_norm": 29.0, + "grad_norm_var": 0.8817057291666667, + "learning_rate": 0.0001, + "loss": 7.2623, + "loss/crossentropy": 2.1447826243937014, + "loss/hidden": 3.35859375, + "loss/jsd": 0.0, + "loss/logits": 0.18581478726118802, + "step": 27480 + }, + { + "epoch": 0.68725, + "grad_norm": 31.25, + "grad_norm_var": 1.9895833333333333, + "learning_rate": 0.0001, + "loss": 7.3027, + "loss/crossentropy": 2.2375788524746896, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19884238932281734, + "step": 27490 + }, + { + "epoch": 0.6875, + "grad_norm": 30.875, + "grad_norm_var": 10.562239583333334, + "learning_rate": 0.0001, + "loss": 7.3156, + "loss/crossentropy": 1.9248571954667568, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.16993207540363073, + "step": 27500 + }, + { + "epoch": 0.68775, + "grad_norm": 29.125, + "grad_norm_var": 6.201041666666667, + "learning_rate": 0.0001, + "loss": 7.1426, + "loss/crossentropy": 2.1742107778787614, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.19332415759563445, + "step": 27510 + }, + { + "epoch": 0.688, + "grad_norm": 28.625, + "grad_norm_var": 2.443489583333333, + "learning_rate": 0.0001, + "loss": 7.1899, + "loss/crossentropy": 2.064833169430494, + "loss/hidden": 3.226171875, + "loss/jsd": 0.0, + "loss/logits": 0.1686219139955938, + "step": 27520 + }, + { + "epoch": 0.68825, + "grad_norm": 29.875, + "grad_norm_var": 2.308333333333333, + "learning_rate": 0.0001, + "loss": 7.2861, + "loss/crossentropy": 2.2087809190154077, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18179803155362606, + "step": 27530 + }, + { + "epoch": 0.6885, + "grad_norm": 29.5, + "grad_norm_var": 1.3559895833333333, + "learning_rate": 0.0001, + "loss": 7.2698, + "loss/crossentropy": 2.1848205491900443, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.18062485102564096, + "step": 27540 + }, + { + "epoch": 0.68875, + "grad_norm": 28.375, + "grad_norm_var": 2.5254557291666666, + "learning_rate": 0.0001, + "loss": 7.1884, + "loss/crossentropy": 2.10894877538085, + "loss/hidden": 3.278515625, + "loss/jsd": 0.0, + "loss/logits": 0.18315895553678274, + "step": 27550 + }, + { + "epoch": 0.689, + "grad_norm": 30.375, + "grad_norm_var": 1.7504557291666667, + "learning_rate": 0.0001, + "loss": 7.2776, + "loss/crossentropy": 1.9970835141837597, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.1800586089491844, + "step": 27560 + }, + { + "epoch": 0.68925, + "grad_norm": 63.0, + "grad_norm_var": 446.81223958333334, + "learning_rate": 0.0001, + "loss": 7.2313, + "loss/crossentropy": 2.108223394304514, + "loss/hidden": 3.310546875, + "loss/jsd": 0.0, + "loss/logits": 0.18529674978926777, + "step": 27570 + }, + { + "epoch": 0.6895, + "grad_norm": 27.75, + "grad_norm_var": 229.2228515625, + "learning_rate": 0.0001, + "loss": 7.311, + "loss/crossentropy": 2.140768423676491, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.18569578658789396, + "step": 27580 + }, + { + "epoch": 0.68975, + "grad_norm": 30.0, + "grad_norm_var": 141.71608072916666, + "learning_rate": 0.0001, + "loss": 7.1009, + "loss/crossentropy": 2.080308865010738, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.17526689525693656, + "step": 27590 + }, + { + "epoch": 0.69, + "grad_norm": 47.5, + "grad_norm_var": 127.42649739583334, + "learning_rate": 0.0001, + "loss": 7.1538, + "loss/crossentropy": 1.9624833583831787, + "loss/hidden": 3.242578125, + "loss/jsd": 0.0, + "loss/logits": 0.17015852518379687, + "step": 27600 + }, + { + "epoch": 0.69025, + "grad_norm": 28.0, + "grad_norm_var": 107.753125, + "learning_rate": 0.0001, + "loss": 7.2956, + "loss/crossentropy": 2.0645004503428934, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19667823929339648, + "step": 27610 + }, + { + "epoch": 0.6905, + "grad_norm": 30.5, + "grad_norm_var": 73.18723958333334, + "learning_rate": 0.0001, + "loss": 7.2826, + "loss/crossentropy": 2.18245909512043, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.1764837397262454, + "step": 27620 + }, + { + "epoch": 0.69075, + "grad_norm": 54.5, + "grad_norm_var": 85.95774739583334, + "learning_rate": 0.0001, + "loss": 7.1622, + "loss/crossentropy": 2.022562848776579, + "loss/hidden": 3.396484375, + "loss/jsd": 0.0, + "loss/logits": 0.17506722770631314, + "step": 27630 + }, + { + "epoch": 0.691, + "grad_norm": 29.25, + "grad_norm_var": 62.54055989583333, + "learning_rate": 0.0001, + "loss": 7.3102, + "loss/crossentropy": 2.039566272497177, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.18705343939363955, + "step": 27640 + }, + { + "epoch": 0.69125, + "grad_norm": 31.625, + "grad_norm_var": 86.08639322916666, + "learning_rate": 0.0001, + "loss": 7.1129, + "loss/crossentropy": 2.006290066242218, + "loss/hidden": 3.28125, + "loss/jsd": 0.0, + "loss/logits": 0.17148794345557689, + "step": 27650 + }, + { + "epoch": 0.6915, + "grad_norm": 31.875, + "grad_norm_var": 3.344204167188264e+18, + "learning_rate": 0.0001, + "loss": 7.1452, + "loss/crossentropy": 2.08501241505146, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.1790543705224991, + "step": 27660 + }, + { + "epoch": 0.69175, + "grad_norm": 30.25, + "grad_norm_var": 75.49479166666667, + "learning_rate": 0.0001, + "loss": 7.226, + "loss/crossentropy": 2.061854050308466, + "loss/hidden": 3.241015625, + "loss/jsd": 0.0, + "loss/logits": 0.18336585741490125, + "step": 27670 + }, + { + "epoch": 0.692, + "grad_norm": 44.75, + "grad_norm_var": 54.40390625, + "learning_rate": 0.0001, + "loss": 7.192, + "loss/crossentropy": 2.12304325401783, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.19009512551128865, + "step": 27680 + }, + { + "epoch": 0.69225, + "grad_norm": 29.625, + "grad_norm_var": 43.609309895833334, + "learning_rate": 0.0001, + "loss": 7.2723, + "loss/crossentropy": 2.082204009592533, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.18473013881593942, + "step": 27690 + }, + { + "epoch": 0.6925, + "grad_norm": 27.125, + "grad_norm_var": 34.3587890625, + "learning_rate": 0.0001, + "loss": 7.2277, + "loss/crossentropy": 2.07097297757864, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.1826794480904937, + "step": 27700 + }, + { + "epoch": 0.69275, + "grad_norm": 40.0, + "grad_norm_var": 27.279622395833332, + "learning_rate": 0.0001, + "loss": 7.2755, + "loss/crossentropy": 2.029601737856865, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.1799439148977399, + "step": 27710 + }, + { + "epoch": 0.693, + "grad_norm": 39.25, + "grad_norm_var": 17.6291015625, + "learning_rate": 0.0001, + "loss": 7.2338, + "loss/crossentropy": 2.0642464771866798, + "loss/hidden": 3.241015625, + "loss/jsd": 0.0, + "loss/logits": 0.1682391857728362, + "step": 27720 + }, + { + "epoch": 0.69325, + "grad_norm": 32.25, + "grad_norm_var": 17.788997395833334, + "learning_rate": 0.0001, + "loss": 7.1999, + "loss/crossentropy": 1.9560820698738097, + "loss/hidden": 3.362109375, + "loss/jsd": 0.0, + "loss/logits": 0.18492446821182967, + "step": 27730 + }, + { + "epoch": 0.6935, + "grad_norm": 29.375, + "grad_norm_var": 19.187239583333334, + "learning_rate": 0.0001, + "loss": 7.2068, + "loss/crossentropy": 2.151790848374367, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18812405075877905, + "step": 27740 + }, + { + "epoch": 0.69375, + "grad_norm": 33.25, + "grad_norm_var": 25.820572916666666, + "learning_rate": 0.0001, + "loss": 7.2683, + "loss/crossentropy": 2.027320671826601, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.1938678870908916, + "step": 27750 + }, + { + "epoch": 0.694, + "grad_norm": 35.75, + "grad_norm_var": 13.291080729166667, + "learning_rate": 0.0001, + "loss": 7.2137, + "loss/crossentropy": 2.222668908536434, + "loss/hidden": 3.212890625, + "loss/jsd": 0.0, + "loss/logits": 0.1686727624386549, + "step": 27760 + }, + { + "epoch": 0.69425, + "grad_norm": 33.5, + "grad_norm_var": 17.365559895833332, + "learning_rate": 0.0001, + "loss": 7.2981, + "loss/crossentropy": 2.0196828715503217, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.17830958236008881, + "step": 27770 + }, + { + "epoch": 0.6945, + "grad_norm": 31.25, + "grad_norm_var": 13.575, + "learning_rate": 0.0001, + "loss": 7.2654, + "loss/crossentropy": 2.0276674374938013, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.1810338616371155, + "step": 27780 + }, + { + "epoch": 0.69475, + "grad_norm": 38.5, + "grad_norm_var": 1.0995116099021482e+18, + "learning_rate": 0.0001, + "loss": 7.1378, + "loss/crossentropy": 1.9873436450958253, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.1692363377660513, + "step": 27790 + }, + { + "epoch": 0.695, + "grad_norm": 30.25, + "grad_norm_var": 19.480989583333333, + "learning_rate": 0.0001, + "loss": 7.1259, + "loss/crossentropy": 2.0556870639324187, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1810257313773036, + "step": 27800 + }, + { + "epoch": 0.69525, + "grad_norm": 31.375, + "grad_norm_var": 18.948372395833335, + "learning_rate": 0.0001, + "loss": 7.2655, + "loss/crossentropy": 2.047024370729923, + "loss/hidden": 3.299609375, + "loss/jsd": 0.0, + "loss/logits": 0.176701095700264, + "step": 27810 + }, + { + "epoch": 0.6955, + "grad_norm": 28.375, + "grad_norm_var": 12.296875, + "learning_rate": 0.0001, + "loss": 7.2012, + "loss/crossentropy": 1.9956993229687214, + "loss/hidden": 3.25078125, + "loss/jsd": 0.0, + "loss/logits": 0.1679278828203678, + "step": 27820 + }, + { + "epoch": 0.69575, + "grad_norm": 29.875, + "grad_norm_var": 10.91015625, + "learning_rate": 0.0001, + "loss": 7.1434, + "loss/crossentropy": 1.9673665076494218, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.1817054906859994, + "step": 27830 + }, + { + "epoch": 0.696, + "grad_norm": 27.25, + "grad_norm_var": 65.115625, + "learning_rate": 0.0001, + "loss": 7.1678, + "loss/crossentropy": 2.110173484683037, + "loss/hidden": 3.299609375, + "loss/jsd": 0.0, + "loss/logits": 0.17987647745758295, + "step": 27840 + }, + { + "epoch": 0.69625, + "grad_norm": 27.0, + "grad_norm_var": 80.79375, + "learning_rate": 0.0001, + "loss": 7.1452, + "loss/crossentropy": 2.066749632358551, + "loss/hidden": 3.41484375, + "loss/jsd": 0.0, + "loss/logits": 0.18724118024110795, + "step": 27850 + }, + { + "epoch": 0.6965, + "grad_norm": 27.625, + "grad_norm_var": 37.25182291666667, + "learning_rate": 0.0001, + "loss": 7.1831, + "loss/crossentropy": 1.8799223847687245, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.1720001684501767, + "step": 27860 + }, + { + "epoch": 0.69675, + "grad_norm": 31.875, + "grad_norm_var": 6.83515625, + "learning_rate": 0.0001, + "loss": 7.2257, + "loss/crossentropy": 1.984929259866476, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18079593311995268, + "step": 27870 + }, + { + "epoch": 0.697, + "grad_norm": 28.375, + "grad_norm_var": 12.743489583333334, + "learning_rate": 0.0001, + "loss": 7.1256, + "loss/crossentropy": 2.1968638002872467, + "loss/hidden": 3.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.17731832768768072, + "step": 27880 + }, + { + "epoch": 0.69725, + "grad_norm": 29.0, + "grad_norm_var": 9.189583333333333, + "learning_rate": 0.0001, + "loss": 7.181, + "loss/crossentropy": 1.9869700893759727, + "loss/hidden": 3.27734375, + "loss/jsd": 0.0, + "loss/logits": 0.18951656930148603, + "step": 27890 + }, + { + "epoch": 0.6975, + "grad_norm": 29.5, + "grad_norm_var": 12.85390625, + "learning_rate": 0.0001, + "loss": 7.2834, + "loss/crossentropy": 2.1294341668486596, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.18512535598129035, + "step": 27900 + }, + { + "epoch": 0.69775, + "grad_norm": 29.625, + "grad_norm_var": 6.847916666666666, + "learning_rate": 0.0001, + "loss": 7.175, + "loss/crossentropy": 2.1979243993759154, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.2182347685098648, + "step": 27910 + }, + { + "epoch": 0.698, + "grad_norm": 29.0, + "grad_norm_var": 8.465625, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 1.9434294357895852, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1738783059641719, + "step": 27920 + }, + { + "epoch": 0.69825, + "grad_norm": 30.625, + "grad_norm_var": 2.443489583333333, + "learning_rate": 0.0001, + "loss": 7.1309, + "loss/crossentropy": 1.9772110790014268, + "loss/hidden": 3.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.16446346305310727, + "step": 27930 + }, + { + "epoch": 0.6985, + "grad_norm": 31.625, + "grad_norm_var": 7.475, + "learning_rate": 0.0001, + "loss": 7.2323, + "loss/crossentropy": 2.046364203095436, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.16903265602886677, + "step": 27940 + }, + { + "epoch": 0.69875, + "grad_norm": 29.5, + "grad_norm_var": 4.887434895833334, + "learning_rate": 0.0001, + "loss": 7.3053, + "loss/crossentropy": 2.240569099038839, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.18274808302521706, + "step": 27950 + }, + { + "epoch": 0.699, + "grad_norm": 34.0, + "grad_norm_var": 7.443684895833333, + "learning_rate": 0.0001, + "loss": 7.3101, + "loss/crossentropy": 2.044392989575863, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.18658852139487864, + "step": 27960 + }, + { + "epoch": 0.69925, + "grad_norm": 31.0, + "grad_norm_var": 9.6166015625, + "learning_rate": 0.0001, + "loss": 7.359, + "loss/crossentropy": 2.178907059133053, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.1918690962716937, + "step": 27970 + }, + { + "epoch": 0.6995, + "grad_norm": 30.875, + "grad_norm_var": 6.7806640625, + "learning_rate": 0.0001, + "loss": 7.3059, + "loss/crossentropy": 2.07792577072978, + "loss/hidden": 3.40703125, + "loss/jsd": 0.0, + "loss/logits": 0.1873043139465153, + "step": 27980 + }, + { + "epoch": 0.69975, + "grad_norm": 29.75, + "grad_norm_var": 5.643489583333333, + "learning_rate": 0.0001, + "loss": 7.3217, + "loss/crossentropy": 1.9168295338749886, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.18331131394952535, + "step": 27990 + }, + { + "epoch": 0.7, + "grad_norm": 30.625, + "grad_norm_var": 4.960416666666666, + "learning_rate": 0.0001, + "loss": 7.3017, + "loss/crossentropy": 1.9929687798023223, + "loss/hidden": 3.482421875, + "loss/jsd": 0.0, + "loss/logits": 0.21256182622164488, + "step": 28000 + }, + { + "epoch": 0.70025, + "grad_norm": 28.25, + "grad_norm_var": 227.75625, + "learning_rate": 0.0001, + "loss": 7.2574, + "loss/crossentropy": 2.0593857660889627, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.17820099778473378, + "step": 28010 + }, + { + "epoch": 0.7005, + "grad_norm": 31.25, + "grad_norm_var": 224.015625, + "learning_rate": 0.0001, + "loss": 7.412, + "loss/crossentropy": 2.174033749103546, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.1898373905569315, + "step": 28020 + }, + { + "epoch": 0.70075, + "grad_norm": 28.375, + "grad_norm_var": 4.4181640625, + "learning_rate": 0.0001, + "loss": 7.1913, + "loss/crossentropy": 2.0483900040388106, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.17332715671509505, + "step": 28030 + }, + { + "epoch": 0.701, + "grad_norm": 29.5, + "grad_norm_var": 5.629166666666666, + "learning_rate": 0.0001, + "loss": 7.2349, + "loss/crossentropy": 2.177481460571289, + "loss/hidden": 3.284375, + "loss/jsd": 0.0, + "loss/logits": 0.1783738609403372, + "step": 28040 + }, + { + "epoch": 0.70125, + "grad_norm": 31.25, + "grad_norm_var": 2.4809895833333333, + "learning_rate": 0.0001, + "loss": 7.3133, + "loss/crossentropy": 2.099740183353424, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.2023984882980585, + "step": 28050 + }, + { + "epoch": 0.7015, + "grad_norm": 29.375, + "grad_norm_var": 1.1051432291666667, + "learning_rate": 0.0001, + "loss": 7.2455, + "loss/crossentropy": 1.9891198098659515, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.1935951853170991, + "step": 28060 + }, + { + "epoch": 0.70175, + "grad_norm": 30.5, + "grad_norm_var": 23.0619140625, + "learning_rate": 0.0001, + "loss": 7.2426, + "loss/crossentropy": 1.9753205917775631, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.17461898345500232, + "step": 28070 + }, + { + "epoch": 0.702, + "grad_norm": 28.375, + "grad_norm_var": 25.145768229166666, + "learning_rate": 0.0001, + "loss": 7.2736, + "loss/crossentropy": 2.0158716015517713, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.19325515236705543, + "step": 28080 + }, + { + "epoch": 0.70225, + "grad_norm": 30.25, + "grad_norm_var": 3.1134765625, + "learning_rate": 0.0001, + "loss": 7.3422, + "loss/crossentropy": 2.217750224471092, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.18919748738408088, + "step": 28090 + }, + { + "epoch": 0.7025, + "grad_norm": 30.0, + "grad_norm_var": 4.51640625, + "learning_rate": 0.0001, + "loss": 7.3656, + "loss/crossentropy": 2.0768872730433943, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.17947383699938654, + "step": 28100 + }, + { + "epoch": 0.70275, + "grad_norm": 29.25, + "grad_norm_var": 3.4052083333333334, + "learning_rate": 0.0001, + "loss": 7.3558, + "loss/crossentropy": 2.115848197042942, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.1815752826165408, + "step": 28110 + }, + { + "epoch": 0.703, + "grad_norm": 29.5, + "grad_norm_var": 2.878125, + "learning_rate": 0.0001, + "loss": 7.2471, + "loss/crossentropy": 2.0716689586639405, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.19040394499897956, + "step": 28120 + }, + { + "epoch": 0.70325, + "grad_norm": 32.5, + "grad_norm_var": 2.8160807291666665, + "learning_rate": 0.0001, + "loss": 7.2455, + "loss/crossentropy": 2.171695147454739, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.186448429338634, + "step": 28130 + }, + { + "epoch": 0.7035, + "grad_norm": 30.875, + "grad_norm_var": 6.870833333333334, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 2.2176968052983286, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.18737617712467908, + "step": 28140 + }, + { + "epoch": 0.70375, + "grad_norm": 31.5, + "grad_norm_var": 7.795768229166667, + "learning_rate": 0.0001, + "loss": 7.3125, + "loss/crossentropy": 2.0401076070964335, + "loss/hidden": 3.305859375, + "loss/jsd": 0.0, + "loss/logits": 0.1749078031629324, + "step": 28150 + }, + { + "epoch": 0.704, + "grad_norm": 34.75, + "grad_norm_var": 5.884375, + "learning_rate": 0.0001, + "loss": 7.3385, + "loss/crossentropy": 2.0140421822667123, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.17568598408252, + "step": 28160 + }, + { + "epoch": 0.70425, + "grad_norm": 33.25, + "grad_norm_var": 631.7122395833334, + "learning_rate": 0.0001, + "loss": 7.2722, + "loss/crossentropy": 2.23368354216218, + "loss/hidden": 3.29140625, + "loss/jsd": 0.0, + "loss/logits": 0.19659436494112015, + "step": 28170 + }, + { + "epoch": 0.7045, + "grad_norm": 29.5, + "grad_norm_var": 5.282747395833334, + "learning_rate": 0.0001, + "loss": 7.2662, + "loss/crossentropy": 1.9696129187941551, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.1738910754211247, + "step": 28180 + }, + { + "epoch": 0.70475, + "grad_norm": 28.25, + "grad_norm_var": 15.142122395833333, + "learning_rate": 0.0001, + "loss": 7.2909, + "loss/crossentropy": 2.041120965778828, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.18105307575315238, + "step": 28190 + }, + { + "epoch": 0.705, + "grad_norm": 29.0, + "grad_norm_var": 1.99765625, + "learning_rate": 0.0001, + "loss": 7.2079, + "loss/crossentropy": 2.110129737854004, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18080106116831302, + "step": 28200 + }, + { + "epoch": 0.70525, + "grad_norm": 29.25, + "grad_norm_var": 1.58515625, + "learning_rate": 0.0001, + "loss": 7.3249, + "loss/crossentropy": 2.148154190182686, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18392155058681964, + "step": 28210 + }, + { + "epoch": 0.7055, + "grad_norm": 29.375, + "grad_norm_var": 1.0155598958333334, + "learning_rate": 0.0001, + "loss": 7.3179, + "loss/crossentropy": 2.190532484650612, + "loss/hidden": 3.311328125, + "loss/jsd": 0.0, + "loss/logits": 0.1767113889567554, + "step": 28220 + }, + { + "epoch": 0.70575, + "grad_norm": 29.0, + "grad_norm_var": 3.0395182291666667, + "learning_rate": 0.0001, + "loss": 7.2009, + "loss/crossentropy": 2.175555232167244, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.20121710691601039, + "step": 28230 + }, + { + "epoch": 0.706, + "grad_norm": 28.25, + "grad_norm_var": 2.7643229166666665, + "learning_rate": 0.0001, + "loss": 7.265, + "loss/crossentropy": 2.1481928557157515, + "loss/hidden": 3.24453125, + "loss/jsd": 0.0, + "loss/logits": 0.17819216307252644, + "step": 28240 + }, + { + "epoch": 0.70625, + "grad_norm": 31.0, + "grad_norm_var": 1.1718098958333334, + "learning_rate": 0.0001, + "loss": 7.2569, + "loss/crossentropy": 1.997367661446333, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.18763066958636046, + "step": 28250 + }, + { + "epoch": 0.7065, + "grad_norm": 29.125, + "grad_norm_var": 1.9580729166666666, + "learning_rate": 0.0001, + "loss": 7.2066, + "loss/crossentropy": 2.156166338920593, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.18162646796554327, + "step": 28260 + }, + { + "epoch": 0.70675, + "grad_norm": 29.375, + "grad_norm_var": 1.5010416666666666, + "learning_rate": 0.0001, + "loss": 7.315, + "loss/crossentropy": 1.9849402651190757, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.18158245990052818, + "step": 28270 + }, + { + "epoch": 0.707, + "grad_norm": 31.5, + "grad_norm_var": 1.209375, + "learning_rate": 0.0001, + "loss": 7.2722, + "loss/crossentropy": 2.0745258808135985, + "loss/hidden": 3.318359375, + "loss/jsd": 0.0, + "loss/logits": 0.17213933765888215, + "step": 28280 + }, + { + "epoch": 0.70725, + "grad_norm": 30.625, + "grad_norm_var": 2.771875, + "learning_rate": 0.0001, + "loss": 7.2265, + "loss/crossentropy": 2.0611407831311226, + "loss/hidden": 3.2765625, + "loss/jsd": 0.0, + "loss/logits": 0.17584825437515975, + "step": 28290 + }, + { + "epoch": 0.7075, + "grad_norm": 31.75, + "grad_norm_var": 2.690625, + "learning_rate": 0.0001, + "loss": 7.3203, + "loss/crossentropy": 2.0343645237386228, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.1779821278527379, + "step": 28300 + }, + { + "epoch": 0.70775, + "grad_norm": 30.5, + "grad_norm_var": 2.173958333333333, + "learning_rate": 0.0001, + "loss": 7.3906, + "loss/crossentropy": 2.023589651286602, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.19405677895992995, + "step": 28310 + }, + { + "epoch": 0.708, + "grad_norm": 31.75, + "grad_norm_var": 2.528059895833333, + "learning_rate": 0.0001, + "loss": 7.4391, + "loss/crossentropy": 2.058233138918877, + "loss/hidden": 3.45546875, + "loss/jsd": 0.0, + "loss/logits": 0.18879729304462672, + "step": 28320 + }, + { + "epoch": 0.70825, + "grad_norm": 32.5, + "grad_norm_var": 3.81015625, + "learning_rate": 0.0001, + "loss": 7.2291, + "loss/crossentropy": 1.9951315492391586, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.1786661896854639, + "step": 28330 + }, + { + "epoch": 0.7085, + "grad_norm": 28.5, + "grad_norm_var": 3.408072916666667, + "learning_rate": 0.0001, + "loss": 7.2737, + "loss/crossentropy": 2.1709944613277914, + "loss/hidden": 3.280078125, + "loss/jsd": 0.0, + "loss/logits": 0.18447048161178828, + "step": 28340 + }, + { + "epoch": 0.70875, + "grad_norm": 29.75, + "grad_norm_var": 2.0014973958333333, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 1.9754939682781696, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.1782332186587155, + "step": 28350 + }, + { + "epoch": 0.709, + "grad_norm": 28.375, + "grad_norm_var": 3.0546223958333334, + "learning_rate": 0.0001, + "loss": 7.3341, + "loss/crossentropy": 1.9486119627952576, + "loss/hidden": 3.333203125, + "loss/jsd": 0.0, + "loss/logits": 0.18434599563479423, + "step": 28360 + }, + { + "epoch": 0.70925, + "grad_norm": 28.375, + "grad_norm_var": 3.0936848958333334, + "learning_rate": 0.0001, + "loss": 7.2676, + "loss/crossentropy": 2.011347506940365, + "loss/hidden": 3.31484375, + "loss/jsd": 0.0, + "loss/logits": 0.17598988981917502, + "step": 28370 + }, + { + "epoch": 0.7095, + "grad_norm": 31.25, + "grad_norm_var": 3.278059895833333, + "learning_rate": 0.0001, + "loss": 7.2683, + "loss/crossentropy": 2.0628366231918336, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.20117868315428494, + "step": 28380 + }, + { + "epoch": 0.70975, + "grad_norm": 30.25, + "grad_norm_var": 1.9744140625, + "learning_rate": 0.0001, + "loss": 7.3684, + "loss/crossentropy": 2.1887825660407545, + "loss/hidden": 3.20390625, + "loss/jsd": 0.0, + "loss/logits": 0.180817243270576, + "step": 28390 + }, + { + "epoch": 0.71, + "grad_norm": 5838471168.0, + "grad_norm_var": 2.1304840768286884e+18, + "learning_rate": 0.0001, + "loss": 7.2715, + "loss/crossentropy": 1.938818733394146, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18086879644542933, + "step": 28400 + }, + { + "epoch": 0.71025, + "grad_norm": 33.25, + "grad_norm_var": 2.1304840755150323e+18, + "learning_rate": 0.0001, + "loss": 7.3715, + "loss/crossentropy": 1.953948251157999, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.17758083064109087, + "step": 28410 + }, + { + "epoch": 0.7105, + "grad_norm": 31.375, + "grad_norm_var": 3.11640625, + "learning_rate": 0.0001, + "loss": 7.1884, + "loss/crossentropy": 2.3171644777059557, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.19651102535426618, + "step": 28420 + }, + { + "epoch": 0.71075, + "grad_norm": 32.0, + "grad_norm_var": 2.7643229166666665, + "learning_rate": 0.0001, + "loss": 7.3556, + "loss/crossentropy": 2.037036082148552, + "loss/hidden": 3.275, + "loss/jsd": 0.0, + "loss/logits": 0.17107345517724754, + "step": 28430 + }, + { + "epoch": 0.711, + "grad_norm": 30.75, + "grad_norm_var": 1.4561848958333334, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 2.127303283661604, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.20090547651052476, + "step": 28440 + }, + { + "epoch": 0.71125, + "grad_norm": 32.0, + "grad_norm_var": 2.6809895833333335, + "learning_rate": 0.0001, + "loss": 7.3823, + "loss/crossentropy": 2.161235421895981, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.1780310433357954, + "step": 28450 + }, + { + "epoch": 0.7115, + "grad_norm": 28.75, + "grad_norm_var": 1.54140625, + "learning_rate": 0.0001, + "loss": 7.3632, + "loss/crossentropy": 2.1571759194135667, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.19603485772386192, + "step": 28460 + }, + { + "epoch": 0.71175, + "grad_norm": 29.75, + "grad_norm_var": 4.364518229166666, + "learning_rate": 0.0001, + "loss": 7.2657, + "loss/crossentropy": 2.1602049231529237, + "loss/hidden": 3.325390625, + "loss/jsd": 0.0, + "loss/logits": 0.18435865715146066, + "step": 28470 + }, + { + "epoch": 0.712, + "grad_norm": 30.5, + "grad_norm_var": 10.72890625, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.001628914475441, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.1918376425281167, + "step": 28480 + }, + { + "epoch": 0.71225, + "grad_norm": 67.5, + "grad_norm_var": 87.51712239583334, + "learning_rate": 0.0001, + "loss": 7.2839, + "loss/crossentropy": 1.9931135676801204, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.17476971219293774, + "step": 28490 + }, + { + "epoch": 0.7125, + "grad_norm": 32.25, + "grad_norm_var": 96.16399739583333, + "learning_rate": 0.0001, + "loss": 7.3093, + "loss/crossentropy": 1.9668639115989208, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.1912683444097638, + "step": 28500 + }, + { + "epoch": 0.71275, + "grad_norm": 29.625, + "grad_norm_var": 2.3824041782086554e+18, + "learning_rate": 0.0001, + "loss": 7.3577, + "loss/crossentropy": 1.9727089330554008, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.17465871069580316, + "step": 28510 + }, + { + "epoch": 0.713, + "grad_norm": 28.875, + "grad_norm_var": 2.3824041781636367e+18, + "learning_rate": 0.0001, + "loss": 7.3469, + "loss/crossentropy": 2.1261328026652335, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.19634310100227595, + "step": 28520 + }, + { + "epoch": 0.71325, + "grad_norm": 44.5, + "grad_norm_var": 14.9853515625, + "learning_rate": 0.0001, + "loss": 7.3029, + "loss/crossentropy": 2.029768883436918, + "loss/hidden": 3.30234375, + "loss/jsd": 0.0, + "loss/logits": 0.1729470796417445, + "step": 28530 + }, + { + "epoch": 0.7135, + "grad_norm": 30.625, + "grad_norm_var": 15.383072916666666, + "learning_rate": 0.0001, + "loss": 7.1945, + "loss/crossentropy": 2.0496911972761156, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.18508521970361472, + "step": 28540 + }, + { + "epoch": 0.71375, + "grad_norm": 28.75, + "grad_norm_var": 2.428580729166667, + "learning_rate": 0.0001, + "loss": 7.323, + "loss/crossentropy": 2.064000000059605, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.1856430722400546, + "step": 28550 + }, + { + "epoch": 0.714, + "grad_norm": 32.25, + "grad_norm_var": 5.41640625, + "learning_rate": 0.0001, + "loss": 7.3069, + "loss/crossentropy": 2.0117718100547792, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.17515756376087666, + "step": 28560 + }, + { + "epoch": 0.71425, + "grad_norm": 32.25, + "grad_norm_var": 2.8400390625, + "learning_rate": 0.0001, + "loss": 7.1966, + "loss/crossentropy": 1.9726905196905136, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.19007054381072522, + "step": 28570 + }, + { + "epoch": 0.7145, + "grad_norm": 28.625, + "grad_norm_var": 4.1416015625, + "learning_rate": 0.0001, + "loss": 7.4288, + "loss/crossentropy": 2.1557130038738253, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.19311564825475216, + "step": 28580 + }, + { + "epoch": 0.71475, + "grad_norm": 33.5, + "grad_norm_var": 4.809375, + "learning_rate": 0.0001, + "loss": 7.3429, + "loss/crossentropy": 2.097322532534599, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.18367934115231038, + "step": 28590 + }, + { + "epoch": 0.715, + "grad_norm": 31.75, + "grad_norm_var": 3.8895833333333334, + "learning_rate": 0.0001, + "loss": 7.2557, + "loss/crossentropy": 2.1415718257427216, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.18898510355502368, + "step": 28600 + }, + { + "epoch": 0.71525, + "grad_norm": 31.5, + "grad_norm_var": 2.3643229166666666, + "learning_rate": 0.0001, + "loss": 7.3755, + "loss/crossentropy": 2.1518311858177186, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.19082098407670856, + "step": 28610 + }, + { + "epoch": 0.7155, + "grad_norm": 29.25, + "grad_norm_var": 2.198372395833333, + "learning_rate": 0.0001, + "loss": 7.3797, + "loss/crossentropy": 2.2079875111579894, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.18495317455381155, + "step": 28620 + }, + { + "epoch": 0.71575, + "grad_norm": 27.0, + "grad_norm_var": 2.0608723958333335, + "learning_rate": 0.0001, + "loss": 7.2214, + "loss/crossentropy": 2.1074557095766067, + "loss/hidden": 3.30078125, + "loss/jsd": 0.0, + "loss/logits": 0.1770407123491168, + "step": 28630 + }, + { + "epoch": 0.716, + "grad_norm": 30.75, + "grad_norm_var": 1.9332682291666667, + "learning_rate": 0.0001, + "loss": 7.3267, + "loss/crossentropy": 2.0740742847323417, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.18051706086844205, + "step": 28640 + }, + { + "epoch": 0.71625, + "grad_norm": 30.375, + "grad_norm_var": 3.254622395833333, + "learning_rate": 0.0001, + "loss": 7.1893, + "loss/crossentropy": 1.9990646712481976, + "loss/hidden": 3.221875, + "loss/jsd": 0.0, + "loss/logits": 0.17936246804893016, + "step": 28650 + }, + { + "epoch": 0.7165, + "grad_norm": 30.75, + "grad_norm_var": 2.57890625, + "learning_rate": 0.0001, + "loss": 7.2947, + "loss/crossentropy": 2.1080400705337525, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.1867066564038396, + "step": 28660 + }, + { + "epoch": 0.71675, + "grad_norm": 31.75, + "grad_norm_var": 1.9247395833333334, + "learning_rate": 0.0001, + "loss": 7.2661, + "loss/crossentropy": 2.074036268889904, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.1867619127035141, + "step": 28670 + }, + { + "epoch": 0.717, + "grad_norm": 29.125, + "grad_norm_var": 2.0434895833333333, + "learning_rate": 0.0001, + "loss": 7.293, + "loss/crossentropy": 2.0304371282458304, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.18383819735608994, + "step": 28680 + }, + { + "epoch": 0.71725, + "grad_norm": 32.25, + "grad_norm_var": 1.9619140625, + "learning_rate": 0.0001, + "loss": 7.4196, + "loss/crossentropy": 2.0870690762996675, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.18118756357580423, + "step": 28690 + }, + { + "epoch": 0.7175, + "grad_norm": 29.375, + "grad_norm_var": 3.2504557291666667, + "learning_rate": 0.0001, + "loss": 7.2668, + "loss/crossentropy": 2.1232191398739815, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.17828409411013127, + "step": 28700 + }, + { + "epoch": 0.71775, + "grad_norm": 31.125, + "grad_norm_var": 3.216666666666667, + "learning_rate": 0.0001, + "loss": 7.3202, + "loss/crossentropy": 2.093307775259018, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.17778347730636596, + "step": 28710 + }, + { + "epoch": 0.718, + "grad_norm": 27.75, + "grad_norm_var": 2.728125, + "learning_rate": 0.0001, + "loss": 7.2286, + "loss/crossentropy": 2.086037275195122, + "loss/hidden": 3.234375, + "loss/jsd": 0.0, + "loss/logits": 0.17649652771651744, + "step": 28720 + }, + { + "epoch": 0.71825, + "grad_norm": 32.25, + "grad_norm_var": 2.872330729166667, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.188430291414261, + "loss/hidden": 3.285546875, + "loss/jsd": 0.0, + "loss/logits": 0.18066751547157764, + "step": 28730 + }, + { + "epoch": 0.7185, + "grad_norm": 29.5, + "grad_norm_var": 2.159375, + "learning_rate": 0.0001, + "loss": 7.3229, + "loss/crossentropy": 2.103739258646965, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18136450834572315, + "step": 28740 + }, + { + "epoch": 0.71875, + "grad_norm": 29.5, + "grad_norm_var": 3.215559895833333, + "learning_rate": 0.0001, + "loss": 7.2833, + "loss/crossentropy": 1.9982567869126797, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.18330774297937752, + "step": 28750 + }, + { + "epoch": 0.719, + "grad_norm": 30.25, + "grad_norm_var": 3.0747395833333333, + "learning_rate": 0.0001, + "loss": 7.251, + "loss/crossentropy": 2.1402099445462226, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.1817182382568717, + "step": 28760 + }, + { + "epoch": 0.71925, + "grad_norm": 28.5, + "grad_norm_var": 2.667122395833333, + "learning_rate": 0.0001, + "loss": 7.2284, + "loss/crossentropy": 1.96514939814806, + "loss/hidden": 3.4609375, + "loss/jsd": 0.0, + "loss/logits": 0.18674888461828232, + "step": 28770 + }, + { + "epoch": 0.7195, + "grad_norm": 30.875, + "grad_norm_var": 13.762955729166666, + "learning_rate": 0.0001, + "loss": 7.4611, + "loss/crossentropy": 2.109529510140419, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.18532418198883532, + "step": 28780 + }, + { + "epoch": 0.71975, + "grad_norm": 29.75, + "grad_norm_var": 3.018684895833333, + "learning_rate": 0.0001, + "loss": 7.3621, + "loss/crossentropy": 2.164661727845669, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.22424146672710776, + "step": 28790 + }, + { + "epoch": 0.72, + "grad_norm": 30.875, + "grad_norm_var": 1.675, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.0529872700572014, + "loss/hidden": 3.283203125, + "loss/jsd": 0.0, + "loss/logits": 0.17430850807577372, + "step": 28800 + }, + { + "epoch": 0.72025, + "grad_norm": 28.75, + "grad_norm_var": 3.463997395833333, + "learning_rate": 0.0001, + "loss": 7.3611, + "loss/crossentropy": 2.091229538619518, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18271805476397276, + "step": 28810 + }, + { + "epoch": 0.7205, + "grad_norm": 29.75, + "grad_norm_var": 2.8306640625, + "learning_rate": 0.0001, + "loss": 7.3029, + "loss/crossentropy": 1.8339010991156102, + "loss/hidden": 3.455078125, + "loss/jsd": 0.0, + "loss/logits": 0.1796142913866788, + "step": 28820 + }, + { + "epoch": 0.72075, + "grad_norm": 29.5, + "grad_norm_var": 2.940559895833333, + "learning_rate": 0.0001, + "loss": 7.2718, + "loss/crossentropy": 2.2519613206386566, + "loss/hidden": 3.249609375, + "loss/jsd": 0.0, + "loss/logits": 0.1793349577113986, + "step": 28830 + }, + { + "epoch": 0.721, + "grad_norm": 32.5, + "grad_norm_var": 3.74140625, + "learning_rate": 0.0001, + "loss": 7.2763, + "loss/crossentropy": 2.0135060526430606, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.1750583214685321, + "step": 28840 + }, + { + "epoch": 0.72125, + "grad_norm": 28.375, + "grad_norm_var": 3.37265625, + "learning_rate": 0.0001, + "loss": 7.328, + "loss/crossentropy": 2.1612493455410005, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.1920415310189128, + "step": 28850 + }, + { + "epoch": 0.7215, + "grad_norm": 30.5, + "grad_norm_var": 1.76640625, + "learning_rate": 0.0001, + "loss": 7.3347, + "loss/crossentropy": 2.20034881234169, + "loss/hidden": 3.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.18493964169174432, + "step": 28860 + }, + { + "epoch": 0.72175, + "grad_norm": 30.25, + "grad_norm_var": 0.9905598958333334, + "learning_rate": 0.0001, + "loss": 7.2071, + "loss/crossentropy": 2.147878049314022, + "loss/hidden": 3.33359375, + "loss/jsd": 0.0, + "loss/logits": 0.18159984424710274, + "step": 28870 + }, + { + "epoch": 0.722, + "grad_norm": 32.25, + "grad_norm_var": 1.5624348958333334, + "learning_rate": 0.0001, + "loss": 7.2372, + "loss/crossentropy": 1.9990291111171246, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.1772604424506426, + "step": 28880 + }, + { + "epoch": 0.72225, + "grad_norm": 28.75, + "grad_norm_var": 1.8525390625, + "learning_rate": 0.0001, + "loss": 7.2288, + "loss/crossentropy": 2.143052602559328, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1891188271343708, + "step": 28890 + }, + { + "epoch": 0.7225, + "grad_norm": 29.5, + "grad_norm_var": 1.8936848958333334, + "learning_rate": 0.0001, + "loss": 7.1928, + "loss/crossentropy": 1.9795674622058868, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.17800430227071046, + "step": 28900 + }, + { + "epoch": 0.72275, + "grad_norm": 42.0, + "grad_norm_var": 10.046809895833333, + "learning_rate": 0.0001, + "loss": 7.2488, + "loss/crossentropy": 2.1416710793972014, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.19329405389726162, + "step": 28910 + }, + { + "epoch": 0.723, + "grad_norm": 31.25, + "grad_norm_var": 10.276822916666667, + "learning_rate": 0.0001, + "loss": 7.2998, + "loss/crossentropy": 1.9632095940411092, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.1833349185064435, + "step": 28920 + }, + { + "epoch": 0.72325, + "grad_norm": 30.375, + "grad_norm_var": 2.0395182291666667, + "learning_rate": 0.0001, + "loss": 7.3208, + "loss/crossentropy": 2.1186553999781608, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.18248404301702975, + "step": 28930 + }, + { + "epoch": 0.7235, + "grad_norm": 30.0, + "grad_norm_var": 3.1434895833333334, + "learning_rate": 0.0001, + "loss": 7.397, + "loss/crossentropy": 2.1514319077134134, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.20566836930811405, + "step": 28940 + }, + { + "epoch": 0.72375, + "grad_norm": 28.25, + "grad_norm_var": 3.1030598958333333, + "learning_rate": 0.0001, + "loss": 7.2651, + "loss/crossentropy": 2.0852475076913835, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.17877694573253394, + "step": 28950 + }, + { + "epoch": 0.724, + "grad_norm": 30.125, + "grad_norm_var": 2.2171223958333335, + "learning_rate": 0.0001, + "loss": 7.3286, + "loss/crossentropy": 2.1540793150663378, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.17957639656960964, + "step": 28960 + }, + { + "epoch": 0.72425, + "grad_norm": 28.625, + "grad_norm_var": 4.618489583333333, + "learning_rate": 0.0001, + "loss": 7.3035, + "loss/crossentropy": 2.13916500210762, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.2003261223435402, + "step": 28970 + }, + { + "epoch": 0.7245, + "grad_norm": 31.125, + "grad_norm_var": 3.435416666666667, + "learning_rate": 0.0001, + "loss": 7.3627, + "loss/crossentropy": 2.0126554384827613, + "loss/hidden": 3.478515625, + "loss/jsd": 0.0, + "loss/logits": 0.192663181014359, + "step": 28980 + }, + { + "epoch": 0.72475, + "grad_norm": 27.5, + "grad_norm_var": 12.14765625, + "learning_rate": 0.0001, + "loss": 7.2051, + "loss/crossentropy": 1.9036022566258908, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.1818926576524973, + "step": 28990 + }, + { + "epoch": 0.725, + "grad_norm": 28.25, + "grad_norm_var": 12.360416666666667, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 2.158479106426239, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.2065505689010024, + "step": 29000 + }, + { + "epoch": 0.72525, + "grad_norm": 29.375, + "grad_norm_var": 2.158072916666667, + "learning_rate": 0.0001, + "loss": 7.3395, + "loss/crossentropy": 2.105581759661436, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.19470038171857595, + "step": 29010 + }, + { + "epoch": 0.7255, + "grad_norm": 36.75, + "grad_norm_var": 5.5462890625, + "learning_rate": 0.0001, + "loss": 7.3939, + "loss/crossentropy": 1.9542883321642877, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.1957132790237665, + "step": 29020 + }, + { + "epoch": 0.72575, + "grad_norm": 33.0, + "grad_norm_var": 7.20390625, + "learning_rate": 0.0001, + "loss": 7.2865, + "loss/crossentropy": 2.1816267639398577, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.2034391313791275, + "step": 29030 + }, + { + "epoch": 0.726, + "grad_norm": 30.125, + "grad_norm_var": 11.633072916666666, + "learning_rate": 0.0001, + "loss": 7.2807, + "loss/crossentropy": 2.1128641098737715, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.17710135243833064, + "step": 29040 + }, + { + "epoch": 0.72625, + "grad_norm": 29.75, + "grad_norm_var": 11.9150390625, + "learning_rate": 0.0001, + "loss": 7.2213, + "loss/crossentropy": 2.154743219912052, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.1751030594110489, + "step": 29050 + }, + { + "epoch": 0.7265, + "grad_norm": 39.75, + "grad_norm_var": 63.35305989583333, + "learning_rate": 0.0001, + "loss": 7.2911, + "loss/crossentropy": 1.957652424275875, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1776497988961637, + "step": 29060 + }, + { + "epoch": 0.72675, + "grad_norm": 29.0, + "grad_norm_var": 17.909309895833335, + "learning_rate": 0.0001, + "loss": 7.2493, + "loss/crossentropy": 1.99792113378644, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18379787560552358, + "step": 29070 + }, + { + "epoch": 0.727, + "grad_norm": 29.875, + "grad_norm_var": 3.6488932291666667, + "learning_rate": 0.0001, + "loss": 7.2821, + "loss/crossentropy": 2.0582117259502413, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.18036038037389518, + "step": 29080 + }, + { + "epoch": 0.72725, + "grad_norm": 29.625, + "grad_norm_var": 1.8728515625, + "learning_rate": 0.0001, + "loss": 7.0774, + "loss/crossentropy": 2.014867861568928, + "loss/hidden": 3.3015625, + "loss/jsd": 0.0, + "loss/logits": 0.17281777542084456, + "step": 29090 + }, + { + "epoch": 0.7275, + "grad_norm": 28.625, + "grad_norm_var": 3.658333333333333, + "learning_rate": 0.0001, + "loss": 7.3509, + "loss/crossentropy": 2.085656464099884, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.17972098607569933, + "step": 29100 + }, + { + "epoch": 0.72775, + "grad_norm": 31.75, + "grad_norm_var": 2.487434895833333, + "learning_rate": 0.0001, + "loss": 7.3234, + "loss/crossentropy": 2.0445970110595226, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.16965955086052417, + "step": 29110 + }, + { + "epoch": 0.728, + "grad_norm": 28.0, + "grad_norm_var": 1.3020182291666667, + "learning_rate": 0.0001, + "loss": 7.2674, + "loss/crossentropy": 2.2715115547180176, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.1960804032161832, + "step": 29120 + }, + { + "epoch": 0.72825, + "grad_norm": 29.625, + "grad_norm_var": 5.242708333333334, + "learning_rate": 0.0001, + "loss": 7.3008, + "loss/crossentropy": 2.1775743424892426, + "loss/hidden": 3.32734375, + "loss/jsd": 0.0, + "loss/logits": 0.18752657640725373, + "step": 29130 + }, + { + "epoch": 0.7285, + "grad_norm": 38.5, + "grad_norm_var": 3.132886831921837e+18, + "learning_rate": 0.0001, + "loss": 7.219, + "loss/crossentropy": 1.903736773133278, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.17021676823496817, + "step": 29140 + }, + { + "epoch": 0.72875, + "grad_norm": 38.75, + "grad_norm_var": 20.174739583333334, + "learning_rate": 0.0001, + "loss": 7.3486, + "loss/crossentropy": 2.1836992099881174, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.2182300578802824, + "step": 29150 + }, + { + "epoch": 0.729, + "grad_norm": 32.5, + "grad_norm_var": 17.109830729166667, + "learning_rate": 0.0001, + "loss": 7.2701, + "loss/crossentropy": 2.0530862897634505, + "loss/hidden": 3.2484375, + "loss/jsd": 0.0, + "loss/logits": 0.18154476722702384, + "step": 29160 + }, + { + "epoch": 0.72925, + "grad_norm": 30.75, + "grad_norm_var": 1.2494140625, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 2.0158081978559492, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.18534094132483006, + "step": 29170 + }, + { + "epoch": 0.7295, + "grad_norm": 30.25, + "grad_norm_var": 43.02682291666667, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 2.254640057682991, + "loss/hidden": 3.26953125, + "loss/jsd": 0.0, + "loss/logits": 0.19149381387978792, + "step": 29180 + }, + { + "epoch": 0.72975, + "grad_norm": 32.25, + "grad_norm_var": 42.57265625, + "learning_rate": 0.0001, + "loss": 7.3284, + "loss/crossentropy": 2.1554953277111055, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.1914082583039999, + "step": 29190 + }, + { + "epoch": 0.73, + "grad_norm": 30.875, + "grad_norm_var": 2.520247395833333, + "learning_rate": 0.0001, + "loss": 7.188, + "loss/crossentropy": 2.095432303100824, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.17190228644758462, + "step": 29200 + }, + { + "epoch": 0.73025, + "grad_norm": 31.0, + "grad_norm_var": 1.8103515625, + "learning_rate": 0.0001, + "loss": 7.2669, + "loss/crossentropy": 1.9353884100914, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.1733547686599195, + "step": 29210 + }, + { + "epoch": 0.7305, + "grad_norm": 29.75, + "grad_norm_var": 1.64375, + "learning_rate": 0.0001, + "loss": 7.38, + "loss/crossentropy": 2.1763652101159097, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.19759245738387107, + "step": 29220 + }, + { + "epoch": 0.73075, + "grad_norm": 29.625, + "grad_norm_var": 1.9811848958333333, + "learning_rate": 0.0001, + "loss": 7.316, + "loss/crossentropy": 2.059311767667532, + "loss/hidden": 3.273046875, + "loss/jsd": 0.0, + "loss/logits": 0.1732610635459423, + "step": 29230 + }, + { + "epoch": 0.731, + "grad_norm": 30.75, + "grad_norm_var": 1.6296223958333333, + "learning_rate": 0.0001, + "loss": 7.2387, + "loss/crossentropy": 2.1624471306800843, + "loss/hidden": 3.27421875, + "loss/jsd": 0.0, + "loss/logits": 0.18443353176116944, + "step": 29240 + }, + { + "epoch": 0.73125, + "grad_norm": 30.875, + "grad_norm_var": 1.1622395833333334, + "learning_rate": 0.0001, + "loss": 7.255, + "loss/crossentropy": 2.087037294358015, + "loss/hidden": 3.46484375, + "loss/jsd": 0.0, + "loss/logits": 0.18923133062198758, + "step": 29250 + }, + { + "epoch": 0.7315, + "grad_norm": 28.625, + "grad_norm_var": 1.5389973958333334, + "learning_rate": 0.0001, + "loss": 7.1715, + "loss/crossentropy": 2.0068927347660064, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1776790694333613, + "step": 29260 + }, + { + "epoch": 0.73175, + "grad_norm": 34.75, + "grad_norm_var": 3.0125, + "learning_rate": 0.0001, + "loss": 7.3044, + "loss/crossentropy": 2.237174994498491, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1987008555792272, + "step": 29270 + }, + { + "epoch": 0.732, + "grad_norm": 32.75, + "grad_norm_var": 4.869791666666667, + "learning_rate": 0.0001, + "loss": 7.3198, + "loss/crossentropy": 2.073240910470486, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.19791424218565226, + "step": 29280 + }, + { + "epoch": 0.73225, + "grad_norm": 32.5, + "grad_norm_var": 3.595833333333333, + "learning_rate": 0.0001, + "loss": 7.3329, + "loss/crossentropy": 2.123592960834503, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.18660867251455784, + "step": 29290 + }, + { + "epoch": 0.7325, + "grad_norm": 27.25, + "grad_norm_var": 2.0957682291666666, + "learning_rate": 0.0001, + "loss": 7.2039, + "loss/crossentropy": 2.1879305720329283, + "loss/hidden": 3.269921875, + "loss/jsd": 0.0, + "loss/logits": 0.18721728827804326, + "step": 29300 + }, + { + "epoch": 0.73275, + "grad_norm": 26.375, + "grad_norm_var": 23.684830729166666, + "learning_rate": 0.0001, + "loss": 7.3075, + "loss/crossentropy": 2.0866588428616524, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.17703543622046708, + "step": 29310 + }, + { + "epoch": 0.733, + "grad_norm": 31.375, + "grad_norm_var": 23.417708333333334, + "learning_rate": 0.0001, + "loss": 7.2536, + "loss/crossentropy": 2.250990514457226, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.18253531642258167, + "step": 29320 + }, + { + "epoch": 0.73325, + "grad_norm": 29.125, + "grad_norm_var": 2.414322916666667, + "learning_rate": 0.0001, + "loss": 7.2226, + "loss/crossentropy": 2.102177432179451, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19504718706011773, + "step": 29330 + }, + { + "epoch": 0.7335, + "grad_norm": 32.5, + "grad_norm_var": 2.820572916666667, + "learning_rate": 0.0001, + "loss": 7.1243, + "loss/crossentropy": 2.0512589052319528, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.18182420805096627, + "step": 29340 + }, + { + "epoch": 0.73375, + "grad_norm": 29.75, + "grad_norm_var": 1.9934895833333333, + "learning_rate": 0.0001, + "loss": 7.3111, + "loss/crossentropy": 2.143858629465103, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.19919059351086615, + "step": 29350 + }, + { + "epoch": 0.734, + "grad_norm": 31.375, + "grad_norm_var": 1.80625, + "learning_rate": 0.0001, + "loss": 7.2538, + "loss/crossentropy": 2.0024397730827332, + "loss/hidden": 3.255859375, + "loss/jsd": 0.0, + "loss/logits": 0.17439783103764056, + "step": 29360 + }, + { + "epoch": 0.73425, + "grad_norm": 31.0, + "grad_norm_var": 3.5143229166666665, + "learning_rate": 0.0001, + "loss": 7.1658, + "loss/crossentropy": 1.8820870153605938, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.17721544997766614, + "step": 29370 + }, + { + "epoch": 0.7345, + "grad_norm": 29.75, + "grad_norm_var": 4.33125, + "learning_rate": 0.0001, + "loss": 7.2407, + "loss/crossentropy": 2.068721887469292, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.1961481623351574, + "step": 29380 + }, + { + "epoch": 0.73475, + "grad_norm": 31.5, + "grad_norm_var": 2.603125, + "learning_rate": 0.0001, + "loss": 7.252, + "loss/crossentropy": 2.016729524731636, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.18200870957225562, + "step": 29390 + }, + { + "epoch": 0.735, + "grad_norm": 29.75, + "grad_norm_var": 1.48515625, + "learning_rate": 0.0001, + "loss": 7.3778, + "loss/crossentropy": 2.2266832500696183, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.18430891614407302, + "step": 29400 + }, + { + "epoch": 0.73525, + "grad_norm": 31.0, + "grad_norm_var": 7.8931640625, + "learning_rate": 0.0001, + "loss": 7.2621, + "loss/crossentropy": 1.9326732844114303, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.19005155926570297, + "step": 29410 + }, + { + "epoch": 0.7355, + "grad_norm": 29.25, + "grad_norm_var": 6.9681640625, + "learning_rate": 0.0001, + "loss": 7.3399, + "loss/crossentropy": 2.017210554331541, + "loss/hidden": 3.240625, + "loss/jsd": 0.0, + "loss/logits": 0.190559295238927, + "step": 29420 + }, + { + "epoch": 0.73575, + "grad_norm": 30.625, + "grad_norm_var": 1.7583333333333333, + "learning_rate": 0.0001, + "loss": 7.2686, + "loss/crossentropy": 1.7766004413366319, + "loss/hidden": 3.479296875, + "loss/jsd": 0.0, + "loss/logits": 0.18161138612776995, + "step": 29430 + }, + { + "epoch": 0.736, + "grad_norm": 30.625, + "grad_norm_var": 7.8041015625, + "learning_rate": 0.0001, + "loss": 7.4176, + "loss/crossentropy": 2.1919517919421194, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.19109788965433835, + "step": 29440 + }, + { + "epoch": 0.73625, + "grad_norm": 33.0, + "grad_norm_var": 2.630989583333333, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 2.187614656984806, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.17972742971032857, + "step": 29450 + }, + { + "epoch": 0.7365, + "grad_norm": 32.25, + "grad_norm_var": 3.4166015625, + "learning_rate": 0.0001, + "loss": 7.2987, + "loss/crossentropy": 2.2710886627435682, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.1898843079805374, + "step": 29460 + }, + { + "epoch": 0.73675, + "grad_norm": 33.25, + "grad_norm_var": 2.787955729166667, + "learning_rate": 0.0001, + "loss": 7.1919, + "loss/crossentropy": 2.1406801946461202, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.18407909590750932, + "step": 29470 + }, + { + "epoch": 0.737, + "grad_norm": 28.875, + "grad_norm_var": 10.710416666666667, + "learning_rate": 0.0001, + "loss": 7.3633, + "loss/crossentropy": 2.117516166716814, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.1789137402549386, + "step": 29480 + }, + { + "epoch": 0.73725, + "grad_norm": 29.25, + "grad_norm_var": 16.240559895833332, + "learning_rate": 0.0001, + "loss": 7.1497, + "loss/crossentropy": 2.0938372761011124, + "loss/hidden": 3.237109375, + "loss/jsd": 0.0, + "loss/logits": 0.16651670411229133, + "step": 29490 + }, + { + "epoch": 0.7375, + "grad_norm": 31.0, + "grad_norm_var": 8.467122395833334, + "learning_rate": 0.0001, + "loss": 7.1916, + "loss/crossentropy": 2.14051573574543, + "loss/hidden": 3.33671875, + "loss/jsd": 0.0, + "loss/logits": 0.17684268727898597, + "step": 29500 + }, + { + "epoch": 0.73775, + "grad_norm": 31.625, + "grad_norm_var": 1.834375, + "learning_rate": 0.0001, + "loss": 7.2436, + "loss/crossentropy": 1.9776017241179944, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.17606793977320195, + "step": 29510 + }, + { + "epoch": 0.738, + "grad_norm": 29.75, + "grad_norm_var": 1.5052083333333333, + "learning_rate": 0.0001, + "loss": 7.2794, + "loss/crossentropy": 2.1177644938230515, + "loss/hidden": 3.40078125, + "loss/jsd": 0.0, + "loss/logits": 0.19227085784077644, + "step": 29520 + }, + { + "epoch": 0.73825, + "grad_norm": 34.75, + "grad_norm_var": 6.0197265625, + "learning_rate": 0.0001, + "loss": 7.2809, + "loss/crossentropy": 2.0195805758237837, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.16698714587837457, + "step": 29530 + }, + { + "epoch": 0.7385, + "grad_norm": 31.25, + "grad_norm_var": 5.507747395833333, + "learning_rate": 0.0001, + "loss": 7.2993, + "loss/crossentropy": 2.17318941950798, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1887758129276335, + "step": 29540 + }, + { + "epoch": 0.73875, + "grad_norm": 28.875, + "grad_norm_var": 1.6400390625, + "learning_rate": 0.0001, + "loss": 7.2153, + "loss/crossentropy": 2.0143275789916517, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.16856615301221609, + "step": 29550 + }, + { + "epoch": 0.739, + "grad_norm": 29.375, + "grad_norm_var": 2.662239583333333, + "learning_rate": 0.0001, + "loss": 7.3949, + "loss/crossentropy": 2.2027142092585565, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.18870100285857916, + "step": 29560 + }, + { + "epoch": 0.73925, + "grad_norm": 29.5, + "grad_norm_var": 2.7436848958333333, + "learning_rate": 0.0001, + "loss": 7.3728, + "loss/crossentropy": 1.9897805616259574, + "loss/hidden": 3.44609375, + "loss/jsd": 0.0, + "loss/logits": 0.21316022276878357, + "step": 29570 + }, + { + "epoch": 0.7395, + "grad_norm": 36.0, + "grad_norm_var": 3.5747395833333333, + "learning_rate": 0.0001, + "loss": 7.2612, + "loss/crossentropy": 2.169425293803215, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.18564392458647488, + "step": 29580 + }, + { + "epoch": 0.73975, + "grad_norm": 33.5, + "grad_norm_var": 5.126497395833334, + "learning_rate": 0.0001, + "loss": 7.1713, + "loss/crossentropy": 1.8968232870101929, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.17540717963129282, + "step": 29590 + }, + { + "epoch": 0.74, + "grad_norm": 30.375, + "grad_norm_var": 3.905989583333333, + "learning_rate": 0.0001, + "loss": 7.3553, + "loss/crossentropy": 2.120753511786461, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.1904517425224185, + "step": 29600 + }, + { + "epoch": 0.74025, + "grad_norm": 30.375, + "grad_norm_var": 4.1375, + "learning_rate": 0.0001, + "loss": 7.3046, + "loss/crossentropy": 2.1590781211853027, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19008232783526183, + "step": 29610 + }, + { + "epoch": 0.7405, + "grad_norm": 31.375, + "grad_norm_var": 3.600455729166667, + "learning_rate": 0.0001, + "loss": 7.3147, + "loss/crossentropy": 1.9427101552486419, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.170235357247293, + "step": 29620 + }, + { + "epoch": 0.74075, + "grad_norm": 29.75, + "grad_norm_var": 1.8582682291666666, + "learning_rate": 0.0001, + "loss": 7.3301, + "loss/crossentropy": 2.0426175825297834, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1734724512323737, + "step": 29630 + }, + { + "epoch": 0.741, + "grad_norm": 30.375, + "grad_norm_var": 1.0462890625, + "learning_rate": 0.0001, + "loss": 7.3503, + "loss/crossentropy": 2.1062657207250597, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.1793691584840417, + "step": 29640 + }, + { + "epoch": 0.74125, + "grad_norm": 33.0, + "grad_norm_var": 2.2684895833333334, + "learning_rate": 0.0001, + "loss": 7.2387, + "loss/crossentropy": 2.0198217228055, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.17088997922837734, + "step": 29650 + }, + { + "epoch": 0.7415, + "grad_norm": 31.125, + "grad_norm_var": 3.3677083333333333, + "learning_rate": 0.0001, + "loss": 7.2345, + "loss/crossentropy": 2.131781388819218, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18850240260362625, + "step": 29660 + }, + { + "epoch": 0.74175, + "grad_norm": 31.125, + "grad_norm_var": 3.145247395833333, + "learning_rate": 0.0001, + "loss": 7.2585, + "loss/crossentropy": 2.1001341193914413, + "loss/hidden": 3.291015625, + "loss/jsd": 0.0, + "loss/logits": 0.18017663452774285, + "step": 29670 + }, + { + "epoch": 0.742, + "grad_norm": 30.375, + "grad_norm_var": 4.264322916666667, + "learning_rate": 0.0001, + "loss": 7.3288, + "loss/crossentropy": 2.1850179612636564, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.1874097477644682, + "step": 29680 + }, + { + "epoch": 0.74225, + "grad_norm": 31.125, + "grad_norm_var": 2.655143229166667, + "learning_rate": 0.0001, + "loss": 7.284, + "loss/crossentropy": 2.088411360234022, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1895237101241946, + "step": 29690 + }, + { + "epoch": 0.7425, + "grad_norm": 28.125, + "grad_norm_var": 3.039322916666667, + "learning_rate": 0.0001, + "loss": 7.1583, + "loss/crossentropy": 2.0124595299363137, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.17138318605720998, + "step": 29700 + }, + { + "epoch": 0.74275, + "grad_norm": 30.875, + "grad_norm_var": 1.5801432291666666, + "learning_rate": 0.0001, + "loss": 7.2585, + "loss/crossentropy": 2.1081888318061828, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.17212112471461297, + "step": 29710 + }, + { + "epoch": 0.743, + "grad_norm": 30.25, + "grad_norm_var": 4.846809895833333, + "learning_rate": 0.0001, + "loss": 7.2807, + "loss/crossentropy": 2.052864673733711, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.17533118948340415, + "step": 29720 + }, + { + "epoch": 0.74325, + "grad_norm": 28.875, + "grad_norm_var": 1.8083333333333333, + "learning_rate": 0.0001, + "loss": 7.2476, + "loss/crossentropy": 2.0982922323048117, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.18855995871126652, + "step": 29730 + }, + { + "epoch": 0.7435, + "grad_norm": 32.75, + "grad_norm_var": 3.3103515625, + "learning_rate": 0.0001, + "loss": 7.3216, + "loss/crossentropy": 2.0989072144031526, + "loss/hidden": 3.290625, + "loss/jsd": 0.0, + "loss/logits": 0.1823441507294774, + "step": 29740 + }, + { + "epoch": 0.74375, + "grad_norm": 32.5, + "grad_norm_var": 2.7979166666666666, + "learning_rate": 0.0001, + "loss": 7.2959, + "loss/crossentropy": 2.1087387457489966, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.19064594879746438, + "step": 29750 + }, + { + "epoch": 0.744, + "grad_norm": 31.875, + "grad_norm_var": 1.8525390625, + "learning_rate": 0.0001, + "loss": 7.4191, + "loss/crossentropy": 1.9283446200191974, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.1755460049957037, + "step": 29760 + }, + { + "epoch": 0.74425, + "grad_norm": 32.25, + "grad_norm_var": 2.370247395833333, + "learning_rate": 0.0001, + "loss": 7.1647, + "loss/crossentropy": 2.0763636574149134, + "loss/hidden": 3.30078125, + "loss/jsd": 0.0, + "loss/logits": 0.18310361802577974, + "step": 29770 + }, + { + "epoch": 0.7445, + "grad_norm": 31.5, + "grad_norm_var": 2.1337890625, + "learning_rate": 0.0001, + "loss": 7.3702, + "loss/crossentropy": 2.1763921469449996, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18296955525875092, + "step": 29780 + }, + { + "epoch": 0.74475, + "grad_norm": 35.0, + "grad_norm_var": 4.333072916666667, + "learning_rate": 0.0001, + "loss": 7.2519, + "loss/crossentropy": 2.101426640152931, + "loss/hidden": 3.403125, + "loss/jsd": 0.0, + "loss/logits": 0.18623395673930646, + "step": 29790 + }, + { + "epoch": 0.745, + "grad_norm": 30.0, + "grad_norm_var": 3.4559895833333334, + "learning_rate": 0.0001, + "loss": 7.2323, + "loss/crossentropy": 2.1355843022465706, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.19213641248643398, + "step": 29800 + }, + { + "epoch": 0.74525, + "grad_norm": 28.75, + "grad_norm_var": 2.5947265625, + "learning_rate": 0.0001, + "loss": 7.3239, + "loss/crossentropy": 2.138861904293299, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.19748148396611215, + "step": 29810 + }, + { + "epoch": 0.7455, + "grad_norm": 32.0, + "grad_norm_var": 2.6455729166666666, + "learning_rate": 0.0001, + "loss": 7.2485, + "loss/crossentropy": 2.074553743004799, + "loss/hidden": 3.25390625, + "loss/jsd": 0.0, + "loss/logits": 0.17721196599304676, + "step": 29820 + }, + { + "epoch": 0.74575, + "grad_norm": 33.25, + "grad_norm_var": 2.919205729166667, + "learning_rate": 0.0001, + "loss": 7.2774, + "loss/crossentropy": 2.206491988897324, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.19982497543096542, + "step": 29830 + }, + { + "epoch": 0.746, + "grad_norm": 29.25, + "grad_norm_var": 3.2625, + "learning_rate": 0.0001, + "loss": 7.1951, + "loss/crossentropy": 2.032223401963711, + "loss/hidden": 3.3328125, + "loss/jsd": 0.0, + "loss/logits": 0.18081800378859042, + "step": 29840 + }, + { + "epoch": 0.74625, + "grad_norm": 28.125, + "grad_norm_var": 2.6455729166666666, + "learning_rate": 0.0001, + "loss": 7.2018, + "loss/crossentropy": 2.1063923329114913, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.17570548821240664, + "step": 29850 + }, + { + "epoch": 0.7465, + "grad_norm": 30.75, + "grad_norm_var": 3.1328868320619617e+18, + "learning_rate": 0.0001, + "loss": 7.2075, + "loss/crossentropy": 2.008588743954897, + "loss/hidden": 3.26640625, + "loss/jsd": 0.0, + "loss/logits": 0.16772094201296567, + "step": 29860 + }, + { + "epoch": 0.74675, + "grad_norm": 30.25, + "grad_norm_var": 3.1328868314940877e+18, + "learning_rate": 0.0001, + "loss": 7.2256, + "loss/crossentropy": 2.327219396829605, + "loss/hidden": 3.30078125, + "loss/jsd": 0.0, + "loss/logits": 0.1916476283222437, + "step": 29870 + }, + { + "epoch": 0.747, + "grad_norm": 30.25, + "grad_norm_var": 2.06015625, + "learning_rate": 0.0001, + "loss": 7.2006, + "loss/crossentropy": 2.1967759788036347, + "loss/hidden": 3.255859375, + "loss/jsd": 0.0, + "loss/logits": 0.181925462000072, + "step": 29880 + }, + { + "epoch": 0.74725, + "grad_norm": 29.0, + "grad_norm_var": 1.07265625, + "learning_rate": 0.0001, + "loss": 7.3302, + "loss/crossentropy": 2.0784429952502252, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.20122398622334003, + "step": 29890 + }, + { + "epoch": 0.7475, + "grad_norm": 30.5, + "grad_norm_var": 2.16015625, + "learning_rate": 0.0001, + "loss": 7.2241, + "loss/crossentropy": 2.044815068691969, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.17522517647594213, + "step": 29900 + }, + { + "epoch": 0.74775, + "grad_norm": 30.625, + "grad_norm_var": 2.7848307291666665, + "learning_rate": 0.0001, + "loss": 7.241, + "loss/crossentropy": 2.0202932924032213, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.18710787147283553, + "step": 29910 + }, + { + "epoch": 0.748, + "grad_norm": 35.75, + "grad_norm_var": 4.892643229166667, + "learning_rate": 0.0001, + "loss": 7.4702, + "loss/crossentropy": 1.9623441621661186, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.18277720119804144, + "step": 29920 + }, + { + "epoch": 0.74825, + "grad_norm": 33.5, + "grad_norm_var": 32.8087890625, + "learning_rate": 0.0001, + "loss": 7.2328, + "loss/crossentropy": 2.1580356508493423, + "loss/hidden": 3.273828125, + "loss/jsd": 0.0, + "loss/logits": 0.18409592770040034, + "step": 29930 + }, + { + "epoch": 0.7485, + "grad_norm": 29.5, + "grad_norm_var": 5.335416666666666, + "learning_rate": 0.0001, + "loss": 7.2307, + "loss/crossentropy": 1.9020950391888618, + "loss/hidden": 3.2171875, + "loss/jsd": 0.0, + "loss/logits": 0.16977879907935858, + "step": 29940 + }, + { + "epoch": 0.74875, + "grad_norm": 28.5, + "grad_norm_var": 3.9124348958333335, + "learning_rate": 0.0001, + "loss": 7.1602, + "loss/crossentropy": 1.9830825686454774, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.19025738779455423, + "step": 29950 + }, + { + "epoch": 0.749, + "grad_norm": 30.125, + "grad_norm_var": 1.4872395833333334, + "learning_rate": 0.0001, + "loss": 7.2886, + "loss/crossentropy": 2.0990236908197404, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.20246837567538023, + "step": 29960 + }, + { + "epoch": 0.74925, + "grad_norm": 32.0, + "grad_norm_var": 1.5291015625, + "learning_rate": 0.0001, + "loss": 7.3944, + "loss/crossentropy": 2.0726598769426348, + "loss/hidden": 3.27890625, + "loss/jsd": 0.0, + "loss/logits": 0.17855302877724172, + "step": 29970 + }, + { + "epoch": 0.7495, + "grad_norm": 29.0, + "grad_norm_var": 3.155989583333333, + "learning_rate": 0.0001, + "loss": 7.0988, + "loss/crossentropy": 2.0020054180175064, + "loss/hidden": 3.27890625, + "loss/jsd": 0.0, + "loss/logits": 0.1707400310318917, + "step": 29980 + }, + { + "epoch": 0.74975, + "grad_norm": 29.75, + "grad_norm_var": 2.613541666666667, + "learning_rate": 0.0001, + "loss": 7.3437, + "loss/crossentropy": 2.1497740030288695, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18407329265028238, + "step": 29990 + }, + { + "epoch": 0.75, + "grad_norm": 30.125, + "grad_norm_var": 1.3348307291666666, + "learning_rate": 0.0001, + "loss": 7.3011, + "loss/crossentropy": 2.10102431550622, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18685767855495214, + "step": 30000 + }, + { + "epoch": 0.75025, + "grad_norm": 28.75, + "grad_norm_var": 1.8431640625, + "learning_rate": 0.0001, + "loss": 7.2246, + "loss/crossentropy": 2.2319724306464197, + "loss/hidden": 3.389453125, + "loss/jsd": 0.0, + "loss/logits": 0.21383447982370854, + "step": 30010 + }, + { + "epoch": 0.7505, + "grad_norm": 28.875, + "grad_norm_var": 3.6244140625, + "learning_rate": 0.0001, + "loss": 7.3341, + "loss/crossentropy": 2.021278867125511, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20931231547147036, + "step": 30020 + }, + { + "epoch": 0.75075, + "grad_norm": 29.375, + "grad_norm_var": 4.070572916666666, + "learning_rate": 0.0001, + "loss": 7.2765, + "loss/crossentropy": 2.1085473522543907, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.18091097762808203, + "step": 30030 + }, + { + "epoch": 0.751, + "grad_norm": 31.75, + "grad_norm_var": 4.53125, + "learning_rate": 0.0001, + "loss": 7.3646, + "loss/crossentropy": 2.1863505825400353, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.1947094239294529, + "step": 30040 + }, + { + "epoch": 0.75125, + "grad_norm": 29.75, + "grad_norm_var": 3.1030598958333333, + "learning_rate": 0.0001, + "loss": 7.2667, + "loss/crossentropy": 1.9493524998426437, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.18073509354144335, + "step": 30050 + }, + { + "epoch": 0.7515, + "grad_norm": 33.0, + "grad_norm_var": 2.85625, + "learning_rate": 0.0001, + "loss": 7.3301, + "loss/crossentropy": 2.0162834845483304, + "loss/hidden": 3.274609375, + "loss/jsd": 0.0, + "loss/logits": 0.17740605510771273, + "step": 30060 + }, + { + "epoch": 0.75175, + "grad_norm": 30.0, + "grad_norm_var": 1.6854166666666666, + "learning_rate": 0.0001, + "loss": 7.3117, + "loss/crossentropy": 2.247773203253746, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.1955332661047578, + "step": 30070 + }, + { + "epoch": 0.752, + "grad_norm": 28.75, + "grad_norm_var": 2.721809895833333, + "learning_rate": 0.0001, + "loss": 7.2713, + "loss/crossentropy": 1.8765224024653435, + "loss/hidden": 3.3640625, + "loss/jsd": 0.0, + "loss/logits": 0.18920745309442283, + "step": 30080 + }, + { + "epoch": 0.75225, + "grad_norm": 29.75, + "grad_norm_var": 2.4228515625, + "learning_rate": 0.0001, + "loss": 7.2952, + "loss/crossentropy": 2.0629479378461837, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1822500554844737, + "step": 30090 + }, + { + "epoch": 0.7525, + "grad_norm": 29.5, + "grad_norm_var": 1.8885416666666666, + "learning_rate": 0.0001, + "loss": 7.2476, + "loss/crossentropy": 2.0535168081521986, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.1875293802469969, + "step": 30100 + }, + { + "epoch": 0.75275, + "grad_norm": 29.5, + "grad_norm_var": 2.8353515625, + "learning_rate": 0.0001, + "loss": 7.3444, + "loss/crossentropy": 2.153412875533104, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20092597529292106, + "step": 30110 + }, + { + "epoch": 0.753, + "grad_norm": 29.75, + "grad_norm_var": 13.36015625, + "learning_rate": 0.0001, + "loss": 7.1798, + "loss/crossentropy": 2.135878336429596, + "loss/hidden": 3.26328125, + "loss/jsd": 0.0, + "loss/logits": 0.17593010412529111, + "step": 30120 + }, + { + "epoch": 0.75325, + "grad_norm": 33.0, + "grad_norm_var": 18.307747395833335, + "learning_rate": 0.0001, + "loss": 7.2136, + "loss/crossentropy": 2.1997751981019973, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.18918296322226524, + "step": 30130 + }, + { + "epoch": 0.7535, + "grad_norm": 39.5, + "grad_norm_var": 3.0444333210866637e+18, + "learning_rate": 0.0001, + "loss": 7.332, + "loss/crossentropy": 1.9713260725140571, + "loss/hidden": 3.4015625, + "loss/jsd": 0.0, + "loss/logits": 0.19746107552200556, + "step": 30140 + }, + { + "epoch": 0.75375, + "grad_norm": 29.875, + "grad_norm_var": 3.04443332031603e+18, + "learning_rate": 0.0001, + "loss": 7.3182, + "loss/crossentropy": 2.0566159434616567, + "loss/hidden": 3.31015625, + "loss/jsd": 0.0, + "loss/logits": 0.17785473773255944, + "step": 30150 + }, + { + "epoch": 0.754, + "grad_norm": 30.75, + "grad_norm_var": 4.223372395833334, + "learning_rate": 0.0001, + "loss": 7.2001, + "loss/crossentropy": 1.8348468966782092, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.15625384994782507, + "step": 30160 + }, + { + "epoch": 0.75425, + "grad_norm": 35.25, + "grad_norm_var": 4.393489583333333, + "learning_rate": 0.0001, + "loss": 7.3332, + "loss/crossentropy": 1.963544089347124, + "loss/hidden": 3.5234375, + "loss/jsd": 0.0, + "loss/logits": 0.22075258214026688, + "step": 30170 + }, + { + "epoch": 0.7545, + "grad_norm": 29.25, + "grad_norm_var": 4.516080729166666, + "learning_rate": 0.0001, + "loss": 7.2782, + "loss/crossentropy": 1.9644573964178562, + "loss/hidden": 3.4171875, + "loss/jsd": 0.0, + "loss/logits": 0.18735937895253302, + "step": 30180 + }, + { + "epoch": 0.75475, + "grad_norm": 29.5, + "grad_norm_var": 4.713541666666667, + "learning_rate": 0.0001, + "loss": 7.2855, + "loss/crossentropy": 2.0815071538090706, + "loss/hidden": 3.3453125, + "loss/jsd": 0.0, + "loss/logits": 0.18255099412053824, + "step": 30190 + }, + { + "epoch": 0.755, + "grad_norm": 29.375, + "grad_norm_var": 7.876822916666667, + "learning_rate": 0.0001, + "loss": 7.2586, + "loss/crossentropy": 2.0835547246038915, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.18153475932776927, + "step": 30200 + }, + { + "epoch": 0.75525, + "grad_norm": 44.0, + "grad_norm_var": 13.9884765625, + "learning_rate": 0.0001, + "loss": 7.3384, + "loss/crossentropy": 2.1766451790928842, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1941084787249565, + "step": 30210 + }, + { + "epoch": 0.7555, + "grad_norm": 28.125, + "grad_norm_var": 16.192122395833334, + "learning_rate": 0.0001, + "loss": 7.3195, + "loss/crossentropy": 2.1572922460734842, + "loss/hidden": 3.2859375, + "loss/jsd": 0.0, + "loss/logits": 0.18345598923042417, + "step": 30220 + }, + { + "epoch": 0.75575, + "grad_norm": 29.375, + "grad_norm_var": 2.8363932291666667, + "learning_rate": 0.0001, + "loss": 7.1101, + "loss/crossentropy": 1.9785410642623902, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.1788899280130863, + "step": 30230 + }, + { + "epoch": 0.756, + "grad_norm": 30.75, + "grad_norm_var": 14.198958333333334, + "learning_rate": 0.0001, + "loss": 7.2872, + "loss/crossentropy": 2.032851605117321, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.19580046012997626, + "step": 30240 + }, + { + "epoch": 0.75625, + "grad_norm": 31.375, + "grad_norm_var": 17.10390625, + "learning_rate": 0.0001, + "loss": 7.112, + "loss/crossentropy": 2.0762212831526994, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.1714485607109964, + "step": 30250 + }, + { + "epoch": 0.7565, + "grad_norm": 29.5, + "grad_norm_var": 3.03125, + "learning_rate": 0.0001, + "loss": 7.2301, + "loss/crossentropy": 2.2272632330656053, + "loss/hidden": 3.43046875, + "loss/jsd": 0.0, + "loss/logits": 0.20788154676556586, + "step": 30260 + }, + { + "epoch": 0.75675, + "grad_norm": 31.625, + "grad_norm_var": 2.09140625, + "learning_rate": 0.0001, + "loss": 7.2818, + "loss/crossentropy": 2.0946265175938605, + "loss/hidden": 3.28203125, + "loss/jsd": 0.0, + "loss/logits": 0.18614709600806237, + "step": 30270 + }, + { + "epoch": 0.757, + "grad_norm": 28.375, + "grad_norm_var": 0.8514973958333333, + "learning_rate": 0.0001, + "loss": 7.1706, + "loss/crossentropy": 2.0933008804917335, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.17898790668696166, + "step": 30280 + }, + { + "epoch": 0.75725, + "grad_norm": 27.875, + "grad_norm_var": 1.0989583333333333, + "learning_rate": 0.0001, + "loss": 7.3146, + "loss/crossentropy": 2.122311710566282, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.16947407890111207, + "step": 30290 + }, + { + "epoch": 0.7575, + "grad_norm": 31.625, + "grad_norm_var": 3.0152302917171804e+18, + "learning_rate": 0.0001, + "loss": 7.3609, + "loss/crossentropy": 2.206487476825714, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.19262611195445062, + "step": 30300 + }, + { + "epoch": 0.75775, + "grad_norm": 27.625, + "grad_norm_var": 3.122330729166667, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 2.166238710284233, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.19571597315371037, + "step": 30310 + }, + { + "epoch": 0.758, + "grad_norm": 27.75, + "grad_norm_var": 2.3692057291666666, + "learning_rate": 0.0001, + "loss": 7.3066, + "loss/crossentropy": 2.1176643908023833, + "loss/hidden": 3.2671875, + "loss/jsd": 0.0, + "loss/logits": 0.17054121531546115, + "step": 30320 + }, + { + "epoch": 0.75825, + "grad_norm": 34.0, + "grad_norm_var": 238.5572265625, + "learning_rate": 0.0001, + "loss": 7.2952, + "loss/crossentropy": 1.9073448725044728, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.1762677811086178, + "step": 30330 + }, + { + "epoch": 0.7585, + "grad_norm": 28.5, + "grad_norm_var": 231.375, + "learning_rate": 0.0001, + "loss": 7.1593, + "loss/crossentropy": 2.0104400724172593, + "loss/hidden": 3.259765625, + "loss/jsd": 0.0, + "loss/logits": 0.17894950192421674, + "step": 30340 + }, + { + "epoch": 0.75875, + "grad_norm": 31.375, + "grad_norm_var": 1.9681640625, + "learning_rate": 0.0001, + "loss": 7.2992, + "loss/crossentropy": 2.0130166873335837, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.17820241563022138, + "step": 30350 + }, + { + "epoch": 0.759, + "grad_norm": 34.75, + "grad_norm_var": 3.71640625, + "learning_rate": 0.0001, + "loss": 7.2691, + "loss/crossentropy": 2.0536349333822725, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.18395173139870166, + "step": 30360 + }, + { + "epoch": 0.75925, + "grad_norm": 29.875, + "grad_norm_var": 25.2056640625, + "learning_rate": 0.0001, + "loss": 7.2065, + "loss/crossentropy": 2.1257813230156897, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.17920450903475285, + "step": 30370 + }, + { + "epoch": 0.7595, + "grad_norm": 31.125, + "grad_norm_var": 24.911393229166666, + "learning_rate": 0.0001, + "loss": 7.2013, + "loss/crossentropy": 2.2414271771907806, + "loss/hidden": 3.2984375, + "loss/jsd": 0.0, + "loss/logits": 0.19076382871717215, + "step": 30380 + }, + { + "epoch": 0.75975, + "grad_norm": 30.75, + "grad_norm_var": 1.603125, + "learning_rate": 0.0001, + "loss": 7.163, + "loss/crossentropy": 2.077455496788025, + "loss/hidden": 3.286328125, + "loss/jsd": 0.0, + "loss/logits": 0.189267154969275, + "step": 30390 + }, + { + "epoch": 0.76, + "grad_norm": 32.75, + "grad_norm_var": 3.5791015625, + "learning_rate": 0.0001, + "loss": 7.3401, + "loss/crossentropy": 2.0604166358709337, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1891783157363534, + "step": 30400 + }, + { + "epoch": 0.76025, + "grad_norm": 28.5, + "grad_norm_var": 4.080208333333333, + "learning_rate": 0.0001, + "loss": 7.2248, + "loss/crossentropy": 2.021010288596153, + "loss/hidden": 3.323046875, + "loss/jsd": 0.0, + "loss/logits": 0.1886049335822463, + "step": 30410 + }, + { + "epoch": 0.7605, + "grad_norm": 34.75, + "grad_norm_var": 2.6080729166666665, + "learning_rate": 0.0001, + "loss": 7.2918, + "loss/crossentropy": 2.104072753340006, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.18789286576211453, + "step": 30420 + }, + { + "epoch": 0.76075, + "grad_norm": 29.375, + "grad_norm_var": 2.983268229166667, + "learning_rate": 0.0001, + "loss": 7.3593, + "loss/crossentropy": 2.077750712633133, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.18233956769108772, + "step": 30430 + }, + { + "epoch": 0.761, + "grad_norm": 29.75, + "grad_norm_var": 3.0737770858689853e+18, + "learning_rate": 0.0001, + "loss": 7.3333, + "loss/crossentropy": 2.1295972421765326, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.18910994809120893, + "step": 30440 + }, + { + "epoch": 0.76125, + "grad_norm": 32.25, + "grad_norm_var": 27.800455729166668, + "learning_rate": 0.0001, + "loss": 7.4099, + "loss/crossentropy": 2.0139816865324973, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.18354472760111093, + "step": 30450 + }, + { + "epoch": 0.7615, + "grad_norm": 29.0, + "grad_norm_var": 164.29837239583333, + "learning_rate": 0.0001, + "loss": 7.1649, + "loss/crossentropy": 2.0612717509269713, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.1756343010812998, + "step": 30460 + }, + { + "epoch": 0.76175, + "grad_norm": 28.0, + "grad_norm_var": 167.95104166666667, + "learning_rate": 0.0001, + "loss": 7.2015, + "loss/crossentropy": 2.110891255736351, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.19027273952960969, + "step": 30470 + }, + { + "epoch": 0.762, + "grad_norm": 29.0, + "grad_norm_var": 3.1747395833333334, + "learning_rate": 0.0001, + "loss": 7.277, + "loss/crossentropy": 2.1438290625810623, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.202046955563128, + "step": 30480 + }, + { + "epoch": 0.76225, + "grad_norm": 28.5, + "grad_norm_var": 22.799739583333334, + "learning_rate": 0.0001, + "loss": 7.2273, + "loss/crossentropy": 1.9993860483169557, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18927458357065916, + "step": 30490 + }, + { + "epoch": 0.7625, + "grad_norm": 28.75, + "grad_norm_var": 27.1634765625, + "learning_rate": 0.0001, + "loss": 7.2857, + "loss/crossentropy": 2.0218796797096728, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.19677631575614213, + "step": 30500 + }, + { + "epoch": 0.76275, + "grad_norm": 30.0, + "grad_norm_var": 31.164583333333333, + "learning_rate": 0.0001, + "loss": 7.2166, + "loss/crossentropy": 2.013028331845999, + "loss/hidden": 3.326171875, + "loss/jsd": 0.0, + "loss/logits": 0.18638179991394282, + "step": 30510 + }, + { + "epoch": 0.763, + "grad_norm": 30.875, + "grad_norm_var": 15.534309895833333, + "learning_rate": 0.0001, + "loss": 7.2954, + "loss/crossentropy": 2.013714115321636, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.1986254785209894, + "step": 30520 + }, + { + "epoch": 0.76325, + "grad_norm": 36.75, + "grad_norm_var": 13.345572916666667, + "learning_rate": 0.0001, + "loss": 7.2507, + "loss/crossentropy": 2.0372176095843315, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.18047925494611264, + "step": 30530 + }, + { + "epoch": 0.7635, + "grad_norm": 29.5, + "grad_norm_var": 8.724739583333333, + "learning_rate": 0.0001, + "loss": 7.1431, + "loss/crossentropy": 1.8315919287502767, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.16889077816158532, + "step": 30540 + }, + { + "epoch": 0.76375, + "grad_norm": 30.0, + "grad_norm_var": 6.551822916666667, + "learning_rate": 0.0001, + "loss": 7.3346, + "loss/crossentropy": 2.0934281766414644, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.18256967328488827, + "step": 30550 + }, + { + "epoch": 0.764, + "grad_norm": 35.5, + "grad_norm_var": 10.449739583333333, + "learning_rate": 0.0001, + "loss": 7.1819, + "loss/crossentropy": 1.9229003690183162, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.17857125196605922, + "step": 30560 + }, + { + "epoch": 0.76425, + "grad_norm": 30.125, + "grad_norm_var": 3.8202487216409324e+18, + "learning_rate": 0.0001, + "loss": 7.2603, + "loss/crossentropy": 1.8773075222969056, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.182831759005785, + "step": 30570 + }, + { + "epoch": 0.7645, + "grad_norm": 30.25, + "grad_norm_var": 3.820248721478054e+18, + "learning_rate": 0.0001, + "loss": 7.2925, + "loss/crossentropy": 2.222408211231232, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.1848822958767414, + "step": 30580 + }, + { + "epoch": 0.76475, + "grad_norm": 30.5, + "grad_norm_var": 6.410872395833334, + "learning_rate": 0.0001, + "loss": 7.1576, + "loss/crossentropy": 2.0349696803838015, + "loss/hidden": 3.259375, + "loss/jsd": 0.0, + "loss/logits": 0.17177348136901854, + "step": 30590 + }, + { + "epoch": 0.765, + "grad_norm": 37.5, + "grad_norm_var": 7.037239583333333, + "learning_rate": 0.0001, + "loss": 7.2271, + "loss/crossentropy": 2.164412271976471, + "loss/hidden": 3.22734375, + "loss/jsd": 0.0, + "loss/logits": 0.17602096050977706, + "step": 30600 + }, + { + "epoch": 0.76525, + "grad_norm": 32.75, + "grad_norm_var": 8.624739583333334, + "learning_rate": 0.0001, + "loss": 7.301, + "loss/crossentropy": 2.0298607796430588, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.1785941503942013, + "step": 30610 + }, + { + "epoch": 0.7655, + "grad_norm": 34.5, + "grad_norm_var": 4.123958333333333, + "learning_rate": 0.0001, + "loss": 7.3098, + "loss/crossentropy": 2.2510849207639696, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.18415474183857442, + "step": 30620 + }, + { + "epoch": 0.76575, + "grad_norm": 29.375, + "grad_norm_var": 4.39140625, + "learning_rate": 0.0001, + "loss": 7.2049, + "loss/crossentropy": 2.003419761359692, + "loss/hidden": 3.2765625, + "loss/jsd": 0.0, + "loss/logits": 0.18635775558650494, + "step": 30630 + }, + { + "epoch": 0.766, + "grad_norm": 31.375, + "grad_norm_var": 2.545768229166667, + "learning_rate": 0.0001, + "loss": 7.3128, + "loss/crossentropy": 2.138288116455078, + "loss/hidden": 3.29140625, + "loss/jsd": 0.0, + "loss/logits": 0.18150971587747336, + "step": 30640 + }, + { + "epoch": 0.76625, + "grad_norm": 27.875, + "grad_norm_var": 2.600455729166667, + "learning_rate": 0.0001, + "loss": 7.2306, + "loss/crossentropy": 2.03582369312644, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.19014431536197662, + "step": 30650 + }, + { + "epoch": 0.7665, + "grad_norm": 27.5, + "grad_norm_var": 5.403059895833334, + "learning_rate": 0.0001, + "loss": 7.0785, + "loss/crossentropy": 2.023620304465294, + "loss/hidden": 3.297265625, + "loss/jsd": 0.0, + "loss/logits": 0.17827179692685605, + "step": 30660 + }, + { + "epoch": 0.76675, + "grad_norm": 31.0, + "grad_norm_var": 5.3603515625, + "learning_rate": 0.0001, + "loss": 7.2912, + "loss/crossentropy": 2.2133360475301744, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.19060195097699761, + "step": 30670 + }, + { + "epoch": 0.767, + "grad_norm": 29.75, + "grad_norm_var": 4.738997395833334, + "learning_rate": 0.0001, + "loss": 7.1324, + "loss/crossentropy": 2.0448534056544303, + "loss/hidden": 3.3390625, + "loss/jsd": 0.0, + "loss/logits": 0.1814136505126953, + "step": 30680 + }, + { + "epoch": 0.76725, + "grad_norm": 29.5, + "grad_norm_var": 3.6879557291666667, + "learning_rate": 0.0001, + "loss": 7.2882, + "loss/crossentropy": 2.0667665399610997, + "loss/hidden": 3.218359375, + "loss/jsd": 0.0, + "loss/logits": 0.17576975822448732, + "step": 30690 + }, + { + "epoch": 0.7675, + "grad_norm": 30.375, + "grad_norm_var": 2.371809895833333, + "learning_rate": 0.0001, + "loss": 7.1785, + "loss/crossentropy": 1.9460737973451614, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.16462803408503532, + "step": 30700 + }, + { + "epoch": 0.76775, + "grad_norm": 31.375, + "grad_norm_var": 2.758072916666667, + "learning_rate": 0.0001, + "loss": 7.3006, + "loss/crossentropy": 2.1625903218984606, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.19786368366330861, + "step": 30710 + }, + { + "epoch": 0.768, + "grad_norm": 31.625, + "grad_norm_var": 4.087955729166667, + "learning_rate": 0.0001, + "loss": 7.2984, + "loss/crossentropy": 2.115101757645607, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.1990343313664198, + "step": 30720 + }, + { + "epoch": 0.76825, + "grad_norm": 30.0, + "grad_norm_var": 31.29375, + "learning_rate": 0.0001, + "loss": 7.3308, + "loss/crossentropy": 2.1543532446026803, + "loss/hidden": 3.354296875, + "loss/jsd": 0.0, + "loss/logits": 0.18993538860231637, + "step": 30730 + }, + { + "epoch": 0.7685, + "grad_norm": 31.625, + "grad_norm_var": 13.255208333333334, + "learning_rate": 0.0001, + "loss": 7.3246, + "loss/crossentropy": 2.147171225398779, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.18650086652487516, + "step": 30740 + }, + { + "epoch": 0.76875, + "grad_norm": 31.75, + "grad_norm_var": 13.110872395833333, + "learning_rate": 0.0001, + "loss": 7.3292, + "loss/crossentropy": 2.050896894186735, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.19013228435069324, + "step": 30750 + }, + { + "epoch": 0.769, + "grad_norm": 31.625, + "grad_norm_var": 42.296809895833334, + "learning_rate": 0.0001, + "loss": 7.2417, + "loss/crossentropy": 1.9304289899766445, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.18646475402638316, + "step": 30760 + }, + { + "epoch": 0.76925, + "grad_norm": 31.5, + "grad_norm_var": 4.1150390625, + "learning_rate": 0.0001, + "loss": 7.2404, + "loss/crossentropy": 2.0934823572635652, + "loss/hidden": 3.294140625, + "loss/jsd": 0.0, + "loss/logits": 0.19288243278861045, + "step": 30770 + }, + { + "epoch": 0.7695, + "grad_norm": 28.75, + "grad_norm_var": 3.3622395833333334, + "learning_rate": 0.0001, + "loss": 7.1472, + "loss/crossentropy": 2.119046524167061, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.19379116278141736, + "step": 30780 + }, + { + "epoch": 0.76975, + "grad_norm": 28.125, + "grad_norm_var": 3.0747395833333333, + "learning_rate": 0.0001, + "loss": 7.228, + "loss/crossentropy": 1.8516628809273243, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.17867155242711305, + "step": 30790 + }, + { + "epoch": 0.77, + "grad_norm": 30.625, + "grad_norm_var": 2.3854166666666665, + "learning_rate": 0.0001, + "loss": 7.2606, + "loss/crossentropy": 2.140632301568985, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.17682827562093734, + "step": 30800 + }, + { + "epoch": 0.77025, + "grad_norm": 31.625, + "grad_norm_var": 1.33515625, + "learning_rate": 0.0001, + "loss": 7.1309, + "loss/crossentropy": 2.063164585828781, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.18230299968272448, + "step": 30810 + }, + { + "epoch": 0.7705, + "grad_norm": 30.375, + "grad_norm_var": 2.8103515625, + "learning_rate": 0.0001, + "loss": 7.2604, + "loss/crossentropy": 2.110166022181511, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.18783345203846694, + "step": 30820 + }, + { + "epoch": 0.77075, + "grad_norm": 29.625, + "grad_norm_var": 3.004166666666667, + "learning_rate": 0.0001, + "loss": 7.298, + "loss/crossentropy": 2.1085711009800434, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.2007377685047686, + "step": 30830 + }, + { + "epoch": 0.771, + "grad_norm": 32.25, + "grad_norm_var": 2.7343098958333334, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 1.923082959651947, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.17613047789782285, + "step": 30840 + }, + { + "epoch": 0.77125, + "grad_norm": 30.25, + "grad_norm_var": 2.986167998798452e+18, + "learning_rate": 0.0001, + "loss": 7.245, + "loss/crossentropy": 2.139000003039837, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.18553725760430098, + "step": 30850 + }, + { + "epoch": 0.7715, + "grad_norm": 31.75, + "grad_norm_var": 6.801041666666666, + "learning_rate": 0.0001, + "loss": 7.2407, + "loss/crossentropy": 1.9860042557120323, + "loss/hidden": 3.293359375, + "loss/jsd": 0.0, + "loss/logits": 0.16922438573092222, + "step": 30860 + }, + { + "epoch": 0.77175, + "grad_norm": 30.0, + "grad_norm_var": 2.1455729166666666, + "learning_rate": 0.0001, + "loss": 7.3205, + "loss/crossentropy": 2.2089149236679075, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.19413873814046384, + "step": 30870 + }, + { + "epoch": 0.772, + "grad_norm": 26.125, + "grad_norm_var": 3.0122395833333333, + "learning_rate": 0.0001, + "loss": 7.2108, + "loss/crossentropy": 2.0838850632309915, + "loss/hidden": 3.22265625, + "loss/jsd": 0.0, + "loss/logits": 0.1679495433345437, + "step": 30880 + }, + { + "epoch": 0.77225, + "grad_norm": 31.0, + "grad_norm_var": 4.1375, + "learning_rate": 0.0001, + "loss": 7.1952, + "loss/crossentropy": 2.0687290251255037, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.17775915022939442, + "step": 30890 + }, + { + "epoch": 0.7725, + "grad_norm": 28.0, + "grad_norm_var": 4.011393229166667, + "learning_rate": 0.0001, + "loss": 7.2563, + "loss/crossentropy": 1.9918573498725891, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.17511531990021467, + "step": 30900 + }, + { + "epoch": 0.77275, + "grad_norm": 30.0, + "grad_norm_var": 2.5603515625, + "learning_rate": 0.0001, + "loss": 7.2497, + "loss/crossentropy": 1.9922025822103024, + "loss/hidden": 3.43203125, + "loss/jsd": 0.0, + "loss/logits": 0.18127510212361814, + "step": 30910 + }, + { + "epoch": 0.773, + "grad_norm": 29.5, + "grad_norm_var": 1.7614583333333333, + "learning_rate": 0.0001, + "loss": 7.2854, + "loss/crossentropy": 2.0986859157681463, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.1825839588418603, + "step": 30920 + }, + { + "epoch": 0.77325, + "grad_norm": 32.75, + "grad_norm_var": 5.251822916666667, + "learning_rate": 0.0001, + "loss": 7.2091, + "loss/crossentropy": 1.9582366243004798, + "loss/hidden": 3.43125, + "loss/jsd": 0.0, + "loss/logits": 0.1809095237404108, + "step": 30930 + }, + { + "epoch": 0.7735, + "grad_norm": 31.5, + "grad_norm_var": 2.8372395833333335, + "learning_rate": 0.0001, + "loss": 7.2826, + "loss/crossentropy": 1.9070976704359055, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.1710755567997694, + "step": 30940 + }, + { + "epoch": 0.77375, + "grad_norm": 29.0, + "grad_norm_var": 3.6020833333333333, + "learning_rate": 0.0001, + "loss": 7.2084, + "loss/crossentropy": 2.0629549741744997, + "loss/hidden": 3.369921875, + "loss/jsd": 0.0, + "loss/logits": 0.18181780502200126, + "step": 30950 + }, + { + "epoch": 0.774, + "grad_norm": 33.5, + "grad_norm_var": 4.3853515625, + "learning_rate": 0.0001, + "loss": 7.1774, + "loss/crossentropy": 2.1738601446151735, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.2032660385593772, + "step": 30960 + }, + { + "epoch": 0.77425, + "grad_norm": 32.25, + "grad_norm_var": 2.38515625, + "learning_rate": 0.0001, + "loss": 7.317, + "loss/crossentropy": 2.079986987262964, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.19429001584649086, + "step": 30970 + }, + { + "epoch": 0.7745, + "grad_norm": 31.125, + "grad_norm_var": 4.668684895833334, + "learning_rate": 0.0001, + "loss": 7.1493, + "loss/crossentropy": 2.0519056364893915, + "loss/hidden": 3.22265625, + "loss/jsd": 0.0, + "loss/logits": 0.16815223284065722, + "step": 30980 + }, + { + "epoch": 0.77475, + "grad_norm": 28.375, + "grad_norm_var": 5.291080729166667, + "learning_rate": 0.0001, + "loss": 7.2312, + "loss/crossentropy": 2.052493926882744, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.18330379351973533, + "step": 30990 + }, + { + "epoch": 0.775, + "grad_norm": 29.875, + "grad_norm_var": 2.12265625, + "learning_rate": 0.0001, + "loss": 7.2823, + "loss/crossentropy": 2.086025407910347, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.18335492983460427, + "step": 31000 + }, + { + "epoch": 0.77525, + "grad_norm": 29.0, + "grad_norm_var": 5.005989583333333, + "learning_rate": 0.0001, + "loss": 7.2367, + "loss/crossentropy": 2.048542061448097, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.18725468441843987, + "step": 31010 + }, + { + "epoch": 0.7755, + "grad_norm": 28.75, + "grad_norm_var": 2.0184895833333334, + "learning_rate": 0.0001, + "loss": 7.3251, + "loss/crossentropy": 2.036131452769041, + "loss/hidden": 3.347265625, + "loss/jsd": 0.0, + "loss/logits": 0.1737860631197691, + "step": 31020 + }, + { + "epoch": 0.77575, + "grad_norm": 31.75, + "grad_norm_var": 1.3999348958333333, + "learning_rate": 0.0001, + "loss": 7.262, + "loss/crossentropy": 2.017168144881725, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.167899782769382, + "step": 31030 + }, + { + "epoch": 0.776, + "grad_norm": 32.25, + "grad_norm_var": 3.342643229166667, + "learning_rate": 0.0001, + "loss": 7.1701, + "loss/crossentropy": 2.019613781571388, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.19059567637741565, + "step": 31040 + }, + { + "epoch": 0.77625, + "grad_norm": 31.125, + "grad_norm_var": 3.4270833333333335, + "learning_rate": 0.0001, + "loss": 7.2972, + "loss/crossentropy": 2.0828250303864477, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.18261822033673525, + "step": 31050 + }, + { + "epoch": 0.7765, + "grad_norm": 29.75, + "grad_norm_var": 2.8910807291666667, + "learning_rate": 0.0001, + "loss": 7.326, + "loss/crossentropy": 2.0209526136517524, + "loss/hidden": 3.322265625, + "loss/jsd": 0.0, + "loss/logits": 0.17059254441410304, + "step": 31060 + }, + { + "epoch": 0.77675, + "grad_norm": 28.75, + "grad_norm_var": 2.356705729166667, + "learning_rate": 0.0001, + "loss": 7.2019, + "loss/crossentropy": 2.1125204801559447, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.17937722243368626, + "step": 31070 + }, + { + "epoch": 0.777, + "grad_norm": 33.5, + "grad_norm_var": 2.4181640625, + "learning_rate": 0.0001, + "loss": 7.1838, + "loss/crossentropy": 2.2028726637363434, + "loss/hidden": 3.256640625, + "loss/jsd": 0.0, + "loss/logits": 0.17989998534321786, + "step": 31080 + }, + { + "epoch": 0.77725, + "grad_norm": 30.75, + "grad_norm_var": 2.671875, + "learning_rate": 0.0001, + "loss": 7.1709, + "loss/crossentropy": 2.1625898659229277, + "loss/hidden": 3.390625, + "loss/jsd": 0.0, + "loss/logits": 0.19090434685349464, + "step": 31090 + }, + { + "epoch": 0.7775, + "grad_norm": 28.875, + "grad_norm_var": 2.36640625, + "learning_rate": 0.0001, + "loss": 7.1326, + "loss/crossentropy": 2.036789299547672, + "loss/hidden": 3.3421875, + "loss/jsd": 0.0, + "loss/logits": 0.17714376896619796, + "step": 31100 + }, + { + "epoch": 0.77775, + "grad_norm": 31.75, + "grad_norm_var": 11.40390625, + "learning_rate": 0.0001, + "loss": 7.1735, + "loss/crossentropy": 1.9948045410215856, + "loss/hidden": 3.24921875, + "loss/jsd": 0.0, + "loss/logits": 0.1798159122467041, + "step": 31110 + }, + { + "epoch": 0.778, + "grad_norm": 34.5, + "grad_norm_var": 4.034309895833333, + "learning_rate": 0.0001, + "loss": 7.2609, + "loss/crossentropy": 2.1702212870121, + "loss/hidden": 3.223828125, + "loss/jsd": 0.0, + "loss/logits": 0.17852376736700534, + "step": 31120 + }, + { + "epoch": 0.77825, + "grad_norm": 31.625, + "grad_norm_var": 4.676822916666667, + "learning_rate": 0.0001, + "loss": 7.3185, + "loss/crossentropy": 2.0733341470360758, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.18980380473658442, + "step": 31130 + }, + { + "epoch": 0.7785, + "grad_norm": 31.25, + "grad_norm_var": 4.430143229166666, + "learning_rate": 0.0001, + "loss": 7.1541, + "loss/crossentropy": 1.9170797616243362, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.1943414082750678, + "step": 31140 + }, + { + "epoch": 0.77875, + "grad_norm": 29.125, + "grad_norm_var": 14.3228515625, + "learning_rate": 0.0001, + "loss": 7.0941, + "loss/crossentropy": 2.0785103261470796, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.2052849479019642, + "step": 31150 + }, + { + "epoch": 0.779, + "grad_norm": 28.625, + "grad_norm_var": 4.9869140625, + "learning_rate": 0.0001, + "loss": 7.3521, + "loss/crossentropy": 2.0296490490436554, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.1873609971255064, + "step": 31160 + }, + { + "epoch": 0.77925, + "grad_norm": 29.75, + "grad_norm_var": 3.9150390625, + "learning_rate": 0.0001, + "loss": 7.2165, + "loss/crossentropy": 2.070235808193684, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.18882376216351987, + "step": 31170 + }, + { + "epoch": 0.7795, + "grad_norm": 28.0, + "grad_norm_var": 4.44765625, + "learning_rate": 0.0001, + "loss": 7.2705, + "loss/crossentropy": 1.8607801221311093, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.16972522204741836, + "step": 31180 + }, + { + "epoch": 0.77975, + "grad_norm": 32.5, + "grad_norm_var": 5.82265625, + "learning_rate": 0.0001, + "loss": 7.3687, + "loss/crossentropy": 2.151284599304199, + "loss/hidden": 3.364453125, + "loss/jsd": 0.0, + "loss/logits": 0.1864160757511854, + "step": 31190 + }, + { + "epoch": 0.78, + "grad_norm": 31.25, + "grad_norm_var": 3.4530598958333334, + "learning_rate": 0.0001, + "loss": 7.2288, + "loss/crossentropy": 2.014324263483286, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.18028491344302894, + "step": 31200 + }, + { + "epoch": 0.78025, + "grad_norm": 33.25, + "grad_norm_var": 2.230208333333333, + "learning_rate": 0.0001, + "loss": 7.2867, + "loss/crossentropy": 2.024941012263298, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.1874180819839239, + "step": 31210 + }, + { + "epoch": 0.7805, + "grad_norm": 30.25, + "grad_norm_var": 2.124739583333333, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 2.0621088325977324, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.18965848479419947, + "step": 31220 + }, + { + "epoch": 0.78075, + "grad_norm": 32.0, + "grad_norm_var": 2.1369140625, + "learning_rate": 0.0001, + "loss": 7.1658, + "loss/crossentropy": 2.0059763975441456, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.20813378430902957, + "step": 31230 + }, + { + "epoch": 0.781, + "grad_norm": 32.25, + "grad_norm_var": 3.903125, + "learning_rate": 0.0001, + "loss": 7.3986, + "loss/crossentropy": 2.168729566037655, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.19630307257175444, + "step": 31240 + }, + { + "epoch": 0.78125, + "grad_norm": 30.375, + "grad_norm_var": 1.8056640625, + "learning_rate": 0.0001, + "loss": 7.3344, + "loss/crossentropy": 2.1517349526286127, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.18015162888914346, + "step": 31250 + }, + { + "epoch": 0.7815, + "grad_norm": 31.625, + "grad_norm_var": 14.411458333333334, + "learning_rate": 0.0001, + "loss": 7.2129, + "loss/crossentropy": 2.06531667560339, + "loss/hidden": 3.431640625, + "loss/jsd": 0.0, + "loss/logits": 0.19317530654370785, + "step": 31260 + }, + { + "epoch": 0.78175, + "grad_norm": 38.25, + "grad_norm_var": 7.57265625, + "learning_rate": 0.0001, + "loss": 7.3526, + "loss/crossentropy": 2.1732255190610887, + "loss/hidden": 3.3203125, + "loss/jsd": 0.0, + "loss/logits": 0.19884103927761315, + "step": 31270 + }, + { + "epoch": 0.782, + "grad_norm": 30.5, + "grad_norm_var": 8.1947265625, + "learning_rate": 0.0001, + "loss": 7.2598, + "loss/crossentropy": 1.9770694464445113, + "loss/hidden": 3.264453125, + "loss/jsd": 0.0, + "loss/logits": 0.18086720816791058, + "step": 31280 + }, + { + "epoch": 0.78225, + "grad_norm": 30.5, + "grad_norm_var": 2.708268229166667, + "learning_rate": 0.0001, + "loss": 7.3301, + "loss/crossentropy": 2.0416681550443174, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17893482595682145, + "step": 31290 + }, + { + "epoch": 0.7825, + "grad_norm": 32.75, + "grad_norm_var": 8.041666666666666, + "learning_rate": 0.0001, + "loss": 7.2272, + "loss/crossentropy": 1.9394095972180367, + "loss/hidden": 3.362890625, + "loss/jsd": 0.0, + "loss/logits": 0.1878746159374714, + "step": 31300 + }, + { + "epoch": 0.78275, + "grad_norm": 36.5, + "grad_norm_var": 8.885872395833333, + "learning_rate": 0.0001, + "loss": 7.2945, + "loss/crossentropy": 2.1827884405851363, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1965128371492028, + "step": 31310 + }, + { + "epoch": 0.783, + "grad_norm": 29.625, + "grad_norm_var": 4.603125, + "learning_rate": 0.0001, + "loss": 7.2984, + "loss/crossentropy": 2.0775409430265426, + "loss/hidden": 3.24140625, + "loss/jsd": 0.0, + "loss/logits": 0.17661921344697476, + "step": 31320 + }, + { + "epoch": 0.78325, + "grad_norm": 30.625, + "grad_norm_var": 8.294205729166666, + "learning_rate": 0.0001, + "loss": 7.1817, + "loss/crossentropy": 2.049132463335991, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.1787800071761012, + "step": 31330 + }, + { + "epoch": 0.7835, + "grad_norm": 32.0, + "grad_norm_var": 4.706184895833333, + "learning_rate": 0.0001, + "loss": 7.3231, + "loss/crossentropy": 2.022278678417206, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.19319646768271923, + "step": 31340 + }, + { + "epoch": 0.78375, + "grad_norm": 28.875, + "grad_norm_var": 3.845572916666667, + "learning_rate": 0.0001, + "loss": 7.3237, + "loss/crossentropy": 1.9787674874067307, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.18886952772736548, + "step": 31350 + }, + { + "epoch": 0.784, + "grad_norm": 29.5, + "grad_norm_var": 4.2744140625, + "learning_rate": 0.0001, + "loss": 7.3666, + "loss/crossentropy": 2.1626318007707597, + "loss/hidden": 3.330078125, + "loss/jsd": 0.0, + "loss/logits": 0.19823330659419297, + "step": 31360 + }, + { + "epoch": 0.78425, + "grad_norm": 27.75, + "grad_norm_var": 4.5025390625, + "learning_rate": 0.0001, + "loss": 7.2335, + "loss/crossentropy": 2.2147083409130572, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.18974209930747749, + "step": 31370 + }, + { + "epoch": 0.7845, + "grad_norm": 29.75, + "grad_norm_var": 5.703059895833333, + "learning_rate": 0.0001, + "loss": 7.2754, + "loss/crossentropy": 1.9638444483280182, + "loss/hidden": 3.210546875, + "loss/jsd": 0.0, + "loss/logits": 0.1664100494235754, + "step": 31380 + }, + { + "epoch": 0.78475, + "grad_norm": 28.625, + "grad_norm_var": 5.27265625, + "learning_rate": 0.0001, + "loss": 7.2791, + "loss/crossentropy": 2.1053778350353243, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.18106228299438953, + "step": 31390 + }, + { + "epoch": 0.785, + "grad_norm": 30.25, + "grad_norm_var": 56.23515625, + "learning_rate": 0.0001, + "loss": 7.3181, + "loss/crossentropy": 2.079754628241062, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.19201560821384192, + "step": 31400 + }, + { + "epoch": 0.78525, + "grad_norm": 30.5, + "grad_norm_var": 31.951041666666665, + "learning_rate": 0.0001, + "loss": 7.2104, + "loss/crossentropy": 2.1266677528619766, + "loss/hidden": 3.327734375, + "loss/jsd": 0.0, + "loss/logits": 0.18508619628846645, + "step": 31410 + }, + { + "epoch": 0.7855, + "grad_norm": 32.0, + "grad_norm_var": 35.484309895833334, + "learning_rate": 0.0001, + "loss": 7.0706, + "loss/crossentropy": 1.9031139247119426, + "loss/hidden": 3.255078125, + "loss/jsd": 0.0, + "loss/logits": 0.16362121552228928, + "step": 31420 + }, + { + "epoch": 0.78575, + "grad_norm": 31.125, + "grad_norm_var": 10.720833333333333, + "learning_rate": 0.0001, + "loss": 7.1675, + "loss/crossentropy": 2.0395915627479555, + "loss/hidden": 3.315625, + "loss/jsd": 0.0, + "loss/logits": 0.17914918782189487, + "step": 31430 + }, + { + "epoch": 0.786, + "grad_norm": 27.375, + "grad_norm_var": 15.347330729166666, + "learning_rate": 0.0001, + "loss": 7.2693, + "loss/crossentropy": 2.0384490221738814, + "loss/hidden": 3.349609375, + "loss/jsd": 0.0, + "loss/logits": 0.17985808495432137, + "step": 31440 + }, + { + "epoch": 0.78625, + "grad_norm": 33.0, + "grad_norm_var": 10.926822916666667, + "learning_rate": 0.0001, + "loss": 7.1775, + "loss/crossentropy": 2.0808909833431244, + "loss/hidden": 3.24609375, + "loss/jsd": 0.0, + "loss/logits": 0.17647454431280493, + "step": 31450 + }, + { + "epoch": 0.7865, + "grad_norm": 31.875, + "grad_norm_var": 3.29375, + "learning_rate": 0.0001, + "loss": 7.2395, + "loss/crossentropy": 2.2060162991285326, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.18310822043567895, + "step": 31460 + }, + { + "epoch": 0.78675, + "grad_norm": 28.75, + "grad_norm_var": 372.24140625, + "learning_rate": 0.0001, + "loss": 7.2611, + "loss/crossentropy": 1.9570839531719684, + "loss/hidden": 3.308203125, + "loss/jsd": 0.0, + "loss/logits": 0.1838856378570199, + "step": 31470 + }, + { + "epoch": 0.787, + "grad_norm": 28.375, + "grad_norm_var": 5.662955729166667, + "learning_rate": 0.0001, + "loss": 7.2132, + "loss/crossentropy": 1.8940647289156913, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18987012309953571, + "step": 31480 + }, + { + "epoch": 0.78725, + "grad_norm": 28.875, + "grad_norm_var": 3.489518229166667, + "learning_rate": 0.0001, + "loss": 7.2335, + "loss/crossentropy": 2.2421848446130754, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.18429996352642775, + "step": 31490 + }, + { + "epoch": 0.7875, + "grad_norm": 29.625, + "grad_norm_var": 4.330989583333333, + "learning_rate": 0.0001, + "loss": 7.3104, + "loss/crossentropy": 2.0037948578596114, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.18187961634248495, + "step": 31500 + }, + { + "epoch": 0.78775, + "grad_norm": 29.375, + "grad_norm_var": 7.914583333333334, + "learning_rate": 0.0001, + "loss": 7.2202, + "loss/crossentropy": 1.9032418720424176, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.1838343466632068, + "step": 31510 + }, + { + "epoch": 0.788, + "grad_norm": 27.375, + "grad_norm_var": 4.1603515625, + "learning_rate": 0.0001, + "loss": 7.2262, + "loss/crossentropy": 2.0772057265043258, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19711360968649388, + "step": 31520 + }, + { + "epoch": 0.78825, + "grad_norm": 35.5, + "grad_norm_var": 9.198372395833333, + "learning_rate": 0.0001, + "loss": 7.3303, + "loss/crossentropy": 2.2158702582120897, + "loss/hidden": 3.263671875, + "loss/jsd": 0.0, + "loss/logits": 0.18437313195317984, + "step": 31530 + }, + { + "epoch": 0.7885, + "grad_norm": 35.75, + "grad_norm_var": 7.027083333333334, + "learning_rate": 0.0001, + "loss": 7.302, + "loss/crossentropy": 2.0729110084474085, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.19737910926342012, + "step": 31540 + }, + { + "epoch": 0.78875, + "grad_norm": 32.25, + "grad_norm_var": 6.404166666666667, + "learning_rate": 0.0001, + "loss": 7.2099, + "loss/crossentropy": 1.9750752970576286, + "loss/hidden": 3.21640625, + "loss/jsd": 0.0, + "loss/logits": 0.17290026945993303, + "step": 31550 + }, + { + "epoch": 0.789, + "grad_norm": 29.75, + "grad_norm_var": 3.4629557291666666, + "learning_rate": 0.0001, + "loss": 7.2183, + "loss/crossentropy": 2.0953447185456753, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.1804077338427305, + "step": 31560 + }, + { + "epoch": 0.78925, + "grad_norm": 31.375, + "grad_norm_var": 3.66015625, + "learning_rate": 0.0001, + "loss": 7.2443, + "loss/crossentropy": 1.9566784024238586, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.18815564569085835, + "step": 31570 + }, + { + "epoch": 0.7895, + "grad_norm": 33.75, + "grad_norm_var": 9.175, + "learning_rate": 0.0001, + "loss": 7.3692, + "loss/crossentropy": 2.2338188737630844, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.18873065207153558, + "step": 31580 + }, + { + "epoch": 0.78975, + "grad_norm": 29.25, + "grad_norm_var": 9.987239583333333, + "learning_rate": 0.0001, + "loss": 7.2562, + "loss/crossentropy": 2.080914391577244, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.17150079626590015, + "step": 31590 + }, + { + "epoch": 0.79, + "grad_norm": 29.625, + "grad_norm_var": 2.3212890625, + "learning_rate": 0.0001, + "loss": 7.2378, + "loss/crossentropy": 2.064169317483902, + "loss/hidden": 3.45390625, + "loss/jsd": 0.0, + "loss/logits": 0.20276217330247165, + "step": 31600 + }, + { + "epoch": 0.79025, + "grad_norm": 32.0, + "grad_norm_var": 2.957291666666667, + "learning_rate": 0.0001, + "loss": 7.3602, + "loss/crossentropy": 2.1499217487871647, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.18527807723730802, + "step": 31610 + }, + { + "epoch": 0.7905, + "grad_norm": 29.875, + "grad_norm_var": 3.027083333333333, + "learning_rate": 0.0001, + "loss": 7.3215, + "loss/crossentropy": 2.1228356897830962, + "loss/hidden": 3.25703125, + "loss/jsd": 0.0, + "loss/logits": 0.17830363363027574, + "step": 31620 + }, + { + "epoch": 0.79075, + "grad_norm": 30.75, + "grad_norm_var": 3.4160807291666666, + "learning_rate": 0.0001, + "loss": 7.2688, + "loss/crossentropy": 2.0881614208221437, + "loss/hidden": 3.258984375, + "loss/jsd": 0.0, + "loss/logits": 0.17270825430750847, + "step": 31630 + }, + { + "epoch": 0.791, + "grad_norm": 28.625, + "grad_norm_var": 3.7525390625, + "learning_rate": 0.0001, + "loss": 7.2279, + "loss/crossentropy": 1.9683130264282227, + "loss/hidden": 3.426953125, + "loss/jsd": 0.0, + "loss/logits": 0.17459874097257852, + "step": 31640 + }, + { + "epoch": 0.79125, + "grad_norm": 29.0, + "grad_norm_var": 3.24375, + "learning_rate": 0.0001, + "loss": 7.2418, + "loss/crossentropy": 1.9514197140932084, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18445694111287594, + "step": 31650 + }, + { + "epoch": 0.7915, + "grad_norm": 32.5, + "grad_norm_var": 4.755143229166666, + "learning_rate": 0.0001, + "loss": 7.1507, + "loss/crossentropy": 2.079793556034565, + "loss/hidden": 3.2609375, + "loss/jsd": 0.0, + "loss/logits": 0.17297195587307215, + "step": 31660 + }, + { + "epoch": 0.79175, + "grad_norm": 33.0, + "grad_norm_var": 4.960872395833333, + "learning_rate": 0.0001, + "loss": 7.4178, + "loss/crossentropy": 2.0618962317705156, + "loss/hidden": 3.32421875, + "loss/jsd": 0.0, + "loss/logits": 0.19019667170941829, + "step": 31670 + }, + { + "epoch": 0.792, + "grad_norm": 30.625, + "grad_norm_var": 5.7509765625, + "learning_rate": 0.0001, + "loss": 7.2239, + "loss/crossentropy": 1.9956995896995067, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.17896303534507751, + "step": 31680 + }, + { + "epoch": 0.79225, + "grad_norm": 30.0, + "grad_norm_var": 3.070572916666667, + "learning_rate": 0.0001, + "loss": 7.1065, + "loss/crossentropy": 2.023432979732752, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.1705683045089245, + "step": 31690 + }, + { + "epoch": 0.7925, + "grad_norm": 27.125, + "grad_norm_var": 2.5697265625, + "learning_rate": 0.0001, + "loss": 7.1217, + "loss/crossentropy": 1.946426709741354, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.17420749654993414, + "step": 31700 + }, + { + "epoch": 0.79275, + "grad_norm": 30.25, + "grad_norm_var": 2.3197265625, + "learning_rate": 0.0001, + "loss": 7.2364, + "loss/crossentropy": 2.0409046217799185, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.18621115908026695, + "step": 31710 + }, + { + "epoch": 0.793, + "grad_norm": 30.375, + "grad_norm_var": 2.3207682291666667, + "learning_rate": 0.0001, + "loss": 7.2649, + "loss/crossentropy": 2.114277198910713, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.18227699343115092, + "step": 31720 + }, + { + "epoch": 0.79325, + "grad_norm": 29.5, + "grad_norm_var": 2.3583333333333334, + "learning_rate": 0.0001, + "loss": 7.1856, + "loss/crossentropy": 1.9716971188783645, + "loss/hidden": 3.4265625, + "loss/jsd": 0.0, + "loss/logits": 0.1855311430990696, + "step": 31730 + }, + { + "epoch": 0.7935, + "grad_norm": 29.75, + "grad_norm_var": 1.7712890625, + "learning_rate": 0.0001, + "loss": 7.2383, + "loss/crossentropy": 2.037404325604439, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.1694024408236146, + "step": 31740 + }, + { + "epoch": 0.79375, + "grad_norm": 31.25, + "grad_norm_var": 2.5403116401424246e+18, + "learning_rate": 0.0001, + "loss": 7.2469, + "loss/crossentropy": 1.9700333438813686, + "loss/hidden": 3.295703125, + "loss/jsd": 0.0, + "loss/logits": 0.1846833111718297, + "step": 31750 + }, + { + "epoch": 0.794, + "grad_norm": 30.75, + "grad_norm_var": 2.5403116394849674e+18, + "learning_rate": 0.0001, + "loss": 7.1773, + "loss/crossentropy": 2.212307733297348, + "loss/hidden": 3.373046875, + "loss/jsd": 0.0, + "loss/logits": 0.18856476843357087, + "step": 31760 + }, + { + "epoch": 0.79425, + "grad_norm": 41.25, + "grad_norm_var": 14.876822916666667, + "learning_rate": 0.0001, + "loss": 7.256, + "loss/crossentropy": 2.0969568237662317, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.179645280726254, + "step": 31770 + }, + { + "epoch": 0.7945, + "grad_norm": 28.875, + "grad_norm_var": 10.3712890625, + "learning_rate": 0.0001, + "loss": 7.1819, + "loss/crossentropy": 1.9347251377999783, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.18520953366532922, + "step": 31780 + }, + { + "epoch": 0.79475, + "grad_norm": 29.75, + "grad_norm_var": 2.5327473958333333, + "learning_rate": 0.0001, + "loss": 7.2494, + "loss/crossentropy": 2.0052943006157875, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.17678212691098452, + "step": 31790 + }, + { + "epoch": 0.795, + "grad_norm": 33.25, + "grad_norm_var": 2.6518229166666667, + "learning_rate": 0.0001, + "loss": 7.251, + "loss/crossentropy": 1.9200729496777058, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1845573834143579, + "step": 31800 + }, + { + "epoch": 0.79525, + "grad_norm": 29.875, + "grad_norm_var": 6.238541666666666, + "learning_rate": 0.0001, + "loss": 7.3009, + "loss/crossentropy": 1.982697883248329, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.19470857754349707, + "step": 31810 + }, + { + "epoch": 0.7955, + "grad_norm": 30.625, + "grad_norm_var": 6.095768229166667, + "learning_rate": 0.0001, + "loss": 7.2466, + "loss/crossentropy": 1.9890022471547126, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1903789708390832, + "step": 31820 + }, + { + "epoch": 0.79575, + "grad_norm": 31.875, + "grad_norm_var": 2.4806640625, + "learning_rate": 0.0001, + "loss": 7.2873, + "loss/crossentropy": 1.9891909763216973, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.1799360180273652, + "step": 31830 + }, + { + "epoch": 0.796, + "grad_norm": 30.5, + "grad_norm_var": 1.8962890625, + "learning_rate": 0.0001, + "loss": 7.237, + "loss/crossentropy": 2.1192200325429438, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.18091769460588694, + "step": 31840 + }, + { + "epoch": 0.79625, + "grad_norm": 28.75, + "grad_norm_var": 1.8872395833333333, + "learning_rate": 0.0001, + "loss": 7.1984, + "loss/crossentropy": 2.197053752094507, + "loss/hidden": 3.341015625, + "loss/jsd": 0.0, + "loss/logits": 0.19096684809774161, + "step": 31850 + }, + { + "epoch": 0.7965, + "grad_norm": 34.75, + "grad_norm_var": 1.8240281960738632e+18, + "learning_rate": 0.0001, + "loss": 7.3018, + "loss/crossentropy": 1.9654372721910476, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.18191756587475538, + "step": 31860 + }, + { + "epoch": 0.79675, + "grad_norm": 30.375, + "grad_norm_var": 4.390625, + "learning_rate": 0.0001, + "loss": 7.1343, + "loss/crossentropy": 2.1827359586954116, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.19493103586137295, + "step": 31870 + }, + { + "epoch": 0.797, + "grad_norm": 35.0, + "grad_norm_var": 4.96875, + "learning_rate": 0.0001, + "loss": 7.2734, + "loss/crossentropy": 2.0850317165255547, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.1965682774782181, + "step": 31880 + }, + { + "epoch": 0.79725, + "grad_norm": 29.0, + "grad_norm_var": 4.255989583333333, + "learning_rate": 0.0001, + "loss": 7.2898, + "loss/crossentropy": 2.078572702407837, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.19365447759628296, + "step": 31890 + }, + { + "epoch": 0.7975, + "grad_norm": 29.875, + "grad_norm_var": 5.2650390625, + "learning_rate": 0.0001, + "loss": 7.3819, + "loss/crossentropy": 2.0934070348739624, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.17799469958990813, + "step": 31900 + }, + { + "epoch": 0.79775, + "grad_norm": 30.0, + "grad_norm_var": 3.856705729166667, + "learning_rate": 0.0001, + "loss": 7.2383, + "loss/crossentropy": 1.9600264072418212, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.1934324972331524, + "step": 31910 + }, + { + "epoch": 0.798, + "grad_norm": 30.125, + "grad_norm_var": 2.471809895833333, + "learning_rate": 0.0001, + "loss": 7.2629, + "loss/crossentropy": 2.2288677148520946, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.20668185902759434, + "step": 31920 + }, + { + "epoch": 0.79825, + "grad_norm": 32.75, + "grad_norm_var": 6.603125, + "learning_rate": 0.0001, + "loss": 7.2431, + "loss/crossentropy": 2.189570128917694, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.19532648399472236, + "step": 31930 + }, + { + "epoch": 0.7985, + "grad_norm": 34.0, + "grad_norm_var": 10.4603515625, + "learning_rate": 0.0001, + "loss": 7.246, + "loss/crossentropy": 2.052780894935131, + "loss/hidden": 3.2640625, + "loss/jsd": 0.0, + "loss/logits": 0.18245863653719424, + "step": 31940 + }, + { + "epoch": 0.79875, + "grad_norm": 30.25, + "grad_norm_var": 11.5791015625, + "learning_rate": 0.0001, + "loss": 7.2474, + "loss/crossentropy": 2.1735198058187963, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.18904183022677898, + "step": 31950 + }, + { + "epoch": 0.799, + "grad_norm": 29.875, + "grad_norm_var": 6.057291666666667, + "learning_rate": 0.0001, + "loss": 7.2311, + "loss/crossentropy": 1.9964329220354557, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.17117326706647873, + "step": 31960 + }, + { + "epoch": 0.79925, + "grad_norm": 28.875, + "grad_norm_var": 7.140625, + "learning_rate": 0.0001, + "loss": 7.2959, + "loss/crossentropy": 2.0637815829366444, + "loss/hidden": 3.348046875, + "loss/jsd": 0.0, + "loss/logits": 0.1760326312389225, + "step": 31970 + }, + { + "epoch": 0.7995, + "grad_norm": 31.875, + "grad_norm_var": 5.26015625, + "learning_rate": 0.0001, + "loss": 7.2168, + "loss/crossentropy": 1.9845145612955093, + "loss/hidden": 3.405078125, + "loss/jsd": 0.0, + "loss/logits": 0.1851344184949994, + "step": 31980 + }, + { + "epoch": 0.79975, + "grad_norm": 31.5, + "grad_norm_var": 3.655989583333333, + "learning_rate": 0.0001, + "loss": 7.2525, + "loss/crossentropy": 2.039809539914131, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.17651782967150212, + "step": 31990 + }, + { + "epoch": 0.8, + "grad_norm": 32.0, + "grad_norm_var": 5.589322916666666, + "learning_rate": 0.0001, + "loss": 7.2411, + "loss/crossentropy": 2.051901635527611, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.17814793512225152, + "step": 32000 + }, + { + "epoch": 0.80025, + "grad_norm": 30.375, + "grad_norm_var": 2.0666015625, + "learning_rate": 0.0001, + "loss": 7.2601, + "loss/crossentropy": 2.167706046998501, + "loss/hidden": 3.4578125, + "loss/jsd": 0.0, + "loss/logits": 0.20020014122128488, + "step": 32010 + }, + { + "epoch": 0.8005, + "grad_norm": 30.875, + "grad_norm_var": 2.7509765625, + "learning_rate": 0.0001, + "loss": 7.2548, + "loss/crossentropy": 2.031809562444687, + "loss/hidden": 3.4171875, + "loss/jsd": 0.0, + "loss/logits": 0.1915106700733304, + "step": 32020 + }, + { + "epoch": 0.80075, + "grad_norm": 30.75, + "grad_norm_var": 3.5479166666666666, + "learning_rate": 0.0001, + "loss": 7.3371, + "loss/crossentropy": 2.151665323972702, + "loss/hidden": 3.242578125, + "loss/jsd": 0.0, + "loss/logits": 0.17666382808238268, + "step": 32030 + }, + { + "epoch": 0.801, + "grad_norm": 34.0, + "grad_norm_var": 3.2744140625, + "learning_rate": 0.0001, + "loss": 7.2745, + "loss/crossentropy": 2.155677234381437, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.1825503976084292, + "step": 32040 + }, + { + "epoch": 0.80125, + "grad_norm": 29.375, + "grad_norm_var": 4.258333333333334, + "learning_rate": 0.0001, + "loss": 7.2659, + "loss/crossentropy": 2.0210466548800468, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.18113178480416536, + "step": 32050 + }, + { + "epoch": 0.8015, + "grad_norm": 28.75, + "grad_norm_var": 4.313997395833334, + "learning_rate": 0.0001, + "loss": 7.1235, + "loss/crossentropy": 1.9424000725150108, + "loss/hidden": 3.414453125, + "loss/jsd": 0.0, + "loss/logits": 0.1696411618962884, + "step": 32060 + }, + { + "epoch": 0.80175, + "grad_norm": 30.375, + "grad_norm_var": 6.124934895833333, + "learning_rate": 0.0001, + "loss": 7.2674, + "loss/crossentropy": 2.0733315840363504, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.18977922163903713, + "step": 32070 + }, + { + "epoch": 0.802, + "grad_norm": 29.25, + "grad_norm_var": 7.609309895833333, + "learning_rate": 0.0001, + "loss": 7.259, + "loss/crossentropy": 2.227187469601631, + "loss/hidden": 3.248828125, + "loss/jsd": 0.0, + "loss/logits": 0.17911691386252643, + "step": 32080 + }, + { + "epoch": 0.80225, + "grad_norm": 30.125, + "grad_norm_var": 3.6754557291666665, + "learning_rate": 0.0001, + "loss": 7.234, + "loss/crossentropy": 2.087716729938984, + "loss/hidden": 3.321484375, + "loss/jsd": 0.0, + "loss/logits": 0.18047652654349805, + "step": 32090 + }, + { + "epoch": 0.8025, + "grad_norm": 30.0, + "grad_norm_var": 3.0400390625, + "learning_rate": 0.0001, + "loss": 7.4021, + "loss/crossentropy": 2.131810285151005, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17247401773929597, + "step": 32100 + }, + { + "epoch": 0.80275, + "grad_norm": 29.375, + "grad_norm_var": 3.55390625, + "learning_rate": 0.0001, + "loss": 7.3817, + "loss/crossentropy": 2.1549403719604014, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.18837472070008515, + "step": 32110 + }, + { + "epoch": 0.803, + "grad_norm": 31.125, + "grad_norm_var": 1.9442057291666666, + "learning_rate": 0.0001, + "loss": 7.1035, + "loss/crossentropy": 1.9505949556827544, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18204545034095646, + "step": 32120 + }, + { + "epoch": 0.80325, + "grad_norm": 33.75, + "grad_norm_var": 2.92890625, + "learning_rate": 0.0001, + "loss": 7.2371, + "loss/crossentropy": 2.1348301008343697, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.18901300579309463, + "step": 32130 + }, + { + "epoch": 0.8035, + "grad_norm": 33.25, + "grad_norm_var": 3.1728515625, + "learning_rate": 0.0001, + "loss": 7.2579, + "loss/crossentropy": 1.9484988793730735, + "loss/hidden": 3.271484375, + "loss/jsd": 0.0, + "loss/logits": 0.16889106966555117, + "step": 32140 + }, + { + "epoch": 0.80375, + "grad_norm": 31.375, + "grad_norm_var": 2.8393229166666667, + "learning_rate": 0.0001, + "loss": 7.2884, + "loss/crossentropy": 1.9826452344655991, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.1743983631953597, + "step": 32150 + }, + { + "epoch": 0.804, + "grad_norm": 31.0, + "grad_norm_var": 3.5322916666666666, + "learning_rate": 0.0001, + "loss": 7.1942, + "loss/crossentropy": 1.9426303870975972, + "loss/hidden": 3.42890625, + "loss/jsd": 0.0, + "loss/logits": 0.1772203067317605, + "step": 32160 + }, + { + "epoch": 0.80425, + "grad_norm": 29.75, + "grad_norm_var": 4.874934895833333, + "learning_rate": 0.0001, + "loss": 7.2248, + "loss/crossentropy": 1.870173018425703, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.1688270630314946, + "step": 32170 + }, + { + "epoch": 0.8045, + "grad_norm": 31.25, + "grad_norm_var": 3.5759765625, + "learning_rate": 0.0001, + "loss": 7.1157, + "loss/crossentropy": 1.9939643263816833, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17504618484526874, + "step": 32180 + }, + { + "epoch": 0.80475, + "grad_norm": 31.5, + "grad_norm_var": 2.2587890625, + "learning_rate": 0.0001, + "loss": 7.2281, + "loss/crossentropy": 2.026793968677521, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.17457820419222117, + "step": 32190 + }, + { + "epoch": 0.805, + "grad_norm": 29.375, + "grad_norm_var": 2.758736220941137e+18, + "learning_rate": 0.0001, + "loss": 7.2813, + "loss/crossentropy": 1.8412863582372665, + "loss/hidden": 3.398828125, + "loss/jsd": 0.0, + "loss/logits": 0.18289455082267522, + "step": 32200 + }, + { + "epoch": 0.80525, + "grad_norm": 30.375, + "grad_norm_var": 2.758736220602027e+18, + "learning_rate": 0.0001, + "loss": 7.3068, + "loss/crossentropy": 2.1275208979845046, + "loss/hidden": 3.328515625, + "loss/jsd": 0.0, + "loss/logits": 0.182334803044796, + "step": 32210 + }, + { + "epoch": 0.8055, + "grad_norm": 33.0, + "grad_norm_var": 2.758736219923808e+18, + "learning_rate": 0.0001, + "loss": 7.3141, + "loss/crossentropy": 2.134224483370781, + "loss/hidden": 3.266015625, + "loss/jsd": 0.0, + "loss/logits": 0.17503854436799884, + "step": 32220 + }, + { + "epoch": 0.80575, + "grad_norm": 32.75, + "grad_norm_var": 2.758736219342478e+18, + "learning_rate": 0.0001, + "loss": 7.4317, + "loss/crossentropy": 2.066732983291149, + "loss/hidden": 3.4828125, + "loss/jsd": 0.0, + "loss/logits": 0.20872533712536095, + "step": 32230 + }, + { + "epoch": 0.806, + "grad_norm": 26.625, + "grad_norm_var": 5.002018229166667, + "learning_rate": 0.0001, + "loss": 7.2406, + "loss/crossentropy": 1.9790171518921853, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.17605258710682392, + "step": 32240 + }, + { + "epoch": 0.80625, + "grad_norm": 29.375, + "grad_norm_var": 2.5551432291666667, + "learning_rate": 0.0001, + "loss": 7.3137, + "loss/crossentropy": 2.115340569615364, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.191914526745677, + "step": 32250 + }, + { + "epoch": 0.8065, + "grad_norm": 33.0, + "grad_norm_var": 1.95625, + "learning_rate": 0.0001, + "loss": 7.2523, + "loss/crossentropy": 1.9939750552177429, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.18340199431404472, + "step": 32260 + }, + { + "epoch": 0.80675, + "grad_norm": 29.875, + "grad_norm_var": 3.0518229166666666, + "learning_rate": 0.0001, + "loss": 7.2693, + "loss/crossentropy": 2.1312461979687214, + "loss/hidden": 3.22109375, + "loss/jsd": 0.0, + "loss/logits": 0.17243078984320165, + "step": 32270 + }, + { + "epoch": 0.807, + "grad_norm": 29.625, + "grad_norm_var": 2.7080729166666666, + "learning_rate": 0.0001, + "loss": 7.2228, + "loss/crossentropy": 1.9979605212807656, + "loss/hidden": 3.335546875, + "loss/jsd": 0.0, + "loss/logits": 0.1727626511827111, + "step": 32280 + }, + { + "epoch": 0.80725, + "grad_norm": 30.375, + "grad_norm_var": 2.4259765625, + "learning_rate": 0.0001, + "loss": 7.1558, + "loss/crossentropy": 2.046182955801487, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.1849920243024826, + "step": 32290 + }, + { + "epoch": 0.8075, + "grad_norm": 28.25, + "grad_norm_var": 3.084830729166667, + "learning_rate": 0.0001, + "loss": 7.2157, + "loss/crossentropy": 2.107586032152176, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.18789408281445502, + "step": 32300 + }, + { + "epoch": 0.80775, + "grad_norm": 31.75, + "grad_norm_var": 2.986458333333333, + "learning_rate": 0.0001, + "loss": 7.135, + "loss/crossentropy": 1.966269464045763, + "loss/hidden": 3.33984375, + "loss/jsd": 0.0, + "loss/logits": 0.17599091026932, + "step": 32310 + }, + { + "epoch": 0.808, + "grad_norm": 30.5, + "grad_norm_var": 1.6926432291666667, + "learning_rate": 0.0001, + "loss": 7.258, + "loss/crossentropy": 2.03958098590374, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.18970332182943822, + "step": 32320 + }, + { + "epoch": 0.80825, + "grad_norm": 30.5, + "grad_norm_var": 1.4893229166666666, + "learning_rate": 0.0001, + "loss": 7.2886, + "loss/crossentropy": 1.927356332540512, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.18331324979662894, + "step": 32330 + }, + { + "epoch": 0.8085, + "grad_norm": 29.375, + "grad_norm_var": 1.8212890625, + "learning_rate": 0.0001, + "loss": 7.3217, + "loss/crossentropy": 2.1595819175243376, + "loss/hidden": 3.243359375, + "loss/jsd": 0.0, + "loss/logits": 0.1796510087326169, + "step": 32340 + }, + { + "epoch": 0.80875, + "grad_norm": 32.0, + "grad_norm_var": 2.135872395833333, + "learning_rate": 0.0001, + "loss": 7.1902, + "loss/crossentropy": 2.0535615980625153, + "loss/hidden": 3.271875, + "loss/jsd": 0.0, + "loss/logits": 0.17316078525036574, + "step": 32350 + }, + { + "epoch": 0.809, + "grad_norm": 29.5, + "grad_norm_var": 4.070833333333334, + "learning_rate": 0.0001, + "loss": 7.1575, + "loss/crossentropy": 1.870839986205101, + "loss/hidden": 3.29375, + "loss/jsd": 0.0, + "loss/logits": 0.1662488218396902, + "step": 32360 + }, + { + "epoch": 0.80925, + "grad_norm": 29.25, + "grad_norm_var": 3.912239583333333, + "learning_rate": 0.0001, + "loss": 7.1563, + "loss/crossentropy": 2.0238643392920492, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.16979822777211667, + "step": 32370 + }, + { + "epoch": 0.8095, + "grad_norm": 34.0, + "grad_norm_var": 3.70625, + "learning_rate": 0.0001, + "loss": 7.2548, + "loss/crossentropy": 2.030308040976524, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.17933697663247586, + "step": 32380 + }, + { + "epoch": 0.80975, + "grad_norm": 31.25, + "grad_norm_var": 3.705208333333333, + "learning_rate": 0.0001, + "loss": 7.1595, + "loss/crossentropy": 2.097300034761429, + "loss/hidden": 3.277734375, + "loss/jsd": 0.0, + "loss/logits": 0.1716611221432686, + "step": 32390 + }, + { + "epoch": 0.81, + "grad_norm": 27.75, + "grad_norm_var": 8.028580729166666, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.1147085294127463, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.17622052859514953, + "step": 32400 + }, + { + "epoch": 0.81025, + "grad_norm": 31.875, + "grad_norm_var": 4.1619140625, + "learning_rate": 0.0001, + "loss": 7.1481, + "loss/crossentropy": 2.1348901122808455, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.18319515213370324, + "step": 32410 + }, + { + "epoch": 0.8105, + "grad_norm": 33.25, + "grad_norm_var": 5.037239583333333, + "learning_rate": 0.0001, + "loss": 7.2796, + "loss/crossentropy": 2.0976379089057446, + "loss/hidden": 3.294921875, + "loss/jsd": 0.0, + "loss/logits": 0.16924114488065242, + "step": 32420 + }, + { + "epoch": 0.81075, + "grad_norm": 32.0, + "grad_norm_var": 15.3806640625, + "learning_rate": 0.0001, + "loss": 7.3044, + "loss/crossentropy": 2.1474139034748077, + "loss/hidden": 3.408203125, + "loss/jsd": 0.0, + "loss/logits": 0.1934512373059988, + "step": 32430 + }, + { + "epoch": 0.811, + "grad_norm": 34.5, + "grad_norm_var": 6.9244140625, + "learning_rate": 0.0001, + "loss": 7.2681, + "loss/crossentropy": 2.1129303738474845, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.19321042932569982, + "step": 32440 + }, + { + "epoch": 0.81125, + "grad_norm": 31.5, + "grad_norm_var": 1.9455729166666667, + "learning_rate": 0.0001, + "loss": 7.4055, + "loss/crossentropy": 2.2073852330446244, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.18046779334545135, + "step": 32450 + }, + { + "epoch": 0.8115, + "grad_norm": 28.0, + "grad_norm_var": 2.247330729166667, + "learning_rate": 0.0001, + "loss": 7.2706, + "loss/crossentropy": 2.009214510768652, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.20849293209612368, + "step": 32460 + }, + { + "epoch": 0.81175, + "grad_norm": 29.875, + "grad_norm_var": 3.4058471889324826e+18, + "learning_rate": 0.0001, + "loss": 7.4795, + "loss/crossentropy": 2.1204657569527625, + "loss/hidden": 3.307421875, + "loss/jsd": 0.0, + "loss/logits": 0.18255203776061535, + "step": 32470 + }, + { + "epoch": 0.812, + "grad_norm": 28.375, + "grad_norm_var": 3.1010416666666667, + "learning_rate": 0.0001, + "loss": 7.163, + "loss/crossentropy": 1.9438684552907943, + "loss/hidden": 3.297265625, + "loss/jsd": 0.0, + "loss/logits": 0.1655712489038706, + "step": 32480 + }, + { + "epoch": 0.81225, + "grad_norm": 27.75, + "grad_norm_var": 13.784309895833333, + "learning_rate": 0.0001, + "loss": 7.2748, + "loss/crossentropy": 1.906498458981514, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.17149967923760415, + "step": 32490 + }, + { + "epoch": 0.8125, + "grad_norm": 32.75, + "grad_norm_var": 18.0337890625, + "learning_rate": 0.0001, + "loss": 7.2568, + "loss/crossentropy": 2.1418545827269555, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.18672561962157488, + "step": 32500 + }, + { + "epoch": 0.81275, + "grad_norm": 30.75, + "grad_norm_var": 2.83515625, + "learning_rate": 0.0001, + "loss": 7.2452, + "loss/crossentropy": 2.0591210328042506, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.1714493166655302, + "step": 32510 + }, + { + "epoch": 0.813, + "grad_norm": 31.625, + "grad_norm_var": 1.671875, + "learning_rate": 0.0001, + "loss": 7.1253, + "loss/crossentropy": 2.0408148244023323, + "loss/hidden": 3.3375, + "loss/jsd": 0.0, + "loss/logits": 0.18097794353961943, + "step": 32520 + }, + { + "epoch": 0.81325, + "grad_norm": 30.75, + "grad_norm_var": 1.7809895833333333, + "learning_rate": 0.0001, + "loss": 7.2145, + "loss/crossentropy": 1.9985629439353942, + "loss/hidden": 3.3296875, + "loss/jsd": 0.0, + "loss/logits": 0.19131027571856976, + "step": 32530 + }, + { + "epoch": 0.8135, + "grad_norm": 32.0, + "grad_norm_var": 3.2426432291666667, + "learning_rate": 0.0001, + "loss": 7.3301, + "loss/crossentropy": 2.1079444982111455, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.18785760272294283, + "step": 32540 + }, + { + "epoch": 0.81375, + "grad_norm": 28.875, + "grad_norm_var": 3.5431640625, + "learning_rate": 0.0001, + "loss": 7.174, + "loss/crossentropy": 1.9769691236317157, + "loss/hidden": 3.228125, + "loss/jsd": 0.0, + "loss/logits": 0.16153431748971342, + "step": 32550 + }, + { + "epoch": 0.814, + "grad_norm": 31.875, + "grad_norm_var": 2.133072916666667, + "learning_rate": 0.0001, + "loss": 7.1737, + "loss/crossentropy": 1.9369481757283211, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18391805402934552, + "step": 32560 + }, + { + "epoch": 0.81425, + "grad_norm": 31.375, + "grad_norm_var": 5.5322265625, + "learning_rate": 0.0001, + "loss": 7.3592, + "loss/crossentropy": 2.074393917620182, + "loss/hidden": 3.2703125, + "loss/jsd": 0.0, + "loss/logits": 0.17730166036635636, + "step": 32570 + }, + { + "epoch": 0.8145, + "grad_norm": 30.25, + "grad_norm_var": 7.649934895833334, + "learning_rate": 0.0001, + "loss": 7.2958, + "loss/crossentropy": 2.152645838260651, + "loss/hidden": 3.2671875, + "loss/jsd": 0.0, + "loss/logits": 0.1779523069038987, + "step": 32580 + }, + { + "epoch": 0.81475, + "grad_norm": 30.0, + "grad_norm_var": 3.687239583333333, + "learning_rate": 0.0001, + "loss": 7.2586, + "loss/crossentropy": 2.1111764326691627, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.2009077413007617, + "step": 32590 + }, + { + "epoch": 0.815, + "grad_norm": 31.0, + "grad_norm_var": 0.853125, + "learning_rate": 0.0001, + "loss": 7.2938, + "loss/crossentropy": 2.095028135180473, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.18357501607388257, + "step": 32600 + }, + { + "epoch": 0.81525, + "grad_norm": 29.125, + "grad_norm_var": 2.2643229166666665, + "learning_rate": 0.0001, + "loss": 7.3045, + "loss/crossentropy": 2.18403902053833, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.20085975714027882, + "step": 32610 + }, + { + "epoch": 0.8155, + "grad_norm": 7549747200.0, + "grad_norm_var": 3.562417644801884e+18, + "learning_rate": 0.0001, + "loss": 7.3539, + "loss/crossentropy": 2.0945954963564875, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.19835255537182092, + "step": 32620 + }, + { + "epoch": 0.81575, + "grad_norm": 29.25, + "grad_norm_var": 3.562417644180603e+18, + "learning_rate": 0.0001, + "loss": 7.2437, + "loss/crossentropy": 2.1617246508598327, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.1850573167204857, + "step": 32630 + }, + { + "epoch": 0.816, + "grad_norm": 29.25, + "grad_norm_var": 4.133268229166666, + "learning_rate": 0.0001, + "loss": 7.2513, + "loss/crossentropy": 2.0281495213508607, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.17307685893028973, + "step": 32640 + }, + { + "epoch": 0.81625, + "grad_norm": 31.375, + "grad_norm_var": 10.743489583333334, + "learning_rate": 0.0001, + "loss": 7.1824, + "loss/crossentropy": 1.9506276100873947, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.17777946423739194, + "step": 32650 + }, + { + "epoch": 0.8165, + "grad_norm": 30.625, + "grad_norm_var": 9.271809895833334, + "learning_rate": 0.0001, + "loss": 7.1952, + "loss/crossentropy": 2.1222976088523864, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.18664605617523194, + "step": 32660 + }, + { + "epoch": 0.81675, + "grad_norm": 30.25, + "grad_norm_var": 2.0072265625, + "learning_rate": 0.0001, + "loss": 7.2998, + "loss/crossentropy": 2.107238310575485, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.19150303788483142, + "step": 32670 + }, + { + "epoch": 0.817, + "grad_norm": 31.5, + "grad_norm_var": 2.9150390625, + "learning_rate": 0.0001, + "loss": 7.3482, + "loss/crossentropy": 1.946744628250599, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.18452872475609183, + "step": 32680 + }, + { + "epoch": 0.81725, + "grad_norm": 31.375, + "grad_norm_var": 3.1708333333333334, + "learning_rate": 0.0001, + "loss": 7.2419, + "loss/crossentropy": 2.169080337882042, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.18254951611161233, + "step": 32690 + }, + { + "epoch": 0.8175, + "grad_norm": 29.0, + "grad_norm_var": 2.3052083333333333, + "learning_rate": 0.0001, + "loss": 7.2529, + "loss/crossentropy": 2.2490514785051348, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.1880893513560295, + "step": 32700 + }, + { + "epoch": 0.81775, + "grad_norm": 28.625, + "grad_norm_var": 20.976497395833334, + "learning_rate": 0.0001, + "loss": 7.3429, + "loss/crossentropy": 2.1124695405364036, + "loss/hidden": 3.41171875, + "loss/jsd": 0.0, + "loss/logits": 0.1869946725666523, + "step": 32710 + }, + { + "epoch": 0.818, + "grad_norm": 28.625, + "grad_norm_var": 21.052083333333332, + "learning_rate": 0.0001, + "loss": 7.2308, + "loss/crossentropy": 1.9733390748500823, + "loss/hidden": 3.284765625, + "loss/jsd": 0.0, + "loss/logits": 0.17016562707722188, + "step": 32720 + }, + { + "epoch": 0.81825, + "grad_norm": 30.25, + "grad_norm_var": 3.2416666666666667, + "learning_rate": 0.0001, + "loss": 7.2682, + "loss/crossentropy": 2.0912339106202125, + "loss/hidden": 3.35078125, + "loss/jsd": 0.0, + "loss/logits": 0.18757227193564177, + "step": 32730 + }, + { + "epoch": 0.8185, + "grad_norm": 29.875, + "grad_norm_var": 1.9098307291666667, + "learning_rate": 0.0001, + "loss": 7.2908, + "loss/crossentropy": 2.046810332685709, + "loss/hidden": 3.30703125, + "loss/jsd": 0.0, + "loss/logits": 0.1753769489005208, + "step": 32740 + }, + { + "epoch": 0.81875, + "grad_norm": 28.875, + "grad_norm_var": 85.02682291666666, + "learning_rate": 0.0001, + "loss": 7.1812, + "loss/crossentropy": 1.8806068405508996, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.1857995780184865, + "step": 32750 + }, + { + "epoch": 0.819, + "grad_norm": 31.75, + "grad_norm_var": 83.22916666666667, + "learning_rate": 0.0001, + "loss": 7.2393, + "loss/crossentropy": 2.263414332270622, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.19157245978713036, + "step": 32760 + }, + { + "epoch": 0.81925, + "grad_norm": 30.875, + "grad_norm_var": 2.247330729166667, + "learning_rate": 0.0001, + "loss": 7.2049, + "loss/crossentropy": 2.0481057576835155, + "loss/hidden": 3.237109375, + "loss/jsd": 0.0, + "loss/logits": 0.16489372327923774, + "step": 32770 + }, + { + "epoch": 0.8195, + "grad_norm": 28.875, + "grad_norm_var": 2.637955729166667, + "learning_rate": 0.0001, + "loss": 7.3102, + "loss/crossentropy": 2.0778881147503854, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.18638940406963228, + "step": 32780 + }, + { + "epoch": 0.81975, + "grad_norm": 30.625, + "grad_norm_var": 2.450455729166667, + "learning_rate": 0.0001, + "loss": 7.2894, + "loss/crossentropy": 2.0030703216791155, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.17577996235340834, + "step": 32790 + }, + { + "epoch": 0.82, + "grad_norm": 29.125, + "grad_norm_var": 1.9561848958333334, + "learning_rate": 0.0001, + "loss": 7.1411, + "loss/crossentropy": 1.983315745741129, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.1809986751526594, + "step": 32800 + }, + { + "epoch": 0.82025, + "grad_norm": 31.25, + "grad_norm_var": 1.5622395833333333, + "learning_rate": 0.0001, + "loss": 7.201, + "loss/crossentropy": 2.0666268527507783, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.17382380943745374, + "step": 32810 + }, + { + "epoch": 0.8205, + "grad_norm": 28.625, + "grad_norm_var": 8.472916666666666, + "learning_rate": 0.0001, + "loss": 7.217, + "loss/crossentropy": 1.9913094595074654, + "loss/hidden": 3.451171875, + "loss/jsd": 0.0, + "loss/logits": 0.17583483047783374, + "step": 32820 + }, + { + "epoch": 0.82075, + "grad_norm": 29.625, + "grad_norm_var": 9.392708333333333, + "learning_rate": 0.0001, + "loss": 7.2831, + "loss/crossentropy": 2.131434071063995, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.18594115655869245, + "step": 32830 + }, + { + "epoch": 0.821, + "grad_norm": 28.25, + "grad_norm_var": 4.114583333333333, + "learning_rate": 0.0001, + "loss": 7.162, + "loss/crossentropy": 2.108327967301011, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.18347832215949894, + "step": 32840 + }, + { + "epoch": 0.82125, + "grad_norm": 32.75, + "grad_norm_var": 2.1426432291666666, + "learning_rate": 0.0001, + "loss": 7.1914, + "loss/crossentropy": 1.989778284728527, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.1808457243256271, + "step": 32850 + }, + { + "epoch": 0.8215, + "grad_norm": 28.25, + "grad_norm_var": 2.7124348958333333, + "learning_rate": 0.0001, + "loss": 7.2049, + "loss/crossentropy": 2.120199552178383, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.18457971159368752, + "step": 32860 + }, + { + "epoch": 0.82175, + "grad_norm": 35.5, + "grad_norm_var": 25.367708333333333, + "learning_rate": 0.0001, + "loss": 7.2266, + "loss/crossentropy": 2.0434179216623307, + "loss/hidden": 3.268359375, + "loss/jsd": 0.0, + "loss/logits": 0.1738625967875123, + "step": 32870 + }, + { + "epoch": 0.822, + "grad_norm": 28.875, + "grad_norm_var": 20.552018229166666, + "learning_rate": 0.0001, + "loss": 7.3397, + "loss/crossentropy": 2.1949902296066286, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.20381880030035973, + "step": 32880 + }, + { + "epoch": 0.82225, + "grad_norm": 30.625, + "grad_norm_var": 20.99375, + "learning_rate": 0.0001, + "loss": 7.2043, + "loss/crossentropy": 2.0969431951642035, + "loss/hidden": 3.19375, + "loss/jsd": 0.0, + "loss/logits": 0.16738458890467883, + "step": 32890 + }, + { + "epoch": 0.8225, + "grad_norm": 29.5, + "grad_norm_var": 2.6, + "learning_rate": 0.0001, + "loss": 7.3717, + "loss/crossentropy": 1.910037212073803, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.21138516459614037, + "step": 32900 + }, + { + "epoch": 0.82275, + "grad_norm": 31.25, + "grad_norm_var": 3.47890625, + "learning_rate": 0.0001, + "loss": 7.2944, + "loss/crossentropy": 2.128745178878307, + "loss/hidden": 3.290234375, + "loss/jsd": 0.0, + "loss/logits": 0.16938637476414442, + "step": 32910 + }, + { + "epoch": 0.823, + "grad_norm": 31.125, + "grad_norm_var": 4.466080729166666, + "learning_rate": 0.0001, + "loss": 7.3261, + "loss/crossentropy": 2.1102588385343553, + "loss/hidden": 3.233984375, + "loss/jsd": 0.0, + "loss/logits": 0.18096004836261273, + "step": 32920 + }, + { + "epoch": 0.82325, + "grad_norm": 28.75, + "grad_norm_var": 2.6264973958333333, + "learning_rate": 0.0001, + "loss": 7.2731, + "loss/crossentropy": 2.0781460791826247, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.20577163994312286, + "step": 32930 + }, + { + "epoch": 0.8235, + "grad_norm": 31.0, + "grad_norm_var": 1.6613932291666667, + "learning_rate": 0.0001, + "loss": 7.3275, + "loss/crossentropy": 2.0820566445589064, + "loss/hidden": 3.337890625, + "loss/jsd": 0.0, + "loss/logits": 0.17738431375473737, + "step": 32940 + }, + { + "epoch": 0.82375, + "grad_norm": 30.5, + "grad_norm_var": 1.8696271412957893e+18, + "learning_rate": 0.0001, + "loss": 7.3092, + "loss/crossentropy": 2.1891760557889937, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.18127844985574484, + "step": 32950 + }, + { + "epoch": 0.824, + "grad_norm": 31.75, + "grad_norm_var": 6.083333333333333, + "learning_rate": 0.0001, + "loss": 7.243, + "loss/crossentropy": 2.1428692400455476, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.19357558973133565, + "step": 32960 + }, + { + "epoch": 0.82425, + "grad_norm": 28.875, + "grad_norm_var": 5.588997395833333, + "learning_rate": 0.0001, + "loss": 7.2652, + "loss/crossentropy": 1.9545904472470284, + "loss/hidden": 3.45625, + "loss/jsd": 0.0, + "loss/logits": 0.18726088060066104, + "step": 32970 + }, + { + "epoch": 0.8245, + "grad_norm": 30.875, + "grad_norm_var": 1.6593098958333334, + "learning_rate": 0.0001, + "loss": 7.2011, + "loss/crossentropy": 2.19513940513134, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18281609732657672, + "step": 32980 + }, + { + "epoch": 0.82475, + "grad_norm": 30.25, + "grad_norm_var": 3.0087890625, + "learning_rate": 0.0001, + "loss": 7.1861, + "loss/crossentropy": 1.9000049561262131, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.18603194374591112, + "step": 32990 + }, + { + "epoch": 0.825, + "grad_norm": 34.5, + "grad_norm_var": 5.230143229166667, + "learning_rate": 0.0001, + "loss": 7.3087, + "loss/crossentropy": 2.176528325676918, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.19508822299540043, + "step": 33000 + }, + { + "epoch": 0.82525, + "grad_norm": 29.75, + "grad_norm_var": 6.870768229166667, + "learning_rate": 0.0001, + "loss": 7.294, + "loss/crossentropy": 2.074225351214409, + "loss/hidden": 3.287890625, + "loss/jsd": 0.0, + "loss/logits": 0.1698618305847049, + "step": 33010 + }, + { + "epoch": 0.8255, + "grad_norm": 30.25, + "grad_norm_var": 3.5259765625, + "learning_rate": 0.0001, + "loss": 7.2318, + "loss/crossentropy": 1.9874062195420266, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.17083260375075043, + "step": 33020 + }, + { + "epoch": 0.82575, + "grad_norm": 34.0, + "grad_norm_var": 17.623958333333334, + "learning_rate": 0.0001, + "loss": 7.2262, + "loss/crossentropy": 2.009798040986061, + "loss/hidden": 3.38359375, + "loss/jsd": 0.0, + "loss/logits": 0.17537497971206903, + "step": 33030 + }, + { + "epoch": 0.826, + "grad_norm": 31.875, + "grad_norm_var": 4.7509765625, + "learning_rate": 0.0001, + "loss": 7.247, + "loss/crossentropy": 1.975242407619953, + "loss/hidden": 3.297265625, + "loss/jsd": 0.0, + "loss/logits": 0.1771410683169961, + "step": 33040 + }, + { + "epoch": 0.82625, + "grad_norm": 29.625, + "grad_norm_var": 3.6885416666666666, + "learning_rate": 0.0001, + "loss": 7.3327, + "loss/crossentropy": 2.1277087360620497, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18781726695597173, + "step": 33050 + }, + { + "epoch": 0.8265, + "grad_norm": 29.5, + "grad_norm_var": 19.176041666666666, + "learning_rate": 0.0001, + "loss": 7.2487, + "loss/crossentropy": 2.0511118680238725, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.18265996258705855, + "step": 33060 + }, + { + "epoch": 0.82675, + "grad_norm": 28.875, + "grad_norm_var": 21.9181640625, + "learning_rate": 0.0001, + "loss": 7.2396, + "loss/crossentropy": 2.0069188207387922, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.18895241040736438, + "step": 33070 + }, + { + "epoch": 0.827, + "grad_norm": 29.0, + "grad_norm_var": 5.089518229166667, + "learning_rate": 0.0001, + "loss": 7.1739, + "loss/crossentropy": 2.0877312146127225, + "loss/hidden": 3.300390625, + "loss/jsd": 0.0, + "loss/logits": 0.1880142292007804, + "step": 33080 + }, + { + "epoch": 0.82725, + "grad_norm": 30.75, + "grad_norm_var": 2.7997395833333334, + "learning_rate": 0.0001, + "loss": 7.2304, + "loss/crossentropy": 2.1121929422020913, + "loss/hidden": 3.38203125, + "loss/jsd": 0.0, + "loss/logits": 0.18759419713169337, + "step": 33090 + }, + { + "epoch": 0.8275, + "grad_norm": 30.125, + "grad_norm_var": 2.0629557291666667, + "learning_rate": 0.0001, + "loss": 7.2302, + "loss/crossentropy": 2.0372960224747656, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.1759640196338296, + "step": 33100 + }, + { + "epoch": 0.82775, + "grad_norm": 28.625, + "grad_norm_var": 1.12265625, + "learning_rate": 0.0001, + "loss": 7.1867, + "loss/crossentropy": 2.0432544320821764, + "loss/hidden": 3.247265625, + "loss/jsd": 0.0, + "loss/logits": 0.17888994030654431, + "step": 33110 + }, + { + "epoch": 0.828, + "grad_norm": 30.0, + "grad_norm_var": 2.155042767301509e+18, + "learning_rate": 0.0001, + "loss": 7.3383, + "loss/crossentropy": 1.9840383768081664, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.1890584809705615, + "step": 33120 + }, + { + "epoch": 0.82825, + "grad_norm": 29.625, + "grad_norm_var": 1.7580729166666667, + "learning_rate": 0.0001, + "loss": 7.1764, + "loss/crossentropy": 1.9866980195045472, + "loss/hidden": 3.280859375, + "loss/jsd": 0.0, + "loss/logits": 0.16517556998878719, + "step": 33130 + }, + { + "epoch": 0.8285, + "grad_norm": 29.25, + "grad_norm_var": 3.1416015625, + "learning_rate": 0.0001, + "loss": 7.2066, + "loss/crossentropy": 2.0350388549268246, + "loss/hidden": 3.298046875, + "loss/jsd": 0.0, + "loss/logits": 0.17439607214182615, + "step": 33140 + }, + { + "epoch": 0.82875, + "grad_norm": 31.625, + "grad_norm_var": 2.4358723958333335, + "learning_rate": 0.0001, + "loss": 7.2689, + "loss/crossentropy": 2.1262506932020186, + "loss/hidden": 3.244921875, + "loss/jsd": 0.0, + "loss/logits": 0.17747630644589663, + "step": 33150 + }, + { + "epoch": 0.829, + "grad_norm": 31.0, + "grad_norm_var": 9.006184895833334, + "learning_rate": 0.0001, + "loss": 7.2443, + "loss/crossentropy": 2.1790440320968627, + "loss/hidden": 3.275390625, + "loss/jsd": 0.0, + "loss/logits": 0.18086491283029318, + "step": 33160 + }, + { + "epoch": 0.82925, + "grad_norm": 28.75, + "grad_norm_var": 10.318684895833334, + "learning_rate": 0.0001, + "loss": 7.3319, + "loss/crossentropy": 2.174597004055977, + "loss/hidden": 3.269140625, + "loss/jsd": 0.0, + "loss/logits": 0.1854943221434951, + "step": 33170 + }, + { + "epoch": 0.8295, + "grad_norm": 34.5, + "grad_norm_var": 2.9228515625, + "learning_rate": 0.0001, + "loss": 7.2453, + "loss/crossentropy": 2.1312096804380416, + "loss/hidden": 3.279296875, + "loss/jsd": 0.0, + "loss/logits": 0.17487460002303123, + "step": 33180 + }, + { + "epoch": 0.82975, + "grad_norm": 30.5, + "grad_norm_var": 2.530989583333333, + "learning_rate": 0.0001, + "loss": 7.2681, + "loss/crossentropy": 2.1367484860122206, + "loss/hidden": 3.385546875, + "loss/jsd": 0.0, + "loss/logits": 0.18738685622811319, + "step": 33190 + }, + { + "epoch": 0.83, + "grad_norm": 28.5, + "grad_norm_var": 1.828125, + "learning_rate": 0.0001, + "loss": 7.228, + "loss/crossentropy": 2.1254208497703075, + "loss/hidden": 3.281640625, + "loss/jsd": 0.0, + "loss/logits": 0.1795659614726901, + "step": 33200 + }, + { + "epoch": 0.83025, + "grad_norm": 28.5, + "grad_norm_var": 2.9530598958333334, + "learning_rate": 0.0001, + "loss": 7.2011, + "loss/crossentropy": 1.9877605877816678, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.18452598052099348, + "step": 33210 + }, + { + "epoch": 0.8305, + "grad_norm": 35.25, + "grad_norm_var": 3.9431640625, + "learning_rate": 0.0001, + "loss": 7.1594, + "loss/crossentropy": 2.0039492681622506, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18381198737770318, + "step": 33220 + }, + { + "epoch": 0.83075, + "grad_norm": 43.5, + "grad_norm_var": 14.514322916666666, + "learning_rate": 0.0001, + "loss": 7.2984, + "loss/crossentropy": 2.1737947434186937, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.17826213724911213, + "step": 33230 + }, + { + "epoch": 0.831, + "grad_norm": 28.125, + "grad_norm_var": 13.200455729166666, + "learning_rate": 0.0001, + "loss": 7.1889, + "loss/crossentropy": 2.1893670335412025, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.1756696652621031, + "step": 33240 + }, + { + "epoch": 0.83125, + "grad_norm": 32.25, + "grad_norm_var": 6.585416666666666, + "learning_rate": 0.0001, + "loss": 7.2065, + "loss/crossentropy": 2.2689021706581114, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.18847335111349822, + "step": 33250 + }, + { + "epoch": 0.8315, + "grad_norm": 29.875, + "grad_norm_var": 4.733072916666667, + "learning_rate": 0.0001, + "loss": 7.273, + "loss/crossentropy": 2.1217512801289558, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18651887271553277, + "step": 33260 + }, + { + "epoch": 0.83175, + "grad_norm": 32.75, + "grad_norm_var": 4.162434895833333, + "learning_rate": 0.0001, + "loss": 7.2661, + "loss/crossentropy": 1.952786525338888, + "loss/hidden": 3.212109375, + "loss/jsd": 0.0, + "loss/logits": 0.15934878159314395, + "step": 33270 + }, + { + "epoch": 0.832, + "grad_norm": 28.875, + "grad_norm_var": 18.381184895833332, + "learning_rate": 0.0001, + "loss": 7.1678, + "loss/crossentropy": 2.1539489194750785, + "loss/hidden": 3.331640625, + "loss/jsd": 0.0, + "loss/logits": 0.1839644392952323, + "step": 33280 + }, + { + "epoch": 0.83225, + "grad_norm": 31.5, + "grad_norm_var": 6.798958333333333, + "learning_rate": 0.0001, + "loss": 7.295, + "loss/crossentropy": 2.144645670056343, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.17679207548499107, + "step": 33290 + }, + { + "epoch": 0.8325, + "grad_norm": 30.375, + "grad_norm_var": 3.0150390625, + "learning_rate": 0.0001, + "loss": 7.3755, + "loss/crossentropy": 2.0700169153511525, + "loss/hidden": 3.3234375, + "loss/jsd": 0.0, + "loss/logits": 0.18612122694030403, + "step": 33300 + }, + { + "epoch": 0.83275, + "grad_norm": 31.125, + "grad_norm_var": 2.7768229166666667, + "learning_rate": 0.0001, + "loss": 7.1297, + "loss/crossentropy": 2.0893400743603707, + "loss/hidden": 3.269140625, + "loss/jsd": 0.0, + "loss/logits": 0.17862566062249244, + "step": 33310 + }, + { + "epoch": 0.833, + "grad_norm": 32.25, + "grad_norm_var": 3.01015625, + "learning_rate": 0.0001, + "loss": 7.2992, + "loss/crossentropy": 2.1281204342842104, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.19392694123089313, + "step": 33320 + }, + { + "epoch": 0.83325, + "grad_norm": 32.25, + "grad_norm_var": 2.66640625, + "learning_rate": 0.0001, + "loss": 7.266, + "loss/crossentropy": 1.9223125174641609, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.1853881362825632, + "step": 33330 + }, + { + "epoch": 0.8335, + "grad_norm": 29.875, + "grad_norm_var": 1.8166015625, + "learning_rate": 0.0001, + "loss": 7.2632, + "loss/crossentropy": 1.9951855912804604, + "loss/hidden": 3.263671875, + "loss/jsd": 0.0, + "loss/logits": 0.17544143386185168, + "step": 33340 + }, + { + "epoch": 0.83375, + "grad_norm": 27.5, + "grad_norm_var": 1.9083333333333334, + "learning_rate": 0.0001, + "loss": 7.2816, + "loss/crossentropy": 1.8624406516551972, + "loss/hidden": 3.326953125, + "loss/jsd": 0.0, + "loss/logits": 0.166407653875649, + "step": 33350 + }, + { + "epoch": 0.834, + "grad_norm": 29.25, + "grad_norm_var": 4.018684895833333, + "learning_rate": 0.0001, + "loss": 7.3705, + "loss/crossentropy": 2.019158259034157, + "loss/hidden": 3.377734375, + "loss/jsd": 0.0, + "loss/logits": 0.1928328016772866, + "step": 33360 + }, + { + "epoch": 0.83425, + "grad_norm": 31.625, + "grad_norm_var": 4.66015625, + "learning_rate": 0.0001, + "loss": 7.2398, + "loss/crossentropy": 2.002238343656063, + "loss/hidden": 3.36875, + "loss/jsd": 0.0, + "loss/logits": 0.18211230151355268, + "step": 33370 + }, + { + "epoch": 0.8345, + "grad_norm": 29.75, + "grad_norm_var": 5.110872395833334, + "learning_rate": 0.0001, + "loss": 7.2584, + "loss/crossentropy": 2.167521375417709, + "loss/hidden": 3.292578125, + "loss/jsd": 0.0, + "loss/logits": 0.17853855043649675, + "step": 33380 + }, + { + "epoch": 0.83475, + "grad_norm": 29.375, + "grad_norm_var": 3.7447916666666665, + "learning_rate": 0.0001, + "loss": 7.1261, + "loss/crossentropy": 2.034111428260803, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18338973429054023, + "step": 33390 + }, + { + "epoch": 0.835, + "grad_norm": 30.375, + "grad_norm_var": 2.1705729166666665, + "learning_rate": 0.0001, + "loss": 7.2059, + "loss/crossentropy": 1.937609364837408, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.18285921812057496, + "step": 33400 + }, + { + "epoch": 0.83525, + "grad_norm": 28.625, + "grad_norm_var": 2.6988932291666665, + "learning_rate": 0.0001, + "loss": 7.222, + "loss/crossentropy": 2.170501431822777, + "loss/hidden": 3.253515625, + "loss/jsd": 0.0, + "loss/logits": 0.17850385289639234, + "step": 33410 + }, + { + "epoch": 0.8355, + "grad_norm": 31.0, + "grad_norm_var": 3.5369140625, + "learning_rate": 0.0001, + "loss": 7.2907, + "loss/crossentropy": 1.869483571499586, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.19331245701760053, + "step": 33420 + }, + { + "epoch": 0.83575, + "grad_norm": 33.5, + "grad_norm_var": 3.5559895833333335, + "learning_rate": 0.0001, + "loss": 7.2744, + "loss/crossentropy": 2.044697883725166, + "loss/hidden": 3.322265625, + "loss/jsd": 0.0, + "loss/logits": 0.17986318413168192, + "step": 33430 + }, + { + "epoch": 0.836, + "grad_norm": 31.125, + "grad_norm_var": 5.074739583333334, + "learning_rate": 0.0001, + "loss": 7.1971, + "loss/crossentropy": 2.074133487045765, + "loss/hidden": 3.2578125, + "loss/jsd": 0.0, + "loss/logits": 0.17584548909217118, + "step": 33440 + }, + { + "epoch": 0.83625, + "grad_norm": 32.5, + "grad_norm_var": 4.56875, + "learning_rate": 0.0001, + "loss": 7.2264, + "loss/crossentropy": 2.1718404933810236, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.1941602316685021, + "step": 33450 + }, + { + "epoch": 0.8365, + "grad_norm": 28.5, + "grad_norm_var": 3.4833333333333334, + "learning_rate": 0.0001, + "loss": 7.3675, + "loss/crossentropy": 2.105057214200497, + "loss/hidden": 3.2921875, + "loss/jsd": 0.0, + "loss/logits": 0.1713842570781708, + "step": 33460 + }, + { + "epoch": 0.83675, + "grad_norm": 28.0, + "grad_norm_var": 2.684375, + "learning_rate": 0.0001, + "loss": 7.176, + "loss/crossentropy": 2.1097921282052994, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.20385158769786357, + "step": 33470 + }, + { + "epoch": 0.837, + "grad_norm": 30.5, + "grad_norm_var": 1.5916666666666666, + "learning_rate": 0.0001, + "loss": 7.1873, + "loss/crossentropy": 2.0274341255426407, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.17628777120262384, + "step": 33480 + }, + { + "epoch": 0.83725, + "grad_norm": 33.75, + "grad_norm_var": 3.06015625, + "learning_rate": 0.0001, + "loss": 7.2902, + "loss/crossentropy": 2.1492093369364738, + "loss/hidden": 3.3078125, + "loss/jsd": 0.0, + "loss/logits": 0.18647899404168128, + "step": 33490 + }, + { + "epoch": 0.8375, + "grad_norm": 30.125, + "grad_norm_var": 3.1518229166666667, + "learning_rate": 0.0001, + "loss": 7.2008, + "loss/crossentropy": 2.2044859319925307, + "loss/hidden": 3.3125, + "loss/jsd": 0.0, + "loss/logits": 0.1958764422684908, + "step": 33500 + }, + { + "epoch": 0.83775, + "grad_norm": 31.75, + "grad_norm_var": 1.1791015625, + "learning_rate": 0.0001, + "loss": 7.2588, + "loss/crossentropy": 1.9938185468316079, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.17682940270751715, + "step": 33510 + }, + { + "epoch": 0.838, + "grad_norm": 28.625, + "grad_norm_var": 3.0916015625, + "learning_rate": 0.0001, + "loss": 7.2187, + "loss/crossentropy": 1.8948181390762329, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.17874590419232844, + "step": 33520 + }, + { + "epoch": 0.83825, + "grad_norm": 30.875, + "grad_norm_var": 2.660416666666667, + "learning_rate": 0.0001, + "loss": 7.2563, + "loss/crossentropy": 1.9899358443915844, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.17318380083888768, + "step": 33530 + }, + { + "epoch": 0.8385, + "grad_norm": 32.0, + "grad_norm_var": 3.2639973958333335, + "learning_rate": 0.0001, + "loss": 7.1997, + "loss/crossentropy": 2.09279451072216, + "loss/hidden": 3.344921875, + "loss/jsd": 0.0, + "loss/logits": 0.19197166245430708, + "step": 33540 + }, + { + "epoch": 0.83875, + "grad_norm": 31.875, + "grad_norm_var": 1.8114583333333334, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.2584090396761893, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.20011573657393456, + "step": 33550 + }, + { + "epoch": 0.839, + "grad_norm": 29.0, + "grad_norm_var": 1.4705729166666666, + "learning_rate": 0.0001, + "loss": 7.2413, + "loss/crossentropy": 2.064199483394623, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.18448180723935365, + "step": 33560 + }, + { + "epoch": 0.83925, + "grad_norm": 32.5, + "grad_norm_var": 1.6811848958333333, + "learning_rate": 0.0001, + "loss": 7.2521, + "loss/crossentropy": 2.1389085978269575, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.17851551342755556, + "step": 33570 + }, + { + "epoch": 0.8395, + "grad_norm": 31.375, + "grad_norm_var": 2.9624348958333333, + "learning_rate": 0.0001, + "loss": 7.2863, + "loss/crossentropy": 2.1424932688474656, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.18908166233450174, + "step": 33580 + }, + { + "epoch": 0.83975, + "grad_norm": 30.75, + "grad_norm_var": 2.928465630524028e+18, + "learning_rate": 0.0001, + "loss": 7.2873, + "loss/crossentropy": 1.968450340628624, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.18586775716394185, + "step": 33590 + }, + { + "epoch": 0.84, + "grad_norm": 29.0, + "grad_norm_var": 25.168489583333333, + "learning_rate": 0.0001, + "loss": 7.2015, + "loss/crossentropy": 2.125929144024849, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.1928325628861785, + "step": 33600 + }, + { + "epoch": 0.84025, + "grad_norm": 33.0, + "grad_norm_var": 30.908268229166666, + "learning_rate": 0.0001, + "loss": 7.3396, + "loss/crossentropy": 2.133786876499653, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18853625562041998, + "step": 33610 + }, + { + "epoch": 0.8405, + "grad_norm": 31.25, + "grad_norm_var": 8.537434895833334, + "learning_rate": 0.0001, + "loss": 7.2523, + "loss/crossentropy": 2.0643911197781564, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.16706958692520857, + "step": 33620 + }, + { + "epoch": 0.84075, + "grad_norm": 29.125, + "grad_norm_var": 15.953580729166667, + "learning_rate": 0.0001, + "loss": 7.253, + "loss/crossentropy": 2.180924139916897, + "loss/hidden": 3.282421875, + "loss/jsd": 0.0, + "loss/logits": 0.1812012242153287, + "step": 33630 + }, + { + "epoch": 0.841, + "grad_norm": 27.75, + "grad_norm_var": 13.53125, + "learning_rate": 0.0001, + "loss": 7.3464, + "loss/crossentropy": 2.1258861504495146, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.2176899950951338, + "step": 33640 + }, + { + "epoch": 0.84125, + "grad_norm": 33.75, + "grad_norm_var": 9.324739583333333, + "learning_rate": 0.0001, + "loss": 7.2401, + "loss/crossentropy": 2.0648413375020027, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.1812899900600314, + "step": 33650 + }, + { + "epoch": 0.8415, + "grad_norm": 36.0, + "grad_norm_var": 6.6509765625, + "learning_rate": 0.0001, + "loss": 7.2464, + "loss/crossentropy": 2.2324777007102967, + "loss/hidden": 3.27109375, + "loss/jsd": 0.0, + "loss/logits": 0.18248971588909627, + "step": 33660 + }, + { + "epoch": 0.84175, + "grad_norm": 36.25, + "grad_norm_var": 6.999934895833333, + "learning_rate": 0.0001, + "loss": 7.2403, + "loss/crossentropy": 2.233609616756439, + "loss/hidden": 3.2484375, + "loss/jsd": 0.0, + "loss/logits": 0.17655839212238789, + "step": 33670 + }, + { + "epoch": 0.842, + "grad_norm": 30.5, + "grad_norm_var": 4.526822916666666, + "learning_rate": 0.0001, + "loss": 7.291, + "loss/crossentropy": 2.103627168387175, + "loss/hidden": 3.370703125, + "loss/jsd": 0.0, + "loss/logits": 0.20420456351712346, + "step": 33680 + }, + { + "epoch": 0.84225, + "grad_norm": 29.25, + "grad_norm_var": 4.9134765625, + "learning_rate": 0.0001, + "loss": 7.218, + "loss/crossentropy": 2.1048862997442486, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.1857083396986127, + "step": 33690 + }, + { + "epoch": 0.8425, + "grad_norm": 30.5, + "grad_norm_var": 6.632291666666666, + "learning_rate": 0.0001, + "loss": 7.241, + "loss/crossentropy": 2.1183654844760893, + "loss/hidden": 3.26171875, + "loss/jsd": 0.0, + "loss/logits": 0.17055343594402075, + "step": 33700 + }, + { + "epoch": 0.84275, + "grad_norm": 30.375, + "grad_norm_var": 45.45618489583333, + "learning_rate": 0.0001, + "loss": 7.1277, + "loss/crossentropy": 1.9154896408319473, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17995868530124426, + "step": 33710 + }, + { + "epoch": 0.843, + "grad_norm": 33.0, + "grad_norm_var": 42.3447265625, + "learning_rate": 0.0001, + "loss": 7.2229, + "loss/crossentropy": 1.8783961586654185, + "loss/hidden": 3.281640625, + "loss/jsd": 0.0, + "loss/logits": 0.15951730422675608, + "step": 33720 + }, + { + "epoch": 0.84325, + "grad_norm": 27.875, + "grad_norm_var": 4.575, + "learning_rate": 0.0001, + "loss": 7.1576, + "loss/crossentropy": 2.2371614634990693, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.20098869111388923, + "step": 33730 + }, + { + "epoch": 0.8435, + "grad_norm": 29.0, + "grad_norm_var": 3.2259765625, + "learning_rate": 0.0001, + "loss": 7.2066, + "loss/crossentropy": 2.0927552759647368, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.17918878868222238, + "step": 33740 + }, + { + "epoch": 0.84375, + "grad_norm": 30.75, + "grad_norm_var": 2.2660807291666667, + "learning_rate": 0.0001, + "loss": 7.186, + "loss/crossentropy": 2.0027615509927275, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.19067690251395106, + "step": 33750 + }, + { + "epoch": 0.844, + "grad_norm": 26.125, + "grad_norm_var": 5.451822916666667, + "learning_rate": 0.0001, + "loss": 7.1761, + "loss/crossentropy": 2.010700835287571, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.1867529757320881, + "step": 33760 + }, + { + "epoch": 0.84425, + "grad_norm": 36.25, + "grad_norm_var": 8.837434895833333, + "learning_rate": 0.0001, + "loss": 7.2334, + "loss/crossentropy": 1.998683289438486, + "loss/hidden": 3.231640625, + "loss/jsd": 0.0, + "loss/logits": 0.17415560763329269, + "step": 33770 + }, + { + "epoch": 0.8445, + "grad_norm": 28.5, + "grad_norm_var": 7.392122395833334, + "learning_rate": 0.0001, + "loss": 7.1901, + "loss/crossentropy": 2.180643618106842, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.18043227680027485, + "step": 33780 + }, + { + "epoch": 0.84475, + "grad_norm": 35.0, + "grad_norm_var": 43.71432291666667, + "learning_rate": 0.0001, + "loss": 7.1083, + "loss/crossentropy": 2.000649718940258, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.17601131778210402, + "step": 33790 + }, + { + "epoch": 0.845, + "grad_norm": 29.25, + "grad_norm_var": 59.41041666666667, + "learning_rate": 0.0001, + "loss": 7.158, + "loss/crossentropy": 1.9586811505258084, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.17341443207114934, + "step": 33800 + }, + { + "epoch": 0.84525, + "grad_norm": 28.875, + "grad_norm_var": 50.3228515625, + "learning_rate": 0.0001, + "loss": 7.255, + "loss/crossentropy": 2.2014808893203734, + "loss/hidden": 3.407421875, + "loss/jsd": 0.0, + "loss/logits": 0.1981699451804161, + "step": 33810 + }, + { + "epoch": 0.8455, + "grad_norm": 30.625, + "grad_norm_var": 37.398372395833334, + "learning_rate": 0.0001, + "loss": 7.3061, + "loss/crossentropy": 2.1012670308351518, + "loss/hidden": 3.294140625, + "loss/jsd": 0.0, + "loss/logits": 0.1775698646903038, + "step": 33820 + }, + { + "epoch": 0.84575, + "grad_norm": 28.25, + "grad_norm_var": 22.502083333333335, + "learning_rate": 0.0001, + "loss": 7.1653, + "loss/crossentropy": 2.0572401136159897, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.182903066650033, + "step": 33830 + }, + { + "epoch": 0.846, + "grad_norm": 42.0, + "grad_norm_var": 27.852018229166667, + "learning_rate": 0.0001, + "loss": 7.208, + "loss/crossentropy": 1.9303223744034768, + "loss/hidden": 3.258984375, + "loss/jsd": 0.0, + "loss/logits": 0.16786477230489255, + "step": 33840 + }, + { + "epoch": 0.84625, + "grad_norm": 30.125, + "grad_norm_var": 15.620572916666667, + "learning_rate": 0.0001, + "loss": 7.1522, + "loss/crossentropy": 2.2037465661764144, + "loss/hidden": 3.27421875, + "loss/jsd": 0.0, + "loss/logits": 0.18479030914604663, + "step": 33850 + }, + { + "epoch": 0.8465, + "grad_norm": 30.0, + "grad_norm_var": 14.82890625, + "learning_rate": 0.0001, + "loss": 7.2389, + "loss/crossentropy": 2.118402448296547, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.19767413921654226, + "step": 33860 + }, + { + "epoch": 0.84675, + "grad_norm": 28.875, + "grad_norm_var": 15.980989583333333, + "learning_rate": 0.0001, + "loss": 7.2069, + "loss/crossentropy": 2.0344385020434856, + "loss/hidden": 3.406640625, + "loss/jsd": 0.0, + "loss/logits": 0.18299854248762132, + "step": 33870 + }, + { + "epoch": 0.847, + "grad_norm": 30.0, + "grad_norm_var": 13.670572916666666, + "learning_rate": 0.0001, + "loss": 7.1785, + "loss/crossentropy": 2.100412330776453, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18360245916992426, + "step": 33880 + }, + { + "epoch": 0.84725, + "grad_norm": 28.875, + "grad_norm_var": 8.151497395833333, + "learning_rate": 0.0001, + "loss": 7.1457, + "loss/crossentropy": 2.0797202557325365, + "loss/hidden": 3.353515625, + "loss/jsd": 0.0, + "loss/logits": 0.1801828995347023, + "step": 33890 + }, + { + "epoch": 0.8475, + "grad_norm": 33.25, + "grad_norm_var": 7.064583333333333, + "learning_rate": 0.0001, + "loss": 7.2686, + "loss/crossentropy": 2.143502997606993, + "loss/hidden": 3.317578125, + "loss/jsd": 0.0, + "loss/logits": 0.191785538662225, + "step": 33900 + }, + { + "epoch": 0.84775, + "grad_norm": 32.5, + "grad_norm_var": 10.1009765625, + "learning_rate": 0.0001, + "loss": 7.1032, + "loss/crossentropy": 1.9439681842923164, + "loss/hidden": 3.23984375, + "loss/jsd": 0.0, + "loss/logits": 0.15850237123668193, + "step": 33910 + }, + { + "epoch": 0.848, + "grad_norm": 29.75, + "grad_norm_var": 8.961458333333333, + "learning_rate": 0.0001, + "loss": 7.1842, + "loss/crossentropy": 2.141938117146492, + "loss/hidden": 3.24921875, + "loss/jsd": 0.0, + "loss/logits": 0.18331899046897887, + "step": 33920 + }, + { + "epoch": 0.84825, + "grad_norm": 29.25, + "grad_norm_var": 14.096875, + "learning_rate": 0.0001, + "loss": 7.2517, + "loss/crossentropy": 2.0895468056201936, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.18215047754347324, + "step": 33930 + }, + { + "epoch": 0.8485, + "grad_norm": 30.0, + "grad_norm_var": 7.476822916666666, + "learning_rate": 0.0001, + "loss": 7.2402, + "loss/crossentropy": 2.0649702072143556, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.193724649772048, + "step": 33940 + }, + { + "epoch": 0.84875, + "grad_norm": 31.0, + "grad_norm_var": 4.956184895833333, + "learning_rate": 0.0001, + "loss": 7.1436, + "loss/crossentropy": 2.058744602650404, + "loss/hidden": 3.469921875, + "loss/jsd": 0.0, + "loss/logits": 0.1868588777258992, + "step": 33950 + }, + { + "epoch": 0.849, + "grad_norm": 33.25, + "grad_norm_var": 3.496809895833333, + "learning_rate": 0.0001, + "loss": 7.4438, + "loss/crossentropy": 2.0169904246926307, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.18656898457556964, + "step": 33960 + }, + { + "epoch": 0.84925, + "grad_norm": 43.75, + "grad_norm_var": 13.911458333333334, + "learning_rate": 0.0001, + "loss": 7.2415, + "loss/crossentropy": 1.9969091698527337, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.19089818820357324, + "step": 33970 + }, + { + "epoch": 0.8495, + "grad_norm": 31.375, + "grad_norm_var": 16.1212890625, + "learning_rate": 0.0001, + "loss": 7.2844, + "loss/crossentropy": 2.1538162916898727, + "loss/hidden": 3.29453125, + "loss/jsd": 0.0, + "loss/logits": 0.18127647042274475, + "step": 33980 + }, + { + "epoch": 0.84975, + "grad_norm": 28.125, + "grad_norm_var": 2.0686848958333335, + "learning_rate": 0.0001, + "loss": 7.2479, + "loss/crossentropy": 1.9829765126109122, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.18695083800703288, + "step": 33990 + }, + { + "epoch": 0.85, + "grad_norm": 29.875, + "grad_norm_var": 1.88515625, + "learning_rate": 0.0001, + "loss": 7.2118, + "loss/crossentropy": 2.14188566505909, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.19902665503323078, + "step": 34000 + }, + { + "epoch": 0.85025, + "grad_norm": 82.5, + "grad_norm_var": 341.77076822916666, + "learning_rate": 0.0001, + "loss": 7.3306, + "loss/crossentropy": 2.0443466253578664, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.18300221683457493, + "step": 34010 + }, + { + "epoch": 0.8505, + "grad_norm": 30.25, + "grad_norm_var": 394.2884765625, + "learning_rate": 0.0001, + "loss": 7.1513, + "loss/crossentropy": 2.0158767446875574, + "loss/hidden": 3.244140625, + "loss/jsd": 0.0, + "loss/logits": 0.16400964334607124, + "step": 34020 + }, + { + "epoch": 0.85075, + "grad_norm": 33.0, + "grad_norm_var": 6.6931640625, + "learning_rate": 0.0001, + "loss": 7.2646, + "loss/crossentropy": 2.0715938180685045, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.17798414621502162, + "step": 34030 + }, + { + "epoch": 0.851, + "grad_norm": 29.75, + "grad_norm_var": 1.2879557291666666, + "learning_rate": 0.0001, + "loss": 7.3028, + "loss/crossentropy": 2.0998909667134287, + "loss/hidden": 3.227734375, + "loss/jsd": 0.0, + "loss/logits": 0.17252598106861114, + "step": 34040 + }, + { + "epoch": 0.85125, + "grad_norm": 31.375, + "grad_norm_var": 0.8093098958333333, + "learning_rate": 0.0001, + "loss": 7.3888, + "loss/crossentropy": 2.110021045804024, + "loss/hidden": 3.35, + "loss/jsd": 0.0, + "loss/logits": 0.18911605402827264, + "step": 34050 + }, + { + "epoch": 0.8515, + "grad_norm": 29.75, + "grad_norm_var": 5.635416666666667, + "learning_rate": 0.0001, + "loss": 7.187, + "loss/crossentropy": 1.9248976431787015, + "loss/hidden": 3.353125, + "loss/jsd": 0.0, + "loss/logits": 0.17050397992134095, + "step": 34060 + }, + { + "epoch": 0.85175, + "grad_norm": 29.375, + "grad_norm_var": 0.9635416666666666, + "learning_rate": 0.0001, + "loss": 7.3192, + "loss/crossentropy": 2.181374564766884, + "loss/hidden": 3.415625, + "loss/jsd": 0.0, + "loss/logits": 0.19645040035247802, + "step": 34070 + }, + { + "epoch": 0.852, + "grad_norm": 30.0, + "grad_norm_var": 2.314322916666667, + "learning_rate": 0.0001, + "loss": 7.2283, + "loss/crossentropy": 2.0438218235969545, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.19108605310320853, + "step": 34080 + }, + { + "epoch": 0.85225, + "grad_norm": 31.25, + "grad_norm_var": 2.176497395833333, + "learning_rate": 0.0001, + "loss": 7.2638, + "loss/crossentropy": 2.1723656713962556, + "loss/hidden": 3.336328125, + "loss/jsd": 0.0, + "loss/logits": 0.18390613235533237, + "step": 34090 + }, + { + "epoch": 0.8525, + "grad_norm": 28.5, + "grad_norm_var": 1.8622395833333334, + "learning_rate": 0.0001, + "loss": 7.1693, + "loss/crossentropy": 2.0392055988311766, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.20077015552669764, + "step": 34100 + }, + { + "epoch": 0.85275, + "grad_norm": 32.25, + "grad_norm_var": 2.6499348958333333, + "learning_rate": 0.0001, + "loss": 7.268, + "loss/crossentropy": 2.0958330132067204, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.1809829636476934, + "step": 34110 + }, + { + "epoch": 0.853, + "grad_norm": 30.75, + "grad_norm_var": 2.8759765625, + "learning_rate": 0.0001, + "loss": 7.1929, + "loss/crossentropy": 2.13548312112689, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.18752679266035557, + "step": 34120 + }, + { + "epoch": 0.85325, + "grad_norm": 42.25, + "grad_norm_var": 10.880989583333333, + "learning_rate": 0.0001, + "loss": 7.2939, + "loss/crossentropy": 2.2205469399690627, + "loss/hidden": 3.340234375, + "loss/jsd": 0.0, + "loss/logits": 0.20043759122490884, + "step": 34130 + }, + { + "epoch": 0.8535, + "grad_norm": 32.0, + "grad_norm_var": 10.0806640625, + "learning_rate": 0.0001, + "loss": 7.3005, + "loss/crossentropy": 2.0488852843642236, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.17713683266192676, + "step": 34140 + }, + { + "epoch": 0.85375, + "grad_norm": 27.25, + "grad_norm_var": 2.3520833333333333, + "learning_rate": 0.0001, + "loss": 7.1977, + "loss/crossentropy": 2.097453436255455, + "loss/hidden": 3.27890625, + "loss/jsd": 0.0, + "loss/logits": 0.18755655977874994, + "step": 34150 + }, + { + "epoch": 0.854, + "grad_norm": 43.5, + "grad_norm_var": 12.536393229166666, + "learning_rate": 0.0001, + "loss": 7.2194, + "loss/crossentropy": 2.0824636235833167, + "loss/hidden": 3.2734375, + "loss/jsd": 0.0, + "loss/logits": 0.18008290715515612, + "step": 34160 + }, + { + "epoch": 0.85425, + "grad_norm": 28.625, + "grad_norm_var": 14.074934895833334, + "learning_rate": 0.0001, + "loss": 7.2031, + "loss/crossentropy": 2.1574205800890924, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19417616873979568, + "step": 34170 + }, + { + "epoch": 0.8545, + "grad_norm": 31.625, + "grad_norm_var": 2.220833333333333, + "learning_rate": 0.0001, + "loss": 7.2651, + "loss/crossentropy": 2.0641315899789334, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.18580467496067285, + "step": 34180 + }, + { + "epoch": 0.85475, + "grad_norm": 29.375, + "grad_norm_var": 2.120247395833333, + "learning_rate": 0.0001, + "loss": 7.2528, + "loss/crossentropy": 2.1908591121435164, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19210774190723895, + "step": 34190 + }, + { + "epoch": 0.855, + "grad_norm": 31.0, + "grad_norm_var": 8.9, + "learning_rate": 0.0001, + "loss": 7.234, + "loss/crossentropy": 1.9531598590314387, + "loss/hidden": 3.328125, + "loss/jsd": 0.0, + "loss/logits": 0.16695764372125269, + "step": 34200 + }, + { + "epoch": 0.85525, + "grad_norm": 31.125, + "grad_norm_var": 1.5622564690419953e+18, + "learning_rate": 0.0001, + "loss": 7.2399, + "loss/crossentropy": 2.1334476828575135, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.18677760362625123, + "step": 34210 + }, + { + "epoch": 0.8555, + "grad_norm": 28.25, + "grad_norm_var": 1.5622564690367872e+18, + "learning_rate": 0.0001, + "loss": 7.2243, + "loss/crossentropy": 2.1223787307739257, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.177429954521358, + "step": 34220 + }, + { + "epoch": 0.85575, + "grad_norm": 32.25, + "grad_norm_var": 6.009375, + "learning_rate": 0.0001, + "loss": 7.1667, + "loss/crossentropy": 2.053752092272043, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.17705130875110625, + "step": 34230 + }, + { + "epoch": 0.856, + "grad_norm": 31.25, + "grad_norm_var": 9.1212890625, + "learning_rate": 0.0001, + "loss": 7.2172, + "loss/crossentropy": 2.0997056752443313, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.1921197187155485, + "step": 34240 + }, + { + "epoch": 0.85625, + "grad_norm": 29.0, + "grad_norm_var": 14.873893229166667, + "learning_rate": 0.0001, + "loss": 7.356, + "loss/crossentropy": 2.250038433074951, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.2058642553165555, + "step": 34250 + }, + { + "epoch": 0.8565, + "grad_norm": 34.5, + "grad_norm_var": 6.51015625, + "learning_rate": 0.0001, + "loss": 7.2588, + "loss/crossentropy": 2.076680043339729, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.19434863217175008, + "step": 34260 + }, + { + "epoch": 0.85675, + "grad_norm": 31.5, + "grad_norm_var": 5.5416015625, + "learning_rate": 0.0001, + "loss": 7.2713, + "loss/crossentropy": 2.0715027406811712, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.18325763065367937, + "step": 34270 + }, + { + "epoch": 0.857, + "grad_norm": 33.5, + "grad_norm_var": 6.362955729166667, + "learning_rate": 0.0001, + "loss": 7.2176, + "loss/crossentropy": 2.1784741804003716, + "loss/hidden": 3.3765625, + "loss/jsd": 0.0, + "loss/logits": 0.19372119139879942, + "step": 34280 + }, + { + "epoch": 0.85725, + "grad_norm": 28.5, + "grad_norm_var": 2.1797421956559012e+18, + "learning_rate": 0.0001, + "loss": 7.2276, + "loss/crossentropy": 1.9267006784677505, + "loss/hidden": 3.621484375, + "loss/jsd": 0.0, + "loss/logits": 0.17005468346178532, + "step": 34290 + }, + { + "epoch": 0.8575, + "grad_norm": 36.25, + "grad_norm_var": 19.323372395833335, + "learning_rate": 0.0001, + "loss": 7.2047, + "loss/crossentropy": 2.0357692562043668, + "loss/hidden": 3.265234375, + "loss/jsd": 0.0, + "loss/logits": 0.16413889769464732, + "step": 34300 + }, + { + "epoch": 0.85775, + "grad_norm": 29.75, + "grad_norm_var": 18.8625, + "learning_rate": 0.0001, + "loss": 7.3689, + "loss/crossentropy": 2.137700055539608, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.21653785947710275, + "step": 34310 + }, + { + "epoch": 0.858, + "grad_norm": 30.0, + "grad_norm_var": 6.951822916666667, + "learning_rate": 0.0001, + "loss": 7.3168, + "loss/crossentropy": 2.1582180485129356, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.18372731544077398, + "step": 34320 + }, + { + "epoch": 0.85825, + "grad_norm": 32.5, + "grad_norm_var": 5.91015625, + "learning_rate": 0.0001, + "loss": 7.1411, + "loss/crossentropy": 1.911540611833334, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.17052888721227646, + "step": 34330 + }, + { + "epoch": 0.8585, + "grad_norm": 33.5, + "grad_norm_var": 6.578580729166666, + "learning_rate": 0.0001, + "loss": 7.3572, + "loss/crossentropy": 2.364978903532028, + "loss/hidden": 3.28671875, + "loss/jsd": 0.0, + "loss/logits": 0.18875811491161584, + "step": 34340 + }, + { + "epoch": 0.85875, + "grad_norm": 28.375, + "grad_norm_var": 3.8583333333333334, + "learning_rate": 0.0001, + "loss": 7.2133, + "loss/crossentropy": 2.141178289055824, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.19198058787733316, + "step": 34350 + }, + { + "epoch": 0.859, + "grad_norm": 34.5, + "grad_norm_var": 5.176822916666667, + "learning_rate": 0.0001, + "loss": 7.3441, + "loss/crossentropy": 2.094452814757824, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.19298046212643385, + "step": 34360 + }, + { + "epoch": 0.85925, + "grad_norm": 29.5, + "grad_norm_var": 6.773372395833333, + "learning_rate": 0.0001, + "loss": 7.2284, + "loss/crossentropy": 2.1614732801914216, + "loss/hidden": 3.36484375, + "loss/jsd": 0.0, + "loss/logits": 0.18951235935091973, + "step": 34370 + }, + { + "epoch": 0.8595, + "grad_norm": 28.125, + "grad_norm_var": 8.332291666666666, + "learning_rate": 0.0001, + "loss": 7.2134, + "loss/crossentropy": 1.923812200129032, + "loss/hidden": 3.33046875, + "loss/jsd": 0.0, + "loss/logits": 0.17469180338084697, + "step": 34380 + }, + { + "epoch": 0.85975, + "grad_norm": 29.125, + "grad_norm_var": 6.509309895833334, + "learning_rate": 0.0001, + "loss": 7.2371, + "loss/crossentropy": 2.095948604494333, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.18810884784907103, + "step": 34390 + }, + { + "epoch": 0.86, + "grad_norm": 32.75, + "grad_norm_var": 2.5561848958333333, + "learning_rate": 0.0001, + "loss": 7.3054, + "loss/crossentropy": 2.100370090454817, + "loss/hidden": 3.296484375, + "loss/jsd": 0.0, + "loss/logits": 0.1727097921539098, + "step": 34400 + }, + { + "epoch": 0.86025, + "grad_norm": 32.25, + "grad_norm_var": 6.81640625, + "learning_rate": 0.0001, + "loss": 7.2286, + "loss/crossentropy": 2.0329787097871304, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.19481676891446115, + "step": 34410 + }, + { + "epoch": 0.8605, + "grad_norm": 27.625, + "grad_norm_var": 3.1384765625, + "learning_rate": 0.0001, + "loss": 7.1993, + "loss/crossentropy": 2.0509812578558924, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.1901794293895364, + "step": 34420 + }, + { + "epoch": 0.86075, + "grad_norm": 33.25, + "grad_norm_var": 4.1134765625, + "learning_rate": 0.0001, + "loss": 7.2347, + "loss/crossentropy": 2.080067491531372, + "loss/hidden": 3.213671875, + "loss/jsd": 0.0, + "loss/logits": 0.16772212591022254, + "step": 34430 + }, + { + "epoch": 0.861, + "grad_norm": 32.0, + "grad_norm_var": 2.3749348958333334, + "learning_rate": 0.0001, + "loss": 7.3447, + "loss/crossentropy": 2.1038970395922663, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.19416949711740017, + "step": 34440 + }, + { + "epoch": 0.86125, + "grad_norm": 30.25, + "grad_norm_var": 1.803125, + "learning_rate": 0.0001, + "loss": 7.1554, + "loss/crossentropy": 2.1600521214306356, + "loss/hidden": 3.258984375, + "loss/jsd": 0.0, + "loss/logits": 0.1786617579869926, + "step": 34450 + }, + { + "epoch": 0.8615, + "grad_norm": 31.75, + "grad_norm_var": 1.93125, + "learning_rate": 0.0001, + "loss": 7.2464, + "loss/crossentropy": 2.0537144929170608, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.18485849518328906, + "step": 34460 + }, + { + "epoch": 0.86175, + "grad_norm": 28.75, + "grad_norm_var": 3.091080729166667, + "learning_rate": 0.0001, + "loss": 7.4084, + "loss/crossentropy": 2.187089277803898, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.19332499224692584, + "step": 34470 + }, + { + "epoch": 0.862, + "grad_norm": 30.0, + "grad_norm_var": 2.1259765625, + "learning_rate": 0.0001, + "loss": 7.3543, + "loss/crossentropy": 2.1333917580544948, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.2026039410382509, + "step": 34480 + }, + { + "epoch": 0.86225, + "grad_norm": 33.25, + "grad_norm_var": 2.695572916666667, + "learning_rate": 0.0001, + "loss": 7.3226, + "loss/crossentropy": 2.2536570131778717, + "loss/hidden": 3.3484375, + "loss/jsd": 0.0, + "loss/logits": 0.20088705383241176, + "step": 34490 + }, + { + "epoch": 0.8625, + "grad_norm": 30.5, + "grad_norm_var": 4.008072916666666, + "learning_rate": 0.0001, + "loss": 7.19, + "loss/crossentropy": 2.0807801954448224, + "loss/hidden": 3.343359375, + "loss/jsd": 0.0, + "loss/logits": 0.17837361767888069, + "step": 34500 + }, + { + "epoch": 0.86275, + "grad_norm": 31.5, + "grad_norm_var": 28.599739583333335, + "learning_rate": 0.0001, + "loss": 7.2724, + "loss/crossentropy": 2.045263282954693, + "loss/hidden": 3.206640625, + "loss/jsd": 0.0, + "loss/logits": 0.1748804647475481, + "step": 34510 + }, + { + "epoch": 0.863, + "grad_norm": 30.25, + "grad_norm_var": 31.26015625, + "learning_rate": 0.0001, + "loss": 7.2109, + "loss/crossentropy": 1.9880013443529605, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.18841168489307164, + "step": 34520 + }, + { + "epoch": 0.86325, + "grad_norm": 30.0, + "grad_norm_var": 3.3353515625, + "learning_rate": 0.0001, + "loss": 7.2164, + "loss/crossentropy": 1.9977822721004486, + "loss/hidden": 3.296875, + "loss/jsd": 0.0, + "loss/logits": 0.1732394440099597, + "step": 34530 + }, + { + "epoch": 0.8635, + "grad_norm": 31.375, + "grad_norm_var": 2.8489583333333335, + "learning_rate": 0.0001, + "loss": 7.3079, + "loss/crossentropy": 2.0394112788140775, + "loss/hidden": 3.306640625, + "loss/jsd": 0.0, + "loss/logits": 0.18080272432416677, + "step": 34540 + }, + { + "epoch": 0.86375, + "grad_norm": 29.375, + "grad_norm_var": 1.9479166666666667, + "learning_rate": 0.0001, + "loss": 7.3328, + "loss/crossentropy": 2.027179962396622, + "loss/hidden": 3.326171875, + "loss/jsd": 0.0, + "loss/logits": 0.17251962665468454, + "step": 34550 + }, + { + "epoch": 0.864, + "grad_norm": 30.625, + "grad_norm_var": 2.6488932291666667, + "learning_rate": 0.0001, + "loss": 7.1298, + "loss/crossentropy": 2.034211057424545, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.19268830977380275, + "step": 34560 + }, + { + "epoch": 0.86425, + "grad_norm": 29.875, + "grad_norm_var": 2.4447916666666667, + "learning_rate": 0.0001, + "loss": 7.3167, + "loss/crossentropy": 2.0971597135066986, + "loss/hidden": 3.357421875, + "loss/jsd": 0.0, + "loss/logits": 0.18605808783322572, + "step": 34570 + }, + { + "epoch": 0.8645, + "grad_norm": 48.75, + "grad_norm_var": 35.84348958333333, + "learning_rate": 0.0001, + "loss": 7.142, + "loss/crossentropy": 2.0621576346457005, + "loss/hidden": 3.260546875, + "loss/jsd": 0.0, + "loss/logits": 0.1724056525155902, + "step": 34580 + }, + { + "epoch": 0.86475, + "grad_norm": 28.5, + "grad_norm_var": 36.61764322916667, + "learning_rate": 0.0001, + "loss": 7.2735, + "loss/crossentropy": 2.1108843713998793, + "loss/hidden": 3.34140625, + "loss/jsd": 0.0, + "loss/logits": 0.18370629232376814, + "step": 34590 + }, + { + "epoch": 0.865, + "grad_norm": 29.875, + "grad_norm_var": 4.808072916666666, + "learning_rate": 0.0001, + "loss": 7.3068, + "loss/crossentropy": 2.1198356598615646, + "loss/hidden": 3.2515625, + "loss/jsd": 0.0, + "loss/logits": 0.17393486481159925, + "step": 34600 + }, + { + "epoch": 0.86525, + "grad_norm": 29.25, + "grad_norm_var": 7.821875, + "learning_rate": 0.0001, + "loss": 7.1895, + "loss/crossentropy": 2.1428691267967226, + "loss/hidden": 3.312890625, + "loss/jsd": 0.0, + "loss/logits": 0.17750228513032199, + "step": 34610 + }, + { + "epoch": 0.8655, + "grad_norm": 28.875, + "grad_norm_var": 3.534309895833333, + "learning_rate": 0.0001, + "loss": 7.1452, + "loss/crossentropy": 2.1150192469358444, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.18156156884506344, + "step": 34620 + }, + { + "epoch": 0.86575, + "grad_norm": 47.25, + "grad_norm_var": 20.116666666666667, + "learning_rate": 0.0001, + "loss": 7.1916, + "loss/crossentropy": 2.048045912384987, + "loss/hidden": 3.2953125, + "loss/jsd": 0.0, + "loss/logits": 0.17622407525777817, + "step": 34630 + }, + { + "epoch": 0.866, + "grad_norm": 44.5, + "grad_norm_var": 32.889322916666664, + "learning_rate": 0.0001, + "loss": 7.2463, + "loss/crossentropy": 2.1702506244182587, + "loss/hidden": 3.320703125, + "loss/jsd": 0.0, + "loss/logits": 0.18493551313877105, + "step": 34640 + }, + { + "epoch": 0.86625, + "grad_norm": 29.25, + "grad_norm_var": 17.79765625, + "learning_rate": 0.0001, + "loss": 7.1766, + "loss/crossentropy": 2.0337833762168884, + "loss/hidden": 3.2890625, + "loss/jsd": 0.0, + "loss/logits": 0.16715510403737427, + "step": 34650 + }, + { + "epoch": 0.8665, + "grad_norm": 32.5, + "grad_norm_var": 21.022916666666667, + "learning_rate": 0.0001, + "loss": 7.3194, + "loss/crossentropy": 2.120563616603613, + "loss/hidden": 3.244140625, + "loss/jsd": 0.0, + "loss/logits": 0.18254042975604534, + "step": 34660 + }, + { + "epoch": 0.86675, + "grad_norm": 30.5, + "grad_norm_var": 21.049934895833335, + "learning_rate": 0.0001, + "loss": 7.2838, + "loss/crossentropy": 2.179687091708183, + "loss/hidden": 3.31171875, + "loss/jsd": 0.0, + "loss/logits": 0.18392705954611302, + "step": 34670 + }, + { + "epoch": 0.867, + "grad_norm": 28.75, + "grad_norm_var": 22.86640625, + "learning_rate": 0.0001, + "loss": 7.1964, + "loss/crossentropy": 2.0371907725930214, + "loss/hidden": 3.272265625, + "loss/jsd": 0.0, + "loss/logits": 0.16766862897202373, + "step": 34680 + }, + { + "epoch": 0.86725, + "grad_norm": 37.5, + "grad_norm_var": 9.8853515625, + "learning_rate": 0.0001, + "loss": 7.2503, + "loss/crossentropy": 2.0847177296876906, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.19687753692269325, + "step": 34690 + }, + { + "epoch": 0.8675, + "grad_norm": 30.625, + "grad_norm_var": 8.3125, + "learning_rate": 0.0001, + "loss": 7.2033, + "loss/crossentropy": 2.1133716337382795, + "loss/hidden": 3.2671875, + "loss/jsd": 0.0, + "loss/logits": 0.17771089039742946, + "step": 34700 + }, + { + "epoch": 0.86775, + "grad_norm": 32.0, + "grad_norm_var": 15.16015625, + "learning_rate": 0.0001, + "loss": 7.1315, + "loss/crossentropy": 1.9481457851827144, + "loss/hidden": 3.29296875, + "loss/jsd": 0.0, + "loss/logits": 0.16198133826255798, + "step": 34710 + }, + { + "epoch": 0.868, + "grad_norm": 28.625, + "grad_norm_var": 9.898893229166667, + "learning_rate": 0.0001, + "loss": 7.217, + "loss/crossentropy": 2.067484679818153, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.1796406304463744, + "step": 34720 + }, + { + "epoch": 0.86825, + "grad_norm": 32.0, + "grad_norm_var": 7.970247395833334, + "learning_rate": 0.0001, + "loss": 7.2962, + "loss/crossentropy": 2.044479449093342, + "loss/hidden": 3.288671875, + "loss/jsd": 0.0, + "loss/logits": 0.1899338317103684, + "step": 34730 + }, + { + "epoch": 0.8685, + "grad_norm": 29.625, + "grad_norm_var": 10.008072916666666, + "learning_rate": 0.0001, + "loss": 7.2071, + "loss/crossentropy": 1.9862901911139488, + "loss/hidden": 3.302734375, + "loss/jsd": 0.0, + "loss/logits": 0.16481328159570693, + "step": 34740 + }, + { + "epoch": 0.86875, + "grad_norm": 29.625, + "grad_norm_var": 6.892122395833334, + "learning_rate": 0.0001, + "loss": 7.1911, + "loss/crossentropy": 2.061764293164015, + "loss/hidden": 3.1796875, + "loss/jsd": 0.0, + "loss/logits": 0.17607883377932013, + "step": 34750 + }, + { + "epoch": 0.869, + "grad_norm": 31.75, + "grad_norm_var": 6.23515625, + "learning_rate": 0.0001, + "loss": 7.1601, + "loss/crossentropy": 1.9567598104476929, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1676634754985571, + "step": 34760 + }, + { + "epoch": 0.86925, + "grad_norm": 32.0, + "grad_norm_var": 5.8712890625, + "learning_rate": 0.0001, + "loss": 7.1628, + "loss/crossentropy": 2.0253935903310776, + "loss/hidden": 3.38203125, + "loss/jsd": 0.0, + "loss/logits": 0.2007642241194844, + "step": 34770 + }, + { + "epoch": 0.8695, + "grad_norm": 32.0, + "grad_norm_var": 5.207291666666666, + "learning_rate": 0.0001, + "loss": 7.2495, + "loss/crossentropy": 2.0621690064668656, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.18934654965996742, + "step": 34780 + }, + { + "epoch": 0.86975, + "grad_norm": 29.75, + "grad_norm_var": 30.786458333333332, + "learning_rate": 0.0001, + "loss": 7.3168, + "loss/crossentropy": 2.1084874011576176, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.17805544696748257, + "step": 34790 + }, + { + "epoch": 0.87, + "grad_norm": 28.25, + "grad_norm_var": 38.59765625, + "learning_rate": 0.0001, + "loss": 7.2599, + "loss/crossentropy": 2.0882460571825505, + "loss/hidden": 3.2828125, + "loss/jsd": 0.0, + "loss/logits": 0.175720988959074, + "step": 34800 + }, + { + "epoch": 0.87025, + "grad_norm": 31.375, + "grad_norm_var": 19.3400390625, + "learning_rate": 0.0001, + "loss": 7.1688, + "loss/crossentropy": 2.16546451151371, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.17413612268865108, + "step": 34810 + }, + { + "epoch": 0.8705, + "grad_norm": 31.625, + "grad_norm_var": 12.055208333333333, + "learning_rate": 0.0001, + "loss": 7.271, + "loss/crossentropy": 2.0857179775834083, + "loss/hidden": 3.468359375, + "loss/jsd": 0.0, + "loss/logits": 0.1955510875210166, + "step": 34820 + }, + { + "epoch": 0.87075, + "grad_norm": 30.625, + "grad_norm_var": 3.5348307291666665, + "learning_rate": 0.0001, + "loss": 7.2399, + "loss/crossentropy": 2.0899414606392384, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.1947469917126, + "step": 34830 + }, + { + "epoch": 0.871, + "grad_norm": 31.125, + "grad_norm_var": 4.118489583333333, + "learning_rate": 0.0001, + "loss": 7.3026, + "loss/crossentropy": 1.9977812230587007, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.1887421939522028, + "step": 34840 + }, + { + "epoch": 0.87125, + "grad_norm": 32.75, + "grad_norm_var": 106.38515625, + "learning_rate": 0.0001, + "loss": 7.1963, + "loss/crossentropy": 2.035346057265997, + "loss/hidden": 3.316796875, + "loss/jsd": 0.0, + "loss/logits": 0.18354044388979673, + "step": 34850 + }, + { + "epoch": 0.8715, + "grad_norm": 29.25, + "grad_norm_var": 40.138997395833336, + "learning_rate": 0.0001, + "loss": 7.2506, + "loss/crossentropy": 2.055702011287212, + "loss/hidden": 3.34296875, + "loss/jsd": 0.0, + "loss/logits": 0.18968002796173095, + "step": 34860 + }, + { + "epoch": 0.87175, + "grad_norm": 31.0, + "grad_norm_var": 3.1434895833333334, + "learning_rate": 0.0001, + "loss": 7.1322, + "loss/crossentropy": 2.205520695447922, + "loss/hidden": 3.261328125, + "loss/jsd": 0.0, + "loss/logits": 0.17120692394673825, + "step": 34870 + }, + { + "epoch": 0.872, + "grad_norm": 32.0, + "grad_norm_var": 13.1791015625, + "learning_rate": 0.0001, + "loss": 7.2685, + "loss/crossentropy": 2.1152933806180956, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.20207914523780346, + "step": 34880 + }, + { + "epoch": 0.87225, + "grad_norm": 29.75, + "grad_norm_var": 9.956705729166666, + "learning_rate": 0.0001, + "loss": 7.3239, + "loss/crossentropy": 2.1282338641583918, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.18587686102837325, + "step": 34890 + }, + { + "epoch": 0.8725, + "grad_norm": 52.25, + "grad_norm_var": 31.803580729166665, + "learning_rate": 0.0001, + "loss": 7.2091, + "loss/crossentropy": 2.1109050989151, + "loss/hidden": 3.400390625, + "loss/jsd": 0.0, + "loss/logits": 0.1893902899697423, + "step": 34900 + }, + { + "epoch": 0.87275, + "grad_norm": 52.75, + "grad_norm_var": 61.08958333333333, + "learning_rate": 0.0001, + "loss": 7.147, + "loss/crossentropy": 2.071830262243748, + "loss/hidden": 3.13828125, + "loss/jsd": 0.0, + "loss/logits": 0.16298614926636218, + "step": 34910 + }, + { + "epoch": 0.873, + "grad_norm": 53.25, + "grad_norm_var": 67.65598958333334, + "learning_rate": 0.0001, + "loss": 7.2023, + "loss/crossentropy": 2.1021742850542067, + "loss/hidden": 3.459375, + "loss/jsd": 0.0, + "loss/logits": 0.20394247360527515, + "step": 34920 + }, + { + "epoch": 0.87325, + "grad_norm": 31.375, + "grad_norm_var": 36.49108072916667, + "learning_rate": 0.0001, + "loss": 7.3976, + "loss/crossentropy": 2.1242296025156975, + "loss/hidden": 3.248046875, + "loss/jsd": 0.0, + "loss/logits": 0.17494969312101602, + "step": 34930 + }, + { + "epoch": 0.8735, + "grad_norm": 30.875, + "grad_norm_var": 1.8572265625, + "learning_rate": 0.0001, + "loss": 7.2589, + "loss/crossentropy": 1.9055627778172493, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.17225345242768525, + "step": 34940 + }, + { + "epoch": 0.87375, + "grad_norm": 30.125, + "grad_norm_var": 1.5186848958333334, + "learning_rate": 0.0001, + "loss": 7.1511, + "loss/crossentropy": 1.9887107498943806, + "loss/hidden": 3.29140625, + "loss/jsd": 0.0, + "loss/logits": 0.17664323821663858, + "step": 34950 + }, + { + "epoch": 0.874, + "grad_norm": 29.75, + "grad_norm_var": 1.4518229166666667, + "learning_rate": 0.0001, + "loss": 7.1873, + "loss/crossentropy": 2.1399480305612086, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.17877526078373193, + "step": 34960 + }, + { + "epoch": 0.87425, + "grad_norm": 27.875, + "grad_norm_var": 4.1478515625, + "learning_rate": 0.0001, + "loss": 7.1986, + "loss/crossentropy": 2.2206870928406715, + "loss/hidden": 3.332421875, + "loss/jsd": 0.0, + "loss/logits": 0.18415752444416283, + "step": 34970 + }, + { + "epoch": 0.8745, + "grad_norm": 34.25, + "grad_norm_var": 6.489518229166666, + "learning_rate": 0.0001, + "loss": 7.2735, + "loss/crossentropy": 2.185173198580742, + "loss/hidden": 3.2921875, + "loss/jsd": 0.0, + "loss/logits": 0.18359190542250872, + "step": 34980 + }, + { + "epoch": 0.87475, + "grad_norm": 29.875, + "grad_norm_var": 4.262239583333334, + "learning_rate": 0.0001, + "loss": 7.3373, + "loss/crossentropy": 2.264425238966942, + "loss/hidden": 3.309375, + "loss/jsd": 0.0, + "loss/logits": 0.18859156258404255, + "step": 34990 + }, + { + "epoch": 0.875, + "grad_norm": 29.625, + "grad_norm_var": 4.75625, + "learning_rate": 0.0001, + "loss": 7.1984, + "loss/crossentropy": 2.123842165619135, + "loss/hidden": 3.312109375, + "loss/jsd": 0.0, + "loss/logits": 0.1855444625020027, + "step": 35000 + } + ], + "logging_steps": 10, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0001285112030822e+20, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}