diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15903 @@ +{ + "best_metric": 78.0784, + "best_model_checkpoint": "marianMT_hin_eng_cs/checkpoint-22360", + "epoch": 20.0, + "eval_steps": 500, + "global_step": 22360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008944543828264758, + "grad_norm": 568.685302734375, + "learning_rate": 3.3333333333333334e-08, + "loss": 45.8279, + "step": 10 + }, + { + "epoch": 0.017889087656529516, + "grad_norm": 569.685546875, + "learning_rate": 6.666666666666667e-08, + "loss": 45.7174, + "step": 20 + }, + { + "epoch": 0.026833631484794274, + "grad_norm": 574.8485717773438, + "learning_rate": 1e-07, + "loss": 45.079, + "step": 30 + }, + { + "epoch": 0.03577817531305903, + "grad_norm": 567.7886962890625, + "learning_rate": 1.3333333333333334e-07, + "loss": 43.9196, + "step": 40 + }, + { + "epoch": 0.044722719141323794, + "grad_norm": 576.7008056640625, + "learning_rate": 1.6666666666666665e-07, + "loss": 43.168, + "step": 50 + }, + { + "epoch": 0.05366726296958855, + "grad_norm": 562.3006591796875, + "learning_rate": 2e-07, + "loss": 41.2498, + "step": 60 + }, + { + "epoch": 0.0626118067978533, + "grad_norm": 570.4864501953125, + "learning_rate": 2.3333333333333333e-07, + "loss": 39.3382, + "step": 70 + }, + { + "epoch": 0.07155635062611806, + "grad_norm": 574.6784057617188, + "learning_rate": 2.6666666666666667e-07, + "loss": 37.5161, + "step": 80 + }, + { + "epoch": 0.08050089445438283, + "grad_norm": 569.1917114257812, + "learning_rate": 3e-07, + "loss": 34.6571, + "step": 90 + }, + { + "epoch": 0.08944543828264759, + "grad_norm": 575.496826171875, + "learning_rate": 3.333333333333333e-07, + "loss": 31.8409, + "step": 100 + }, + { + "epoch": 0.09838998211091235, + "grad_norm": 559.3043212890625, + "learning_rate": 3.666666666666666e-07, + "loss": 28.8879, + "step": 110 + }, + { + "epoch": 0.1073345259391771, + "grad_norm": 549.9949951171875, + "learning_rate": 4e-07, + "loss": 25.2081, + "step": 120 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 503.8363037109375, + "learning_rate": 4.3333333333333335e-07, + "loss": 21.1121, + "step": 130 + }, + { + "epoch": 0.1252236135957066, + "grad_norm": 403.3987121582031, + "learning_rate": 4.6666666666666666e-07, + "loss": 17.6223, + "step": 140 + }, + { + "epoch": 0.13416815742397137, + "grad_norm": 300.0085754394531, + "learning_rate": 5e-07, + "loss": 14.2539, + "step": 150 + }, + { + "epoch": 0.14311270125223613, + "grad_norm": 184.18865966796875, + "learning_rate": 5.333333333333333e-07, + "loss": 11.7277, + "step": 160 + }, + { + "epoch": 0.1520572450805009, + "grad_norm": 87.85770416259766, + "learning_rate": 5.666666666666666e-07, + "loss": 9.9414, + "step": 170 + }, + { + "epoch": 0.16100178890876565, + "grad_norm": 60.96449279785156, + "learning_rate": 6e-07, + "loss": 9.0615, + "step": 180 + }, + { + "epoch": 0.16994633273703041, + "grad_norm": 48.676456451416016, + "learning_rate": 6.333333333333332e-07, + "loss": 8.4672, + "step": 190 + }, + { + "epoch": 0.17889087656529518, + "grad_norm": 43.717464447021484, + "learning_rate": 6.666666666666666e-07, + "loss": 7.8418, + "step": 200 + }, + { + "epoch": 0.18783542039355994, + "grad_norm": 38.93547439575195, + "learning_rate": 7e-07, + "loss": 7.3526, + "step": 210 + }, + { + "epoch": 0.1967799642218247, + "grad_norm": 34.23944854736328, + "learning_rate": 7.333333333333332e-07, + "loss": 6.9987, + "step": 220 + }, + { + "epoch": 0.20572450805008943, + "grad_norm": 31.877201080322266, + "learning_rate": 7.666666666666667e-07, + "loss": 6.4741, + "step": 230 + }, + { + "epoch": 0.2146690518783542, + "grad_norm": 26.220779418945312, + "learning_rate": 8e-07, + "loss": 6.1131, + "step": 240 + }, + { + "epoch": 0.22361359570661896, + "grad_norm": 24.312641143798828, + "learning_rate": 8.333333333333333e-07, + "loss": 5.864, + "step": 250 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 21.310455322265625, + "learning_rate": 8.666666666666667e-07, + "loss": 5.6485, + "step": 260 + }, + { + "epoch": 0.24150268336314848, + "grad_norm": 20.5892276763916, + "learning_rate": 9e-07, + "loss": 5.4319, + "step": 270 + }, + { + "epoch": 0.2504472271914132, + "grad_norm": 16.048423767089844, + "learning_rate": 9.333333333333333e-07, + "loss": 5.2566, + "step": 280 + }, + { + "epoch": 0.259391771019678, + "grad_norm": 14.29564094543457, + "learning_rate": 9.666666666666666e-07, + "loss": 5.0271, + "step": 290 + }, + { + "epoch": 0.26833631484794274, + "grad_norm": 12.640876770019531, + "learning_rate": 1e-06, + "loss": 4.9135, + "step": 300 + }, + { + "epoch": 0.2772808586762075, + "grad_norm": 11.171175003051758, + "learning_rate": 9.99699157641396e-07, + "loss": 4.7181, + "step": 310 + }, + { + "epoch": 0.28622540250447226, + "grad_norm": 11.868614196777344, + "learning_rate": 9.993983152827917e-07, + "loss": 4.6156, + "step": 320 + }, + { + "epoch": 0.295169946332737, + "grad_norm": 9.595853805541992, + "learning_rate": 9.990974729241877e-07, + "loss": 4.5295, + "step": 330 + }, + { + "epoch": 0.3041144901610018, + "grad_norm": 8.965413093566895, + "learning_rate": 9.987966305655835e-07, + "loss": 4.3876, + "step": 340 + }, + { + "epoch": 0.31305903398926654, + "grad_norm": 8.445356369018555, + "learning_rate": 9.984957882069795e-07, + "loss": 4.3003, + "step": 350 + }, + { + "epoch": 0.3220035778175313, + "grad_norm": 7.859620094299316, + "learning_rate": 9.981949458483753e-07, + "loss": 4.1741, + "step": 360 + }, + { + "epoch": 0.33094812164579607, + "grad_norm": 7.521224498748779, + "learning_rate": 9.978941034897713e-07, + "loss": 4.1204, + "step": 370 + }, + { + "epoch": 0.33989266547406083, + "grad_norm": 7.16605281829834, + "learning_rate": 9.975932611311673e-07, + "loss": 4.025, + "step": 380 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 7.4897990226745605, + "learning_rate": 9.97292418772563e-07, + "loss": 3.9424, + "step": 390 + }, + { + "epoch": 0.35778175313059035, + "grad_norm": 6.378204345703125, + "learning_rate": 9.96991576413959e-07, + "loss": 3.9045, + "step": 400 + }, + { + "epoch": 0.3667262969588551, + "grad_norm": 7.619123935699463, + "learning_rate": 9.96690734055355e-07, + "loss": 3.8186, + "step": 410 + }, + { + "epoch": 0.3756708407871199, + "grad_norm": 6.114331245422363, + "learning_rate": 9.963898916967508e-07, + "loss": 3.7494, + "step": 420 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 6.384728908538818, + "learning_rate": 9.960890493381468e-07, + "loss": 3.7179, + "step": 430 + }, + { + "epoch": 0.3935599284436494, + "grad_norm": 5.996768951416016, + "learning_rate": 9.957882069795426e-07, + "loss": 3.6116, + "step": 440 + }, + { + "epoch": 0.40250447227191416, + "grad_norm": 6.2389140129089355, + "learning_rate": 9.954873646209386e-07, + "loss": 3.5626, + "step": 450 + }, + { + "epoch": 0.41144901610017887, + "grad_norm": 6.4500651359558105, + "learning_rate": 9.951865222623344e-07, + "loss": 3.4897, + "step": 460 + }, + { + "epoch": 0.4203935599284436, + "grad_norm": 6.330971717834473, + "learning_rate": 9.948856799037304e-07, + "loss": 3.4178, + "step": 470 + }, + { + "epoch": 0.4293381037567084, + "grad_norm": 5.717202663421631, + "learning_rate": 9.945848375451264e-07, + "loss": 3.3517, + "step": 480 + }, + { + "epoch": 0.43828264758497315, + "grad_norm": 5.807801723480225, + "learning_rate": 9.942839951865222e-07, + "loss": 3.2986, + "step": 490 + }, + { + "epoch": 0.4472271914132379, + "grad_norm": 5.935306072235107, + "learning_rate": 9.939831528279182e-07, + "loss": 3.2396, + "step": 500 + }, + { + "epoch": 0.4561717352415027, + "grad_norm": 6.021726608276367, + "learning_rate": 9.93682310469314e-07, + "loss": 3.1649, + "step": 510 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 5.559403419494629, + "learning_rate": 9.9338146811071e-07, + "loss": 3.1074, + "step": 520 + }, + { + "epoch": 0.4740608228980322, + "grad_norm": 5.631515026092529, + "learning_rate": 9.930806257521057e-07, + "loss": 3.049, + "step": 530 + }, + { + "epoch": 0.48300536672629696, + "grad_norm": 5.0299391746521, + "learning_rate": 9.927797833935017e-07, + "loss": 2.999, + "step": 540 + }, + { + "epoch": 0.4919499105545617, + "grad_norm": 5.350959777832031, + "learning_rate": 9.924789410348977e-07, + "loss": 2.9523, + "step": 550 + }, + { + "epoch": 0.5008944543828264, + "grad_norm": 5.76137638092041, + "learning_rate": 9.921780986762935e-07, + "loss": 2.8882, + "step": 560 + }, + { + "epoch": 0.5098389982110912, + "grad_norm": 5.530500411987305, + "learning_rate": 9.918772563176895e-07, + "loss": 2.8392, + "step": 570 + }, + { + "epoch": 0.518783542039356, + "grad_norm": 5.285860538482666, + "learning_rate": 9.915764139590855e-07, + "loss": 2.8097, + "step": 580 + }, + { + "epoch": 0.5277280858676208, + "grad_norm": 5.185799598693848, + "learning_rate": 9.912755716004813e-07, + "loss": 2.7741, + "step": 590 + }, + { + "epoch": 0.5366726296958855, + "grad_norm": 5.190760135650635, + "learning_rate": 9.909747292418773e-07, + "loss": 2.7067, + "step": 600 + }, + { + "epoch": 0.5456171735241503, + "grad_norm": 5.321168899536133, + "learning_rate": 9.90673886883273e-07, + "loss": 2.6636, + "step": 610 + }, + { + "epoch": 0.554561717352415, + "grad_norm": 4.955347537994385, + "learning_rate": 9.90373044524669e-07, + "loss": 2.6456, + "step": 620 + }, + { + "epoch": 0.5635062611806798, + "grad_norm": 5.024599552154541, + "learning_rate": 9.900722021660649e-07, + "loss": 2.5973, + "step": 630 + }, + { + "epoch": 0.5724508050089445, + "grad_norm": 4.905087471008301, + "learning_rate": 9.897713598074608e-07, + "loss": 2.5483, + "step": 640 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 4.616291522979736, + "learning_rate": 9.894705174488568e-07, + "loss": 2.5395, + "step": 650 + }, + { + "epoch": 0.590339892665474, + "grad_norm": 5.065720081329346, + "learning_rate": 9.891696750902526e-07, + "loss": 2.4616, + "step": 660 + }, + { + "epoch": 0.5992844364937389, + "grad_norm": 4.8041558265686035, + "learning_rate": 9.888688327316486e-07, + "loss": 2.4739, + "step": 670 + }, + { + "epoch": 0.6082289803220036, + "grad_norm": 5.215109348297119, + "learning_rate": 9.885679903730444e-07, + "loss": 2.4249, + "step": 680 + }, + { + "epoch": 0.6171735241502684, + "grad_norm": 4.656722545623779, + "learning_rate": 9.882671480144404e-07, + "loss": 2.3901, + "step": 690 + }, + { + "epoch": 0.6261180679785331, + "grad_norm": 4.943691730499268, + "learning_rate": 9.879663056558362e-07, + "loss": 2.3465, + "step": 700 + }, + { + "epoch": 0.6350626118067979, + "grad_norm": 4.540484428405762, + "learning_rate": 9.876654632972322e-07, + "loss": 2.3274, + "step": 710 + }, + { + "epoch": 0.6440071556350626, + "grad_norm": 4.563231468200684, + "learning_rate": 9.873646209386282e-07, + "loss": 2.2895, + "step": 720 + }, + { + "epoch": 0.6529516994633273, + "grad_norm": 4.655714988708496, + "learning_rate": 9.87063778580024e-07, + "loss": 2.2459, + "step": 730 + }, + { + "epoch": 0.6618962432915921, + "grad_norm": 4.837522029876709, + "learning_rate": 9.8676293622142e-07, + "loss": 2.225, + "step": 740 + }, + { + "epoch": 0.6708407871198568, + "grad_norm": 4.510364532470703, + "learning_rate": 9.86462093862816e-07, + "loss": 2.2263, + "step": 750 + }, + { + "epoch": 0.6797853309481217, + "grad_norm": 4.513600826263428, + "learning_rate": 9.861612515042117e-07, + "loss": 2.2018, + "step": 760 + }, + { + "epoch": 0.6887298747763864, + "grad_norm": 4.477957248687744, + "learning_rate": 9.858604091456077e-07, + "loss": 2.161, + "step": 770 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 4.512912750244141, + "learning_rate": 9.855595667870035e-07, + "loss": 2.151, + "step": 780 + }, + { + "epoch": 0.7066189624329159, + "grad_norm": 5.71266508102417, + "learning_rate": 9.852587244283995e-07, + "loss": 2.0973, + "step": 790 + }, + { + "epoch": 0.7155635062611807, + "grad_norm": 5.014021873474121, + "learning_rate": 9.849578820697953e-07, + "loss": 2.1292, + "step": 800 + }, + { + "epoch": 0.7245080500894454, + "grad_norm": 4.280351638793945, + "learning_rate": 9.846570397111913e-07, + "loss": 2.061, + "step": 810 + }, + { + "epoch": 0.7334525939177102, + "grad_norm": 4.5558929443359375, + "learning_rate": 9.843561973525873e-07, + "loss": 2.0387, + "step": 820 + }, + { + "epoch": 0.7423971377459749, + "grad_norm": 4.458014011383057, + "learning_rate": 9.84055354993983e-07, + "loss": 2.0474, + "step": 830 + }, + { + "epoch": 0.7513416815742398, + "grad_norm": 4.254577159881592, + "learning_rate": 9.83754512635379e-07, + "loss": 2.0304, + "step": 840 + }, + { + "epoch": 0.7602862254025045, + "grad_norm": 4.3591742515563965, + "learning_rate": 9.834536702767749e-07, + "loss": 2.0006, + "step": 850 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 4.3223042488098145, + "learning_rate": 9.831528279181708e-07, + "loss": 1.973, + "step": 860 + }, + { + "epoch": 0.778175313059034, + "grad_norm": 4.374037742614746, + "learning_rate": 9.828519855595666e-07, + "loss": 1.9684, + "step": 870 + }, + { + "epoch": 0.7871198568872988, + "grad_norm": 4.324105739593506, + "learning_rate": 9.825511432009626e-07, + "loss": 1.9514, + "step": 880 + }, + { + "epoch": 0.7960644007155635, + "grad_norm": 4.300051212310791, + "learning_rate": 9.822503008423586e-07, + "loss": 1.9275, + "step": 890 + }, + { + "epoch": 0.8050089445438283, + "grad_norm": 4.329366207122803, + "learning_rate": 9.819494584837544e-07, + "loss": 1.8943, + "step": 900 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 4.005756855010986, + "learning_rate": 9.816486161251504e-07, + "loss": 1.8852, + "step": 910 + }, + { + "epoch": 0.8228980322003577, + "grad_norm": 4.359870910644531, + "learning_rate": 9.813477737665464e-07, + "loss": 1.87, + "step": 920 + }, + { + "epoch": 0.8318425760286225, + "grad_norm": 4.084425926208496, + "learning_rate": 9.810469314079422e-07, + "loss": 1.8368, + "step": 930 + }, + { + "epoch": 0.8407871198568873, + "grad_norm": 3.967466354370117, + "learning_rate": 9.807460890493382e-07, + "loss": 1.8078, + "step": 940 + }, + { + "epoch": 0.8497316636851521, + "grad_norm": 4.0641021728515625, + "learning_rate": 9.80445246690734e-07, + "loss": 1.8084, + "step": 950 + }, + { + "epoch": 0.8586762075134168, + "grad_norm": 4.061304092407227, + "learning_rate": 9.8014440433213e-07, + "loss": 1.801, + "step": 960 + }, + { + "epoch": 0.8676207513416816, + "grad_norm": 4.145083427429199, + "learning_rate": 9.798435619735257e-07, + "loss": 1.7596, + "step": 970 + }, + { + "epoch": 0.8765652951699463, + "grad_norm": 4.002536296844482, + "learning_rate": 9.795427196149217e-07, + "loss": 1.7762, + "step": 980 + }, + { + "epoch": 0.8855098389982111, + "grad_norm": 3.7484164237976074, + "learning_rate": 9.792418772563177e-07, + "loss": 1.7532, + "step": 990 + }, + { + "epoch": 0.8944543828264758, + "grad_norm": 4.02016544342041, + "learning_rate": 9.789410348977135e-07, + "loss": 1.729, + "step": 1000 + }, + { + "epoch": 0.9033989266547406, + "grad_norm": 4.240043640136719, + "learning_rate": 9.786401925391095e-07, + "loss": 1.71, + "step": 1010 + }, + { + "epoch": 0.9123434704830053, + "grad_norm": 3.7680490016937256, + "learning_rate": 9.783393501805053e-07, + "loss": 1.6782, + "step": 1020 + }, + { + "epoch": 0.9212880143112702, + "grad_norm": 3.9167776107788086, + "learning_rate": 9.780385078219013e-07, + "loss": 1.6722, + "step": 1030 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 4.05009126663208, + "learning_rate": 9.77737665463297e-07, + "loss": 1.6392, + "step": 1040 + }, + { + "epoch": 0.9391771019677997, + "grad_norm": 4.031562805175781, + "learning_rate": 9.77436823104693e-07, + "loss": 1.6431, + "step": 1050 + }, + { + "epoch": 0.9481216457960644, + "grad_norm": 4.088625431060791, + "learning_rate": 9.77135980746089e-07, + "loss": 1.6517, + "step": 1060 + }, + { + "epoch": 0.9570661896243292, + "grad_norm": 4.250485420227051, + "learning_rate": 9.768351383874849e-07, + "loss": 1.6238, + "step": 1070 + }, + { + "epoch": 0.9660107334525939, + "grad_norm": 4.007058143615723, + "learning_rate": 9.765342960288808e-07, + "loss": 1.6368, + "step": 1080 + }, + { + "epoch": 0.9749552772808586, + "grad_norm": 4.101932525634766, + "learning_rate": 9.762334536702768e-07, + "loss": 1.6112, + "step": 1090 + }, + { + "epoch": 0.9838998211091234, + "grad_norm": 3.9927737712860107, + "learning_rate": 9.759326113116726e-07, + "loss": 1.5906, + "step": 1100 + }, + { + "epoch": 0.9928443649373881, + "grad_norm": 3.737098455429077, + "learning_rate": 9.756317689530686e-07, + "loss": 1.5823, + "step": 1110 + }, + { + "epoch": 1.0, + "eval_bleu": 11.6257, + "eval_gen_len": 77.1622, + "eval_loss": 1.177813172340393, + "eval_runtime": 59.2168, + "eval_samples_per_second": 17.596, + "eval_steps_per_second": 0.186, + "step": 1118 + }, + { + "epoch": 1.0017889087656529, + "grad_norm": 4.285216808319092, + "learning_rate": 9.753309265944644e-07, + "loss": 1.5971, + "step": 1120 + }, + { + "epoch": 1.0107334525939178, + "grad_norm": 4.146667003631592, + "learning_rate": 9.750300842358604e-07, + "loss": 1.5671, + "step": 1130 + }, + { + "epoch": 1.0196779964221825, + "grad_norm": 4.014759063720703, + "learning_rate": 9.747292418772562e-07, + "loss": 1.5578, + "step": 1140 + }, + { + "epoch": 1.0286225402504472, + "grad_norm": 3.8846042156219482, + "learning_rate": 9.744283995186522e-07, + "loss": 1.5398, + "step": 1150 + }, + { + "epoch": 1.037567084078712, + "grad_norm": 3.8807787895202637, + "learning_rate": 9.741275571600482e-07, + "loss": 1.5235, + "step": 1160 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 3.5784103870391846, + "learning_rate": 9.73826714801444e-07, + "loss": 1.5122, + "step": 1170 + }, + { + "epoch": 1.0554561717352415, + "grad_norm": 3.704495668411255, + "learning_rate": 9.7352587244284e-07, + "loss": 1.4861, + "step": 1180 + }, + { + "epoch": 1.0644007155635062, + "grad_norm": 3.9804067611694336, + "learning_rate": 9.732250300842357e-07, + "loss": 1.4979, + "step": 1190 + }, + { + "epoch": 1.073345259391771, + "grad_norm": 3.8511621952056885, + "learning_rate": 9.729241877256317e-07, + "loss": 1.4808, + "step": 1200 + }, + { + "epoch": 1.0822898032200359, + "grad_norm": 3.7490718364715576, + "learning_rate": 9.726233453670275e-07, + "loss": 1.4304, + "step": 1210 + }, + { + "epoch": 1.0912343470483006, + "grad_norm": 3.821725845336914, + "learning_rate": 9.723225030084235e-07, + "loss": 1.4809, + "step": 1220 + }, + { + "epoch": 1.1001788908765653, + "grad_norm": 3.8172216415405273, + "learning_rate": 9.720216606498195e-07, + "loss": 1.4608, + "step": 1230 + }, + { + "epoch": 1.10912343470483, + "grad_norm": 4.13401460647583, + "learning_rate": 9.717208182912153e-07, + "loss": 1.461, + "step": 1240 + }, + { + "epoch": 1.118067978533095, + "grad_norm": 3.9874379634857178, + "learning_rate": 9.714199759326113e-07, + "loss": 1.4375, + "step": 1250 + }, + { + "epoch": 1.1270125223613596, + "grad_norm": 3.4529519081115723, + "learning_rate": 9.711191335740073e-07, + "loss": 1.4295, + "step": 1260 + }, + { + "epoch": 1.1359570661896243, + "grad_norm": 3.6411478519439697, + "learning_rate": 9.70818291215403e-07, + "loss": 1.3979, + "step": 1270 + }, + { + "epoch": 1.144901610017889, + "grad_norm": 3.712270736694336, + "learning_rate": 9.70517448856799e-07, + "loss": 1.4054, + "step": 1280 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 3.6629765033721924, + "learning_rate": 9.702166064981949e-07, + "loss": 1.4118, + "step": 1290 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 3.4929187297821045, + "learning_rate": 9.699157641395908e-07, + "loss": 1.3981, + "step": 1300 + }, + { + "epoch": 1.1717352415026834, + "grad_norm": 3.4843080043792725, + "learning_rate": 9.696149217809866e-07, + "loss": 1.3595, + "step": 1310 + }, + { + "epoch": 1.180679785330948, + "grad_norm": 3.445066452026367, + "learning_rate": 9.693140794223826e-07, + "loss": 1.4054, + "step": 1320 + }, + { + "epoch": 1.1896243291592128, + "grad_norm": 3.7293801307678223, + "learning_rate": 9.690132370637786e-07, + "loss": 1.3854, + "step": 1330 + }, + { + "epoch": 1.1985688729874777, + "grad_norm": 3.4736623764038086, + "learning_rate": 9.687123947051744e-07, + "loss": 1.3661, + "step": 1340 + }, + { + "epoch": 1.2075134168157424, + "grad_norm": 3.5197012424468994, + "learning_rate": 9.684115523465704e-07, + "loss": 1.3532, + "step": 1350 + }, + { + "epoch": 1.2164579606440071, + "grad_norm": 3.8631348609924316, + "learning_rate": 9.681107099879662e-07, + "loss": 1.3467, + "step": 1360 + }, + { + "epoch": 1.2254025044722718, + "grad_norm": 3.636857509613037, + "learning_rate": 9.678098676293622e-07, + "loss": 1.3086, + "step": 1370 + }, + { + "epoch": 1.2343470483005368, + "grad_norm": 3.636439561843872, + "learning_rate": 9.67509025270758e-07, + "loss": 1.3196, + "step": 1380 + }, + { + "epoch": 1.2432915921288015, + "grad_norm": 3.594397783279419, + "learning_rate": 9.67208182912154e-07, + "loss": 1.3261, + "step": 1390 + }, + { + "epoch": 1.2522361359570662, + "grad_norm": 3.3458898067474365, + "learning_rate": 9.6690734055355e-07, + "loss": 1.3092, + "step": 1400 + }, + { + "epoch": 1.2611806797853309, + "grad_norm": 3.4665396213531494, + "learning_rate": 9.666064981949457e-07, + "loss": 1.2903, + "step": 1410 + }, + { + "epoch": 1.2701252236135958, + "grad_norm": 3.383054733276367, + "learning_rate": 9.663056558363417e-07, + "loss": 1.2863, + "step": 1420 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 3.463866949081421, + "learning_rate": 9.660048134777377e-07, + "loss": 1.28, + "step": 1430 + }, + { + "epoch": 1.2880143112701252, + "grad_norm": 3.318000078201294, + "learning_rate": 9.657039711191335e-07, + "loss": 1.2701, + "step": 1440 + }, + { + "epoch": 1.29695885509839, + "grad_norm": 3.5626983642578125, + "learning_rate": 9.654031287605295e-07, + "loss": 1.2798, + "step": 1450 + }, + { + "epoch": 1.3059033989266546, + "grad_norm": 3.5325281620025635, + "learning_rate": 9.651022864019253e-07, + "loss": 1.2848, + "step": 1460 + }, + { + "epoch": 1.3148479427549196, + "grad_norm": 3.1660006046295166, + "learning_rate": 9.648014440433213e-07, + "loss": 1.2856, + "step": 1470 + }, + { + "epoch": 1.3237924865831843, + "grad_norm": 3.7746942043304443, + "learning_rate": 9.64500601684717e-07, + "loss": 1.2511, + "step": 1480 + }, + { + "epoch": 1.332737030411449, + "grad_norm": 3.2455437183380127, + "learning_rate": 9.64199759326113e-07, + "loss": 1.2239, + "step": 1490 + }, + { + "epoch": 1.341681574239714, + "grad_norm": 3.6265106201171875, + "learning_rate": 9.63898916967509e-07, + "loss": 1.2483, + "step": 1500 + }, + { + "epoch": 1.3506261180679786, + "grad_norm": 3.450028419494629, + "learning_rate": 9.635980746089049e-07, + "loss": 1.2594, + "step": 1510 + }, + { + "epoch": 1.3595706618962433, + "grad_norm": 3.248667001724243, + "learning_rate": 9.632972322503009e-07, + "loss": 1.2371, + "step": 1520 + }, + { + "epoch": 1.368515205724508, + "grad_norm": 3.4102725982666016, + "learning_rate": 9.629963898916966e-07, + "loss": 1.2297, + "step": 1530 + }, + { + "epoch": 1.3774597495527727, + "grad_norm": 3.563889265060425, + "learning_rate": 9.626955475330926e-07, + "loss": 1.2094, + "step": 1540 + }, + { + "epoch": 1.3864042933810374, + "grad_norm": 3.4570846557617188, + "learning_rate": 9.623947051744884e-07, + "loss": 1.1988, + "step": 1550 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 3.3971352577209473, + "learning_rate": 9.620938628158844e-07, + "loss": 1.214, + "step": 1560 + }, + { + "epoch": 1.404293381037567, + "grad_norm": 3.2068612575531006, + "learning_rate": 9.617930204572804e-07, + "loss": 1.1866, + "step": 1570 + }, + { + "epoch": 1.4132379248658318, + "grad_norm": 3.515761137008667, + "learning_rate": 9.614921780986762e-07, + "loss": 1.1937, + "step": 1580 + }, + { + "epoch": 1.4221824686940967, + "grad_norm": 3.3083653450012207, + "learning_rate": 9.611913357400722e-07, + "loss": 1.1842, + "step": 1590 + }, + { + "epoch": 1.4311270125223614, + "grad_norm": 3.1191821098327637, + "learning_rate": 9.608904933814682e-07, + "loss": 1.1759, + "step": 1600 + }, + { + "epoch": 1.4400715563506261, + "grad_norm": 3.5757179260253906, + "learning_rate": 9.60589651022864e-07, + "loss": 1.1686, + "step": 1610 + }, + { + "epoch": 1.4490161001788908, + "grad_norm": 3.2756786346435547, + "learning_rate": 9.6028880866426e-07, + "loss": 1.1844, + "step": 1620 + }, + { + "epoch": 1.4579606440071555, + "grad_norm": 3.5108566284179688, + "learning_rate": 9.599879663056557e-07, + "loss": 1.1476, + "step": 1630 + }, + { + "epoch": 1.4669051878354205, + "grad_norm": 3.323366641998291, + "learning_rate": 9.596871239470517e-07, + "loss": 1.1609, + "step": 1640 + }, + { + "epoch": 1.4758497316636852, + "grad_norm": 3.339520215988159, + "learning_rate": 9.593862815884475e-07, + "loss": 1.1642, + "step": 1650 + }, + { + "epoch": 1.4847942754919499, + "grad_norm": 3.155186176300049, + "learning_rate": 9.590854392298435e-07, + "loss": 1.1638, + "step": 1660 + }, + { + "epoch": 1.4937388193202148, + "grad_norm": 3.370572805404663, + "learning_rate": 9.587845968712395e-07, + "loss": 1.1436, + "step": 1670 + }, + { + "epoch": 1.5026833631484795, + "grad_norm": 3.346024751663208, + "learning_rate": 9.584837545126353e-07, + "loss": 1.1417, + "step": 1680 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 3.285853862762451, + "learning_rate": 9.581829121540313e-07, + "loss": 1.1113, + "step": 1690 + }, + { + "epoch": 1.520572450805009, + "grad_norm": 3.1128010749816895, + "learning_rate": 9.57882069795427e-07, + "loss": 1.1205, + "step": 1700 + }, + { + "epoch": 1.5295169946332736, + "grad_norm": 3.3890109062194824, + "learning_rate": 9.57581227436823e-07, + "loss": 1.13, + "step": 1710 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 3.10841703414917, + "learning_rate": 9.572803850782189e-07, + "loss": 1.105, + "step": 1720 + }, + { + "epoch": 1.5474060822898033, + "grad_norm": 3.3679182529449463, + "learning_rate": 9.569795427196149e-07, + "loss": 1.1032, + "step": 1730 + }, + { + "epoch": 1.556350626118068, + "grad_norm": 3.1372768878936768, + "learning_rate": 9.566787003610109e-07, + "loss": 1.0836, + "step": 1740 + }, + { + "epoch": 1.5652951699463329, + "grad_norm": 3.139641284942627, + "learning_rate": 9.563778580024066e-07, + "loss": 1.1069, + "step": 1750 + }, + { + "epoch": 1.5742397137745976, + "grad_norm": 3.0833637714385986, + "learning_rate": 9.560770156438026e-07, + "loss": 1.0943, + "step": 1760 + }, + { + "epoch": 1.5831842576028623, + "grad_norm": 3.8753104209899902, + "learning_rate": 9.557761732851986e-07, + "loss": 1.1077, + "step": 1770 + }, + { + "epoch": 1.592128801431127, + "grad_norm": 3.6157336235046387, + "learning_rate": 9.554753309265944e-07, + "loss": 1.0878, + "step": 1780 + }, + { + "epoch": 1.6010733452593917, + "grad_norm": 3.359645366668701, + "learning_rate": 9.551744885679904e-07, + "loss": 1.0951, + "step": 1790 + }, + { + "epoch": 1.6100178890876564, + "grad_norm": 3.478275775909424, + "learning_rate": 9.548736462093862e-07, + "loss": 1.0714, + "step": 1800 + }, + { + "epoch": 1.6189624329159211, + "grad_norm": 3.354357957839966, + "learning_rate": 9.545728038507822e-07, + "loss": 1.0665, + "step": 1810 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 3.028160572052002, + "learning_rate": 9.54271961492178e-07, + "loss": 1.0745, + "step": 1820 + }, + { + "epoch": 1.6368515205724508, + "grad_norm": 3.5806148052215576, + "learning_rate": 9.53971119133574e-07, + "loss": 1.0503, + "step": 1830 + }, + { + "epoch": 1.6457960644007157, + "grad_norm": 3.154953718185425, + "learning_rate": 9.5367027677497e-07, + "loss": 1.0341, + "step": 1840 + }, + { + "epoch": 1.6547406082289804, + "grad_norm": 3.376316785812378, + "learning_rate": 9.533694344163657e-07, + "loss": 1.0852, + "step": 1850 + }, + { + "epoch": 1.663685152057245, + "grad_norm": 3.3376107215881348, + "learning_rate": 9.530685920577616e-07, + "loss": 1.0677, + "step": 1860 + }, + { + "epoch": 1.6726296958855098, + "grad_norm": 3.161376714706421, + "learning_rate": 9.527677496991576e-07, + "loss": 1.0487, + "step": 1870 + }, + { + "epoch": 1.6815742397137745, + "grad_norm": 3.0562355518341064, + "learning_rate": 9.524669073405534e-07, + "loss": 1.0354, + "step": 1880 + }, + { + "epoch": 1.6905187835420392, + "grad_norm": 3.007615804672241, + "learning_rate": 9.521660649819494e-07, + "loss": 1.0253, + "step": 1890 + }, + { + "epoch": 1.6994633273703041, + "grad_norm": 3.5059332847595215, + "learning_rate": 9.518652226233453e-07, + "loss": 1.0258, + "step": 1900 + }, + { + "epoch": 1.7084078711985689, + "grad_norm": 2.9691965579986572, + "learning_rate": 9.515643802647413e-07, + "loss": 1.024, + "step": 1910 + }, + { + "epoch": 1.7173524150268338, + "grad_norm": 3.405560255050659, + "learning_rate": 9.512635379061371e-07, + "loss": 1.027, + "step": 1920 + }, + { + "epoch": 1.7262969588550985, + "grad_norm": 3.0473272800445557, + "learning_rate": 9.509626955475331e-07, + "loss": 1.0313, + "step": 1930 + }, + { + "epoch": 1.7352415026833632, + "grad_norm": 3.096930980682373, + "learning_rate": 9.50661853188929e-07, + "loss": 0.9979, + "step": 1940 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 3.17958402633667, + "learning_rate": 9.503610108303249e-07, + "loss": 1.0146, + "step": 1950 + }, + { + "epoch": 1.7531305903398926, + "grad_norm": 2.9525468349456787, + "learning_rate": 9.500601684717207e-07, + "loss": 1.0005, + "step": 1960 + }, + { + "epoch": 1.7620751341681573, + "grad_norm": 3.1976871490478516, + "learning_rate": 9.497593261131167e-07, + "loss": 1.0108, + "step": 1970 + }, + { + "epoch": 1.7710196779964222, + "grad_norm": 3.2845773696899414, + "learning_rate": 9.494584837545125e-07, + "loss": 0.9818, + "step": 1980 + }, + { + "epoch": 1.779964221824687, + "grad_norm": 2.9621360301971436, + "learning_rate": 9.491576413959085e-07, + "loss": 0.9846, + "step": 1990 + }, + { + "epoch": 1.7889087656529516, + "grad_norm": 3.1748251914978027, + "learning_rate": 9.488567990373044e-07, + "loss": 0.993, + "step": 2000 + }, + { + "epoch": 1.7978533094812166, + "grad_norm": 3.0182857513427734, + "learning_rate": 9.485559566787004e-07, + "loss": 0.9828, + "step": 2010 + }, + { + "epoch": 1.8067978533094813, + "grad_norm": 2.983999490737915, + "learning_rate": 9.482551143200962e-07, + "loss": 0.9596, + "step": 2020 + }, + { + "epoch": 1.815742397137746, + "grad_norm": 3.285773277282715, + "learning_rate": 9.479542719614922e-07, + "loss": 0.99, + "step": 2030 + }, + { + "epoch": 1.8246869409660107, + "grad_norm": 2.963573455810547, + "learning_rate": 9.476534296028881e-07, + "loss": 0.9562, + "step": 2040 + }, + { + "epoch": 1.8336314847942754, + "grad_norm": 3.3950607776641846, + "learning_rate": 9.473525872442839e-07, + "loss": 0.9805, + "step": 2050 + }, + { + "epoch": 1.84257602862254, + "grad_norm": 2.9940521717071533, + "learning_rate": 9.470517448856799e-07, + "loss": 0.9535, + "step": 2060 + }, + { + "epoch": 1.851520572450805, + "grad_norm": 3.064953565597534, + "learning_rate": 9.467509025270757e-07, + "loss": 0.9832, + "step": 2070 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 3.1656112670898438, + "learning_rate": 9.464500601684717e-07, + "loss": 0.9431, + "step": 2080 + }, + { + "epoch": 1.8694096601073347, + "grad_norm": 3.262320041656494, + "learning_rate": 9.461492178098675e-07, + "loss": 0.9641, + "step": 2090 + }, + { + "epoch": 1.8783542039355994, + "grad_norm": 3.1577858924865723, + "learning_rate": 9.458483754512635e-07, + "loss": 0.9686, + "step": 2100 + }, + { + "epoch": 1.887298747763864, + "grad_norm": 3.1922857761383057, + "learning_rate": 9.455475330926594e-07, + "loss": 0.9449, + "step": 2110 + }, + { + "epoch": 1.8962432915921288, + "grad_norm": 2.9973959922790527, + "learning_rate": 9.452466907340553e-07, + "loss": 0.9772, + "step": 2120 + }, + { + "epoch": 1.9051878354203935, + "grad_norm": 3.2709221839904785, + "learning_rate": 9.449458483754512e-07, + "loss": 0.9299, + "step": 2130 + }, + { + "epoch": 1.9141323792486582, + "grad_norm": 2.9278268814086914, + "learning_rate": 9.446450060168472e-07, + "loss": 0.9355, + "step": 2140 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 3.182664632797241, + "learning_rate": 9.44344163658243e-07, + "loss": 0.9213, + "step": 2150 + }, + { + "epoch": 1.9320214669051878, + "grad_norm": 2.9108495712280273, + "learning_rate": 9.44043321299639e-07, + "loss": 0.9293, + "step": 2160 + }, + { + "epoch": 1.9409660107334525, + "grad_norm": 3.257746934890747, + "learning_rate": 9.437424789410349e-07, + "loss": 0.9202, + "step": 2170 + }, + { + "epoch": 1.9499105545617175, + "grad_norm": 2.7252960205078125, + "learning_rate": 9.434416365824309e-07, + "loss": 0.9369, + "step": 2180 + }, + { + "epoch": 1.9588550983899822, + "grad_norm": 3.083718776702881, + "learning_rate": 9.431407942238266e-07, + "loss": 0.9066, + "step": 2190 + }, + { + "epoch": 1.9677996422182469, + "grad_norm": 2.9476046562194824, + "learning_rate": 9.428399518652226e-07, + "loss": 0.9189, + "step": 2200 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 2.823620080947876, + "learning_rate": 9.425391095066185e-07, + "loss": 0.9092, + "step": 2210 + }, + { + "epoch": 1.9856887298747763, + "grad_norm": 3.1929540634155273, + "learning_rate": 9.422382671480143e-07, + "loss": 0.919, + "step": 2220 + }, + { + "epoch": 1.994633273703041, + "grad_norm": 3.178370475769043, + "learning_rate": 9.419374247894103e-07, + "loss": 0.921, + "step": 2230 + }, + { + "epoch": 2.0, + "eval_bleu": 33.2917, + "eval_gen_len": 76.1459, + "eval_loss": 0.6356510519981384, + "eval_runtime": 55.3064, + "eval_samples_per_second": 18.841, + "eval_steps_per_second": 0.199, + "step": 2236 + }, + { + "epoch": 2.0035778175313057, + "grad_norm": 3.014850616455078, + "learning_rate": 9.416365824308062e-07, + "loss": 0.9086, + "step": 2240 + }, + { + "epoch": 2.012522361359571, + "grad_norm": 2.8700523376464844, + "learning_rate": 9.413357400722022e-07, + "loss": 0.8985, + "step": 2250 + }, + { + "epoch": 2.0214669051878356, + "grad_norm": 3.0652620792388916, + "learning_rate": 9.41034897713598e-07, + "loss": 0.9062, + "step": 2260 + }, + { + "epoch": 2.0304114490161003, + "grad_norm": 3.1319339275360107, + "learning_rate": 9.40734055354994e-07, + "loss": 0.8954, + "step": 2270 + }, + { + "epoch": 2.039355992844365, + "grad_norm": 3.253756284713745, + "learning_rate": 9.404332129963899e-07, + "loss": 0.9073, + "step": 2280 + }, + { + "epoch": 2.0483005366726297, + "grad_norm": 2.780285120010376, + "learning_rate": 9.401323706377857e-07, + "loss": 0.8782, + "step": 2290 + }, + { + "epoch": 2.0572450805008944, + "grad_norm": 2.7521615028381348, + "learning_rate": 9.398315282791816e-07, + "loss": 0.8654, + "step": 2300 + }, + { + "epoch": 2.066189624329159, + "grad_norm": 3.2367069721221924, + "learning_rate": 9.395306859205776e-07, + "loss": 0.8866, + "step": 2310 + }, + { + "epoch": 2.075134168157424, + "grad_norm": 2.7654380798339844, + "learning_rate": 9.392298435619734e-07, + "loss": 0.8966, + "step": 2320 + }, + { + "epoch": 2.084078711985689, + "grad_norm": 2.697866916656494, + "learning_rate": 9.389290012033694e-07, + "loss": 0.871, + "step": 2330 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 3.0392096042633057, + "learning_rate": 9.386281588447653e-07, + "loss": 0.8726, + "step": 2340 + }, + { + "epoch": 2.1019677996422184, + "grad_norm": 2.8354122638702393, + "learning_rate": 9.383273164861613e-07, + "loss": 0.8677, + "step": 2350 + }, + { + "epoch": 2.110912343470483, + "grad_norm": 2.7471110820770264, + "learning_rate": 9.380264741275571e-07, + "loss": 0.8711, + "step": 2360 + }, + { + "epoch": 2.1198568872987478, + "grad_norm": 3.0733835697174072, + "learning_rate": 9.377256317689531e-07, + "loss": 0.8396, + "step": 2370 + }, + { + "epoch": 2.1288014311270125, + "grad_norm": 2.961397647857666, + "learning_rate": 9.37424789410349e-07, + "loss": 0.8514, + "step": 2380 + }, + { + "epoch": 2.137745974955277, + "grad_norm": 2.858783006668091, + "learning_rate": 9.371239470517448e-07, + "loss": 0.8549, + "step": 2390 + }, + { + "epoch": 2.146690518783542, + "grad_norm": 2.8222157955169678, + "learning_rate": 9.368231046931407e-07, + "loss": 0.8596, + "step": 2400 + }, + { + "epoch": 2.1556350626118066, + "grad_norm": 2.9093058109283447, + "learning_rate": 9.365222623345366e-07, + "loss": 0.8729, + "step": 2410 + }, + { + "epoch": 2.1645796064400717, + "grad_norm": 2.8252718448638916, + "learning_rate": 9.362214199759325e-07, + "loss": 0.8523, + "step": 2420 + }, + { + "epoch": 2.1735241502683365, + "grad_norm": 2.9063096046447754, + "learning_rate": 9.359205776173284e-07, + "loss": 0.852, + "step": 2430 + }, + { + "epoch": 2.182468694096601, + "grad_norm": 2.9497509002685547, + "learning_rate": 9.356197352587244e-07, + "loss": 0.8409, + "step": 2440 + }, + { + "epoch": 2.191413237924866, + "grad_norm": 3.069898843765259, + "learning_rate": 9.353188929001203e-07, + "loss": 0.8372, + "step": 2450 + }, + { + "epoch": 2.2003577817531306, + "grad_norm": 2.689397096633911, + "learning_rate": 9.350180505415162e-07, + "loss": 0.8282, + "step": 2460 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 2.8414719104766846, + "learning_rate": 9.347172081829121e-07, + "loss": 0.8352, + "step": 2470 + }, + { + "epoch": 2.21824686940966, + "grad_norm": 2.834557056427002, + "learning_rate": 9.344163658243081e-07, + "loss": 0.8425, + "step": 2480 + }, + { + "epoch": 2.2271914132379247, + "grad_norm": 3.1694188117980957, + "learning_rate": 9.341155234657039e-07, + "loss": 0.8362, + "step": 2490 + }, + { + "epoch": 2.23613595706619, + "grad_norm": 2.7628824710845947, + "learning_rate": 9.338146811070999e-07, + "loss": 0.8291, + "step": 2500 + }, + { + "epoch": 2.2450805008944545, + "grad_norm": 2.9400546550750732, + "learning_rate": 9.335138387484958e-07, + "loss": 0.8252, + "step": 2510 + }, + { + "epoch": 2.2540250447227193, + "grad_norm": 2.8414359092712402, + "learning_rate": 9.332129963898917e-07, + "loss": 0.8165, + "step": 2520 + }, + { + "epoch": 2.262969588550984, + "grad_norm": 2.7981302738189697, + "learning_rate": 9.329121540312875e-07, + "loss": 0.8054, + "step": 2530 + }, + { + "epoch": 2.2719141323792487, + "grad_norm": 3.369706630706787, + "learning_rate": 9.326113116726835e-07, + "loss": 0.8079, + "step": 2540 + }, + { + "epoch": 2.2808586762075134, + "grad_norm": 3.226773738861084, + "learning_rate": 9.323104693140794e-07, + "loss": 0.8184, + "step": 2550 + }, + { + "epoch": 2.289803220035778, + "grad_norm": 2.6831185817718506, + "learning_rate": 9.320096269554752e-07, + "loss": 0.8149, + "step": 2560 + }, + { + "epoch": 2.298747763864043, + "grad_norm": 2.8104960918426514, + "learning_rate": 9.317087845968712e-07, + "loss": 0.809, + "step": 2570 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 2.8982748985290527, + "learning_rate": 9.314079422382671e-07, + "loss": 0.828, + "step": 2580 + }, + { + "epoch": 2.3166368515205726, + "grad_norm": 2.8262553215026855, + "learning_rate": 9.31107099879663e-07, + "loss": 0.8061, + "step": 2590 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 2.9029324054718018, + "learning_rate": 9.308062575210589e-07, + "loss": 0.7949, + "step": 2600 + }, + { + "epoch": 2.334525939177102, + "grad_norm": 2.753194570541382, + "learning_rate": 9.305054151624549e-07, + "loss": 0.7933, + "step": 2610 + }, + { + "epoch": 2.3434704830053668, + "grad_norm": 2.6942641735076904, + "learning_rate": 9.302045728038508e-07, + "loss": 0.7904, + "step": 2620 + }, + { + "epoch": 2.3524150268336315, + "grad_norm": 2.987013816833496, + "learning_rate": 9.299037304452466e-07, + "loss": 0.8139, + "step": 2630 + }, + { + "epoch": 2.361359570661896, + "grad_norm": 3.0176124572753906, + "learning_rate": 9.296028880866425e-07, + "loss": 0.814, + "step": 2640 + }, + { + "epoch": 2.370304114490161, + "grad_norm": 2.544062614440918, + "learning_rate": 9.293020457280385e-07, + "loss": 0.7866, + "step": 2650 + }, + { + "epoch": 2.3792486583184256, + "grad_norm": 3.0904483795166016, + "learning_rate": 9.290012033694343e-07, + "loss": 0.7841, + "step": 2660 + }, + { + "epoch": 2.3881932021466907, + "grad_norm": 2.8507888317108154, + "learning_rate": 9.287003610108303e-07, + "loss": 0.7828, + "step": 2670 + }, + { + "epoch": 2.3971377459749554, + "grad_norm": 2.9731242656707764, + "learning_rate": 9.283995186522262e-07, + "loss": 0.7884, + "step": 2680 + }, + { + "epoch": 2.40608228980322, + "grad_norm": 2.6749331951141357, + "learning_rate": 9.280986762936222e-07, + "loss": 0.78, + "step": 2690 + }, + { + "epoch": 2.415026833631485, + "grad_norm": 2.694559335708618, + "learning_rate": 9.27797833935018e-07, + "loss": 0.7832, + "step": 2700 + }, + { + "epoch": 2.4239713774597496, + "grad_norm": 2.7893500328063965, + "learning_rate": 9.27496991576414e-07, + "loss": 0.7872, + "step": 2710 + }, + { + "epoch": 2.4329159212880143, + "grad_norm": 3.1287553310394287, + "learning_rate": 9.271961492178099e-07, + "loss": 0.7593, + "step": 2720 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 3.007920503616333, + "learning_rate": 9.268953068592056e-07, + "loss": 0.7803, + "step": 2730 + }, + { + "epoch": 2.4508050089445437, + "grad_norm": 2.706397533416748, + "learning_rate": 9.265944645006016e-07, + "loss": 0.7833, + "step": 2740 + }, + { + "epoch": 2.4597495527728084, + "grad_norm": 2.878617763519287, + "learning_rate": 9.262936221419975e-07, + "loss": 0.756, + "step": 2750 + }, + { + "epoch": 2.4686940966010735, + "grad_norm": 3.0355637073516846, + "learning_rate": 9.259927797833934e-07, + "loss": 0.7572, + "step": 2760 + }, + { + "epoch": 2.4776386404293382, + "grad_norm": 2.7085120677948, + "learning_rate": 9.256919374247893e-07, + "loss": 0.7513, + "step": 2770 + }, + { + "epoch": 2.486583184257603, + "grad_norm": 3.1326651573181152, + "learning_rate": 9.253910950661853e-07, + "loss": 0.7592, + "step": 2780 + }, + { + "epoch": 2.4955277280858676, + "grad_norm": 3.014554500579834, + "learning_rate": 9.250902527075812e-07, + "loss": 0.7562, + "step": 2790 + }, + { + "epoch": 2.5044722719141324, + "grad_norm": 2.5138909816741943, + "learning_rate": 9.247894103489771e-07, + "loss": 0.7435, + "step": 2800 + }, + { + "epoch": 2.513416815742397, + "grad_norm": 2.6745359897613525, + "learning_rate": 9.24488567990373e-07, + "loss": 0.7361, + "step": 2810 + }, + { + "epoch": 2.5223613595706618, + "grad_norm": 3.270789623260498, + "learning_rate": 9.24187725631769e-07, + "loss": 0.7563, + "step": 2820 + }, + { + "epoch": 2.531305903398927, + "grad_norm": 2.8245232105255127, + "learning_rate": 9.238868832731648e-07, + "loss": 0.7571, + "step": 2830 + }, + { + "epoch": 2.5402504472271916, + "grad_norm": 2.7508704662323, + "learning_rate": 9.235860409145608e-07, + "loss": 0.7691, + "step": 2840 + }, + { + "epoch": 2.5491949910554563, + "grad_norm": 2.965639114379883, + "learning_rate": 9.232851985559566e-07, + "loss": 0.7438, + "step": 2850 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 2.903453826904297, + "learning_rate": 9.229843561973526e-07, + "loss": 0.7612, + "step": 2860 + }, + { + "epoch": 2.5670840787119857, + "grad_norm": 2.4326062202453613, + "learning_rate": 9.226835138387484e-07, + "loss": 0.7373, + "step": 2870 + }, + { + "epoch": 2.5760286225402504, + "grad_norm": 2.5255181789398193, + "learning_rate": 9.223826714801444e-07, + "loss": 0.703, + "step": 2880 + }, + { + "epoch": 2.584973166368515, + "grad_norm": 2.712981700897217, + "learning_rate": 9.220818291215403e-07, + "loss": 0.7386, + "step": 2890 + }, + { + "epoch": 2.59391771019678, + "grad_norm": 2.6336443424224854, + "learning_rate": 9.217809867629361e-07, + "loss": 0.7088, + "step": 2900 + }, + { + "epoch": 2.6028622540250446, + "grad_norm": 2.936994791030884, + "learning_rate": 9.214801444043321e-07, + "loss": 0.7431, + "step": 2910 + }, + { + "epoch": 2.6118067978533093, + "grad_norm": 2.911325693130493, + "learning_rate": 9.21179302045728e-07, + "loss": 0.741, + "step": 2920 + }, + { + "epoch": 2.620751341681574, + "grad_norm": 2.7983999252319336, + "learning_rate": 9.208784596871239e-07, + "loss": 0.7363, + "step": 2930 + }, + { + "epoch": 2.629695885509839, + "grad_norm": 2.58998703956604, + "learning_rate": 9.205776173285198e-07, + "loss": 0.7193, + "step": 2940 + }, + { + "epoch": 2.638640429338104, + "grad_norm": 2.603731632232666, + "learning_rate": 9.202767749699158e-07, + "loss": 0.7366, + "step": 2950 + }, + { + "epoch": 2.6475849731663685, + "grad_norm": 3.180845022201538, + "learning_rate": 9.199759326113116e-07, + "loss": 0.7156, + "step": 2960 + }, + { + "epoch": 2.6565295169946332, + "grad_norm": 2.47441029548645, + "learning_rate": 9.196750902527075e-07, + "loss": 0.7323, + "step": 2970 + }, + { + "epoch": 2.665474060822898, + "grad_norm": 2.681979179382324, + "learning_rate": 9.193742478941034e-07, + "loss": 0.7238, + "step": 2980 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 2.7500979900360107, + "learning_rate": 9.190734055354994e-07, + "loss": 0.7205, + "step": 2990 + }, + { + "epoch": 2.683363148479428, + "grad_norm": 2.9268977642059326, + "learning_rate": 9.187725631768952e-07, + "loss": 0.7104, + "step": 3000 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 2.865962028503418, + "learning_rate": 9.184717208182912e-07, + "loss": 0.7189, + "step": 3010 + }, + { + "epoch": 2.701252236135957, + "grad_norm": 3.1300830841064453, + "learning_rate": 9.181708784596871e-07, + "loss": 0.7118, + "step": 3020 + }, + { + "epoch": 2.710196779964222, + "grad_norm": 2.472964286804199, + "learning_rate": 9.178700361010831e-07, + "loss": 0.7168, + "step": 3030 + }, + { + "epoch": 2.7191413237924866, + "grad_norm": 2.9153640270233154, + "learning_rate": 9.175691937424789e-07, + "loss": 0.7052, + "step": 3040 + }, + { + "epoch": 2.7280858676207513, + "grad_norm": 2.578078031539917, + "learning_rate": 9.172683513838749e-07, + "loss": 0.7028, + "step": 3050 + }, + { + "epoch": 2.737030411449016, + "grad_norm": 2.813464879989624, + "learning_rate": 9.169675090252708e-07, + "loss": 0.682, + "step": 3060 + }, + { + "epoch": 2.7459749552772807, + "grad_norm": 2.818431854248047, + "learning_rate": 9.166666666666665e-07, + "loss": 0.6974, + "step": 3070 + }, + { + "epoch": 2.7549194991055455, + "grad_norm": 2.902752161026001, + "learning_rate": 9.163658243080625e-07, + "loss": 0.6881, + "step": 3080 + }, + { + "epoch": 2.76386404293381, + "grad_norm": 3.032303810119629, + "learning_rate": 9.160649819494584e-07, + "loss": 0.695, + "step": 3090 + }, + { + "epoch": 2.772808586762075, + "grad_norm": 2.627807140350342, + "learning_rate": 9.157641395908543e-07, + "loss": 0.6846, + "step": 3100 + }, + { + "epoch": 2.78175313059034, + "grad_norm": 2.711284875869751, + "learning_rate": 9.154632972322502e-07, + "loss": 0.6836, + "step": 3110 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 3.2660164833068848, + "learning_rate": 9.151624548736462e-07, + "loss": 0.703, + "step": 3120 + }, + { + "epoch": 2.7996422182468694, + "grad_norm": 2.5693178176879883, + "learning_rate": 9.148616125150421e-07, + "loss": 0.6885, + "step": 3130 + }, + { + "epoch": 2.808586762075134, + "grad_norm": 2.5643057823181152, + "learning_rate": 9.14560770156438e-07, + "loss": 0.6825, + "step": 3140 + }, + { + "epoch": 2.817531305903399, + "grad_norm": 2.8102810382843018, + "learning_rate": 9.142599277978339e-07, + "loss": 0.6685, + "step": 3150 + }, + { + "epoch": 2.8264758497316635, + "grad_norm": 2.8618929386138916, + "learning_rate": 9.139590854392299e-07, + "loss": 0.6847, + "step": 3160 + }, + { + "epoch": 2.8354203935599287, + "grad_norm": 2.610710382461548, + "learning_rate": 9.136582430806256e-07, + "loss": 0.6774, + "step": 3170 + }, + { + "epoch": 2.8443649373881934, + "grad_norm": 3.4627251625061035, + "learning_rate": 9.133574007220216e-07, + "loss": 0.6865, + "step": 3180 + }, + { + "epoch": 2.853309481216458, + "grad_norm": 2.6292290687561035, + "learning_rate": 9.130565583634175e-07, + "loss": 0.7044, + "step": 3190 + }, + { + "epoch": 2.862254025044723, + "grad_norm": 2.454831600189209, + "learning_rate": 9.127557160048135e-07, + "loss": 0.6791, + "step": 3200 + }, + { + "epoch": 2.8711985688729875, + "grad_norm": 2.51297926902771, + "learning_rate": 9.124548736462093e-07, + "loss": 0.6838, + "step": 3210 + }, + { + "epoch": 2.8801431127012522, + "grad_norm": 2.57023549079895, + "learning_rate": 9.121540312876053e-07, + "loss": 0.6691, + "step": 3220 + }, + { + "epoch": 2.889087656529517, + "grad_norm": 2.8879973888397217, + "learning_rate": 9.118531889290012e-07, + "loss": 0.6612, + "step": 3230 + }, + { + "epoch": 2.8980322003577816, + "grad_norm": 2.8545825481414795, + "learning_rate": 9.11552346570397e-07, + "loss": 0.6701, + "step": 3240 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 2.746073007583618, + "learning_rate": 9.11251504211793e-07, + "loss": 0.6867, + "step": 3250 + }, + { + "epoch": 2.915921288014311, + "grad_norm": 2.964799404144287, + "learning_rate": 9.109506618531889e-07, + "loss": 0.6676, + "step": 3260 + }, + { + "epoch": 2.9248658318425758, + "grad_norm": 2.561464548110962, + "learning_rate": 9.106498194945848e-07, + "loss": 0.6847, + "step": 3270 + }, + { + "epoch": 2.933810375670841, + "grad_norm": 2.822511672973633, + "learning_rate": 9.103489771359806e-07, + "loss": 0.6591, + "step": 3280 + }, + { + "epoch": 2.9427549194991056, + "grad_norm": 2.471494197845459, + "learning_rate": 9.100481347773766e-07, + "loss": 0.6356, + "step": 3290 + }, + { + "epoch": 2.9516994633273703, + "grad_norm": 2.75529146194458, + "learning_rate": 9.097472924187725e-07, + "loss": 0.6628, + "step": 3300 + }, + { + "epoch": 2.960644007155635, + "grad_norm": 2.634551525115967, + "learning_rate": 9.094464500601684e-07, + "loss": 0.6445, + "step": 3310 + }, + { + "epoch": 2.9695885509838997, + "grad_norm": 2.6194956302642822, + "learning_rate": 9.091456077015643e-07, + "loss": 0.6657, + "step": 3320 + }, + { + "epoch": 2.9785330948121644, + "grad_norm": 2.409846305847168, + "learning_rate": 9.088447653429603e-07, + "loss": 0.6449, + "step": 3330 + }, + { + "epoch": 2.9874776386404296, + "grad_norm": 2.7916181087493896, + "learning_rate": 9.085439229843561e-07, + "loss": 0.6567, + "step": 3340 + }, + { + "epoch": 2.9964221824686943, + "grad_norm": 2.529426336288452, + "learning_rate": 9.082430806257521e-07, + "loss": 0.6472, + "step": 3350 + }, + { + "epoch": 3.0, + "eval_bleu": 47.3533, + "eval_gen_len": 75.9194, + "eval_loss": 0.45044583082199097, + "eval_runtime": 57.4786, + "eval_samples_per_second": 18.128, + "eval_steps_per_second": 0.191, + "step": 3354 + }, + { + "epoch": 3.005366726296959, + "grad_norm": 2.2102513313293457, + "learning_rate": 9.07942238267148e-07, + "loss": 0.6442, + "step": 3360 + }, + { + "epoch": 3.0143112701252237, + "grad_norm": 2.70259428024292, + "learning_rate": 9.076413959085439e-07, + "loss": 0.6565, + "step": 3370 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 2.7066004276275635, + "learning_rate": 9.073405535499398e-07, + "loss": 0.656, + "step": 3380 + }, + { + "epoch": 3.032200357781753, + "grad_norm": 2.758183717727661, + "learning_rate": 9.070397111913358e-07, + "loss": 0.645, + "step": 3390 + }, + { + "epoch": 3.041144901610018, + "grad_norm": 2.4055726528167725, + "learning_rate": 9.067388688327316e-07, + "loss": 0.6314, + "step": 3400 + }, + { + "epoch": 3.0500894454382825, + "grad_norm": 2.4877970218658447, + "learning_rate": 9.064380264741275e-07, + "loss": 0.6397, + "step": 3410 + }, + { + "epoch": 3.0590339892665472, + "grad_norm": 2.6725428104400635, + "learning_rate": 9.061371841155234e-07, + "loss": 0.6429, + "step": 3420 + }, + { + "epoch": 3.067978533094812, + "grad_norm": 2.4992988109588623, + "learning_rate": 9.058363417569193e-07, + "loss": 0.6472, + "step": 3430 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 2.4662342071533203, + "learning_rate": 9.055354993983152e-07, + "loss": 0.6309, + "step": 3440 + }, + { + "epoch": 3.085867620751342, + "grad_norm": 2.4642457962036133, + "learning_rate": 9.052346570397111e-07, + "loss": 0.6161, + "step": 3450 + }, + { + "epoch": 3.0948121645796065, + "grad_norm": 2.5901832580566406, + "learning_rate": 9.049338146811071e-07, + "loss": 0.6439, + "step": 3460 + }, + { + "epoch": 3.103756708407871, + "grad_norm": 2.6220297813415527, + "learning_rate": 9.04632972322503e-07, + "loss": 0.6496, + "step": 3470 + }, + { + "epoch": 3.112701252236136, + "grad_norm": 2.4917731285095215, + "learning_rate": 9.043321299638989e-07, + "loss": 0.6178, + "step": 3480 + }, + { + "epoch": 3.1216457960644006, + "grad_norm": 2.4936294555664062, + "learning_rate": 9.040312876052948e-07, + "loss": 0.6482, + "step": 3490 + }, + { + "epoch": 3.1305903398926653, + "grad_norm": 2.4493210315704346, + "learning_rate": 9.037304452466908e-07, + "loss": 0.6177, + "step": 3500 + }, + { + "epoch": 3.13953488372093, + "grad_norm": 2.941797971725464, + "learning_rate": 9.034296028880865e-07, + "loss": 0.6231, + "step": 3510 + }, + { + "epoch": 3.148479427549195, + "grad_norm": 2.5311951637268066, + "learning_rate": 9.031287605294825e-07, + "loss": 0.6191, + "step": 3520 + }, + { + "epoch": 3.15742397137746, + "grad_norm": 2.6150107383728027, + "learning_rate": 9.028279181708784e-07, + "loss": 0.6243, + "step": 3530 + }, + { + "epoch": 3.1663685152057246, + "grad_norm": 2.4990551471710205, + "learning_rate": 9.025270758122743e-07, + "loss": 0.616, + "step": 3540 + }, + { + "epoch": 3.1753130590339893, + "grad_norm": 2.4558522701263428, + "learning_rate": 9.022262334536702e-07, + "loss": 0.6286, + "step": 3550 + }, + { + "epoch": 3.184257602862254, + "grad_norm": 2.2986326217651367, + "learning_rate": 9.019253910950662e-07, + "loss": 0.6191, + "step": 3560 + }, + { + "epoch": 3.1932021466905187, + "grad_norm": 2.9514191150665283, + "learning_rate": 9.016245487364621e-07, + "loss": 0.5976, + "step": 3570 + }, + { + "epoch": 3.2021466905187834, + "grad_norm": 2.2443857192993164, + "learning_rate": 9.01323706377858e-07, + "loss": 0.6335, + "step": 3580 + }, + { + "epoch": 3.211091234347048, + "grad_norm": 2.7957968711853027, + "learning_rate": 9.010228640192539e-07, + "loss": 0.6137, + "step": 3590 + }, + { + "epoch": 3.220035778175313, + "grad_norm": 2.374537944793701, + "learning_rate": 9.007220216606498e-07, + "loss": 0.6211, + "step": 3600 + }, + { + "epoch": 3.228980322003578, + "grad_norm": 2.501329183578491, + "learning_rate": 9.004211793020456e-07, + "loss": 0.6182, + "step": 3610 + }, + { + "epoch": 3.2379248658318427, + "grad_norm": 2.5942885875701904, + "learning_rate": 9.001203369434415e-07, + "loss": 0.6014, + "step": 3620 + }, + { + "epoch": 3.2468694096601074, + "grad_norm": 2.34264874458313, + "learning_rate": 8.998194945848375e-07, + "loss": 0.6056, + "step": 3630 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 2.3675620555877686, + "learning_rate": 8.995186522262334e-07, + "loss": 0.6075, + "step": 3640 + }, + { + "epoch": 3.264758497316637, + "grad_norm": 2.7011513710021973, + "learning_rate": 8.992178098676293e-07, + "loss": 0.6236, + "step": 3650 + }, + { + "epoch": 3.2737030411449015, + "grad_norm": 2.562945604324341, + "learning_rate": 8.989169675090252e-07, + "loss": 0.6193, + "step": 3660 + }, + { + "epoch": 3.282647584973166, + "grad_norm": 2.497159957885742, + "learning_rate": 8.986161251504212e-07, + "loss": 0.5963, + "step": 3670 + }, + { + "epoch": 3.2915921288014314, + "grad_norm": 2.405364513397217, + "learning_rate": 8.98315282791817e-07, + "loss": 0.6122, + "step": 3680 + }, + { + "epoch": 3.300536672629696, + "grad_norm": 2.8417489528656006, + "learning_rate": 8.98014440433213e-07, + "loss": 0.6098, + "step": 3690 + }, + { + "epoch": 3.309481216457961, + "grad_norm": 2.4375507831573486, + "learning_rate": 8.977135980746089e-07, + "loss": 0.6107, + "step": 3700 + }, + { + "epoch": 3.3184257602862255, + "grad_norm": 2.3820364475250244, + "learning_rate": 8.974127557160048e-07, + "loss": 0.6088, + "step": 3710 + }, + { + "epoch": 3.32737030411449, + "grad_norm": 2.655949592590332, + "learning_rate": 8.971119133574007e-07, + "loss": 0.5994, + "step": 3720 + }, + { + "epoch": 3.336314847942755, + "grad_norm": 2.63189435005188, + "learning_rate": 8.968110709987966e-07, + "loss": 0.5925, + "step": 3730 + }, + { + "epoch": 3.3452593917710196, + "grad_norm": 2.4572486877441406, + "learning_rate": 8.965102286401925e-07, + "loss": 0.5931, + "step": 3740 + }, + { + "epoch": 3.3542039355992843, + "grad_norm": 2.2869107723236084, + "learning_rate": 8.962093862815884e-07, + "loss": 0.5864, + "step": 3750 + }, + { + "epoch": 3.363148479427549, + "grad_norm": 2.425100326538086, + "learning_rate": 8.959085439229843e-07, + "loss": 0.5963, + "step": 3760 + }, + { + "epoch": 3.3720930232558137, + "grad_norm": 2.5817906856536865, + "learning_rate": 8.956077015643802e-07, + "loss": 0.6021, + "step": 3770 + }, + { + "epoch": 3.381037567084079, + "grad_norm": 2.679655075073242, + "learning_rate": 8.953068592057761e-07, + "loss": 0.5926, + "step": 3780 + }, + { + "epoch": 3.3899821109123436, + "grad_norm": 2.54133677482605, + "learning_rate": 8.95006016847172e-07, + "loss": 0.5984, + "step": 3790 + }, + { + "epoch": 3.3989266547406083, + "grad_norm": 2.5012824535369873, + "learning_rate": 8.94705174488568e-07, + "loss": 0.5903, + "step": 3800 + }, + { + "epoch": 3.407871198568873, + "grad_norm": 2.6578915119171143, + "learning_rate": 8.944043321299639e-07, + "loss": 0.6046, + "step": 3810 + }, + { + "epoch": 3.4168157423971377, + "grad_norm": 2.964258909225464, + "learning_rate": 8.941034897713598e-07, + "loss": 0.6001, + "step": 3820 + }, + { + "epoch": 3.4257602862254024, + "grad_norm": 2.4920527935028076, + "learning_rate": 8.938026474127557e-07, + "loss": 0.5796, + "step": 3830 + }, + { + "epoch": 3.434704830053667, + "grad_norm": 2.3578991889953613, + "learning_rate": 8.935018050541516e-07, + "loss": 0.5828, + "step": 3840 + }, + { + "epoch": 3.4436493738819323, + "grad_norm": 2.511328935623169, + "learning_rate": 8.932009626955474e-07, + "loss": 0.5674, + "step": 3850 + }, + { + "epoch": 3.452593917710197, + "grad_norm": 2.626751184463501, + "learning_rate": 8.929001203369434e-07, + "loss": 0.5862, + "step": 3860 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 2.8260960578918457, + "learning_rate": 8.925992779783393e-07, + "loss": 0.5911, + "step": 3870 + }, + { + "epoch": 3.4704830053667264, + "grad_norm": 2.422102928161621, + "learning_rate": 8.922984356197352e-07, + "loss": 0.5726, + "step": 3880 + }, + { + "epoch": 3.479427549194991, + "grad_norm": 2.27225923538208, + "learning_rate": 8.919975932611311e-07, + "loss": 0.5553, + "step": 3890 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 2.5205020904541016, + "learning_rate": 8.916967509025271e-07, + "loss": 0.5918, + "step": 3900 + }, + { + "epoch": 3.4973166368515205, + "grad_norm": 2.428128242492676, + "learning_rate": 8.91395908543923e-07, + "loss": 0.5759, + "step": 3910 + }, + { + "epoch": 3.506261180679785, + "grad_norm": 2.4170215129852295, + "learning_rate": 8.910950661853189e-07, + "loss": 0.5597, + "step": 3920 + }, + { + "epoch": 3.51520572450805, + "grad_norm": 2.1635830402374268, + "learning_rate": 8.907942238267148e-07, + "loss": 0.5546, + "step": 3930 + }, + { + "epoch": 3.5241502683363146, + "grad_norm": 3.3241753578186035, + "learning_rate": 8.904933814681108e-07, + "loss": 0.5603, + "step": 3940 + }, + { + "epoch": 3.5330948121645798, + "grad_norm": 2.1622519493103027, + "learning_rate": 8.901925391095065e-07, + "loss": 0.5674, + "step": 3950 + }, + { + "epoch": 3.5420393559928445, + "grad_norm": 2.3452277183532715, + "learning_rate": 8.898916967509024e-07, + "loss": 0.5768, + "step": 3960 + }, + { + "epoch": 3.550983899821109, + "grad_norm": 2.950314998626709, + "learning_rate": 8.895908543922984e-07, + "loss": 0.571, + "step": 3970 + }, + { + "epoch": 3.559928443649374, + "grad_norm": 2.2387633323669434, + "learning_rate": 8.892900120336943e-07, + "loss": 0.5706, + "step": 3980 + }, + { + "epoch": 3.5688729874776386, + "grad_norm": 2.417853832244873, + "learning_rate": 8.889891696750902e-07, + "loss": 0.56, + "step": 3990 + }, + { + "epoch": 3.5778175313059033, + "grad_norm": 2.549022674560547, + "learning_rate": 8.886883273164861e-07, + "loss": 0.5686, + "step": 4000 + }, + { + "epoch": 3.586762075134168, + "grad_norm": 2.2357287406921387, + "learning_rate": 8.883874849578821e-07, + "loss": 0.5694, + "step": 4010 + }, + { + "epoch": 3.595706618962433, + "grad_norm": 2.5129199028015137, + "learning_rate": 8.880866425992779e-07, + "loss": 0.565, + "step": 4020 + }, + { + "epoch": 3.604651162790698, + "grad_norm": 2.858186721801758, + "learning_rate": 8.877858002406739e-07, + "loss": 0.5604, + "step": 4030 + }, + { + "epoch": 3.6135957066189626, + "grad_norm": 2.274585247039795, + "learning_rate": 8.874849578820698e-07, + "loss": 0.5549, + "step": 4040 + }, + { + "epoch": 3.6225402504472273, + "grad_norm": 2.324376106262207, + "learning_rate": 8.871841155234657e-07, + "loss": 0.5607, + "step": 4050 + }, + { + "epoch": 3.631484794275492, + "grad_norm": 2.541766881942749, + "learning_rate": 8.868832731648615e-07, + "loss": 0.5387, + "step": 4060 + }, + { + "epoch": 3.6404293381037567, + "grad_norm": 2.4490244388580322, + "learning_rate": 8.865824308062575e-07, + "loss": 0.5613, + "step": 4070 + }, + { + "epoch": 3.6493738819320214, + "grad_norm": 2.665994167327881, + "learning_rate": 8.862815884476534e-07, + "loss": 0.5636, + "step": 4080 + }, + { + "epoch": 3.658318425760286, + "grad_norm": 2.3954038619995117, + "learning_rate": 8.859807460890493e-07, + "loss": 0.5684, + "step": 4090 + }, + { + "epoch": 3.667262969588551, + "grad_norm": 2.3968029022216797, + "learning_rate": 8.856799037304452e-07, + "loss": 0.5503, + "step": 4100 + }, + { + "epoch": 3.6762075134168155, + "grad_norm": 2.2309978008270264, + "learning_rate": 8.853790613718412e-07, + "loss": 0.5553, + "step": 4110 + }, + { + "epoch": 3.6851520572450807, + "grad_norm": 2.3942465782165527, + "learning_rate": 8.85078219013237e-07, + "loss": 0.5405, + "step": 4120 + }, + { + "epoch": 3.6940966010733454, + "grad_norm": 2.1299009323120117, + "learning_rate": 8.847773766546329e-07, + "loss": 0.5501, + "step": 4130 + }, + { + "epoch": 3.70304114490161, + "grad_norm": 2.2872629165649414, + "learning_rate": 8.844765342960289e-07, + "loss": 0.5402, + "step": 4140 + }, + { + "epoch": 3.7119856887298748, + "grad_norm": 2.65865159034729, + "learning_rate": 8.841756919374247e-07, + "loss": 0.5467, + "step": 4150 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 3.056772232055664, + "learning_rate": 8.838748495788207e-07, + "loss": 0.5516, + "step": 4160 + }, + { + "epoch": 3.729874776386404, + "grad_norm": 2.4120004177093506, + "learning_rate": 8.835740072202165e-07, + "loss": 0.5606, + "step": 4170 + }, + { + "epoch": 3.738819320214669, + "grad_norm": 2.7942564487457275, + "learning_rate": 8.832731648616125e-07, + "loss": 0.547, + "step": 4180 + }, + { + "epoch": 3.747763864042934, + "grad_norm": 2.5700485706329346, + "learning_rate": 8.829723225030083e-07, + "loss": 0.5318, + "step": 4190 + }, + { + "epoch": 3.7567084078711988, + "grad_norm": 2.232795476913452, + "learning_rate": 8.826714801444043e-07, + "loss": 0.5529, + "step": 4200 + }, + { + "epoch": 3.7656529516994635, + "grad_norm": 2.5302608013153076, + "learning_rate": 8.823706377858002e-07, + "loss": 0.5422, + "step": 4210 + }, + { + "epoch": 3.774597495527728, + "grad_norm": 2.1510462760925293, + "learning_rate": 8.820697954271961e-07, + "loss": 0.5389, + "step": 4220 + }, + { + "epoch": 3.783542039355993, + "grad_norm": 2.637227773666382, + "learning_rate": 8.81768953068592e-07, + "loss": 0.5516, + "step": 4230 + }, + { + "epoch": 3.7924865831842576, + "grad_norm": 2.3238768577575684, + "learning_rate": 8.81468110709988e-07, + "loss": 0.5304, + "step": 4240 + }, + { + "epoch": 3.8014311270125223, + "grad_norm": 2.2426376342773438, + "learning_rate": 8.811672683513839e-07, + "loss": 0.5337, + "step": 4250 + }, + { + "epoch": 3.810375670840787, + "grad_norm": 2.6865127086639404, + "learning_rate": 8.808664259927798e-07, + "loss": 0.5536, + "step": 4260 + }, + { + "epoch": 3.8193202146690517, + "grad_norm": 2.312762975692749, + "learning_rate": 8.805655836341757e-07, + "loss": 0.5353, + "step": 4270 + }, + { + "epoch": 3.8282647584973164, + "grad_norm": 2.282130241394043, + "learning_rate": 8.802647412755716e-07, + "loss": 0.5492, + "step": 4280 + }, + { + "epoch": 3.8372093023255816, + "grad_norm": 2.3553049564361572, + "learning_rate": 8.799638989169674e-07, + "loss": 0.5362, + "step": 4290 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 2.6251494884490967, + "learning_rate": 8.796630565583633e-07, + "loss": 0.5387, + "step": 4300 + }, + { + "epoch": 3.855098389982111, + "grad_norm": 2.353665590286255, + "learning_rate": 8.793622141997593e-07, + "loss": 0.5447, + "step": 4310 + }, + { + "epoch": 3.8640429338103757, + "grad_norm": 3.079887628555298, + "learning_rate": 8.790613718411551e-07, + "loss": 0.5483, + "step": 4320 + }, + { + "epoch": 3.8729874776386404, + "grad_norm": 2.3928916454315186, + "learning_rate": 8.787605294825511e-07, + "loss": 0.5224, + "step": 4330 + }, + { + "epoch": 3.881932021466905, + "grad_norm": 2.356884717941284, + "learning_rate": 8.78459687123947e-07, + "loss": 0.5263, + "step": 4340 + }, + { + "epoch": 3.89087656529517, + "grad_norm": 2.5218331813812256, + "learning_rate": 8.78158844765343e-07, + "loss": 0.527, + "step": 4350 + }, + { + "epoch": 3.899821109123435, + "grad_norm": 2.455214023590088, + "learning_rate": 8.778580024067388e-07, + "loss": 0.5356, + "step": 4360 + }, + { + "epoch": 3.9087656529516996, + "grad_norm": 2.339240312576294, + "learning_rate": 8.775571600481348e-07, + "loss": 0.5239, + "step": 4370 + }, + { + "epoch": 3.9177101967799643, + "grad_norm": 2.3760018348693848, + "learning_rate": 8.772563176895307e-07, + "loss": 0.5143, + "step": 4380 + }, + { + "epoch": 3.926654740608229, + "grad_norm": 2.7996761798858643, + "learning_rate": 8.769554753309265e-07, + "loss": 0.5442, + "step": 4390 + }, + { + "epoch": 3.9355992844364938, + "grad_norm": 2.385220527648926, + "learning_rate": 8.766546329723224e-07, + "loss": 0.5248, + "step": 4400 + }, + { + "epoch": 3.9445438282647585, + "grad_norm": 2.228893518447876, + "learning_rate": 8.763537906137184e-07, + "loss": 0.5097, + "step": 4410 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 2.426032066345215, + "learning_rate": 8.760529482551143e-07, + "loss": 0.5188, + "step": 4420 + }, + { + "epoch": 3.962432915921288, + "grad_norm": 2.2290916442871094, + "learning_rate": 8.757521058965102e-07, + "loss": 0.5198, + "step": 4430 + }, + { + "epoch": 3.9713774597495526, + "grad_norm": 2.2398645877838135, + "learning_rate": 8.754512635379061e-07, + "loss": 0.5115, + "step": 4440 + }, + { + "epoch": 3.9803220035778173, + "grad_norm": 2.507640838623047, + "learning_rate": 8.751504211793021e-07, + "loss": 0.527, + "step": 4450 + }, + { + "epoch": 3.9892665474060824, + "grad_norm": 2.547366142272949, + "learning_rate": 8.748495788206979e-07, + "loss": 0.5188, + "step": 4460 + }, + { + "epoch": 3.998211091234347, + "grad_norm": 2.1927096843719482, + "learning_rate": 8.745487364620938e-07, + "loss": 0.5246, + "step": 4470 + }, + { + "epoch": 4.0, + "eval_bleu": 55.2169, + "eval_gen_len": 75.6871, + "eval_loss": 0.3579396903514862, + "eval_runtime": 56.6446, + "eval_samples_per_second": 18.395, + "eval_steps_per_second": 0.194, + "step": 4472 + }, + { + "epoch": 4.007155635062611, + "grad_norm": 2.197103977203369, + "learning_rate": 8.742478941034898e-07, + "loss": 0.5227, + "step": 4480 + }, + { + "epoch": 4.016100178890877, + "grad_norm": 2.287051200866699, + "learning_rate": 8.739470517448855e-07, + "loss": 0.5079, + "step": 4490 + }, + { + "epoch": 4.025044722719142, + "grad_norm": 2.386626958847046, + "learning_rate": 8.736462093862815e-07, + "loss": 0.5162, + "step": 4500 + }, + { + "epoch": 4.033989266547406, + "grad_norm": 2.674652099609375, + "learning_rate": 8.733453670276774e-07, + "loss": 0.5346, + "step": 4510 + }, + { + "epoch": 4.042933810375671, + "grad_norm": 2.3477110862731934, + "learning_rate": 8.730445246690734e-07, + "loss": 0.5254, + "step": 4520 + }, + { + "epoch": 4.051878354203936, + "grad_norm": 2.2642228603363037, + "learning_rate": 8.727436823104692e-07, + "loss": 0.5201, + "step": 4530 + }, + { + "epoch": 4.0608228980322005, + "grad_norm": 2.0289313793182373, + "learning_rate": 8.724428399518652e-07, + "loss": 0.5189, + "step": 4540 + }, + { + "epoch": 4.069767441860465, + "grad_norm": 2.5789413452148438, + "learning_rate": 8.721419975932611e-07, + "loss": 0.5165, + "step": 4550 + }, + { + "epoch": 4.07871198568873, + "grad_norm": 2.535637617111206, + "learning_rate": 8.71841155234657e-07, + "loss": 0.4972, + "step": 4560 + }, + { + "epoch": 4.087656529516995, + "grad_norm": 2.328303098678589, + "learning_rate": 8.715403128760529e-07, + "loss": 0.501, + "step": 4570 + }, + { + "epoch": 4.096601073345259, + "grad_norm": 2.030282735824585, + "learning_rate": 8.712394705174489e-07, + "loss": 0.491, + "step": 4580 + }, + { + "epoch": 4.105545617173524, + "grad_norm": 2.2180206775665283, + "learning_rate": 8.709386281588448e-07, + "loss": 0.5042, + "step": 4590 + }, + { + "epoch": 4.114490161001789, + "grad_norm": 2.104442596435547, + "learning_rate": 8.706377858002407e-07, + "loss": 0.5095, + "step": 4600 + }, + { + "epoch": 4.1234347048300535, + "grad_norm": 2.341421127319336, + "learning_rate": 8.703369434416365e-07, + "loss": 0.5096, + "step": 4610 + }, + { + "epoch": 4.132379248658318, + "grad_norm": 2.276421070098877, + "learning_rate": 8.700361010830325e-07, + "loss": 0.51, + "step": 4620 + }, + { + "epoch": 4.141323792486583, + "grad_norm": 2.3223495483398438, + "learning_rate": 8.697352587244283e-07, + "loss": 0.5036, + "step": 4630 + }, + { + "epoch": 4.150268336314848, + "grad_norm": 2.262240171432495, + "learning_rate": 8.694344163658242e-07, + "loss": 0.4992, + "step": 4640 + }, + { + "epoch": 4.159212880143112, + "grad_norm": 2.533426284790039, + "learning_rate": 8.691335740072202e-07, + "loss": 0.5042, + "step": 4650 + }, + { + "epoch": 4.168157423971378, + "grad_norm": 2.2887630462646484, + "learning_rate": 8.68832731648616e-07, + "loss": 0.5022, + "step": 4660 + }, + { + "epoch": 4.177101967799643, + "grad_norm": 1.967930793762207, + "learning_rate": 8.68531889290012e-07, + "loss": 0.4739, + "step": 4670 + }, + { + "epoch": 4.186046511627907, + "grad_norm": 2.440462589263916, + "learning_rate": 8.682310469314079e-07, + "loss": 0.4958, + "step": 4680 + }, + { + "epoch": 4.194991055456172, + "grad_norm": 2.404984712600708, + "learning_rate": 8.679302045728039e-07, + "loss": 0.494, + "step": 4690 + }, + { + "epoch": 4.203935599284437, + "grad_norm": 2.4689669609069824, + "learning_rate": 8.676293622141997e-07, + "loss": 0.4923, + "step": 4700 + }, + { + "epoch": 4.212880143112701, + "grad_norm": 2.2958297729492188, + "learning_rate": 8.673285198555957e-07, + "loss": 0.5075, + "step": 4710 + }, + { + "epoch": 4.221824686940966, + "grad_norm": 2.1192386150360107, + "learning_rate": 8.670276774969915e-07, + "loss": 0.4984, + "step": 4720 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 2.3063201904296875, + "learning_rate": 8.667268351383874e-07, + "loss": 0.5009, + "step": 4730 + }, + { + "epoch": 4.2397137745974955, + "grad_norm": 2.4694032669067383, + "learning_rate": 8.664259927797833e-07, + "loss": 0.5009, + "step": 4740 + }, + { + "epoch": 4.24865831842576, + "grad_norm": 2.2757201194763184, + "learning_rate": 8.661251504211793e-07, + "loss": 0.5108, + "step": 4750 + }, + { + "epoch": 4.257602862254025, + "grad_norm": 2.134617567062378, + "learning_rate": 8.658243080625752e-07, + "loss": 0.496, + "step": 4760 + }, + { + "epoch": 4.26654740608229, + "grad_norm": 2.357692003250122, + "learning_rate": 8.655234657039711e-07, + "loss": 0.4936, + "step": 4770 + }, + { + "epoch": 4.275491949910554, + "grad_norm": 2.2824032306671143, + "learning_rate": 8.65222623345367e-07, + "loss": 0.4818, + "step": 4780 + }, + { + "epoch": 4.284436493738819, + "grad_norm": 2.100003719329834, + "learning_rate": 8.64921780986763e-07, + "loss": 0.4943, + "step": 4790 + }, + { + "epoch": 4.293381037567084, + "grad_norm": 2.776961326599121, + "learning_rate": 8.646209386281588e-07, + "loss": 0.4815, + "step": 4800 + }, + { + "epoch": 4.3023255813953485, + "grad_norm": 2.3573222160339355, + "learning_rate": 8.643200962695547e-07, + "loss": 0.5031, + "step": 4810 + }, + { + "epoch": 4.311270125223613, + "grad_norm": 2.1648378372192383, + "learning_rate": 8.640192539109507e-07, + "loss": 0.4951, + "step": 4820 + }, + { + "epoch": 4.320214669051879, + "grad_norm": 2.6196367740631104, + "learning_rate": 8.637184115523464e-07, + "loss": 0.4832, + "step": 4830 + }, + { + "epoch": 4.3291592128801435, + "grad_norm": 2.0457088947296143, + "learning_rate": 8.634175691937424e-07, + "loss": 0.4645, + "step": 4840 + }, + { + "epoch": 4.338103756708408, + "grad_norm": 2.3532934188842773, + "learning_rate": 8.631167268351383e-07, + "loss": 0.4832, + "step": 4850 + }, + { + "epoch": 4.347048300536673, + "grad_norm": 2.3529703617095947, + "learning_rate": 8.628158844765343e-07, + "loss": 0.4777, + "step": 4860 + }, + { + "epoch": 4.355992844364938, + "grad_norm": 2.3178508281707764, + "learning_rate": 8.625150421179301e-07, + "loss": 0.4952, + "step": 4870 + }, + { + "epoch": 4.364937388193202, + "grad_norm": 2.1043753623962402, + "learning_rate": 8.622141997593261e-07, + "loss": 0.4872, + "step": 4880 + }, + { + "epoch": 4.373881932021467, + "grad_norm": 2.3565258979797363, + "learning_rate": 8.61913357400722e-07, + "loss": 0.4641, + "step": 4890 + }, + { + "epoch": 4.382826475849732, + "grad_norm": 2.2326138019561768, + "learning_rate": 8.616125150421179e-07, + "loss": 0.4719, + "step": 4900 + }, + { + "epoch": 4.391771019677996, + "grad_norm": 2.4230077266693115, + "learning_rate": 8.613116726835138e-07, + "loss": 0.4814, + "step": 4910 + }, + { + "epoch": 4.400715563506261, + "grad_norm": 2.375678300857544, + "learning_rate": 8.610108303249098e-07, + "loss": 0.4746, + "step": 4920 + }, + { + "epoch": 4.409660107334526, + "grad_norm": 2.1337859630584717, + "learning_rate": 8.607099879663056e-07, + "loss": 0.4658, + "step": 4930 + }, + { + "epoch": 4.4186046511627906, + "grad_norm": 2.2945430278778076, + "learning_rate": 8.604091456077015e-07, + "loss": 0.4875, + "step": 4940 + }, + { + "epoch": 4.427549194991055, + "grad_norm": 2.20735502243042, + "learning_rate": 8.601083032490974e-07, + "loss": 0.4671, + "step": 4950 + }, + { + "epoch": 4.43649373881932, + "grad_norm": 2.89607834815979, + "learning_rate": 8.598074608904934e-07, + "loss": 0.4757, + "step": 4960 + }, + { + "epoch": 4.445438282647585, + "grad_norm": 2.1082570552825928, + "learning_rate": 8.595066185318892e-07, + "loss": 0.4722, + "step": 4970 + }, + { + "epoch": 4.454382826475849, + "grad_norm": 2.1505637168884277, + "learning_rate": 8.592057761732851e-07, + "loss": 0.4911, + "step": 4980 + }, + { + "epoch": 4.463327370304114, + "grad_norm": 2.17527174949646, + "learning_rate": 8.589049338146811e-07, + "loss": 0.4774, + "step": 4990 + }, + { + "epoch": 4.47227191413238, + "grad_norm": 2.530137777328491, + "learning_rate": 8.586040914560769e-07, + "loss": 0.478, + "step": 5000 + }, + { + "epoch": 4.481216457960644, + "grad_norm": 2.1128530502319336, + "learning_rate": 8.583032490974729e-07, + "loss": 0.4885, + "step": 5010 + }, + { + "epoch": 4.490161001788909, + "grad_norm": 2.4407949447631836, + "learning_rate": 8.580024067388688e-07, + "loss": 0.4733, + "step": 5020 + }, + { + "epoch": 4.499105545617174, + "grad_norm": 2.1629414558410645, + "learning_rate": 8.577015643802648e-07, + "loss": 0.4683, + "step": 5030 + }, + { + "epoch": 4.5080500894454385, + "grad_norm": 2.161496639251709, + "learning_rate": 8.574007220216606e-07, + "loss": 0.4607, + "step": 5040 + }, + { + "epoch": 4.516994633273703, + "grad_norm": 2.093541383743286, + "learning_rate": 8.570998796630565e-07, + "loss": 0.4683, + "step": 5050 + }, + { + "epoch": 4.525939177101968, + "grad_norm": 2.1078083515167236, + "learning_rate": 8.567990373044524e-07, + "loss": 0.4698, + "step": 5060 + }, + { + "epoch": 4.534883720930233, + "grad_norm": 2.2364354133605957, + "learning_rate": 8.564981949458483e-07, + "loss": 0.4843, + "step": 5070 + }, + { + "epoch": 4.543828264758497, + "grad_norm": 2.204305648803711, + "learning_rate": 8.561973525872442e-07, + "loss": 0.4785, + "step": 5080 + }, + { + "epoch": 4.552772808586762, + "grad_norm": 2.314359426498413, + "learning_rate": 8.558965102286402e-07, + "loss": 0.4814, + "step": 5090 + }, + { + "epoch": 4.561717352415027, + "grad_norm": 2.3714308738708496, + "learning_rate": 8.55595667870036e-07, + "loss": 0.4677, + "step": 5100 + }, + { + "epoch": 4.5706618962432914, + "grad_norm": 2.1810977458953857, + "learning_rate": 8.55294825511432e-07, + "loss": 0.4567, + "step": 5110 + }, + { + "epoch": 4.579606440071556, + "grad_norm": 2.3970401287078857, + "learning_rate": 8.549939831528279e-07, + "loss": 0.4854, + "step": 5120 + }, + { + "epoch": 4.588550983899821, + "grad_norm": 2.307947874069214, + "learning_rate": 8.546931407942239e-07, + "loss": 0.4575, + "step": 5130 + }, + { + "epoch": 4.597495527728086, + "grad_norm": 2.3009941577911377, + "learning_rate": 8.543922984356197e-07, + "loss": 0.4662, + "step": 5140 + }, + { + "epoch": 4.60644007155635, + "grad_norm": 2.2120587825775146, + "learning_rate": 8.540914560770156e-07, + "loss": 0.4637, + "step": 5150 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 1.8983241319656372, + "learning_rate": 8.537906137184115e-07, + "loss": 0.4662, + "step": 5160 + }, + { + "epoch": 4.624329159212881, + "grad_norm": 2.24377703666687, + "learning_rate": 8.534897713598073e-07, + "loss": 0.4628, + "step": 5170 + }, + { + "epoch": 4.633273703041145, + "grad_norm": 2.33957839012146, + "learning_rate": 8.531889290012033e-07, + "loss": 0.4778, + "step": 5180 + }, + { + "epoch": 4.64221824686941, + "grad_norm": 2.2532317638397217, + "learning_rate": 8.528880866425992e-07, + "loss": 0.4579, + "step": 5190 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 2.1577749252319336, + "learning_rate": 8.525872442839952e-07, + "loss": 0.4625, + "step": 5200 + }, + { + "epoch": 4.660107334525939, + "grad_norm": 2.20407772064209, + "learning_rate": 8.52286401925391e-07, + "loss": 0.4669, + "step": 5210 + }, + { + "epoch": 4.669051878354204, + "grad_norm": 2.2237601280212402, + "learning_rate": 8.51985559566787e-07, + "loss": 0.4705, + "step": 5220 + }, + { + "epoch": 4.677996422182469, + "grad_norm": 2.367025852203369, + "learning_rate": 8.516847172081829e-07, + "loss": 0.4538, + "step": 5230 + }, + { + "epoch": 4.6869409660107335, + "grad_norm": 2.0631942749023438, + "learning_rate": 8.513838748495788e-07, + "loss": 0.4572, + "step": 5240 + }, + { + "epoch": 4.695885509838998, + "grad_norm": 2.1658992767333984, + "learning_rate": 8.510830324909747e-07, + "loss": 0.4475, + "step": 5250 + }, + { + "epoch": 4.704830053667263, + "grad_norm": 4.232874870300293, + "learning_rate": 8.507821901323707e-07, + "loss": 0.4613, + "step": 5260 + }, + { + "epoch": 4.713774597495528, + "grad_norm": 2.224188804626465, + "learning_rate": 8.504813477737664e-07, + "loss": 0.4424, + "step": 5270 + }, + { + "epoch": 4.722719141323792, + "grad_norm": 2.1103410720825195, + "learning_rate": 8.501805054151624e-07, + "loss": 0.4552, + "step": 5280 + }, + { + "epoch": 4.731663685152057, + "grad_norm": 1.8934681415557861, + "learning_rate": 8.498796630565583e-07, + "loss": 0.4497, + "step": 5290 + }, + { + "epoch": 4.740608228980322, + "grad_norm": 2.366163492202759, + "learning_rate": 8.495788206979543e-07, + "loss": 0.4416, + "step": 5300 + }, + { + "epoch": 4.7495527728085865, + "grad_norm": 2.462538003921509, + "learning_rate": 8.492779783393501e-07, + "loss": 0.4612, + "step": 5310 + }, + { + "epoch": 4.758497316636851, + "grad_norm": 2.191615104675293, + "learning_rate": 8.489771359807461e-07, + "loss": 0.4596, + "step": 5320 + }, + { + "epoch": 4.767441860465116, + "grad_norm": 2.1773529052734375, + "learning_rate": 8.48676293622142e-07, + "loss": 0.4657, + "step": 5330 + }, + { + "epoch": 4.7763864042933815, + "grad_norm": 2.416468858718872, + "learning_rate": 8.483754512635378e-07, + "loss": 0.4553, + "step": 5340 + }, + { + "epoch": 4.785330948121646, + "grad_norm": 2.129228115081787, + "learning_rate": 8.480746089049338e-07, + "loss": 0.4545, + "step": 5350 + }, + { + "epoch": 4.794275491949911, + "grad_norm": 2.597944498062134, + "learning_rate": 8.477737665463297e-07, + "loss": 0.4424, + "step": 5360 + }, + { + "epoch": 4.803220035778176, + "grad_norm": 2.125438928604126, + "learning_rate": 8.474729241877257e-07, + "loss": 0.4565, + "step": 5370 + }, + { + "epoch": 4.81216457960644, + "grad_norm": 2.2695999145507812, + "learning_rate": 8.471720818291214e-07, + "loss": 0.4642, + "step": 5380 + }, + { + "epoch": 4.821109123434705, + "grad_norm": 2.325247049331665, + "learning_rate": 8.468712394705174e-07, + "loss": 0.4528, + "step": 5390 + }, + { + "epoch": 4.83005366726297, + "grad_norm": 2.11484432220459, + "learning_rate": 8.465703971119133e-07, + "loss": 0.4494, + "step": 5400 + }, + { + "epoch": 4.838998211091234, + "grad_norm": 2.2997634410858154, + "learning_rate": 8.462695547533092e-07, + "loss": 0.4426, + "step": 5410 + }, + { + "epoch": 4.847942754919499, + "grad_norm": 2.39806866645813, + "learning_rate": 8.459687123947051e-07, + "loss": 0.4298, + "step": 5420 + }, + { + "epoch": 4.856887298747764, + "grad_norm": 2.1731514930725098, + "learning_rate": 8.456678700361011e-07, + "loss": 0.4535, + "step": 5430 + }, + { + "epoch": 4.8658318425760285, + "grad_norm": 1.9812053442001343, + "learning_rate": 8.453670276774969e-07, + "loss": 0.4431, + "step": 5440 + }, + { + "epoch": 4.874776386404293, + "grad_norm": 2.2461302280426025, + "learning_rate": 8.450661853188929e-07, + "loss": 0.4432, + "step": 5450 + }, + { + "epoch": 4.883720930232558, + "grad_norm": 2.3431684970855713, + "learning_rate": 8.447653429602888e-07, + "loss": 0.4323, + "step": 5460 + }, + { + "epoch": 4.892665474060823, + "grad_norm": 2.1888179779052734, + "learning_rate": 8.444645006016848e-07, + "loss": 0.454, + "step": 5470 + }, + { + "epoch": 4.901610017889087, + "grad_norm": 2.4536116123199463, + "learning_rate": 8.441636582430806e-07, + "loss": 0.4468, + "step": 5480 + }, + { + "epoch": 4.910554561717352, + "grad_norm": 2.444634199142456, + "learning_rate": 8.438628158844765e-07, + "loss": 0.4511, + "step": 5490 + }, + { + "epoch": 4.919499105545617, + "grad_norm": 2.3427412509918213, + "learning_rate": 8.435619735258724e-07, + "loss": 0.427, + "step": 5500 + }, + { + "epoch": 4.928443649373882, + "grad_norm": 2.6630396842956543, + "learning_rate": 8.432611311672682e-07, + "loss": 0.4286, + "step": 5510 + }, + { + "epoch": 4.937388193202147, + "grad_norm": 2.0534238815307617, + "learning_rate": 8.429602888086642e-07, + "loss": 0.4479, + "step": 5520 + }, + { + "epoch": 4.946332737030412, + "grad_norm": 2.080934762954712, + "learning_rate": 8.426594464500601e-07, + "loss": 0.4274, + "step": 5530 + }, + { + "epoch": 4.9552772808586765, + "grad_norm": 2.3056702613830566, + "learning_rate": 8.423586040914561e-07, + "loss": 0.4229, + "step": 5540 + }, + { + "epoch": 4.964221824686941, + "grad_norm": 2.275747060775757, + "learning_rate": 8.420577617328519e-07, + "loss": 0.4371, + "step": 5550 + }, + { + "epoch": 4.973166368515206, + "grad_norm": 2.3384668827056885, + "learning_rate": 8.417569193742479e-07, + "loss": 0.4237, + "step": 5560 + }, + { + "epoch": 4.982110912343471, + "grad_norm": 2.177608013153076, + "learning_rate": 8.414560770156438e-07, + "loss": 0.4334, + "step": 5570 + }, + { + "epoch": 4.991055456171735, + "grad_norm": 1.9598808288574219, + "learning_rate": 8.411552346570397e-07, + "loss": 0.4296, + "step": 5580 + }, + { + "epoch": 5.0, + "grad_norm": 3.7194361686706543, + "learning_rate": 8.408543922984356e-07, + "loss": 0.4228, + "step": 5590 + }, + { + "epoch": 5.0, + "eval_bleu": 60.8262, + "eval_gen_len": 75.5777, + "eval_loss": 0.30407437682151794, + "eval_runtime": 57.0241, + "eval_samples_per_second": 18.273, + "eval_steps_per_second": 0.193, + "step": 5590 + }, + { + "epoch": 5.008944543828265, + "grad_norm": 2.257715940475464, + "learning_rate": 8.405535499398315e-07, + "loss": 0.4484, + "step": 5600 + }, + { + "epoch": 5.017889087656529, + "grad_norm": 2.1407155990600586, + "learning_rate": 8.402527075812273e-07, + "loss": 0.4399, + "step": 5610 + }, + { + "epoch": 5.026833631484794, + "grad_norm": 1.955741047859192, + "learning_rate": 8.399518652226233e-07, + "loss": 0.4216, + "step": 5620 + }, + { + "epoch": 5.035778175313059, + "grad_norm": 2.0968446731567383, + "learning_rate": 8.396510228640192e-07, + "loss": 0.441, + "step": 5630 + }, + { + "epoch": 5.0447227191413235, + "grad_norm": 2.1058216094970703, + "learning_rate": 8.393501805054152e-07, + "loss": 0.4197, + "step": 5640 + }, + { + "epoch": 5.053667262969588, + "grad_norm": 2.249011516571045, + "learning_rate": 8.39049338146811e-07, + "loss": 0.4409, + "step": 5650 + }, + { + "epoch": 5.062611806797853, + "grad_norm": 2.140320062637329, + "learning_rate": 8.38748495788207e-07, + "loss": 0.4305, + "step": 5660 + }, + { + "epoch": 5.071556350626118, + "grad_norm": 2.0407919883728027, + "learning_rate": 8.384476534296029e-07, + "loss": 0.4351, + "step": 5670 + }, + { + "epoch": 5.080500894454383, + "grad_norm": 2.0861878395080566, + "learning_rate": 8.381468110709987e-07, + "loss": 0.4366, + "step": 5680 + }, + { + "epoch": 5.089445438282648, + "grad_norm": 2.259369373321533, + "learning_rate": 8.378459687123947e-07, + "loss": 0.4188, + "step": 5690 + }, + { + "epoch": 5.098389982110913, + "grad_norm": 2.647836923599243, + "learning_rate": 8.375451263537906e-07, + "loss": 0.4298, + "step": 5700 + }, + { + "epoch": 5.107334525939177, + "grad_norm": 2.4857370853424072, + "learning_rate": 8.372442839951864e-07, + "loss": 0.4314, + "step": 5710 + }, + { + "epoch": 5.116279069767442, + "grad_norm": 2.1227731704711914, + "learning_rate": 8.369434416365823e-07, + "loss": 0.4378, + "step": 5720 + }, + { + "epoch": 5.125223613595707, + "grad_norm": 2.1125993728637695, + "learning_rate": 8.366425992779783e-07, + "loss": 0.4339, + "step": 5730 + }, + { + "epoch": 5.1341681574239715, + "grad_norm": 2.170203685760498, + "learning_rate": 8.363417569193742e-07, + "loss": 0.4329, + "step": 5740 + }, + { + "epoch": 5.143112701252236, + "grad_norm": 2.1479272842407227, + "learning_rate": 8.360409145607701e-07, + "loss": 0.4423, + "step": 5750 + }, + { + "epoch": 5.152057245080501, + "grad_norm": 2.229536771774292, + "learning_rate": 8.35740072202166e-07, + "loss": 0.4332, + "step": 5760 + }, + { + "epoch": 5.161001788908766, + "grad_norm": 2.070498466491699, + "learning_rate": 8.35439229843562e-07, + "loss": 0.435, + "step": 5770 + }, + { + "epoch": 5.16994633273703, + "grad_norm": 2.2134475708007812, + "learning_rate": 8.351383874849578e-07, + "loss": 0.4407, + "step": 5780 + }, + { + "epoch": 5.178890876565295, + "grad_norm": 2.3678598403930664, + "learning_rate": 8.348375451263538e-07, + "loss": 0.41, + "step": 5790 + }, + { + "epoch": 5.18783542039356, + "grad_norm": 2.2990410327911377, + "learning_rate": 8.345367027677497e-07, + "loss": 0.4115, + "step": 5800 + }, + { + "epoch": 5.196779964221824, + "grad_norm": 2.1436893939971924, + "learning_rate": 8.342358604091457e-07, + "loss": 0.4172, + "step": 5810 + }, + { + "epoch": 5.205724508050089, + "grad_norm": 1.9223712682724, + "learning_rate": 8.339350180505414e-07, + "loss": 0.4096, + "step": 5820 + }, + { + "epoch": 5.214669051878354, + "grad_norm": 1.97529935836792, + "learning_rate": 8.336341756919374e-07, + "loss": 0.4282, + "step": 5830 + }, + { + "epoch": 5.2236135957066185, + "grad_norm": 2.008768320083618, + "learning_rate": 8.333333333333333e-07, + "loss": 0.4185, + "step": 5840 + }, + { + "epoch": 5.232558139534884, + "grad_norm": 2.5357260704040527, + "learning_rate": 8.330324909747291e-07, + "loss": 0.4185, + "step": 5850 + }, + { + "epoch": 5.241502683363149, + "grad_norm": 1.980273962020874, + "learning_rate": 8.327316486161251e-07, + "loss": 0.4169, + "step": 5860 + }, + { + "epoch": 5.2504472271914135, + "grad_norm": 2.093468427658081, + "learning_rate": 8.32430806257521e-07, + "loss": 0.411, + "step": 5870 + }, + { + "epoch": 5.259391771019678, + "grad_norm": 2.3774306774139404, + "learning_rate": 8.321299638989169e-07, + "loss": 0.4457, + "step": 5880 + }, + { + "epoch": 5.268336314847943, + "grad_norm": 2.2056474685668945, + "learning_rate": 8.318291215403128e-07, + "loss": 0.4186, + "step": 5890 + }, + { + "epoch": 5.277280858676208, + "grad_norm": 1.9724749326705933, + "learning_rate": 8.315282791817088e-07, + "loss": 0.4209, + "step": 5900 + }, + { + "epoch": 5.286225402504472, + "grad_norm": 2.140307664871216, + "learning_rate": 8.312274368231047e-07, + "loss": 0.4132, + "step": 5910 + }, + { + "epoch": 5.295169946332737, + "grad_norm": 1.9825023412704468, + "learning_rate": 8.309265944645006e-07, + "loss": 0.4241, + "step": 5920 + }, + { + "epoch": 5.304114490161002, + "grad_norm": 2.3204171657562256, + "learning_rate": 8.306257521058964e-07, + "loss": 0.4332, + "step": 5930 + }, + { + "epoch": 5.3130590339892665, + "grad_norm": 1.8255951404571533, + "learning_rate": 8.303249097472924e-07, + "loss": 0.4166, + "step": 5940 + }, + { + "epoch": 5.322003577817531, + "grad_norm": 2.5971059799194336, + "learning_rate": 8.300240673886882e-07, + "loss": 0.4203, + "step": 5950 + }, + { + "epoch": 5.330948121645796, + "grad_norm": 2.185863494873047, + "learning_rate": 8.297232250300842e-07, + "loss": 0.414, + "step": 5960 + }, + { + "epoch": 5.339892665474061, + "grad_norm": 2.592667818069458, + "learning_rate": 8.294223826714801e-07, + "loss": 0.4288, + "step": 5970 + }, + { + "epoch": 5.348837209302325, + "grad_norm": 1.9179635047912598, + "learning_rate": 8.291215403128761e-07, + "loss": 0.4083, + "step": 5980 + }, + { + "epoch": 5.35778175313059, + "grad_norm": 2.0136308670043945, + "learning_rate": 8.288206979542719e-07, + "loss": 0.4153, + "step": 5990 + }, + { + "epoch": 5.366726296958855, + "grad_norm": 1.8011486530303955, + "learning_rate": 8.285198555956679e-07, + "loss": 0.4106, + "step": 6000 + }, + { + "epoch": 5.375670840787119, + "grad_norm": 1.690496802330017, + "learning_rate": 8.282190132370638e-07, + "loss": 0.4148, + "step": 6010 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 1.9623056650161743, + "learning_rate": 8.279181708784596e-07, + "loss": 0.4152, + "step": 6020 + }, + { + "epoch": 5.39355992844365, + "grad_norm": 2.0957489013671875, + "learning_rate": 8.276173285198556e-07, + "loss": 0.4076, + "step": 6030 + }, + { + "epoch": 5.402504472271914, + "grad_norm": 2.2126786708831787, + "learning_rate": 8.273164861612514e-07, + "loss": 0.4134, + "step": 6040 + }, + { + "epoch": 5.411449016100179, + "grad_norm": 2.522958517074585, + "learning_rate": 8.270156438026473e-07, + "loss": 0.4087, + "step": 6050 + }, + { + "epoch": 5.420393559928444, + "grad_norm": 2.0177793502807617, + "learning_rate": 8.267148014440432e-07, + "loss": 0.4058, + "step": 6060 + }, + { + "epoch": 5.4293381037567086, + "grad_norm": 2.025320291519165, + "learning_rate": 8.264139590854392e-07, + "loss": 0.4136, + "step": 6070 + }, + { + "epoch": 5.438282647584973, + "grad_norm": 2.357020378112793, + "learning_rate": 8.261131167268351e-07, + "loss": 0.3919, + "step": 6080 + }, + { + "epoch": 5.447227191413238, + "grad_norm": 2.289153575897217, + "learning_rate": 8.25812274368231e-07, + "loss": 0.3974, + "step": 6090 + }, + { + "epoch": 5.456171735241503, + "grad_norm": 2.366131067276001, + "learning_rate": 8.255114320096269e-07, + "loss": 0.4129, + "step": 6100 + }, + { + "epoch": 5.465116279069767, + "grad_norm": 2.0910308361053467, + "learning_rate": 8.252105896510229e-07, + "loss": 0.4023, + "step": 6110 + }, + { + "epoch": 5.474060822898032, + "grad_norm": 1.8637781143188477, + "learning_rate": 8.249097472924187e-07, + "loss": 0.3988, + "step": 6120 + }, + { + "epoch": 5.483005366726297, + "grad_norm": 2.1386542320251465, + "learning_rate": 8.246089049338147e-07, + "loss": 0.3972, + "step": 6130 + }, + { + "epoch": 5.4919499105545615, + "grad_norm": 2.4520576000213623, + "learning_rate": 8.243080625752106e-07, + "loss": 0.3992, + "step": 6140 + }, + { + "epoch": 5.500894454382826, + "grad_norm": 2.33306884765625, + "learning_rate": 8.240072202166066e-07, + "loss": 0.4174, + "step": 6150 + }, + { + "epoch": 5.509838998211091, + "grad_norm": 2.198904514312744, + "learning_rate": 8.237063778580023e-07, + "loss": 0.391, + "step": 6160 + }, + { + "epoch": 5.518783542039356, + "grad_norm": 2.1780428886413574, + "learning_rate": 8.234055354993983e-07, + "loss": 0.3996, + "step": 6170 + }, + { + "epoch": 5.52772808586762, + "grad_norm": 2.1225438117980957, + "learning_rate": 8.231046931407942e-07, + "loss": 0.4143, + "step": 6180 + }, + { + "epoch": 5.536672629695886, + "grad_norm": 2.3154380321502686, + "learning_rate": 8.2280385078219e-07, + "loss": 0.406, + "step": 6190 + }, + { + "epoch": 5.545617173524151, + "grad_norm": 2.2880001068115234, + "learning_rate": 8.22503008423586e-07, + "loss": 0.4119, + "step": 6200 + }, + { + "epoch": 5.554561717352415, + "grad_norm": 1.9084720611572266, + "learning_rate": 8.222021660649819e-07, + "loss": 0.402, + "step": 6210 + }, + { + "epoch": 5.56350626118068, + "grad_norm": 1.8375004529953003, + "learning_rate": 8.219013237063778e-07, + "loss": 0.4025, + "step": 6220 + }, + { + "epoch": 5.572450805008945, + "grad_norm": 1.825980544090271, + "learning_rate": 8.216004813477737e-07, + "loss": 0.3976, + "step": 6230 + }, + { + "epoch": 5.5813953488372094, + "grad_norm": 1.917263388633728, + "learning_rate": 8.212996389891697e-07, + "loss": 0.3974, + "step": 6240 + }, + { + "epoch": 5.590339892665474, + "grad_norm": 1.8471791744232178, + "learning_rate": 8.209987966305656e-07, + "loss": 0.3988, + "step": 6250 + }, + { + "epoch": 5.599284436493739, + "grad_norm": 2.0131492614746094, + "learning_rate": 8.206979542719614e-07, + "loss": 0.3971, + "step": 6260 + }, + { + "epoch": 5.608228980322004, + "grad_norm": 1.9742953777313232, + "learning_rate": 8.203971119133573e-07, + "loss": 0.4083, + "step": 6270 + }, + { + "epoch": 5.617173524150268, + "grad_norm": 1.6688611507415771, + "learning_rate": 8.200962695547533e-07, + "loss": 0.3905, + "step": 6280 + }, + { + "epoch": 5.626118067978533, + "grad_norm": 2.111806631088257, + "learning_rate": 8.197954271961491e-07, + "loss": 0.3913, + "step": 6290 + }, + { + "epoch": 5.635062611806798, + "grad_norm": 2.0097107887268066, + "learning_rate": 8.194945848375451e-07, + "loss": 0.3915, + "step": 6300 + }, + { + "epoch": 5.644007155635062, + "grad_norm": 2.3371779918670654, + "learning_rate": 8.19193742478941e-07, + "loss": 0.4012, + "step": 6310 + }, + { + "epoch": 5.652951699463327, + "grad_norm": 2.0339999198913574, + "learning_rate": 8.18892900120337e-07, + "loss": 0.4054, + "step": 6320 + }, + { + "epoch": 5.661896243291592, + "grad_norm": 2.0710511207580566, + "learning_rate": 8.185920577617328e-07, + "loss": 0.4002, + "step": 6330 + }, + { + "epoch": 5.6708407871198565, + "grad_norm": 1.9887924194335938, + "learning_rate": 8.182912154031288e-07, + "loss": 0.3952, + "step": 6340 + }, + { + "epoch": 5.679785330948121, + "grad_norm": 2.114912986755371, + "learning_rate": 8.179903730445247e-07, + "loss": 0.3923, + "step": 6350 + }, + { + "epoch": 5.688729874776387, + "grad_norm": 1.895389437675476, + "learning_rate": 8.176895306859205e-07, + "loss": 0.4021, + "step": 6360 + }, + { + "epoch": 5.6976744186046515, + "grad_norm": 2.2383832931518555, + "learning_rate": 8.173886883273164e-07, + "loss": 0.3911, + "step": 6370 + }, + { + "epoch": 5.706618962432916, + "grad_norm": 1.9833958148956299, + "learning_rate": 8.170878459687123e-07, + "loss": 0.3896, + "step": 6380 + }, + { + "epoch": 5.715563506261181, + "grad_norm": 1.88776695728302, + "learning_rate": 8.167870036101082e-07, + "loss": 0.3753, + "step": 6390 + }, + { + "epoch": 5.724508050089446, + "grad_norm": 1.8492844104766846, + "learning_rate": 8.164861612515041e-07, + "loss": 0.3803, + "step": 6400 + }, + { + "epoch": 5.73345259391771, + "grad_norm": 2.098515033721924, + "learning_rate": 8.161853188929001e-07, + "loss": 0.3929, + "step": 6410 + }, + { + "epoch": 5.742397137745975, + "grad_norm": 2.0026450157165527, + "learning_rate": 8.15884476534296e-07, + "loss": 0.4026, + "step": 6420 + }, + { + "epoch": 5.75134168157424, + "grad_norm": 2.0872244834899902, + "learning_rate": 8.155836341756919e-07, + "loss": 0.3999, + "step": 6430 + }, + { + "epoch": 5.7602862254025045, + "grad_norm": 2.047790765762329, + "learning_rate": 8.152827918170878e-07, + "loss": 0.4006, + "step": 6440 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 2.2762393951416016, + "learning_rate": 8.149819494584838e-07, + "loss": 0.3964, + "step": 6450 + }, + { + "epoch": 5.778175313059034, + "grad_norm": 2.1704812049865723, + "learning_rate": 8.146811070998796e-07, + "loss": 0.387, + "step": 6460 + }, + { + "epoch": 5.787119856887299, + "grad_norm": 2.2697205543518066, + "learning_rate": 8.143802647412756e-07, + "loss": 0.3891, + "step": 6470 + }, + { + "epoch": 5.796064400715563, + "grad_norm": 2.1582305431365967, + "learning_rate": 8.140794223826714e-07, + "loss": 0.3932, + "step": 6480 + }, + { + "epoch": 5.805008944543828, + "grad_norm": 2.3086400032043457, + "learning_rate": 8.137785800240674e-07, + "loss": 0.3955, + "step": 6490 + }, + { + "epoch": 5.813953488372093, + "grad_norm": 1.9696953296661377, + "learning_rate": 8.134777376654632e-07, + "loss": 0.3912, + "step": 6500 + }, + { + "epoch": 5.822898032200357, + "grad_norm": 2.045171022415161, + "learning_rate": 8.131768953068592e-07, + "loss": 0.3898, + "step": 6510 + }, + { + "epoch": 5.831842576028622, + "grad_norm": 1.8143988847732544, + "learning_rate": 8.128760529482551e-07, + "loss": 0.3799, + "step": 6520 + }, + { + "epoch": 5.840787119856888, + "grad_norm": 2.1355209350585938, + "learning_rate": 8.125752105896509e-07, + "loss": 0.3877, + "step": 6530 + }, + { + "epoch": 5.849731663685152, + "grad_norm": 1.9495118856430054, + "learning_rate": 8.122743682310469e-07, + "loss": 0.3993, + "step": 6540 + }, + { + "epoch": 5.858676207513417, + "grad_norm": 2.00313401222229, + "learning_rate": 8.119735258724428e-07, + "loss": 0.3859, + "step": 6550 + }, + { + "epoch": 5.867620751341682, + "grad_norm": 2.0196080207824707, + "learning_rate": 8.116726835138387e-07, + "loss": 0.3979, + "step": 6560 + }, + { + "epoch": 5.8765652951699465, + "grad_norm": 2.0028293132781982, + "learning_rate": 8.113718411552346e-07, + "loss": 0.3915, + "step": 6570 + }, + { + "epoch": 5.885509838998211, + "grad_norm": 1.7704436779022217, + "learning_rate": 8.110709987966306e-07, + "loss": 0.3942, + "step": 6580 + }, + { + "epoch": 5.894454382826476, + "grad_norm": 1.9910413026809692, + "learning_rate": 8.107701564380264e-07, + "loss": 0.3909, + "step": 6590 + }, + { + "epoch": 5.903398926654741, + "grad_norm": 2.428554058074951, + "learning_rate": 8.104693140794223e-07, + "loss": 0.363, + "step": 6600 + }, + { + "epoch": 5.912343470483005, + "grad_norm": 2.052687406539917, + "learning_rate": 8.101684717208182e-07, + "loss": 0.3876, + "step": 6610 + }, + { + "epoch": 5.92128801431127, + "grad_norm": 2.212726354598999, + "learning_rate": 8.098676293622142e-07, + "loss": 0.3824, + "step": 6620 + }, + { + "epoch": 5.930232558139535, + "grad_norm": 1.9802848100662231, + "learning_rate": 8.0956678700361e-07, + "loss": 0.3696, + "step": 6630 + }, + { + "epoch": 5.9391771019677995, + "grad_norm": 2.106544256210327, + "learning_rate": 8.09265944645006e-07, + "loss": 0.3884, + "step": 6640 + }, + { + "epoch": 5.948121645796064, + "grad_norm": 1.7887368202209473, + "learning_rate": 8.089651022864019e-07, + "loss": 0.3764, + "step": 6650 + }, + { + "epoch": 5.957066189624329, + "grad_norm": 1.9476968050003052, + "learning_rate": 8.086642599277978e-07, + "loss": 0.3758, + "step": 6660 + }, + { + "epoch": 5.966010733452594, + "grad_norm": 1.7526689767837524, + "learning_rate": 8.083634175691937e-07, + "loss": 0.3862, + "step": 6670 + }, + { + "epoch": 5.974955277280858, + "grad_norm": 1.9941354990005493, + "learning_rate": 8.080625752105897e-07, + "loss": 0.3759, + "step": 6680 + }, + { + "epoch": 5.983899821109123, + "grad_norm": 2.159471273422241, + "learning_rate": 8.077617328519856e-07, + "loss": 0.3838, + "step": 6690 + }, + { + "epoch": 5.992844364937389, + "grad_norm": 2.2671890258789062, + "learning_rate": 8.074608904933814e-07, + "loss": 0.3745, + "step": 6700 + }, + { + "epoch": 6.0, + "eval_bleu": 64.8987, + "eval_gen_len": 75.4424, + "eval_loss": 0.26931023597717285, + "eval_runtime": 57.2125, + "eval_samples_per_second": 18.213, + "eval_steps_per_second": 0.192, + "step": 6708 + }, + { + "epoch": 6.001788908765653, + "grad_norm": 2.142197847366333, + "learning_rate": 8.071600481347773e-07, + "loss": 0.3731, + "step": 6710 + }, + { + "epoch": 6.010733452593918, + "grad_norm": 2.0640597343444824, + "learning_rate": 8.068592057761732e-07, + "loss": 0.3712, + "step": 6720 + }, + { + "epoch": 6.019677996422183, + "grad_norm": 2.0293760299682617, + "learning_rate": 8.065583634175691e-07, + "loss": 0.3621, + "step": 6730 + }, + { + "epoch": 6.028622540250447, + "grad_norm": 2.1380388736724854, + "learning_rate": 8.06257521058965e-07, + "loss": 0.3859, + "step": 6740 + }, + { + "epoch": 6.037567084078712, + "grad_norm": 2.00776743888855, + "learning_rate": 8.05956678700361e-07, + "loss": 0.372, + "step": 6750 + }, + { + "epoch": 6.046511627906977, + "grad_norm": 2.1911516189575195, + "learning_rate": 8.056558363417569e-07, + "loss": 0.3605, + "step": 6760 + }, + { + "epoch": 6.0554561717352415, + "grad_norm": 1.8570131063461304, + "learning_rate": 8.053549939831528e-07, + "loss": 0.3731, + "step": 6770 + }, + { + "epoch": 6.064400715563506, + "grad_norm": 2.439568519592285, + "learning_rate": 8.050541516245487e-07, + "loss": 0.3819, + "step": 6780 + }, + { + "epoch": 6.073345259391771, + "grad_norm": 2.1658380031585693, + "learning_rate": 8.047533092659447e-07, + "loss": 0.3862, + "step": 6790 + }, + { + "epoch": 6.082289803220036, + "grad_norm": 1.8695380687713623, + "learning_rate": 8.044524669073405e-07, + "loss": 0.3829, + "step": 6800 + }, + { + "epoch": 6.0912343470483, + "grad_norm": 1.9415037631988525, + "learning_rate": 8.041516245487365e-07, + "loss": 0.3825, + "step": 6810 + }, + { + "epoch": 6.100178890876565, + "grad_norm": 1.9677858352661133, + "learning_rate": 8.038507821901323e-07, + "loss": 0.3645, + "step": 6820 + }, + { + "epoch": 6.10912343470483, + "grad_norm": 2.04807710647583, + "learning_rate": 8.035499398315282e-07, + "loss": 0.3718, + "step": 6830 + }, + { + "epoch": 6.1180679785330945, + "grad_norm": 2.2417209148406982, + "learning_rate": 8.032490974729241e-07, + "loss": 0.3703, + "step": 6840 + }, + { + "epoch": 6.127012522361359, + "grad_norm": 2.086850881576538, + "learning_rate": 8.029482551143201e-07, + "loss": 0.3684, + "step": 6850 + }, + { + "epoch": 6.135957066189624, + "grad_norm": 2.112938165664673, + "learning_rate": 8.02647412755716e-07, + "loss": 0.383, + "step": 6860 + }, + { + "epoch": 6.1449016100178895, + "grad_norm": 2.30755615234375, + "learning_rate": 8.023465703971119e-07, + "loss": 0.3692, + "step": 6870 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 1.943410873413086, + "learning_rate": 8.020457280385078e-07, + "loss": 0.3741, + "step": 6880 + }, + { + "epoch": 6.162790697674419, + "grad_norm": 1.8077304363250732, + "learning_rate": 8.017448856799037e-07, + "loss": 0.3713, + "step": 6890 + }, + { + "epoch": 6.171735241502684, + "grad_norm": 2.3878605365753174, + "learning_rate": 8.014440433212996e-07, + "loss": 0.3672, + "step": 6900 + }, + { + "epoch": 6.180679785330948, + "grad_norm": 1.8652573823928833, + "learning_rate": 8.011432009626955e-07, + "loss": 0.3611, + "step": 6910 + }, + { + "epoch": 6.189624329159213, + "grad_norm": 1.9883804321289062, + "learning_rate": 8.008423586040915e-07, + "loss": 0.3632, + "step": 6920 + }, + { + "epoch": 6.198568872987478, + "grad_norm": 1.811181664466858, + "learning_rate": 8.005415162454873e-07, + "loss": 0.3643, + "step": 6930 + }, + { + "epoch": 6.207513416815742, + "grad_norm": 2.0005087852478027, + "learning_rate": 8.002406738868832e-07, + "loss": 0.38, + "step": 6940 + }, + { + "epoch": 6.216457960644007, + "grad_norm": 2.004239082336426, + "learning_rate": 7.999398315282791e-07, + "loss": 0.3682, + "step": 6950 + }, + { + "epoch": 6.225402504472272, + "grad_norm": 2.189047336578369, + "learning_rate": 7.996389891696751e-07, + "loss": 0.3688, + "step": 6960 + }, + { + "epoch": 6.2343470483005365, + "grad_norm": 1.7951610088348389, + "learning_rate": 7.993381468110709e-07, + "loss": 0.363, + "step": 6970 + }, + { + "epoch": 6.243291592128801, + "grad_norm": 2.3958542346954346, + "learning_rate": 7.990373044524669e-07, + "loss": 0.3753, + "step": 6980 + }, + { + "epoch": 6.252236135957066, + "grad_norm": 1.7929232120513916, + "learning_rate": 7.987364620938628e-07, + "loss": 0.3627, + "step": 6990 + }, + { + "epoch": 6.261180679785331, + "grad_norm": 2.1548123359680176, + "learning_rate": 7.984356197352587e-07, + "loss": 0.3801, + "step": 7000 + }, + { + "epoch": 6.270125223613595, + "grad_norm": 2.0325024127960205, + "learning_rate": 7.981347773766546e-07, + "loss": 0.3719, + "step": 7010 + }, + { + "epoch": 6.27906976744186, + "grad_norm": 1.8743184804916382, + "learning_rate": 7.978339350180506e-07, + "loss": 0.3743, + "step": 7020 + }, + { + "epoch": 6.288014311270125, + "grad_norm": 2.211442708969116, + "learning_rate": 7.975330926594465e-07, + "loss": 0.3474, + "step": 7030 + }, + { + "epoch": 6.29695885509839, + "grad_norm": 2.0763611793518066, + "learning_rate": 7.972322503008423e-07, + "loss": 0.366, + "step": 7040 + }, + { + "epoch": 6.305903398926655, + "grad_norm": 2.271003484725952, + "learning_rate": 7.969314079422382e-07, + "loss": 0.3742, + "step": 7050 + }, + { + "epoch": 6.31484794275492, + "grad_norm": 2.1758437156677246, + "learning_rate": 7.966305655836341e-07, + "loss": 0.3637, + "step": 7060 + }, + { + "epoch": 6.3237924865831845, + "grad_norm": 2.1391825675964355, + "learning_rate": 7.9632972322503e-07, + "loss": 0.3567, + "step": 7070 + }, + { + "epoch": 6.332737030411449, + "grad_norm": 1.9387537240982056, + "learning_rate": 7.960288808664259e-07, + "loss": 0.3645, + "step": 7080 + }, + { + "epoch": 6.341681574239714, + "grad_norm": 2.2183737754821777, + "learning_rate": 7.957280385078219e-07, + "loss": 0.3575, + "step": 7090 + }, + { + "epoch": 6.350626118067979, + "grad_norm": 1.9483188390731812, + "learning_rate": 7.954271961492178e-07, + "loss": 0.3747, + "step": 7100 + }, + { + "epoch": 6.359570661896243, + "grad_norm": 1.929976463317871, + "learning_rate": 7.951263537906137e-07, + "loss": 0.3479, + "step": 7110 + }, + { + "epoch": 6.368515205724508, + "grad_norm": 1.893912434577942, + "learning_rate": 7.948255114320096e-07, + "loss": 0.3543, + "step": 7120 + }, + { + "epoch": 6.377459749552773, + "grad_norm": 2.3432559967041016, + "learning_rate": 7.945246690734056e-07, + "loss": 0.3651, + "step": 7130 + }, + { + "epoch": 6.386404293381037, + "grad_norm": 2.40236759185791, + "learning_rate": 7.942238267148013e-07, + "loss": 0.3757, + "step": 7140 + }, + { + "epoch": 6.395348837209302, + "grad_norm": 1.7062771320343018, + "learning_rate": 7.939229843561973e-07, + "loss": 0.3597, + "step": 7150 + }, + { + "epoch": 6.404293381037567, + "grad_norm": 1.9003205299377441, + "learning_rate": 7.936221419975932e-07, + "loss": 0.3616, + "step": 7160 + }, + { + "epoch": 6.4132379248658316, + "grad_norm": 1.929549217224121, + "learning_rate": 7.933212996389891e-07, + "loss": 0.37, + "step": 7170 + }, + { + "epoch": 6.422182468694096, + "grad_norm": 2.0662312507629395, + "learning_rate": 7.93020457280385e-07, + "loss": 0.3654, + "step": 7180 + }, + { + "epoch": 6.431127012522361, + "grad_norm": 2.080864191055298, + "learning_rate": 7.92719614921781e-07, + "loss": 0.361, + "step": 7190 + }, + { + "epoch": 6.440071556350626, + "grad_norm": 2.1335203647613525, + "learning_rate": 7.924187725631769e-07, + "loss": 0.3767, + "step": 7200 + }, + { + "epoch": 6.449016100178891, + "grad_norm": 2.0065715312957764, + "learning_rate": 7.921179302045728e-07, + "loss": 0.3654, + "step": 7210 + }, + { + "epoch": 6.457960644007156, + "grad_norm": 2.2601537704467773, + "learning_rate": 7.918170878459687e-07, + "loss": 0.3665, + "step": 7220 + }, + { + "epoch": 6.466905187835421, + "grad_norm": 1.9954750537872314, + "learning_rate": 7.915162454873647e-07, + "loss": 0.3744, + "step": 7230 + }, + { + "epoch": 6.475849731663685, + "grad_norm": 2.185459852218628, + "learning_rate": 7.912154031287605e-07, + "loss": 0.3667, + "step": 7240 + }, + { + "epoch": 6.48479427549195, + "grad_norm": 1.9032912254333496, + "learning_rate": 7.909145607701563e-07, + "loss": 0.3529, + "step": 7250 + }, + { + "epoch": 6.493738819320215, + "grad_norm": 1.798946738243103, + "learning_rate": 7.906137184115523e-07, + "loss": 0.3671, + "step": 7260 + }, + { + "epoch": 6.5026833631484795, + "grad_norm": 2.072037935256958, + "learning_rate": 7.903128760529482e-07, + "loss": 0.3748, + "step": 7270 + }, + { + "epoch": 6.511627906976744, + "grad_norm": 2.0674805641174316, + "learning_rate": 7.900120336943441e-07, + "loss": 0.3539, + "step": 7280 + }, + { + "epoch": 6.520572450805009, + "grad_norm": 1.8857333660125732, + "learning_rate": 7.8971119133574e-07, + "loss": 0.3476, + "step": 7290 + }, + { + "epoch": 6.529516994633274, + "grad_norm": 1.8023921251296997, + "learning_rate": 7.89410348977136e-07, + "loss": 0.3654, + "step": 7300 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 1.7650887966156006, + "learning_rate": 7.891095066185318e-07, + "loss": 0.3548, + "step": 7310 + }, + { + "epoch": 6.547406082289803, + "grad_norm": 1.9362927675247192, + "learning_rate": 7.888086642599278e-07, + "loss": 0.3561, + "step": 7320 + }, + { + "epoch": 6.556350626118068, + "grad_norm": 1.8597862720489502, + "learning_rate": 7.885078219013237e-07, + "loss": 0.3621, + "step": 7330 + }, + { + "epoch": 6.565295169946332, + "grad_norm": 2.072204351425171, + "learning_rate": 7.882069795427196e-07, + "loss": 0.3599, + "step": 7340 + }, + { + "epoch": 6.574239713774597, + "grad_norm": 2.5796756744384766, + "learning_rate": 7.879061371841155e-07, + "loss": 0.3505, + "step": 7350 + }, + { + "epoch": 6.583184257602863, + "grad_norm": 2.2466342449188232, + "learning_rate": 7.876052948255115e-07, + "loss": 0.3551, + "step": 7360 + }, + { + "epoch": 6.592128801431127, + "grad_norm": 1.9930235147476196, + "learning_rate": 7.873044524669073e-07, + "loss": 0.3643, + "step": 7370 + }, + { + "epoch": 6.601073345259392, + "grad_norm": 2.447427988052368, + "learning_rate": 7.870036101083032e-07, + "loss": 0.3649, + "step": 7380 + }, + { + "epoch": 6.610017889087657, + "grad_norm": 1.9434752464294434, + "learning_rate": 7.867027677496991e-07, + "loss": 0.3573, + "step": 7390 + }, + { + "epoch": 6.618962432915922, + "grad_norm": 1.8593007326126099, + "learning_rate": 7.864019253910951e-07, + "loss": 0.3528, + "step": 7400 + }, + { + "epoch": 6.627906976744186, + "grad_norm": 1.7246472835540771, + "learning_rate": 7.861010830324909e-07, + "loss": 0.3397, + "step": 7410 + }, + { + "epoch": 6.636851520572451, + "grad_norm": 1.7897623777389526, + "learning_rate": 7.858002406738868e-07, + "loss": 0.3483, + "step": 7420 + }, + { + "epoch": 6.645796064400716, + "grad_norm": 1.6254141330718994, + "learning_rate": 7.854993983152828e-07, + "loss": 0.3543, + "step": 7430 + }, + { + "epoch": 6.65474060822898, + "grad_norm": 2.3027360439300537, + "learning_rate": 7.851985559566786e-07, + "loss": 0.3663, + "step": 7440 + }, + { + "epoch": 6.663685152057245, + "grad_norm": 1.8483282327651978, + "learning_rate": 7.848977135980746e-07, + "loss": 0.352, + "step": 7450 + }, + { + "epoch": 6.67262969588551, + "grad_norm": 1.87179696559906, + "learning_rate": 7.845968712394705e-07, + "loss": 0.3652, + "step": 7460 + }, + { + "epoch": 6.6815742397137745, + "grad_norm": 1.859720230102539, + "learning_rate": 7.842960288808665e-07, + "loss": 0.3572, + "step": 7470 + }, + { + "epoch": 6.690518783542039, + "grad_norm": 1.9132184982299805, + "learning_rate": 7.839951865222622e-07, + "loss": 0.3506, + "step": 7480 + }, + { + "epoch": 6.699463327370304, + "grad_norm": 2.097236394882202, + "learning_rate": 7.836943441636582e-07, + "loss": 0.3484, + "step": 7490 + }, + { + "epoch": 6.708407871198569, + "grad_norm": 1.7888951301574707, + "learning_rate": 7.833935018050541e-07, + "loss": 0.3555, + "step": 7500 + }, + { + "epoch": 6.717352415026833, + "grad_norm": 2.1230521202087402, + "learning_rate": 7.8309265944645e-07, + "loss": 0.345, + "step": 7510 + }, + { + "epoch": 6.726296958855098, + "grad_norm": 2.173988103866577, + "learning_rate": 7.827918170878459e-07, + "loss": 0.3505, + "step": 7520 + }, + { + "epoch": 6.735241502683364, + "grad_norm": 2.3823533058166504, + "learning_rate": 7.824909747292419e-07, + "loss": 0.3517, + "step": 7530 + }, + { + "epoch": 6.7441860465116275, + "grad_norm": 1.8840171098709106, + "learning_rate": 7.821901323706378e-07, + "loss": 0.3518, + "step": 7540 + }, + { + "epoch": 6.753130590339893, + "grad_norm": 1.7635117769241333, + "learning_rate": 7.818892900120337e-07, + "loss": 0.355, + "step": 7550 + }, + { + "epoch": 6.762075134168158, + "grad_norm": 2.197007656097412, + "learning_rate": 7.815884476534296e-07, + "loss": 0.3512, + "step": 7560 + }, + { + "epoch": 6.7710196779964225, + "grad_norm": 1.796606421470642, + "learning_rate": 7.812876052948256e-07, + "loss": 0.3483, + "step": 7570 + }, + { + "epoch": 6.779964221824687, + "grad_norm": 1.990676760673523, + "learning_rate": 7.809867629362213e-07, + "loss": 0.3528, + "step": 7580 + }, + { + "epoch": 6.788908765652952, + "grad_norm": 1.8666518926620483, + "learning_rate": 7.806859205776172e-07, + "loss": 0.3449, + "step": 7590 + }, + { + "epoch": 6.797853309481217, + "grad_norm": 1.7525831460952759, + "learning_rate": 7.803850782190132e-07, + "loss": 0.3534, + "step": 7600 + }, + { + "epoch": 6.806797853309481, + "grad_norm": 2.3835935592651367, + "learning_rate": 7.80084235860409e-07, + "loss": 0.3486, + "step": 7610 + }, + { + "epoch": 6.815742397137746, + "grad_norm": 1.987086296081543, + "learning_rate": 7.79783393501805e-07, + "loss": 0.3554, + "step": 7620 + }, + { + "epoch": 6.824686940966011, + "grad_norm": 1.9066219329833984, + "learning_rate": 7.794825511432009e-07, + "loss": 0.3469, + "step": 7630 + }, + { + "epoch": 6.833631484794275, + "grad_norm": 1.7373526096343994, + "learning_rate": 7.791817087845969e-07, + "loss": 0.3421, + "step": 7640 + }, + { + "epoch": 6.84257602862254, + "grad_norm": 1.9056932926177979, + "learning_rate": 7.788808664259927e-07, + "loss": 0.3489, + "step": 7650 + }, + { + "epoch": 6.851520572450805, + "grad_norm": 1.7385718822479248, + "learning_rate": 7.785800240673887e-07, + "loss": 0.3528, + "step": 7660 + }, + { + "epoch": 6.8604651162790695, + "grad_norm": 2.358637571334839, + "learning_rate": 7.782791817087846e-07, + "loss": 0.3438, + "step": 7670 + }, + { + "epoch": 6.869409660107334, + "grad_norm": 2.082883834838867, + "learning_rate": 7.779783393501805e-07, + "loss": 0.3635, + "step": 7680 + }, + { + "epoch": 6.878354203935599, + "grad_norm": 1.8540581464767456, + "learning_rate": 7.776774969915763e-07, + "loss": 0.3412, + "step": 7690 + }, + { + "epoch": 6.8872987477638645, + "grad_norm": 2.0504400730133057, + "learning_rate": 7.773766546329723e-07, + "loss": 0.3476, + "step": 7700 + }, + { + "epoch": 6.896243291592128, + "grad_norm": 1.7489218711853027, + "learning_rate": 7.770758122743682e-07, + "loss": 0.3459, + "step": 7710 + }, + { + "epoch": 6.905187835420394, + "grad_norm": 1.8986119031906128, + "learning_rate": 7.767749699157641e-07, + "loss": 0.3532, + "step": 7720 + }, + { + "epoch": 6.914132379248659, + "grad_norm": 2.127493143081665, + "learning_rate": 7.7647412755716e-07, + "loss": 0.3532, + "step": 7730 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 1.7227003574371338, + "learning_rate": 7.76173285198556e-07, + "loss": 0.3277, + "step": 7740 + }, + { + "epoch": 6.932021466905188, + "grad_norm": 1.9514144659042358, + "learning_rate": 7.758724428399518e-07, + "loss": 0.3448, + "step": 7750 + }, + { + "epoch": 6.940966010733453, + "grad_norm": 2.035494089126587, + "learning_rate": 7.755716004813477e-07, + "loss": 0.3462, + "step": 7760 + }, + { + "epoch": 6.9499105545617175, + "grad_norm": 1.9275267124176025, + "learning_rate": 7.752707581227437e-07, + "loss": 0.3435, + "step": 7770 + }, + { + "epoch": 6.958855098389982, + "grad_norm": 1.8266222476959229, + "learning_rate": 7.749699157641395e-07, + "loss": 0.3354, + "step": 7780 + }, + { + "epoch": 6.967799642218247, + "grad_norm": 2.1054952144622803, + "learning_rate": 7.746690734055355e-07, + "loss": 0.348, + "step": 7790 + }, + { + "epoch": 6.976744186046512, + "grad_norm": 1.929856300354004, + "learning_rate": 7.743682310469313e-07, + "loss": 0.3368, + "step": 7800 + }, + { + "epoch": 6.985688729874776, + "grad_norm": 1.9243886470794678, + "learning_rate": 7.740673886883273e-07, + "loss": 0.339, + "step": 7810 + }, + { + "epoch": 6.994633273703041, + "grad_norm": 1.7127997875213623, + "learning_rate": 7.737665463297231e-07, + "loss": 0.3552, + "step": 7820 + }, + { + "epoch": 7.0, + "eval_bleu": 67.7607, + "eval_gen_len": 75.2438, + "eval_loss": 0.24548278748989105, + "eval_runtime": 55.6065, + "eval_samples_per_second": 18.739, + "eval_steps_per_second": 0.198, + "step": 7826 + }, + { + "epoch": 7.003577817531306, + "grad_norm": 1.4987674951553345, + "learning_rate": 7.734657039711191e-07, + "loss": 0.3365, + "step": 7830 + }, + { + "epoch": 7.01252236135957, + "grad_norm": 1.9626741409301758, + "learning_rate": 7.73164861612515e-07, + "loss": 0.3284, + "step": 7840 + }, + { + "epoch": 7.021466905187835, + "grad_norm": 2.43878436088562, + "learning_rate": 7.728640192539109e-07, + "loss": 0.3435, + "step": 7850 + }, + { + "epoch": 7.0304114490161, + "grad_norm": 2.184654712677002, + "learning_rate": 7.725631768953068e-07, + "loss": 0.3573, + "step": 7860 + }, + { + "epoch": 7.0393559928443645, + "grad_norm": 2.2012953758239746, + "learning_rate": 7.722623345367028e-07, + "loss": 0.3417, + "step": 7870 + }, + { + "epoch": 7.04830053667263, + "grad_norm": 2.001495599746704, + "learning_rate": 7.719614921780987e-07, + "loss": 0.3223, + "step": 7880 + }, + { + "epoch": 7.057245080500895, + "grad_norm": 1.6574831008911133, + "learning_rate": 7.716606498194946e-07, + "loss": 0.3331, + "step": 7890 + }, + { + "epoch": 7.0661896243291595, + "grad_norm": 1.8421671390533447, + "learning_rate": 7.713598074608905e-07, + "loss": 0.3402, + "step": 7900 + }, + { + "epoch": 7.075134168157424, + "grad_norm": 2.073538303375244, + "learning_rate": 7.710589651022865e-07, + "loss": 0.3427, + "step": 7910 + }, + { + "epoch": 7.084078711985689, + "grad_norm": 1.7038462162017822, + "learning_rate": 7.707581227436822e-07, + "loss": 0.3423, + "step": 7920 + }, + { + "epoch": 7.093023255813954, + "grad_norm": 1.9820359945297241, + "learning_rate": 7.704572803850781e-07, + "loss": 0.3418, + "step": 7930 + }, + { + "epoch": 7.101967799642218, + "grad_norm": 1.8462133407592773, + "learning_rate": 7.701564380264741e-07, + "loss": 0.3441, + "step": 7940 + }, + { + "epoch": 7.110912343470483, + "grad_norm": 1.9584110975265503, + "learning_rate": 7.698555956678699e-07, + "loss": 0.3414, + "step": 7950 + }, + { + "epoch": 7.119856887298748, + "grad_norm": 2.0095632076263428, + "learning_rate": 7.695547533092659e-07, + "loss": 0.3334, + "step": 7960 + }, + { + "epoch": 7.1288014311270125, + "grad_norm": 2.040109395980835, + "learning_rate": 7.692539109506618e-07, + "loss": 0.3381, + "step": 7970 + }, + { + "epoch": 7.137745974955277, + "grad_norm": 1.7459338903427124, + "learning_rate": 7.689530685920578e-07, + "loss": 0.3286, + "step": 7980 + }, + { + "epoch": 7.146690518783542, + "grad_norm": 1.7973277568817139, + "learning_rate": 7.686522262334536e-07, + "loss": 0.3241, + "step": 7990 + }, + { + "epoch": 7.155635062611807, + "grad_norm": 2.1315131187438965, + "learning_rate": 7.683513838748496e-07, + "loss": 0.3131, + "step": 8000 + }, + { + "epoch": 7.164579606440071, + "grad_norm": 2.5606272220611572, + "learning_rate": 7.680505415162455e-07, + "loss": 0.3306, + "step": 8010 + }, + { + "epoch": 7.173524150268336, + "grad_norm": 1.9568837881088257, + "learning_rate": 7.677496991576414e-07, + "loss": 0.3286, + "step": 8020 + }, + { + "epoch": 7.182468694096601, + "grad_norm": 1.9463409185409546, + "learning_rate": 7.674488567990372e-07, + "loss": 0.3464, + "step": 8030 + }, + { + "epoch": 7.191413237924865, + "grad_norm": 1.9276138544082642, + "learning_rate": 7.671480144404332e-07, + "loss": 0.3437, + "step": 8040 + }, + { + "epoch": 7.200357781753131, + "grad_norm": 1.7747093439102173, + "learning_rate": 7.668471720818291e-07, + "loss": 0.3372, + "step": 8050 + }, + { + "epoch": 7.209302325581396, + "grad_norm": 1.7246421575546265, + "learning_rate": 7.66546329723225e-07, + "loss": 0.3437, + "step": 8060 + }, + { + "epoch": 7.21824686940966, + "grad_norm": 2.0087859630584717, + "learning_rate": 7.662454873646209e-07, + "loss": 0.3437, + "step": 8070 + }, + { + "epoch": 7.227191413237925, + "grad_norm": 1.9562363624572754, + "learning_rate": 7.659446450060169e-07, + "loss": 0.3148, + "step": 8080 + }, + { + "epoch": 7.23613595706619, + "grad_norm": 1.9914944171905518, + "learning_rate": 7.656438026474127e-07, + "loss": 0.3327, + "step": 8090 + }, + { + "epoch": 7.2450805008944545, + "grad_norm": 2.276630401611328, + "learning_rate": 7.653429602888086e-07, + "loss": 0.3285, + "step": 8100 + }, + { + "epoch": 7.254025044722719, + "grad_norm": 1.6960512399673462, + "learning_rate": 7.650421179302046e-07, + "loss": 0.3303, + "step": 8110 + }, + { + "epoch": 7.262969588550984, + "grad_norm": 1.9524924755096436, + "learning_rate": 7.647412755716004e-07, + "loss": 0.3328, + "step": 8120 + }, + { + "epoch": 7.271914132379249, + "grad_norm": 1.7584964036941528, + "learning_rate": 7.644404332129964e-07, + "loss": 0.3264, + "step": 8130 + }, + { + "epoch": 7.280858676207513, + "grad_norm": 1.9152427911758423, + "learning_rate": 7.641395908543922e-07, + "loss": 0.333, + "step": 8140 + }, + { + "epoch": 7.289803220035778, + "grad_norm": 1.6606615781784058, + "learning_rate": 7.638387484957882e-07, + "loss": 0.328, + "step": 8150 + }, + { + "epoch": 7.298747763864043, + "grad_norm": 1.8712750673294067, + "learning_rate": 7.63537906137184e-07, + "loss": 0.3203, + "step": 8160 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 1.7427736520767212, + "learning_rate": 7.6323706377858e-07, + "loss": 0.3284, + "step": 8170 + }, + { + "epoch": 7.316636851520572, + "grad_norm": 2.324427366256714, + "learning_rate": 7.629362214199759e-07, + "loss": 0.3354, + "step": 8180 + }, + { + "epoch": 7.325581395348837, + "grad_norm": 1.9044541120529175, + "learning_rate": 7.626353790613718e-07, + "loss": 0.3379, + "step": 8190 + }, + { + "epoch": 7.334525939177102, + "grad_norm": 2.359497547149658, + "learning_rate": 7.623345367027677e-07, + "loss": 0.3252, + "step": 8200 + }, + { + "epoch": 7.343470483005366, + "grad_norm": 2.01823091506958, + "learning_rate": 7.620336943441637e-07, + "loss": 0.3203, + "step": 8210 + }, + { + "epoch": 7.352415026833631, + "grad_norm": 2.5603623390197754, + "learning_rate": 7.617328519855595e-07, + "loss": 0.3251, + "step": 8220 + }, + { + "epoch": 7.361359570661897, + "grad_norm": 2.2850921154022217, + "learning_rate": 7.614320096269555e-07, + "loss": 0.3468, + "step": 8230 + }, + { + "epoch": 7.370304114490161, + "grad_norm": 1.7620161771774292, + "learning_rate": 7.611311672683514e-07, + "loss": 0.3267, + "step": 8240 + }, + { + "epoch": 7.379248658318426, + "grad_norm": 1.9929213523864746, + "learning_rate": 7.608303249097473e-07, + "loss": 0.3309, + "step": 8250 + }, + { + "epoch": 7.388193202146691, + "grad_norm": 1.846187710762024, + "learning_rate": 7.605294825511431e-07, + "loss": 0.3391, + "step": 8260 + }, + { + "epoch": 7.397137745974955, + "grad_norm": 1.825868844985962, + "learning_rate": 7.60228640192539e-07, + "loss": 0.3348, + "step": 8270 + }, + { + "epoch": 7.40608228980322, + "grad_norm": 1.8973084688186646, + "learning_rate": 7.59927797833935e-07, + "loss": 0.3282, + "step": 8280 + }, + { + "epoch": 7.415026833631485, + "grad_norm": 2.0674967765808105, + "learning_rate": 7.596269554753308e-07, + "loss": 0.322, + "step": 8290 + }, + { + "epoch": 7.4239713774597496, + "grad_norm": 1.8808948993682861, + "learning_rate": 7.593261131167268e-07, + "loss": 0.3143, + "step": 8300 + }, + { + "epoch": 7.432915921288014, + "grad_norm": 2.2094178199768066, + "learning_rate": 7.590252707581227e-07, + "loss": 0.3319, + "step": 8310 + }, + { + "epoch": 7.441860465116279, + "grad_norm": 1.8964875936508179, + "learning_rate": 7.587244283995187e-07, + "loss": 0.3245, + "step": 8320 + }, + { + "epoch": 7.450805008944544, + "grad_norm": 1.961187481880188, + "learning_rate": 7.584235860409145e-07, + "loss": 0.3192, + "step": 8330 + }, + { + "epoch": 7.459749552772808, + "grad_norm": 2.1825661659240723, + "learning_rate": 7.581227436823105e-07, + "loss": 0.3427, + "step": 8340 + }, + { + "epoch": 7.468694096601073, + "grad_norm": 1.9353907108306885, + "learning_rate": 7.578219013237064e-07, + "loss": 0.315, + "step": 8350 + }, + { + "epoch": 7.477638640429338, + "grad_norm": 1.852070927619934, + "learning_rate": 7.575210589651022e-07, + "loss": 0.3275, + "step": 8360 + }, + { + "epoch": 7.4865831842576025, + "grad_norm": 1.8378336429595947, + "learning_rate": 7.572202166064981e-07, + "loss": 0.323, + "step": 8370 + }, + { + "epoch": 7.495527728085868, + "grad_norm": 1.6793603897094727, + "learning_rate": 7.569193742478941e-07, + "loss": 0.3284, + "step": 8380 + }, + { + "epoch": 7.504472271914132, + "grad_norm": 1.8029990196228027, + "learning_rate": 7.566185318892899e-07, + "loss": 0.3216, + "step": 8390 + }, + { + "epoch": 7.5134168157423975, + "grad_norm": 1.9036970138549805, + "learning_rate": 7.563176895306859e-07, + "loss": 0.324, + "step": 8400 + }, + { + "epoch": 7.522361359570662, + "grad_norm": 1.8701826333999634, + "learning_rate": 7.560168471720818e-07, + "loss": 0.3232, + "step": 8410 + }, + { + "epoch": 7.531305903398927, + "grad_norm": 1.6452906131744385, + "learning_rate": 7.557160048134778e-07, + "loss": 0.3327, + "step": 8420 + }, + { + "epoch": 7.540250447227192, + "grad_norm": 1.9045932292938232, + "learning_rate": 7.554151624548736e-07, + "loss": 0.3254, + "step": 8430 + }, + { + "epoch": 7.549194991055456, + "grad_norm": 1.8441321849822998, + "learning_rate": 7.551143200962695e-07, + "loss": 0.3255, + "step": 8440 + }, + { + "epoch": 7.558139534883721, + "grad_norm": 1.7707712650299072, + "learning_rate": 7.548134777376655e-07, + "loss": 0.33, + "step": 8450 + }, + { + "epoch": 7.567084078711986, + "grad_norm": 1.815173625946045, + "learning_rate": 7.545126353790612e-07, + "loss": 0.3307, + "step": 8460 + }, + { + "epoch": 7.5760286225402504, + "grad_norm": 1.857528805732727, + "learning_rate": 7.542117930204572e-07, + "loss": 0.3196, + "step": 8470 + }, + { + "epoch": 7.584973166368515, + "grad_norm": 2.055410623550415, + "learning_rate": 7.539109506618531e-07, + "loss": 0.3183, + "step": 8480 + }, + { + "epoch": 7.59391771019678, + "grad_norm": 1.8891446590423584, + "learning_rate": 7.536101083032491e-07, + "loss": 0.3357, + "step": 8490 + }, + { + "epoch": 7.602862254025045, + "grad_norm": 1.4552106857299805, + "learning_rate": 7.533092659446449e-07, + "loss": 0.314, + "step": 8500 + }, + { + "epoch": 7.611806797853309, + "grad_norm": 1.7067893743515015, + "learning_rate": 7.530084235860409e-07, + "loss": 0.3181, + "step": 8510 + }, + { + "epoch": 7.620751341681574, + "grad_norm": 1.9766528606414795, + "learning_rate": 7.527075812274368e-07, + "loss": 0.3173, + "step": 8520 + }, + { + "epoch": 7.629695885509839, + "grad_norm": 1.8885570764541626, + "learning_rate": 7.524067388688327e-07, + "loss": 0.32, + "step": 8530 + }, + { + "epoch": 7.638640429338103, + "grad_norm": 2.078587055206299, + "learning_rate": 7.521058965102286e-07, + "loss": 0.307, + "step": 8540 + }, + { + "epoch": 7.647584973166369, + "grad_norm": 1.9613964557647705, + "learning_rate": 7.518050541516246e-07, + "loss": 0.3182, + "step": 8550 + }, + { + "epoch": 7.656529516994633, + "grad_norm": 1.9764385223388672, + "learning_rate": 7.515042117930204e-07, + "loss": 0.3352, + "step": 8560 + }, + { + "epoch": 7.665474060822898, + "grad_norm": 2.0007693767547607, + "learning_rate": 7.512033694344164e-07, + "loss": 0.3244, + "step": 8570 + }, + { + "epoch": 7.674418604651163, + "grad_norm": 1.6230690479278564, + "learning_rate": 7.509025270758122e-07, + "loss": 0.3219, + "step": 8580 + }, + { + "epoch": 7.683363148479428, + "grad_norm": 1.8112707138061523, + "learning_rate": 7.506016847172082e-07, + "loss": 0.3196, + "step": 8590 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 1.8433512449264526, + "learning_rate": 7.50300842358604e-07, + "loss": 0.3201, + "step": 8600 + }, + { + "epoch": 7.701252236135957, + "grad_norm": 2.180025100708008, + "learning_rate": 7.5e-07, + "loss": 0.3189, + "step": 8610 + }, + { + "epoch": 7.710196779964222, + "grad_norm": 1.7990797758102417, + "learning_rate": 7.496991576413959e-07, + "loss": 0.3161, + "step": 8620 + }, + { + "epoch": 7.719141323792487, + "grad_norm": 1.9342771768569946, + "learning_rate": 7.493983152827917e-07, + "loss": 0.3198, + "step": 8630 + }, + { + "epoch": 7.728085867620751, + "grad_norm": 1.7878928184509277, + "learning_rate": 7.490974729241877e-07, + "loss": 0.3136, + "step": 8640 + }, + { + "epoch": 7.737030411449016, + "grad_norm": 1.9334170818328857, + "learning_rate": 7.487966305655836e-07, + "loss": 0.3101, + "step": 8650 + }, + { + "epoch": 7.745974955277281, + "grad_norm": 1.6836000680923462, + "learning_rate": 7.484957882069796e-07, + "loss": 0.3102, + "step": 8660 + }, + { + "epoch": 7.7549194991055455, + "grad_norm": 2.0518503189086914, + "learning_rate": 7.481949458483754e-07, + "loss": 0.3089, + "step": 8670 + }, + { + "epoch": 7.76386404293381, + "grad_norm": 1.9015884399414062, + "learning_rate": 7.478941034897714e-07, + "loss": 0.3116, + "step": 8680 + }, + { + "epoch": 7.772808586762075, + "grad_norm": 1.910198450088501, + "learning_rate": 7.475932611311672e-07, + "loss": 0.3133, + "step": 8690 + }, + { + "epoch": 7.78175313059034, + "grad_norm": 1.6957149505615234, + "learning_rate": 7.472924187725631e-07, + "loss": 0.3079, + "step": 8700 + }, + { + "epoch": 7.790697674418604, + "grad_norm": 1.673759937286377, + "learning_rate": 7.46991576413959e-07, + "loss": 0.3109, + "step": 8710 + }, + { + "epoch": 7.79964221824687, + "grad_norm": 1.676411509513855, + "learning_rate": 7.46690734055355e-07, + "loss": 0.3271, + "step": 8720 + }, + { + "epoch": 7.808586762075134, + "grad_norm": 1.729395866394043, + "learning_rate": 7.463898916967508e-07, + "loss": 0.3222, + "step": 8730 + }, + { + "epoch": 7.817531305903399, + "grad_norm": 1.7788453102111816, + "learning_rate": 7.460890493381468e-07, + "loss": 0.3099, + "step": 8740 + }, + { + "epoch": 7.826475849731664, + "grad_norm": 1.9072816371917725, + "learning_rate": 7.457882069795427e-07, + "loss": 0.3078, + "step": 8750 + }, + { + "epoch": 7.835420393559929, + "grad_norm": 1.8437711000442505, + "learning_rate": 7.454873646209387e-07, + "loss": 0.3151, + "step": 8760 + }, + { + "epoch": 7.844364937388193, + "grad_norm": 1.8351961374282837, + "learning_rate": 7.451865222623345e-07, + "loss": 0.3233, + "step": 8770 + }, + { + "epoch": 7.853309481216458, + "grad_norm": 1.892714023590088, + "learning_rate": 7.448856799037305e-07, + "loss": 0.3242, + "step": 8780 + }, + { + "epoch": 7.862254025044723, + "grad_norm": 1.8229130506515503, + "learning_rate": 7.445848375451264e-07, + "loss": 0.3077, + "step": 8790 + }, + { + "epoch": 7.8711985688729875, + "grad_norm": 1.7857228517532349, + "learning_rate": 7.442839951865221e-07, + "loss": 0.3269, + "step": 8800 + }, + { + "epoch": 7.880143112701252, + "grad_norm": 1.8515849113464355, + "learning_rate": 7.439831528279181e-07, + "loss": 0.3163, + "step": 8810 + }, + { + "epoch": 7.889087656529517, + "grad_norm": 1.964338779449463, + "learning_rate": 7.43682310469314e-07, + "loss": 0.3207, + "step": 8820 + }, + { + "epoch": 7.898032200357782, + "grad_norm": 1.8012237548828125, + "learning_rate": 7.4338146811071e-07, + "loss": 0.3204, + "step": 8830 + }, + { + "epoch": 7.906976744186046, + "grad_norm": 1.8915505409240723, + "learning_rate": 7.430806257521058e-07, + "loss": 0.3157, + "step": 8840 + }, + { + "epoch": 7.915921288014311, + "grad_norm": 1.9770463705062866, + "learning_rate": 7.427797833935018e-07, + "loss": 0.302, + "step": 8850 + }, + { + "epoch": 7.924865831842576, + "grad_norm": 1.6322442293167114, + "learning_rate": 7.424789410348977e-07, + "loss": 0.3229, + "step": 8860 + }, + { + "epoch": 7.9338103756708405, + "grad_norm": 2.0155563354492188, + "learning_rate": 7.421780986762936e-07, + "loss": 0.3021, + "step": 8870 + }, + { + "epoch": 7.942754919499105, + "grad_norm": 1.6682124137878418, + "learning_rate": 7.418772563176895e-07, + "loss": 0.3259, + "step": 8880 + }, + { + "epoch": 7.951699463327371, + "grad_norm": 1.604107141494751, + "learning_rate": 7.415764139590855e-07, + "loss": 0.3006, + "step": 8890 + }, + { + "epoch": 7.960644007155635, + "grad_norm": 2.204362630844116, + "learning_rate": 7.412755716004812e-07, + "loss": 0.31, + "step": 8900 + }, + { + "epoch": 7.9695885509839, + "grad_norm": 1.7704609632492065, + "learning_rate": 7.409747292418772e-07, + "loss": 0.3284, + "step": 8910 + }, + { + "epoch": 7.978533094812165, + "grad_norm": 2.1972293853759766, + "learning_rate": 7.406738868832731e-07, + "loss": 0.32, + "step": 8920 + }, + { + "epoch": 7.98747763864043, + "grad_norm": 1.8060202598571777, + "learning_rate": 7.403730445246691e-07, + "loss": 0.3107, + "step": 8930 + }, + { + "epoch": 7.996422182468694, + "grad_norm": 1.6772303581237793, + "learning_rate": 7.400722021660649e-07, + "loss": 0.3324, + "step": 8940 + }, + { + "epoch": 8.0, + "eval_bleu": 69.635, + "eval_gen_len": 75.1036, + "eval_loss": 0.22736571729183197, + "eval_runtime": 56.5288, + "eval_samples_per_second": 18.433, + "eval_steps_per_second": 0.195, + "step": 8944 + }, + { + "epoch": 8.005366726296959, + "grad_norm": 1.878150463104248, + "learning_rate": 7.397713598074609e-07, + "loss": 0.3096, + "step": 8950 + }, + { + "epoch": 8.014311270125223, + "grad_norm": 1.708453893661499, + "learning_rate": 7.394705174488568e-07, + "loss": 0.3129, + "step": 8960 + }, + { + "epoch": 8.023255813953488, + "grad_norm": 2.111327886581421, + "learning_rate": 7.391696750902526e-07, + "loss": 0.3115, + "step": 8970 + }, + { + "epoch": 8.032200357781754, + "grad_norm": 1.8746176958084106, + "learning_rate": 7.388688327316486e-07, + "loss": 0.3133, + "step": 8980 + }, + { + "epoch": 8.041144901610018, + "grad_norm": 1.7758466005325317, + "learning_rate": 7.385679903730445e-07, + "loss": 0.3204, + "step": 8990 + }, + { + "epoch": 8.050089445438283, + "grad_norm": 1.8004777431488037, + "learning_rate": 7.382671480144405e-07, + "loss": 0.3175, + "step": 9000 + }, + { + "epoch": 8.059033989266547, + "grad_norm": 1.8340625762939453, + "learning_rate": 7.379663056558362e-07, + "loss": 0.3095, + "step": 9010 + }, + { + "epoch": 8.067978533094813, + "grad_norm": 1.6976382732391357, + "learning_rate": 7.376654632972322e-07, + "loss": 0.3202, + "step": 9020 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 1.9153364896774292, + "learning_rate": 7.373646209386281e-07, + "loss": 0.3142, + "step": 9030 + }, + { + "epoch": 8.085867620751342, + "grad_norm": 1.8551111221313477, + "learning_rate": 7.37063778580024e-07, + "loss": 0.2903, + "step": 9040 + }, + { + "epoch": 8.094812164579606, + "grad_norm": 1.7493977546691895, + "learning_rate": 7.367629362214199e-07, + "loss": 0.3034, + "step": 9050 + }, + { + "epoch": 8.103756708407872, + "grad_norm": 1.7226780652999878, + "learning_rate": 7.364620938628159e-07, + "loss": 0.309, + "step": 9060 + }, + { + "epoch": 8.112701252236135, + "grad_norm": 2.0469791889190674, + "learning_rate": 7.361612515042117e-07, + "loss": 0.3073, + "step": 9070 + }, + { + "epoch": 8.121645796064401, + "grad_norm": 1.8778507709503174, + "learning_rate": 7.358604091456077e-07, + "loss": 0.3106, + "step": 9080 + }, + { + "epoch": 8.130590339892665, + "grad_norm": 2.180492639541626, + "learning_rate": 7.355595667870036e-07, + "loss": 0.3061, + "step": 9090 + }, + { + "epoch": 8.13953488372093, + "grad_norm": 1.9905855655670166, + "learning_rate": 7.352587244283996e-07, + "loss": 0.315, + "step": 9100 + }, + { + "epoch": 8.148479427549194, + "grad_norm": 1.7467095851898193, + "learning_rate": 7.349578820697954e-07, + "loss": 0.2965, + "step": 9110 + }, + { + "epoch": 8.15742397137746, + "grad_norm": 1.6741291284561157, + "learning_rate": 7.346570397111914e-07, + "loss": 0.2965, + "step": 9120 + }, + { + "epoch": 8.166368515205724, + "grad_norm": 1.6304727792739868, + "learning_rate": 7.343561973525872e-07, + "loss": 0.3133, + "step": 9130 + }, + { + "epoch": 8.17531305903399, + "grad_norm": 1.9069669246673584, + "learning_rate": 7.34055354993983e-07, + "loss": 0.3063, + "step": 9140 + }, + { + "epoch": 8.184257602862255, + "grad_norm": 1.7819583415985107, + "learning_rate": 7.33754512635379e-07, + "loss": 0.3107, + "step": 9150 + }, + { + "epoch": 8.193202146690519, + "grad_norm": 2.1937334537506104, + "learning_rate": 7.334536702767749e-07, + "loss": 0.312, + "step": 9160 + }, + { + "epoch": 8.202146690518784, + "grad_norm": 2.098236083984375, + "learning_rate": 7.331528279181708e-07, + "loss": 0.3091, + "step": 9170 + }, + { + "epoch": 8.211091234347048, + "grad_norm": 1.8808610439300537, + "learning_rate": 7.328519855595667e-07, + "loss": 0.3014, + "step": 9180 + }, + { + "epoch": 8.220035778175314, + "grad_norm": 2.0765531063079834, + "learning_rate": 7.325511432009627e-07, + "loss": 0.3057, + "step": 9190 + }, + { + "epoch": 8.228980322003578, + "grad_norm": 1.7418488264083862, + "learning_rate": 7.322503008423586e-07, + "loss": 0.2948, + "step": 9200 + }, + { + "epoch": 8.237924865831843, + "grad_norm": 2.491260528564453, + "learning_rate": 7.319494584837545e-07, + "loss": 0.2992, + "step": 9210 + }, + { + "epoch": 8.246869409660107, + "grad_norm": 1.9340630769729614, + "learning_rate": 7.316486161251504e-07, + "loss": 0.3065, + "step": 9220 + }, + { + "epoch": 8.255813953488373, + "grad_norm": 1.5319713354110718, + "learning_rate": 7.313477737665464e-07, + "loss": 0.2944, + "step": 9230 + }, + { + "epoch": 8.264758497316636, + "grad_norm": 1.8187763690948486, + "learning_rate": 7.310469314079421e-07, + "loss": 0.3155, + "step": 9240 + }, + { + "epoch": 8.273703041144902, + "grad_norm": 2.0588202476501465, + "learning_rate": 7.307460890493381e-07, + "loss": 0.3024, + "step": 9250 + }, + { + "epoch": 8.282647584973166, + "grad_norm": 1.6918816566467285, + "learning_rate": 7.30445246690734e-07, + "loss": 0.2959, + "step": 9260 + }, + { + "epoch": 8.291592128801431, + "grad_norm": 1.8093841075897217, + "learning_rate": 7.3014440433213e-07, + "loss": 0.3049, + "step": 9270 + }, + { + "epoch": 8.300536672629695, + "grad_norm": 1.782524824142456, + "learning_rate": 7.298435619735258e-07, + "loss": 0.2912, + "step": 9280 + }, + { + "epoch": 8.30948121645796, + "grad_norm": 1.842565894126892, + "learning_rate": 7.295427196149218e-07, + "loss": 0.2942, + "step": 9290 + }, + { + "epoch": 8.318425760286225, + "grad_norm": 1.786942958831787, + "learning_rate": 7.292418772563177e-07, + "loss": 0.2944, + "step": 9300 + }, + { + "epoch": 8.32737030411449, + "grad_norm": 2.370697021484375, + "learning_rate": 7.289410348977135e-07, + "loss": 0.3061, + "step": 9310 + }, + { + "epoch": 8.336314847942756, + "grad_norm": 1.8887977600097656, + "learning_rate": 7.286401925391095e-07, + "loss": 0.2966, + "step": 9320 + }, + { + "epoch": 8.34525939177102, + "grad_norm": 1.9103881120681763, + "learning_rate": 7.283393501805054e-07, + "loss": 0.3035, + "step": 9330 + }, + { + "epoch": 8.354203935599285, + "grad_norm": 1.6420018672943115, + "learning_rate": 7.280385078219013e-07, + "loss": 0.302, + "step": 9340 + }, + { + "epoch": 8.363148479427549, + "grad_norm": 1.8546104431152344, + "learning_rate": 7.277376654632971e-07, + "loss": 0.2954, + "step": 9350 + }, + { + "epoch": 8.372093023255815, + "grad_norm": 1.7940912246704102, + "learning_rate": 7.274368231046931e-07, + "loss": 0.307, + "step": 9360 + }, + { + "epoch": 8.381037567084078, + "grad_norm": 2.2668471336364746, + "learning_rate": 7.27135980746089e-07, + "loss": 0.301, + "step": 9370 + }, + { + "epoch": 8.389982110912344, + "grad_norm": 1.848625898361206, + "learning_rate": 7.268351383874849e-07, + "loss": 0.3041, + "step": 9380 + }, + { + "epoch": 8.398926654740608, + "grad_norm": 1.559826135635376, + "learning_rate": 7.265342960288808e-07, + "loss": 0.292, + "step": 9390 + }, + { + "epoch": 8.407871198568873, + "grad_norm": 1.899900197982788, + "learning_rate": 7.262334536702768e-07, + "loss": 0.2948, + "step": 9400 + }, + { + "epoch": 8.416815742397137, + "grad_norm": 1.8511258363723755, + "learning_rate": 7.259326113116726e-07, + "loss": 0.2937, + "step": 9410 + }, + { + "epoch": 8.425760286225403, + "grad_norm": 1.9412320852279663, + "learning_rate": 7.256317689530686e-07, + "loss": 0.303, + "step": 9420 + }, + { + "epoch": 8.434704830053667, + "grad_norm": 1.8842699527740479, + "learning_rate": 7.253309265944645e-07, + "loss": 0.3054, + "step": 9430 + }, + { + "epoch": 8.443649373881932, + "grad_norm": 2.0506179332733154, + "learning_rate": 7.250300842358605e-07, + "loss": 0.304, + "step": 9440 + }, + { + "epoch": 8.452593917710196, + "grad_norm": 1.71364164352417, + "learning_rate": 7.247292418772563e-07, + "loss": 0.3071, + "step": 9450 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 1.8477452993392944, + "learning_rate": 7.244283995186522e-07, + "loss": 0.3042, + "step": 9460 + }, + { + "epoch": 8.470483005366725, + "grad_norm": 2.0586087703704834, + "learning_rate": 7.241275571600481e-07, + "loss": 0.3098, + "step": 9470 + }, + { + "epoch": 8.479427549194991, + "grad_norm": 1.9386125802993774, + "learning_rate": 7.238267148014439e-07, + "loss": 0.3002, + "step": 9480 + }, + { + "epoch": 8.488372093023255, + "grad_norm": 1.7622777223587036, + "learning_rate": 7.235258724428399e-07, + "loss": 0.3056, + "step": 9490 + }, + { + "epoch": 8.49731663685152, + "grad_norm": 1.7191228866577148, + "learning_rate": 7.232250300842358e-07, + "loss": 0.2963, + "step": 9500 + }, + { + "epoch": 8.506261180679786, + "grad_norm": 1.622677206993103, + "learning_rate": 7.229241877256317e-07, + "loss": 0.305, + "step": 9510 + }, + { + "epoch": 8.51520572450805, + "grad_norm": 1.6148167848587036, + "learning_rate": 7.226233453670276e-07, + "loss": 0.3046, + "step": 9520 + }, + { + "epoch": 8.524150268336316, + "grad_norm": 1.6732158660888672, + "learning_rate": 7.223225030084236e-07, + "loss": 0.3014, + "step": 9530 + }, + { + "epoch": 8.53309481216458, + "grad_norm": 1.8138659000396729, + "learning_rate": 7.220216606498195e-07, + "loss": 0.3041, + "step": 9540 + }, + { + "epoch": 8.542039355992845, + "grad_norm": 1.5868489742279053, + "learning_rate": 7.217208182912154e-07, + "loss": 0.2994, + "step": 9550 + }, + { + "epoch": 8.550983899821109, + "grad_norm": 1.57341468334198, + "learning_rate": 7.214199759326113e-07, + "loss": 0.2986, + "step": 9560 + }, + { + "epoch": 8.559928443649374, + "grad_norm": 2.0454437732696533, + "learning_rate": 7.211191335740072e-07, + "loss": 0.3019, + "step": 9570 + }, + { + "epoch": 8.568872987477638, + "grad_norm": 1.9171931743621826, + "learning_rate": 7.20818291215403e-07, + "loss": 0.2963, + "step": 9580 + }, + { + "epoch": 8.577817531305904, + "grad_norm": 1.9101694822311401, + "learning_rate": 7.20517448856799e-07, + "loss": 0.3049, + "step": 9590 + }, + { + "epoch": 8.586762075134168, + "grad_norm": 2.052638292312622, + "learning_rate": 7.202166064981949e-07, + "loss": 0.3065, + "step": 9600 + }, + { + "epoch": 8.595706618962433, + "grad_norm": 1.6304017305374146, + "learning_rate": 7.199157641395909e-07, + "loss": 0.2959, + "step": 9610 + }, + { + "epoch": 8.604651162790697, + "grad_norm": 1.8963510990142822, + "learning_rate": 7.196149217809867e-07, + "loss": 0.3002, + "step": 9620 + }, + { + "epoch": 8.613595706618963, + "grad_norm": 1.8579572439193726, + "learning_rate": 7.193140794223827e-07, + "loss": 0.3044, + "step": 9630 + }, + { + "epoch": 8.622540250447226, + "grad_norm": 2.3206236362457275, + "learning_rate": 7.190132370637786e-07, + "loss": 0.2881, + "step": 9640 + }, + { + "epoch": 8.631484794275492, + "grad_norm": 2.7964982986450195, + "learning_rate": 7.187123947051744e-07, + "loss": 0.298, + "step": 9650 + }, + { + "epoch": 8.640429338103758, + "grad_norm": 1.6848253011703491, + "learning_rate": 7.184115523465704e-07, + "loss": 0.3032, + "step": 9660 + }, + { + "epoch": 8.649373881932021, + "grad_norm": 2.1918492317199707, + "learning_rate": 7.181107099879663e-07, + "loss": 0.2981, + "step": 9670 + }, + { + "epoch": 8.658318425760287, + "grad_norm": 1.746699571609497, + "learning_rate": 7.178098676293621e-07, + "loss": 0.2893, + "step": 9680 + }, + { + "epoch": 8.66726296958855, + "grad_norm": 1.6501363515853882, + "learning_rate": 7.17509025270758e-07, + "loss": 0.3029, + "step": 9690 + }, + { + "epoch": 8.676207513416816, + "grad_norm": 1.6390386819839478, + "learning_rate": 7.17208182912154e-07, + "loss": 0.2977, + "step": 9700 + }, + { + "epoch": 8.68515205724508, + "grad_norm": 1.9444026947021484, + "learning_rate": 7.169073405535499e-07, + "loss": 0.2797, + "step": 9710 + }, + { + "epoch": 8.694096601073346, + "grad_norm": 1.679573893547058, + "learning_rate": 7.166064981949458e-07, + "loss": 0.2978, + "step": 9720 + }, + { + "epoch": 8.70304114490161, + "grad_norm": 1.9855968952178955, + "learning_rate": 7.163056558363417e-07, + "loss": 0.2979, + "step": 9730 + }, + { + "epoch": 8.711985688729875, + "grad_norm": 1.834344506263733, + "learning_rate": 7.160048134777377e-07, + "loss": 0.2891, + "step": 9740 + }, + { + "epoch": 8.720930232558139, + "grad_norm": 1.6731619834899902, + "learning_rate": 7.157039711191335e-07, + "loss": 0.2886, + "step": 9750 + }, + { + "epoch": 8.729874776386405, + "grad_norm": 1.90044105052948, + "learning_rate": 7.154031287605295e-07, + "loss": 0.2836, + "step": 9760 + }, + { + "epoch": 8.738819320214668, + "grad_norm": 1.754361867904663, + "learning_rate": 7.151022864019254e-07, + "loss": 0.2967, + "step": 9770 + }, + { + "epoch": 8.747763864042934, + "grad_norm": 1.7115799188613892, + "learning_rate": 7.148014440433214e-07, + "loss": 0.2941, + "step": 9780 + }, + { + "epoch": 8.756708407871198, + "grad_norm": 1.5401322841644287, + "learning_rate": 7.145006016847171e-07, + "loss": 0.2962, + "step": 9790 + }, + { + "epoch": 8.765652951699463, + "grad_norm": 1.596837043762207, + "learning_rate": 7.141997593261131e-07, + "loss": 0.2987, + "step": 9800 + }, + { + "epoch": 8.774597495527727, + "grad_norm": 1.6483724117279053, + "learning_rate": 7.13898916967509e-07, + "loss": 0.292, + "step": 9810 + }, + { + "epoch": 8.783542039355993, + "grad_norm": 2.597574472427368, + "learning_rate": 7.135980746089048e-07, + "loss": 0.2954, + "step": 9820 + }, + { + "epoch": 8.792486583184257, + "grad_norm": 1.6123573780059814, + "learning_rate": 7.132972322503008e-07, + "loss": 0.291, + "step": 9830 + }, + { + "epoch": 8.801431127012522, + "grad_norm": 1.6893271207809448, + "learning_rate": 7.129963898916967e-07, + "loss": 0.2701, + "step": 9840 + }, + { + "epoch": 8.810375670840788, + "grad_norm": 2.026670217514038, + "learning_rate": 7.126955475330926e-07, + "loss": 0.2884, + "step": 9850 + }, + { + "epoch": 8.819320214669052, + "grad_norm": 1.7611801624298096, + "learning_rate": 7.123947051744885e-07, + "loss": 0.2849, + "step": 9860 + }, + { + "epoch": 8.828264758497317, + "grad_norm": 1.6061441898345947, + "learning_rate": 7.120938628158845e-07, + "loss": 0.2921, + "step": 9870 + }, + { + "epoch": 8.837209302325581, + "grad_norm": 2.0565128326416016, + "learning_rate": 7.117930204572804e-07, + "loss": 0.2856, + "step": 9880 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 2.602135181427002, + "learning_rate": 7.114921780986763e-07, + "loss": 0.2967, + "step": 9890 + }, + { + "epoch": 8.85509838998211, + "grad_norm": 1.7691055536270142, + "learning_rate": 7.111913357400721e-07, + "loss": 0.2996, + "step": 9900 + }, + { + "epoch": 8.864042933810376, + "grad_norm": 1.7833753824234009, + "learning_rate": 7.108904933814681e-07, + "loss": 0.2951, + "step": 9910 + }, + { + "epoch": 8.87298747763864, + "grad_norm": 1.9108906984329224, + "learning_rate": 7.105896510228639e-07, + "loss": 0.2925, + "step": 9920 + }, + { + "epoch": 8.881932021466906, + "grad_norm": 36.32560729980469, + "learning_rate": 7.102888086642599e-07, + "loss": 0.2947, + "step": 9930 + }, + { + "epoch": 8.89087656529517, + "grad_norm": 1.918143630027771, + "learning_rate": 7.099879663056558e-07, + "loss": 0.2847, + "step": 9940 + }, + { + "epoch": 8.899821109123435, + "grad_norm": 2.5026514530181885, + "learning_rate": 7.096871239470517e-07, + "loss": 0.2871, + "step": 9950 + }, + { + "epoch": 8.908765652951699, + "grad_norm": 1.8258767127990723, + "learning_rate": 7.093862815884476e-07, + "loss": 0.2853, + "step": 9960 + }, + { + "epoch": 8.917710196779964, + "grad_norm": 1.6664685010910034, + "learning_rate": 7.090854392298436e-07, + "loss": 0.2846, + "step": 9970 + }, + { + "epoch": 8.926654740608228, + "grad_norm": 1.9599624872207642, + "learning_rate": 7.087845968712395e-07, + "loss": 0.2807, + "step": 9980 + }, + { + "epoch": 8.935599284436494, + "grad_norm": 1.8693242073059082, + "learning_rate": 7.084837545126353e-07, + "loss": 0.2931, + "step": 9990 + }, + { + "epoch": 8.94454382826476, + "grad_norm": 2.025860548019409, + "learning_rate": 7.081829121540313e-07, + "loss": 0.2916, + "step": 10000 + }, + { + "epoch": 8.953488372093023, + "grad_norm": 1.6296007633209229, + "learning_rate": 7.078820697954271e-07, + "loss": 0.2901, + "step": 10010 + }, + { + "epoch": 8.962432915921289, + "grad_norm": 1.8131719827651978, + "learning_rate": 7.07581227436823e-07, + "loss": 0.2935, + "step": 10020 + }, + { + "epoch": 8.971377459749553, + "grad_norm": 1.7503496408462524, + "learning_rate": 7.072803850782189e-07, + "loss": 0.2843, + "step": 10030 + }, + { + "epoch": 8.980322003577818, + "grad_norm": 2.0243091583251953, + "learning_rate": 7.069795427196149e-07, + "loss": 0.2877, + "step": 10040 + }, + { + "epoch": 8.989266547406082, + "grad_norm": 2.1338999271392822, + "learning_rate": 7.066787003610108e-07, + "loss": 0.2816, + "step": 10050 + }, + { + "epoch": 8.998211091234348, + "grad_norm": 1.9088892936706543, + "learning_rate": 7.063778580024067e-07, + "loss": 0.2912, + "step": 10060 + }, + { + "epoch": 9.0, + "eval_bleu": 71.3086, + "eval_gen_len": 75.0326, + "eval_loss": 0.21165511012077332, + "eval_runtime": 57.4802, + "eval_samples_per_second": 18.128, + "eval_steps_per_second": 0.191, + "step": 10062 + }, + { + "epoch": 9.007155635062611, + "grad_norm": 1.994418978691101, + "learning_rate": 7.060770156438026e-07, + "loss": 0.2897, + "step": 10070 + }, + { + "epoch": 9.016100178890877, + "grad_norm": 1.5814578533172607, + "learning_rate": 7.057761732851986e-07, + "loss": 0.2765, + "step": 10080 + }, + { + "epoch": 9.02504472271914, + "grad_norm": 1.6096930503845215, + "learning_rate": 7.054753309265944e-07, + "loss": 0.2732, + "step": 10090 + }, + { + "epoch": 9.033989266547406, + "grad_norm": 1.7393819093704224, + "learning_rate": 7.051744885679904e-07, + "loss": 0.3009, + "step": 10100 + }, + { + "epoch": 9.04293381037567, + "grad_norm": 1.8216224908828735, + "learning_rate": 7.048736462093863e-07, + "loss": 0.2829, + "step": 10110 + }, + { + "epoch": 9.051878354203936, + "grad_norm": 1.5788861513137817, + "learning_rate": 7.045728038507821e-07, + "loss": 0.2716, + "step": 10120 + }, + { + "epoch": 9.0608228980322, + "grad_norm": 1.5919556617736816, + "learning_rate": 7.04271961492178e-07, + "loss": 0.2838, + "step": 10130 + }, + { + "epoch": 9.069767441860465, + "grad_norm": 1.6876708269119263, + "learning_rate": 7.03971119133574e-07, + "loss": 0.2832, + "step": 10140 + }, + { + "epoch": 9.078711985688729, + "grad_norm": 1.8378523588180542, + "learning_rate": 7.036702767749699e-07, + "loss": 0.2942, + "step": 10150 + }, + { + "epoch": 9.087656529516995, + "grad_norm": 1.6949840784072876, + "learning_rate": 7.033694344163658e-07, + "loss": 0.2952, + "step": 10160 + }, + { + "epoch": 9.09660107334526, + "grad_norm": 1.837931513786316, + "learning_rate": 7.030685920577617e-07, + "loss": 0.2837, + "step": 10170 + }, + { + "epoch": 9.105545617173524, + "grad_norm": 1.8467128276824951, + "learning_rate": 7.027677496991576e-07, + "loss": 0.2939, + "step": 10180 + }, + { + "epoch": 9.11449016100179, + "grad_norm": 1.966645359992981, + "learning_rate": 7.024669073405535e-07, + "loss": 0.2838, + "step": 10190 + }, + { + "epoch": 9.123434704830053, + "grad_norm": 1.8841787576675415, + "learning_rate": 7.021660649819494e-07, + "loss": 0.2896, + "step": 10200 + }, + { + "epoch": 9.132379248658319, + "grad_norm": 1.8857272863388062, + "learning_rate": 7.018652226233454e-07, + "loss": 0.2955, + "step": 10210 + }, + { + "epoch": 9.141323792486583, + "grad_norm": 1.6923903226852417, + "learning_rate": 7.015643802647413e-07, + "loss": 0.2851, + "step": 10220 + }, + { + "epoch": 9.150268336314848, + "grad_norm": 1.8548059463500977, + "learning_rate": 7.012635379061371e-07, + "loss": 0.2843, + "step": 10230 + }, + { + "epoch": 9.159212880143112, + "grad_norm": 2.048212766647339, + "learning_rate": 7.00962695547533e-07, + "loss": 0.2927, + "step": 10240 + }, + { + "epoch": 9.168157423971378, + "grad_norm": 1.835397481918335, + "learning_rate": 7.00661853188929e-07, + "loss": 0.2801, + "step": 10250 + }, + { + "epoch": 9.177101967799642, + "grad_norm": 1.7286008596420288, + "learning_rate": 7.003610108303248e-07, + "loss": 0.2852, + "step": 10260 + }, + { + "epoch": 9.186046511627907, + "grad_norm": 1.7264772653579712, + "learning_rate": 7.000601684717208e-07, + "loss": 0.2932, + "step": 10270 + }, + { + "epoch": 9.194991055456171, + "grad_norm": 2.0432634353637695, + "learning_rate": 6.997593261131167e-07, + "loss": 0.2696, + "step": 10280 + }, + { + "epoch": 9.203935599284437, + "grad_norm": 1.849487543106079, + "learning_rate": 6.994584837545126e-07, + "loss": 0.2886, + "step": 10290 + }, + { + "epoch": 9.2128801431127, + "grad_norm": 1.8255362510681152, + "learning_rate": 6.991576413959085e-07, + "loss": 0.2889, + "step": 10300 + }, + { + "epoch": 9.221824686940966, + "grad_norm": 1.9005097150802612, + "learning_rate": 6.988567990373045e-07, + "loss": 0.2798, + "step": 10310 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 1.788237452507019, + "learning_rate": 6.985559566787004e-07, + "loss": 0.2821, + "step": 10320 + }, + { + "epoch": 9.239713774597496, + "grad_norm": 1.9736496210098267, + "learning_rate": 6.982551143200963e-07, + "loss": 0.2824, + "step": 10330 + }, + { + "epoch": 9.248658318425761, + "grad_norm": 1.855490803718567, + "learning_rate": 6.979542719614921e-07, + "loss": 0.2907, + "step": 10340 + }, + { + "epoch": 9.257602862254025, + "grad_norm": 1.6318082809448242, + "learning_rate": 6.97653429602888e-07, + "loss": 0.2845, + "step": 10350 + }, + { + "epoch": 9.26654740608229, + "grad_norm": 1.9707635641098022, + "learning_rate": 6.973525872442839e-07, + "loss": 0.2981, + "step": 10360 + }, + { + "epoch": 9.275491949910554, + "grad_norm": 1.567692756652832, + "learning_rate": 6.970517448856798e-07, + "loss": 0.2837, + "step": 10370 + }, + { + "epoch": 9.28443649373882, + "grad_norm": 1.9208908081054688, + "learning_rate": 6.967509025270758e-07, + "loss": 0.2824, + "step": 10380 + }, + { + "epoch": 9.293381037567084, + "grad_norm": 1.8427250385284424, + "learning_rate": 6.964500601684717e-07, + "loss": 0.2848, + "step": 10390 + }, + { + "epoch": 9.30232558139535, + "grad_norm": 2.1784746646881104, + "learning_rate": 6.961492178098676e-07, + "loss": 0.2847, + "step": 10400 + }, + { + "epoch": 9.311270125223613, + "grad_norm": 1.7605880498886108, + "learning_rate": 6.958483754512635e-07, + "loss": 0.2803, + "step": 10410 + }, + { + "epoch": 9.320214669051879, + "grad_norm": 1.820336103439331, + "learning_rate": 6.955475330926595e-07, + "loss": 0.2756, + "step": 10420 + }, + { + "epoch": 9.329159212880143, + "grad_norm": 1.5924071073532104, + "learning_rate": 6.952466907340553e-07, + "loss": 0.2767, + "step": 10430 + }, + { + "epoch": 9.338103756708408, + "grad_norm": 1.563889741897583, + "learning_rate": 6.949458483754513e-07, + "loss": 0.2748, + "step": 10440 + }, + { + "epoch": 9.347048300536672, + "grad_norm": 1.9999738931655884, + "learning_rate": 6.946450060168471e-07, + "loss": 0.2816, + "step": 10450 + }, + { + "epoch": 9.355992844364938, + "grad_norm": 1.93051016330719, + "learning_rate": 6.94344163658243e-07, + "loss": 0.2838, + "step": 10460 + }, + { + "epoch": 9.364937388193201, + "grad_norm": 1.7076514959335327, + "learning_rate": 6.940433212996389e-07, + "loss": 0.2773, + "step": 10470 + }, + { + "epoch": 9.373881932021467, + "grad_norm": 1.8156001567840576, + "learning_rate": 6.937424789410349e-07, + "loss": 0.2745, + "step": 10480 + }, + { + "epoch": 9.38282647584973, + "grad_norm": 1.8215938806533813, + "learning_rate": 6.934416365824308e-07, + "loss": 0.2851, + "step": 10490 + }, + { + "epoch": 9.391771019677996, + "grad_norm": 2.2098872661590576, + "learning_rate": 6.931407942238267e-07, + "loss": 0.281, + "step": 10500 + }, + { + "epoch": 9.400715563506262, + "grad_norm": 1.8551905155181885, + "learning_rate": 6.928399518652226e-07, + "loss": 0.2785, + "step": 10510 + }, + { + "epoch": 9.409660107334526, + "grad_norm": 1.5589836835861206, + "learning_rate": 6.925391095066185e-07, + "loss": 0.2728, + "step": 10520 + }, + { + "epoch": 9.418604651162791, + "grad_norm": 1.5664674043655396, + "learning_rate": 6.922382671480144e-07, + "loss": 0.2886, + "step": 10530 + }, + { + "epoch": 9.427549194991055, + "grad_norm": 1.721440076828003, + "learning_rate": 6.919374247894103e-07, + "loss": 0.2747, + "step": 10540 + }, + { + "epoch": 9.43649373881932, + "grad_norm": 1.601876974105835, + "learning_rate": 6.916365824308063e-07, + "loss": 0.2685, + "step": 10550 + }, + { + "epoch": 9.445438282647585, + "grad_norm": 1.8169065713882446, + "learning_rate": 6.913357400722021e-07, + "loss": 0.2787, + "step": 10560 + }, + { + "epoch": 9.45438282647585, + "grad_norm": 1.734645128250122, + "learning_rate": 6.91034897713598e-07, + "loss": 0.2892, + "step": 10570 + }, + { + "epoch": 9.463327370304114, + "grad_norm": 1.8845486640930176, + "learning_rate": 6.907340553549939e-07, + "loss": 0.2785, + "step": 10580 + }, + { + "epoch": 9.47227191413238, + "grad_norm": 1.6264091730117798, + "learning_rate": 6.904332129963899e-07, + "loss": 0.2712, + "step": 10590 + }, + { + "epoch": 9.481216457960643, + "grad_norm": 1.8591593503952026, + "learning_rate": 6.901323706377857e-07, + "loss": 0.2745, + "step": 10600 + }, + { + "epoch": 9.490161001788909, + "grad_norm": 1.8113240003585815, + "learning_rate": 6.898315282791817e-07, + "loss": 0.2773, + "step": 10610 + }, + { + "epoch": 9.499105545617173, + "grad_norm": 2.180983066558838, + "learning_rate": 6.895306859205776e-07, + "loss": 0.2841, + "step": 10620 + }, + { + "epoch": 9.508050089445439, + "grad_norm": 1.5472530126571655, + "learning_rate": 6.892298435619735e-07, + "loss": 0.2784, + "step": 10630 + }, + { + "epoch": 9.516994633273702, + "grad_norm": 1.6539241075515747, + "learning_rate": 6.889290012033694e-07, + "loss": 0.2803, + "step": 10640 + }, + { + "epoch": 9.525939177101968, + "grad_norm": 2.5152432918548584, + "learning_rate": 6.886281588447654e-07, + "loss": 0.2754, + "step": 10650 + }, + { + "epoch": 9.534883720930232, + "grad_norm": 1.7078677415847778, + "learning_rate": 6.883273164861613e-07, + "loss": 0.2816, + "step": 10660 + }, + { + "epoch": 9.543828264758497, + "grad_norm": 1.7446966171264648, + "learning_rate": 6.880264741275571e-07, + "loss": 0.2845, + "step": 10670 + }, + { + "epoch": 9.552772808586763, + "grad_norm": 1.8630328178405762, + "learning_rate": 6.87725631768953e-07, + "loss": 0.2716, + "step": 10680 + }, + { + "epoch": 9.561717352415027, + "grad_norm": 1.6282811164855957, + "learning_rate": 6.87424789410349e-07, + "loss": 0.2888, + "step": 10690 + }, + { + "epoch": 9.570661896243292, + "grad_norm": 1.8140019178390503, + "learning_rate": 6.871239470517448e-07, + "loss": 0.284, + "step": 10700 + }, + { + "epoch": 9.579606440071556, + "grad_norm": 1.568709373474121, + "learning_rate": 6.868231046931407e-07, + "loss": 0.2728, + "step": 10710 + }, + { + "epoch": 9.588550983899822, + "grad_norm": 2.294747829437256, + "learning_rate": 6.865222623345367e-07, + "loss": 0.2732, + "step": 10720 + }, + { + "epoch": 9.597495527728086, + "grad_norm": 1.5327823162078857, + "learning_rate": 6.862214199759325e-07, + "loss": 0.2704, + "step": 10730 + }, + { + "epoch": 9.606440071556351, + "grad_norm": 1.429221272468567, + "learning_rate": 6.859205776173285e-07, + "loss": 0.2758, + "step": 10740 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 2.2276451587677, + "learning_rate": 6.856197352587244e-07, + "loss": 0.2649, + "step": 10750 + }, + { + "epoch": 9.62432915921288, + "grad_norm": 2.1165475845336914, + "learning_rate": 6.853188929001204e-07, + "loss": 0.2848, + "step": 10760 + }, + { + "epoch": 9.633273703041144, + "grad_norm": 1.5652105808258057, + "learning_rate": 6.850180505415162e-07, + "loss": 0.2683, + "step": 10770 + }, + { + "epoch": 9.64221824686941, + "grad_norm": 1.8563920259475708, + "learning_rate": 6.847172081829121e-07, + "loss": 0.2701, + "step": 10780 + }, + { + "epoch": 9.651162790697674, + "grad_norm": 1.9464433193206787, + "learning_rate": 6.84416365824308e-07, + "loss": 0.2643, + "step": 10790 + }, + { + "epoch": 9.66010733452594, + "grad_norm": 1.7110228538513184, + "learning_rate": 6.841155234657039e-07, + "loss": 0.2678, + "step": 10800 + }, + { + "epoch": 9.669051878354203, + "grad_norm": 1.559122920036316, + "learning_rate": 6.838146811070998e-07, + "loss": 0.2769, + "step": 10810 + }, + { + "epoch": 9.677996422182469, + "grad_norm": 1.5416113138198853, + "learning_rate": 6.835138387484958e-07, + "loss": 0.2847, + "step": 10820 + }, + { + "epoch": 9.686940966010733, + "grad_norm": 1.944888710975647, + "learning_rate": 6.832129963898917e-07, + "loss": 0.285, + "step": 10830 + }, + { + "epoch": 9.695885509838998, + "grad_norm": 1.6021989583969116, + "learning_rate": 6.829121540312876e-07, + "loss": 0.2639, + "step": 10840 + }, + { + "epoch": 9.704830053667262, + "grad_norm": 1.9095149040222168, + "learning_rate": 6.826113116726835e-07, + "loss": 0.2783, + "step": 10850 + }, + { + "epoch": 9.713774597495528, + "grad_norm": 1.979996919631958, + "learning_rate": 6.823104693140795e-07, + "loss": 0.27, + "step": 10860 + }, + { + "epoch": 9.722719141323793, + "grad_norm": 1.568677544593811, + "learning_rate": 6.820096269554753e-07, + "loss": 0.2717, + "step": 10870 + }, + { + "epoch": 9.731663685152057, + "grad_norm": 1.8690828084945679, + "learning_rate": 6.817087845968712e-07, + "loss": 0.2799, + "step": 10880 + }, + { + "epoch": 9.740608228980323, + "grad_norm": 2.1000804901123047, + "learning_rate": 6.814079422382671e-07, + "loss": 0.2891, + "step": 10890 + }, + { + "epoch": 9.749552772808586, + "grad_norm": 1.8976290225982666, + "learning_rate": 6.811070998796629e-07, + "loss": 0.2687, + "step": 10900 + }, + { + "epoch": 9.758497316636852, + "grad_norm": 1.5658888816833496, + "learning_rate": 6.808062575210589e-07, + "loss": 0.2728, + "step": 10910 + }, + { + "epoch": 9.767441860465116, + "grad_norm": 2.6436548233032227, + "learning_rate": 6.805054151624548e-07, + "loss": 0.2645, + "step": 10920 + }, + { + "epoch": 9.776386404293381, + "grad_norm": 1.801951289176941, + "learning_rate": 6.802045728038508e-07, + "loss": 0.2759, + "step": 10930 + }, + { + "epoch": 9.785330948121645, + "grad_norm": 1.6843072175979614, + "learning_rate": 6.799037304452466e-07, + "loss": 0.2751, + "step": 10940 + }, + { + "epoch": 9.79427549194991, + "grad_norm": 1.7897447347640991, + "learning_rate": 6.796028880866426e-07, + "loss": 0.2716, + "step": 10950 + }, + { + "epoch": 9.803220035778175, + "grad_norm": 1.4712356328964233, + "learning_rate": 6.793020457280385e-07, + "loss": 0.271, + "step": 10960 + }, + { + "epoch": 9.81216457960644, + "grad_norm": 1.9668405055999756, + "learning_rate": 6.790012033694344e-07, + "loss": 0.2825, + "step": 10970 + }, + { + "epoch": 9.821109123434704, + "grad_norm": 1.4257514476776123, + "learning_rate": 6.787003610108303e-07, + "loss": 0.2695, + "step": 10980 + }, + { + "epoch": 9.83005366726297, + "grad_norm": 2.357689380645752, + "learning_rate": 6.783995186522263e-07, + "loss": 0.285, + "step": 10990 + }, + { + "epoch": 9.838998211091234, + "grad_norm": 1.616826057434082, + "learning_rate": 6.780986762936222e-07, + "loss": 0.2639, + "step": 11000 + }, + { + "epoch": 9.847942754919499, + "grad_norm": 1.521409034729004, + "learning_rate": 6.77797833935018e-07, + "loss": 0.2713, + "step": 11010 + }, + { + "epoch": 9.856887298747765, + "grad_norm": 1.5054371356964111, + "learning_rate": 6.774969915764139e-07, + "loss": 0.2704, + "step": 11020 + }, + { + "epoch": 9.865831842576029, + "grad_norm": 1.6104198694229126, + "learning_rate": 6.771961492178099e-07, + "loss": 0.2672, + "step": 11030 + }, + { + "epoch": 9.874776386404294, + "grad_norm": 1.7688450813293457, + "learning_rate": 6.768953068592057e-07, + "loss": 0.2667, + "step": 11040 + }, + { + "epoch": 9.883720930232558, + "grad_norm": 1.5123931169509888, + "learning_rate": 6.765944645006016e-07, + "loss": 0.2616, + "step": 11050 + }, + { + "epoch": 9.892665474060824, + "grad_norm": 1.5442701578140259, + "learning_rate": 6.762936221419976e-07, + "loss": 0.2797, + "step": 11060 + }, + { + "epoch": 9.901610017889087, + "grad_norm": 1.9512841701507568, + "learning_rate": 6.759927797833934e-07, + "loss": 0.2676, + "step": 11070 + }, + { + "epoch": 9.910554561717353, + "grad_norm": 1.6428849697113037, + "learning_rate": 6.756919374247894e-07, + "loss": 0.2735, + "step": 11080 + }, + { + "epoch": 9.919499105545617, + "grad_norm": 2.7944211959838867, + "learning_rate": 6.753910950661853e-07, + "loss": 0.2712, + "step": 11090 + }, + { + "epoch": 9.928443649373882, + "grad_norm": 1.6916003227233887, + "learning_rate": 6.750902527075813e-07, + "loss": 0.272, + "step": 11100 + }, + { + "epoch": 9.937388193202146, + "grad_norm": 1.6299495697021484, + "learning_rate": 6.74789410348977e-07, + "loss": 0.2795, + "step": 11110 + }, + { + "epoch": 9.946332737030412, + "grad_norm": 1.7412554025650024, + "learning_rate": 6.74488567990373e-07, + "loss": 0.2697, + "step": 11120 + }, + { + "epoch": 9.955277280858676, + "grad_norm": 1.537885308265686, + "learning_rate": 6.741877256317689e-07, + "loss": 0.2593, + "step": 11130 + }, + { + "epoch": 9.964221824686941, + "grad_norm": 1.4216618537902832, + "learning_rate": 6.738868832731648e-07, + "loss": 0.2626, + "step": 11140 + }, + { + "epoch": 9.973166368515205, + "grad_norm": 1.3773558139801025, + "learning_rate": 6.735860409145607e-07, + "loss": 0.2648, + "step": 11150 + }, + { + "epoch": 9.98211091234347, + "grad_norm": 1.9161559343338013, + "learning_rate": 6.732851985559567e-07, + "loss": 0.2675, + "step": 11160 + }, + { + "epoch": 9.991055456171736, + "grad_norm": 1.692733645439148, + "learning_rate": 6.729843561973526e-07, + "loss": 0.2601, + "step": 11170 + }, + { + "epoch": 10.0, + "grad_norm": 2.6017653942108154, + "learning_rate": 6.726835138387485e-07, + "loss": 0.2591, + "step": 11180 + }, + { + "epoch": 10.0, + "eval_bleu": 72.392, + "eval_gen_len": 74.9607, + "eval_loss": 0.20007498562335968, + "eval_runtime": 57.3761, + "eval_samples_per_second": 18.161, + "eval_steps_per_second": 0.192, + "step": 11180 + }, + { + "epoch": 10.008944543828266, + "grad_norm": 1.955108642578125, + "learning_rate": 6.723826714801444e-07, + "loss": 0.2603, + "step": 11190 + }, + { + "epoch": 10.01788908765653, + "grad_norm": 1.7971755266189575, + "learning_rate": 6.720818291215404e-07, + "loss": 0.2574, + "step": 11200 + }, + { + "epoch": 10.026833631484795, + "grad_norm": 1.5595004558563232, + "learning_rate": 6.717809867629362e-07, + "loss": 0.2729, + "step": 11210 + }, + { + "epoch": 10.035778175313059, + "grad_norm": 1.6440141201019287, + "learning_rate": 6.71480144404332e-07, + "loss": 0.2706, + "step": 11220 + }, + { + "epoch": 10.044722719141324, + "grad_norm": 1.6471924781799316, + "learning_rate": 6.71179302045728e-07, + "loss": 0.2662, + "step": 11230 + }, + { + "epoch": 10.053667262969588, + "grad_norm": 1.8682457208633423, + "learning_rate": 6.708784596871238e-07, + "loss": 0.2615, + "step": 11240 + }, + { + "epoch": 10.062611806797854, + "grad_norm": 1.9249464273452759, + "learning_rate": 6.705776173285198e-07, + "loss": 0.2718, + "step": 11250 + }, + { + "epoch": 10.071556350626118, + "grad_norm": 1.7548773288726807, + "learning_rate": 6.702767749699157e-07, + "loss": 0.2604, + "step": 11260 + }, + { + "epoch": 10.080500894454383, + "grad_norm": 1.8712800741195679, + "learning_rate": 6.699759326113117e-07, + "loss": 0.2587, + "step": 11270 + }, + { + "epoch": 10.089445438282647, + "grad_norm": 1.5508815050125122, + "learning_rate": 6.696750902527075e-07, + "loss": 0.2692, + "step": 11280 + }, + { + "epoch": 10.098389982110913, + "grad_norm": 1.7437515258789062, + "learning_rate": 6.693742478941035e-07, + "loss": 0.2724, + "step": 11290 + }, + { + "epoch": 10.107334525939176, + "grad_norm": 1.8249415159225464, + "learning_rate": 6.690734055354994e-07, + "loss": 0.2701, + "step": 11300 + }, + { + "epoch": 10.116279069767442, + "grad_norm": 2.235722064971924, + "learning_rate": 6.687725631768953e-07, + "loss": 0.2718, + "step": 11310 + }, + { + "epoch": 10.125223613595706, + "grad_norm": 1.6692746877670288, + "learning_rate": 6.684717208182912e-07, + "loss": 0.2679, + "step": 11320 + }, + { + "epoch": 10.134168157423971, + "grad_norm": 1.651930332183838, + "learning_rate": 6.681708784596872e-07, + "loss": 0.2573, + "step": 11330 + }, + { + "epoch": 10.143112701252235, + "grad_norm": 2.019298791885376, + "learning_rate": 6.67870036101083e-07, + "loss": 0.2617, + "step": 11340 + }, + { + "epoch": 10.152057245080501, + "grad_norm": 1.7158833742141724, + "learning_rate": 6.675691937424789e-07, + "loss": 0.2806, + "step": 11350 + }, + { + "epoch": 10.161001788908766, + "grad_norm": 1.6784923076629639, + "learning_rate": 6.672683513838748e-07, + "loss": 0.2717, + "step": 11360 + }, + { + "epoch": 10.16994633273703, + "grad_norm": 1.6140066385269165, + "learning_rate": 6.669675090252708e-07, + "loss": 0.2557, + "step": 11370 + }, + { + "epoch": 10.178890876565296, + "grad_norm": 1.7521976232528687, + "learning_rate": 6.666666666666666e-07, + "loss": 0.2697, + "step": 11380 + }, + { + "epoch": 10.18783542039356, + "grad_norm": 1.640120267868042, + "learning_rate": 6.663658243080625e-07, + "loss": 0.268, + "step": 11390 + }, + { + "epoch": 10.196779964221825, + "grad_norm": 1.9078781604766846, + "learning_rate": 6.660649819494585e-07, + "loss": 0.2661, + "step": 11400 + }, + { + "epoch": 10.20572450805009, + "grad_norm": 1.6086190938949585, + "learning_rate": 6.657641395908543e-07, + "loss": 0.2656, + "step": 11410 + }, + { + "epoch": 10.214669051878355, + "grad_norm": 1.8679802417755127, + "learning_rate": 6.654632972322503e-07, + "loss": 0.2875, + "step": 11420 + }, + { + "epoch": 10.223613595706619, + "grad_norm": 1.6009517908096313, + "learning_rate": 6.651624548736462e-07, + "loss": 0.2691, + "step": 11430 + }, + { + "epoch": 10.232558139534884, + "grad_norm": 1.8012363910675049, + "learning_rate": 6.648616125150422e-07, + "loss": 0.2695, + "step": 11440 + }, + { + "epoch": 10.241502683363148, + "grad_norm": 2.0842690467834473, + "learning_rate": 6.645607701564379e-07, + "loss": 0.275, + "step": 11450 + }, + { + "epoch": 10.250447227191414, + "grad_norm": 1.7462635040283203, + "learning_rate": 6.642599277978339e-07, + "loss": 0.2578, + "step": 11460 + }, + { + "epoch": 10.259391771019677, + "grad_norm": 1.591439127922058, + "learning_rate": 6.639590854392298e-07, + "loss": 0.2627, + "step": 11470 + }, + { + "epoch": 10.268336314847943, + "grad_norm": 1.7550685405731201, + "learning_rate": 6.636582430806257e-07, + "loss": 0.2597, + "step": 11480 + }, + { + "epoch": 10.277280858676207, + "grad_norm": 1.7708865404129028, + "learning_rate": 6.633574007220216e-07, + "loss": 0.2737, + "step": 11490 + }, + { + "epoch": 10.286225402504472, + "grad_norm": 1.8559082746505737, + "learning_rate": 6.630565583634176e-07, + "loss": 0.2778, + "step": 11500 + }, + { + "epoch": 10.295169946332736, + "grad_norm": 1.6985397338867188, + "learning_rate": 6.627557160048135e-07, + "loss": 0.2616, + "step": 11510 + }, + { + "epoch": 10.304114490161002, + "grad_norm": 1.6595687866210938, + "learning_rate": 6.624548736462094e-07, + "loss": 0.2626, + "step": 11520 + }, + { + "epoch": 10.313059033989267, + "grad_norm": 1.7601011991500854, + "learning_rate": 6.621540312876053e-07, + "loss": 0.2625, + "step": 11530 + }, + { + "epoch": 10.322003577817531, + "grad_norm": 1.7115074396133423, + "learning_rate": 6.618531889290013e-07, + "loss": 0.2632, + "step": 11540 + }, + { + "epoch": 10.330948121645797, + "grad_norm": 1.7978862524032593, + "learning_rate": 6.61552346570397e-07, + "loss": 0.2591, + "step": 11550 + }, + { + "epoch": 10.33989266547406, + "grad_norm": 1.7057119607925415, + "learning_rate": 6.612515042117929e-07, + "loss": 0.2651, + "step": 11560 + }, + { + "epoch": 10.348837209302326, + "grad_norm": 1.6174237728118896, + "learning_rate": 6.609506618531889e-07, + "loss": 0.2562, + "step": 11570 + }, + { + "epoch": 10.35778175313059, + "grad_norm": 1.4859338998794556, + "learning_rate": 6.606498194945847e-07, + "loss": 0.2542, + "step": 11580 + }, + { + "epoch": 10.366726296958856, + "grad_norm": 1.6329357624053955, + "learning_rate": 6.603489771359807e-07, + "loss": 0.2652, + "step": 11590 + }, + { + "epoch": 10.37567084078712, + "grad_norm": 1.7987093925476074, + "learning_rate": 6.600481347773766e-07, + "loss": 0.2693, + "step": 11600 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 2.0187013149261475, + "learning_rate": 6.597472924187726e-07, + "loss": 0.2544, + "step": 11610 + }, + { + "epoch": 10.393559928443649, + "grad_norm": 1.8823727369308472, + "learning_rate": 6.594464500601684e-07, + "loss": 0.2667, + "step": 11620 + }, + { + "epoch": 10.402504472271914, + "grad_norm": 2.0941925048828125, + "learning_rate": 6.591456077015644e-07, + "loss": 0.2643, + "step": 11630 + }, + { + "epoch": 10.411449016100178, + "grad_norm": 1.7243622541427612, + "learning_rate": 6.588447653429603e-07, + "loss": 0.257, + "step": 11640 + }, + { + "epoch": 10.420393559928444, + "grad_norm": 1.586456537246704, + "learning_rate": 6.585439229843562e-07, + "loss": 0.2544, + "step": 11650 + }, + { + "epoch": 10.429338103756708, + "grad_norm": 1.5672041177749634, + "learning_rate": 6.58243080625752e-07, + "loss": 0.2714, + "step": 11660 + }, + { + "epoch": 10.438282647584973, + "grad_norm": 1.4649882316589355, + "learning_rate": 6.57942238267148e-07, + "loss": 0.2523, + "step": 11670 + }, + { + "epoch": 10.447227191413237, + "grad_norm": 1.5319983959197998, + "learning_rate": 6.576413959085438e-07, + "loss": 0.2574, + "step": 11680 + }, + { + "epoch": 10.456171735241503, + "grad_norm": 1.672580599784851, + "learning_rate": 6.573405535499398e-07, + "loss": 0.253, + "step": 11690 + }, + { + "epoch": 10.465116279069768, + "grad_norm": 1.5990978479385376, + "learning_rate": 6.570397111913357e-07, + "loss": 0.2598, + "step": 11700 + }, + { + "epoch": 10.474060822898032, + "grad_norm": 1.6208809614181519, + "learning_rate": 6.567388688327317e-07, + "loss": 0.2591, + "step": 11710 + }, + { + "epoch": 10.483005366726298, + "grad_norm": 1.6113932132720947, + "learning_rate": 6.564380264741275e-07, + "loss": 0.2595, + "step": 11720 + }, + { + "epoch": 10.491949910554561, + "grad_norm": 1.9188364744186401, + "learning_rate": 6.561371841155234e-07, + "loss": 0.2634, + "step": 11730 + }, + { + "epoch": 10.500894454382827, + "grad_norm": 1.6541216373443604, + "learning_rate": 6.558363417569194e-07, + "loss": 0.2655, + "step": 11740 + }, + { + "epoch": 10.509838998211091, + "grad_norm": 1.631947636604309, + "learning_rate": 6.555354993983152e-07, + "loss": 0.262, + "step": 11750 + }, + { + "epoch": 10.518783542039357, + "grad_norm": 1.7661869525909424, + "learning_rate": 6.552346570397112e-07, + "loss": 0.2632, + "step": 11760 + }, + { + "epoch": 10.52772808586762, + "grad_norm": 1.4233602285385132, + "learning_rate": 6.54933814681107e-07, + "loss": 0.2599, + "step": 11770 + }, + { + "epoch": 10.536672629695886, + "grad_norm": 1.8507752418518066, + "learning_rate": 6.54632972322503e-07, + "loss": 0.2615, + "step": 11780 + }, + { + "epoch": 10.54561717352415, + "grad_norm": 1.598519206047058, + "learning_rate": 6.543321299638988e-07, + "loss": 0.2568, + "step": 11790 + }, + { + "epoch": 10.554561717352415, + "grad_norm": 1.6336430311203003, + "learning_rate": 6.540312876052948e-07, + "loss": 0.2549, + "step": 11800 + }, + { + "epoch": 10.56350626118068, + "grad_norm": 1.557985782623291, + "learning_rate": 6.537304452466907e-07, + "loss": 0.2638, + "step": 11810 + }, + { + "epoch": 10.572450805008945, + "grad_norm": 1.6330904960632324, + "learning_rate": 6.534296028880866e-07, + "loss": 0.256, + "step": 11820 + }, + { + "epoch": 10.581395348837209, + "grad_norm": 1.4486061334609985, + "learning_rate": 6.531287605294825e-07, + "loss": 0.2527, + "step": 11830 + }, + { + "epoch": 10.590339892665474, + "grad_norm": 1.7647820711135864, + "learning_rate": 6.528279181708785e-07, + "loss": 0.255, + "step": 11840 + }, + { + "epoch": 10.59928443649374, + "grad_norm": 1.6095329523086548, + "learning_rate": 6.525270758122743e-07, + "loss": 0.2484, + "step": 11850 + }, + { + "epoch": 10.608228980322004, + "grad_norm": 1.395642876625061, + "learning_rate": 6.522262334536703e-07, + "loss": 0.2507, + "step": 11860 + }, + { + "epoch": 10.61717352415027, + "grad_norm": 1.7519396543502808, + "learning_rate": 6.519253910950662e-07, + "loss": 0.2574, + "step": 11870 + }, + { + "epoch": 10.626118067978533, + "grad_norm": 1.5717977285385132, + "learning_rate": 6.516245487364622e-07, + "loss": 0.253, + "step": 11880 + }, + { + "epoch": 10.635062611806799, + "grad_norm": 1.6412906646728516, + "learning_rate": 6.513237063778579e-07, + "loss": 0.2586, + "step": 11890 + }, + { + "epoch": 10.644007155635062, + "grad_norm": 1.8597332239151, + "learning_rate": 6.510228640192538e-07, + "loss": 0.2592, + "step": 11900 + }, + { + "epoch": 10.652951699463328, + "grad_norm": 1.8751498460769653, + "learning_rate": 6.507220216606498e-07, + "loss": 0.27, + "step": 11910 + }, + { + "epoch": 10.661896243291592, + "grad_norm": 1.6349623203277588, + "learning_rate": 6.504211793020456e-07, + "loss": 0.2566, + "step": 11920 + }, + { + "epoch": 10.670840787119857, + "grad_norm": 1.5924897193908691, + "learning_rate": 6.501203369434416e-07, + "loss": 0.2518, + "step": 11930 + }, + { + "epoch": 10.679785330948121, + "grad_norm": 1.5484246015548706, + "learning_rate": 6.498194945848375e-07, + "loss": 0.25, + "step": 11940 + }, + { + "epoch": 10.688729874776387, + "grad_norm": 1.6486756801605225, + "learning_rate": 6.495186522262335e-07, + "loss": 0.2617, + "step": 11950 + }, + { + "epoch": 10.69767441860465, + "grad_norm": 1.730144739151001, + "learning_rate": 6.492178098676293e-07, + "loss": 0.2569, + "step": 11960 + }, + { + "epoch": 10.706618962432916, + "grad_norm": 2.03178334236145, + "learning_rate": 6.489169675090253e-07, + "loss": 0.2526, + "step": 11970 + }, + { + "epoch": 10.71556350626118, + "grad_norm": 1.855293869972229, + "learning_rate": 6.486161251504212e-07, + "loss": 0.2685, + "step": 11980 + }, + { + "epoch": 10.724508050089446, + "grad_norm": 1.7201274633407593, + "learning_rate": 6.48315282791817e-07, + "loss": 0.2529, + "step": 11990 + }, + { + "epoch": 10.73345259391771, + "grad_norm": 1.6383235454559326, + "learning_rate": 6.480144404332129e-07, + "loss": 0.267, + "step": 12000 + }, + { + "epoch": 10.742397137745975, + "grad_norm": 1.538323998451233, + "learning_rate": 6.477135980746089e-07, + "loss": 0.2543, + "step": 12010 + }, + { + "epoch": 10.751341681574239, + "grad_norm": 1.6423795223236084, + "learning_rate": 6.474127557160047e-07, + "loss": 0.2586, + "step": 12020 + }, + { + "epoch": 10.760286225402504, + "grad_norm": 1.6282501220703125, + "learning_rate": 6.471119133574007e-07, + "loss": 0.2552, + "step": 12030 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 1.472403645515442, + "learning_rate": 6.468110709987966e-07, + "loss": 0.2591, + "step": 12040 + }, + { + "epoch": 10.778175313059034, + "grad_norm": 1.8465627431869507, + "learning_rate": 6.465102286401926e-07, + "loss": 0.2547, + "step": 12050 + }, + { + "epoch": 10.7871198568873, + "grad_norm": 1.6513354778289795, + "learning_rate": 6.462093862815884e-07, + "loss": 0.2611, + "step": 12060 + }, + { + "epoch": 10.796064400715563, + "grad_norm": 1.4019272327423096, + "learning_rate": 6.459085439229844e-07, + "loss": 0.2526, + "step": 12070 + }, + { + "epoch": 10.805008944543829, + "grad_norm": 1.6927614212036133, + "learning_rate": 6.456077015643803e-07, + "loss": 0.2659, + "step": 12080 + }, + { + "epoch": 10.813953488372093, + "grad_norm": 1.825679063796997, + "learning_rate": 6.453068592057761e-07, + "loss": 0.2677, + "step": 12090 + }, + { + "epoch": 10.822898032200358, + "grad_norm": 1.6658412218093872, + "learning_rate": 6.45006016847172e-07, + "loss": 0.2548, + "step": 12100 + }, + { + "epoch": 10.831842576028622, + "grad_norm": 1.9499558210372925, + "learning_rate": 6.447051744885679e-07, + "loss": 0.2597, + "step": 12110 + }, + { + "epoch": 10.840787119856888, + "grad_norm": 1.8598222732543945, + "learning_rate": 6.444043321299639e-07, + "loss": 0.2574, + "step": 12120 + }, + { + "epoch": 10.849731663685152, + "grad_norm": 1.6625630855560303, + "learning_rate": 6.441034897713597e-07, + "loss": 0.2526, + "step": 12130 + }, + { + "epoch": 10.858676207513417, + "grad_norm": 1.5452898740768433, + "learning_rate": 6.438026474127557e-07, + "loss": 0.2515, + "step": 12140 + }, + { + "epoch": 10.867620751341681, + "grad_norm": 1.7677903175354004, + "learning_rate": 6.435018050541516e-07, + "loss": 0.2607, + "step": 12150 + }, + { + "epoch": 10.876565295169947, + "grad_norm": 1.5394271612167358, + "learning_rate": 6.432009626955475e-07, + "loss": 0.2502, + "step": 12160 + }, + { + "epoch": 10.88550983899821, + "grad_norm": 1.6044206619262695, + "learning_rate": 6.429001203369434e-07, + "loss": 0.246, + "step": 12170 + }, + { + "epoch": 10.894454382826476, + "grad_norm": 1.5506819486618042, + "learning_rate": 6.425992779783394e-07, + "loss": 0.2637, + "step": 12180 + }, + { + "epoch": 10.903398926654742, + "grad_norm": 1.797220230102539, + "learning_rate": 6.422984356197352e-07, + "loss": 0.2535, + "step": 12190 + }, + { + "epoch": 10.912343470483005, + "grad_norm": 1.6893784999847412, + "learning_rate": 6.419975932611312e-07, + "loss": 0.2554, + "step": 12200 + }, + { + "epoch": 10.921288014311271, + "grad_norm": 2.001776695251465, + "learning_rate": 6.41696750902527e-07, + "loss": 0.2591, + "step": 12210 + }, + { + "epoch": 10.930232558139535, + "grad_norm": 1.6749728918075562, + "learning_rate": 6.41395908543923e-07, + "loss": 0.2387, + "step": 12220 + }, + { + "epoch": 10.9391771019678, + "grad_norm": 1.7582108974456787, + "learning_rate": 6.410950661853188e-07, + "loss": 0.2542, + "step": 12230 + }, + { + "epoch": 10.948121645796064, + "grad_norm": 1.4224308729171753, + "learning_rate": 6.407942238267148e-07, + "loss": 0.2597, + "step": 12240 + }, + { + "epoch": 10.95706618962433, + "grad_norm": 1.700706958770752, + "learning_rate": 6.404933814681107e-07, + "loss": 0.256, + "step": 12250 + }, + { + "epoch": 10.966010733452594, + "grad_norm": 1.681830883026123, + "learning_rate": 6.401925391095065e-07, + "loss": 0.2516, + "step": 12260 + }, + { + "epoch": 10.97495527728086, + "grad_norm": 1.5968338251113892, + "learning_rate": 6.398916967509025e-07, + "loss": 0.2514, + "step": 12270 + }, + { + "epoch": 10.983899821109123, + "grad_norm": 1.705511450767517, + "learning_rate": 6.395908543922984e-07, + "loss": 0.2532, + "step": 12280 + }, + { + "epoch": 10.992844364937389, + "grad_norm": 1.6551766395568848, + "learning_rate": 6.392900120336944e-07, + "loss": 0.2471, + "step": 12290 + }, + { + "epoch": 11.0, + "eval_bleu": 73.4758, + "eval_gen_len": 74.9251, + "eval_loss": 0.18992315232753754, + "eval_runtime": 56.392, + "eval_samples_per_second": 18.478, + "eval_steps_per_second": 0.195, + "step": 12298 + }, + { + "epoch": 11.001788908765652, + "grad_norm": 1.6203548908233643, + "learning_rate": 6.389891696750902e-07, + "loss": 0.2487, + "step": 12300 + }, + { + "epoch": 11.010733452593918, + "grad_norm": 1.4733091592788696, + "learning_rate": 6.386883273164862e-07, + "loss": 0.2623, + "step": 12310 + }, + { + "epoch": 11.019677996422182, + "grad_norm": 1.8115324974060059, + "learning_rate": 6.38387484957882e-07, + "loss": 0.2616, + "step": 12320 + }, + { + "epoch": 11.028622540250447, + "grad_norm": 1.9844839572906494, + "learning_rate": 6.380866425992779e-07, + "loss": 0.2561, + "step": 12330 + }, + { + "epoch": 11.037567084078711, + "grad_norm": 3.2231359481811523, + "learning_rate": 6.377858002406738e-07, + "loss": 0.2553, + "step": 12340 + }, + { + "epoch": 11.046511627906977, + "grad_norm": 1.4005216360092163, + "learning_rate": 6.374849578820698e-07, + "loss": 0.2399, + "step": 12350 + }, + { + "epoch": 11.05545617173524, + "grad_norm": 1.6509549617767334, + "learning_rate": 6.371841155234656e-07, + "loss": 0.251, + "step": 12360 + }, + { + "epoch": 11.064400715563506, + "grad_norm": 1.507264494895935, + "learning_rate": 6.368832731648616e-07, + "loss": 0.2449, + "step": 12370 + }, + { + "epoch": 11.073345259391772, + "grad_norm": 1.9406224489212036, + "learning_rate": 6.365824308062575e-07, + "loss": 0.25, + "step": 12380 + }, + { + "epoch": 11.082289803220036, + "grad_norm": 1.834517002105713, + "learning_rate": 6.362815884476535e-07, + "loss": 0.2619, + "step": 12390 + }, + { + "epoch": 11.091234347048301, + "grad_norm": 1.7223912477493286, + "learning_rate": 6.359807460890493e-07, + "loss": 0.2576, + "step": 12400 + }, + { + "epoch": 11.100178890876565, + "grad_norm": 1.5235416889190674, + "learning_rate": 6.356799037304453e-07, + "loss": 0.24, + "step": 12410 + }, + { + "epoch": 11.10912343470483, + "grad_norm": 1.5340937376022339, + "learning_rate": 6.353790613718412e-07, + "loss": 0.2567, + "step": 12420 + }, + { + "epoch": 11.118067978533094, + "grad_norm": 1.5485321283340454, + "learning_rate": 6.350782190132369e-07, + "loss": 0.2501, + "step": 12430 + }, + { + "epoch": 11.12701252236136, + "grad_norm": 1.5416953563690186, + "learning_rate": 6.347773766546329e-07, + "loss": 0.2493, + "step": 12440 + }, + { + "epoch": 11.135957066189624, + "grad_norm": 1.7156434059143066, + "learning_rate": 6.344765342960288e-07, + "loss": 0.2451, + "step": 12450 + }, + { + "epoch": 11.14490161001789, + "grad_norm": 1.7634798288345337, + "learning_rate": 6.341756919374247e-07, + "loss": 0.2525, + "step": 12460 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 1.6508985757827759, + "learning_rate": 6.338748495788206e-07, + "loss": 0.2492, + "step": 12470 + }, + { + "epoch": 11.162790697674419, + "grad_norm": 2.021599769592285, + "learning_rate": 6.335740072202166e-07, + "loss": 0.2534, + "step": 12480 + }, + { + "epoch": 11.171735241502683, + "grad_norm": 1.6555845737457275, + "learning_rate": 6.332731648616125e-07, + "loss": 0.2561, + "step": 12490 + }, + { + "epoch": 11.180679785330948, + "grad_norm": 1.904301404953003, + "learning_rate": 6.329723225030084e-07, + "loss": 0.243, + "step": 12500 + }, + { + "epoch": 11.189624329159212, + "grad_norm": 1.5915064811706543, + "learning_rate": 6.326714801444043e-07, + "loss": 0.2526, + "step": 12510 + }, + { + "epoch": 11.198568872987478, + "grad_norm": 1.4680665731430054, + "learning_rate": 6.323706377858003e-07, + "loss": 0.2336, + "step": 12520 + }, + { + "epoch": 11.207513416815742, + "grad_norm": 1.414535641670227, + "learning_rate": 6.320697954271961e-07, + "loss": 0.2491, + "step": 12530 + }, + { + "epoch": 11.216457960644007, + "grad_norm": 1.5655280351638794, + "learning_rate": 6.31768953068592e-07, + "loss": 0.2554, + "step": 12540 + }, + { + "epoch": 11.225402504472273, + "grad_norm": 1.8156970739364624, + "learning_rate": 6.314681107099879e-07, + "loss": 0.2549, + "step": 12550 + }, + { + "epoch": 11.234347048300537, + "grad_norm": 1.654660940170288, + "learning_rate": 6.311672683513839e-07, + "loss": 0.2481, + "step": 12560 + }, + { + "epoch": 11.243291592128802, + "grad_norm": 1.5700427293777466, + "learning_rate": 6.308664259927797e-07, + "loss": 0.2534, + "step": 12570 + }, + { + "epoch": 11.252236135957066, + "grad_norm": 1.6425267457962036, + "learning_rate": 6.305655836341757e-07, + "loss": 0.257, + "step": 12580 + }, + { + "epoch": 11.261180679785332, + "grad_norm": 1.6263014078140259, + "learning_rate": 6.302647412755716e-07, + "loss": 0.2413, + "step": 12590 + }, + { + "epoch": 11.270125223613595, + "grad_norm": 1.5932801961898804, + "learning_rate": 6.299638989169674e-07, + "loss": 0.239, + "step": 12600 + }, + { + "epoch": 11.279069767441861, + "grad_norm": 1.8274989128112793, + "learning_rate": 6.296630565583634e-07, + "loss": 0.2491, + "step": 12610 + }, + { + "epoch": 11.288014311270125, + "grad_norm": 1.538745641708374, + "learning_rate": 6.293622141997593e-07, + "loss": 0.243, + "step": 12620 + }, + { + "epoch": 11.29695885509839, + "grad_norm": 1.4591760635375977, + "learning_rate": 6.290613718411552e-07, + "loss": 0.2395, + "step": 12630 + }, + { + "epoch": 11.305903398926654, + "grad_norm": 1.4508098363876343, + "learning_rate": 6.287605294825511e-07, + "loss": 0.236, + "step": 12640 + }, + { + "epoch": 11.31484794275492, + "grad_norm": 1.6408956050872803, + "learning_rate": 6.28459687123947e-07, + "loss": 0.2486, + "step": 12650 + }, + { + "epoch": 11.323792486583184, + "grad_norm": 1.584268569946289, + "learning_rate": 6.281588447653429e-07, + "loss": 0.2416, + "step": 12660 + }, + { + "epoch": 11.33273703041145, + "grad_norm": 1.9997392892837524, + "learning_rate": 6.278580024067388e-07, + "loss": 0.2643, + "step": 12670 + }, + { + "epoch": 11.341681574239713, + "grad_norm": 1.5162452459335327, + "learning_rate": 6.275571600481347e-07, + "loss": 0.2404, + "step": 12680 + }, + { + "epoch": 11.350626118067979, + "grad_norm": 1.5581248998641968, + "learning_rate": 6.272563176895307e-07, + "loss": 0.2503, + "step": 12690 + }, + { + "epoch": 11.359570661896242, + "grad_norm": 1.477333903312683, + "learning_rate": 6.269554753309265e-07, + "loss": 0.2379, + "step": 12700 + }, + { + "epoch": 11.368515205724508, + "grad_norm": 1.7269163131713867, + "learning_rate": 6.266546329723225e-07, + "loss": 0.2526, + "step": 12710 + }, + { + "epoch": 11.377459749552774, + "grad_norm": 1.6344338655471802, + "learning_rate": 6.263537906137184e-07, + "loss": 0.2481, + "step": 12720 + }, + { + "epoch": 11.386404293381037, + "grad_norm": 1.5223592519760132, + "learning_rate": 6.260529482551144e-07, + "loss": 0.2525, + "step": 12730 + }, + { + "epoch": 11.395348837209303, + "grad_norm": 1.904078722000122, + "learning_rate": 6.257521058965102e-07, + "loss": 0.2427, + "step": 12740 + }, + { + "epoch": 11.404293381037567, + "grad_norm": 1.5667195320129395, + "learning_rate": 6.254512635379062e-07, + "loss": 0.2402, + "step": 12750 + }, + { + "epoch": 11.413237924865832, + "grad_norm": 1.8779903650283813, + "learning_rate": 6.251504211793021e-07, + "loss": 0.251, + "step": 12760 + }, + { + "epoch": 11.422182468694096, + "grad_norm": 1.4085509777069092, + "learning_rate": 6.248495788206978e-07, + "loss": 0.2532, + "step": 12770 + }, + { + "epoch": 11.431127012522362, + "grad_norm": 1.5099104642868042, + "learning_rate": 6.245487364620938e-07, + "loss": 0.2482, + "step": 12780 + }, + { + "epoch": 11.440071556350626, + "grad_norm": 1.6909841299057007, + "learning_rate": 6.242478941034897e-07, + "loss": 0.2376, + "step": 12790 + }, + { + "epoch": 11.449016100178891, + "grad_norm": 1.5439828634262085, + "learning_rate": 6.239470517448856e-07, + "loss": 0.2375, + "step": 12800 + }, + { + "epoch": 11.457960644007155, + "grad_norm": 1.8462985754013062, + "learning_rate": 6.236462093862815e-07, + "loss": 0.2528, + "step": 12810 + }, + { + "epoch": 11.46690518783542, + "grad_norm": 1.5367786884307861, + "learning_rate": 6.233453670276775e-07, + "loss": 0.2409, + "step": 12820 + }, + { + "epoch": 11.475849731663684, + "grad_norm": 1.5457161664962769, + "learning_rate": 6.230445246690734e-07, + "loss": 0.2508, + "step": 12830 + }, + { + "epoch": 11.48479427549195, + "grad_norm": 1.5234055519104004, + "learning_rate": 6.227436823104693e-07, + "loss": 0.2439, + "step": 12840 + }, + { + "epoch": 11.493738819320214, + "grad_norm": 1.925322413444519, + "learning_rate": 6.224428399518652e-07, + "loss": 0.2497, + "step": 12850 + }, + { + "epoch": 11.50268336314848, + "grad_norm": 1.5820813179016113, + "learning_rate": 6.221419975932612e-07, + "loss": 0.2616, + "step": 12860 + }, + { + "epoch": 11.511627906976745, + "grad_norm": 1.547390341758728, + "learning_rate": 6.21841155234657e-07, + "loss": 0.2432, + "step": 12870 + }, + { + "epoch": 11.520572450805009, + "grad_norm": 1.5614700317382812, + "learning_rate": 6.215403128760529e-07, + "loss": 0.2466, + "step": 12880 + }, + { + "epoch": 11.529516994633275, + "grad_norm": 1.9572466611862183, + "learning_rate": 6.212394705174488e-07, + "loss": 0.253, + "step": 12890 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 1.6293258666992188, + "learning_rate": 6.209386281588448e-07, + "loss": 0.2541, + "step": 12900 + }, + { + "epoch": 11.547406082289804, + "grad_norm": 1.781266212463379, + "learning_rate": 6.206377858002406e-07, + "loss": 0.2503, + "step": 12910 + }, + { + "epoch": 11.556350626118068, + "grad_norm": 1.7980941534042358, + "learning_rate": 6.203369434416366e-07, + "loss": 0.2477, + "step": 12920 + }, + { + "epoch": 11.565295169946333, + "grad_norm": 1.8397564888000488, + "learning_rate": 6.200361010830325e-07, + "loss": 0.2472, + "step": 12930 + }, + { + "epoch": 11.574239713774597, + "grad_norm": 1.603770136833191, + "learning_rate": 6.197352587244283e-07, + "loss": 0.2457, + "step": 12940 + }, + { + "epoch": 11.583184257602863, + "grad_norm": 1.601441740989685, + "learning_rate": 6.194344163658243e-07, + "loss": 0.2462, + "step": 12950 + }, + { + "epoch": 11.592128801431127, + "grad_norm": 1.6930180788040161, + "learning_rate": 6.191335740072202e-07, + "loss": 0.2463, + "step": 12960 + }, + { + "epoch": 11.601073345259392, + "grad_norm": 2.2629811763763428, + "learning_rate": 6.188327316486161e-07, + "loss": 0.2463, + "step": 12970 + }, + { + "epoch": 11.610017889087656, + "grad_norm": 1.9006010293960571, + "learning_rate": 6.18531889290012e-07, + "loss": 0.2487, + "step": 12980 + }, + { + "epoch": 11.618962432915922, + "grad_norm": 1.6862515211105347, + "learning_rate": 6.182310469314079e-07, + "loss": 0.2367, + "step": 12990 + }, + { + "epoch": 11.627906976744185, + "grad_norm": 1.5999614000320435, + "learning_rate": 6.179302045728038e-07, + "loss": 0.2415, + "step": 13000 + }, + { + "epoch": 11.636851520572451, + "grad_norm": 2.216756582260132, + "learning_rate": 6.176293622141997e-07, + "loss": 0.2416, + "step": 13010 + }, + { + "epoch": 11.645796064400715, + "grad_norm": 1.8712037801742554, + "learning_rate": 6.173285198555956e-07, + "loss": 0.2471, + "step": 13020 + }, + { + "epoch": 11.65474060822898, + "grad_norm": 1.4762934446334839, + "learning_rate": 6.170276774969916e-07, + "loss": 0.2512, + "step": 13030 + }, + { + "epoch": 11.663685152057244, + "grad_norm": 1.8089286088943481, + "learning_rate": 6.167268351383874e-07, + "loss": 0.2418, + "step": 13040 + }, + { + "epoch": 11.67262969588551, + "grad_norm": 1.5072790384292603, + "learning_rate": 6.164259927797834e-07, + "loss": 0.2431, + "step": 13050 + }, + { + "epoch": 11.681574239713775, + "grad_norm": 1.778215765953064, + "learning_rate": 6.161251504211793e-07, + "loss": 0.2433, + "step": 13060 + }, + { + "epoch": 11.69051878354204, + "grad_norm": 2.7389516830444336, + "learning_rate": 6.158243080625753e-07, + "loss": 0.2508, + "step": 13070 + }, + { + "epoch": 11.699463327370305, + "grad_norm": 1.4133468866348267, + "learning_rate": 6.155234657039711e-07, + "loss": 0.2408, + "step": 13080 + }, + { + "epoch": 11.708407871198569, + "grad_norm": 1.6936426162719727, + "learning_rate": 6.152226233453671e-07, + "loss": 0.2542, + "step": 13090 + }, + { + "epoch": 11.717352415026834, + "grad_norm": 1.8718106746673584, + "learning_rate": 6.149217809867629e-07, + "loss": 0.2432, + "step": 13100 + }, + { + "epoch": 11.726296958855098, + "grad_norm": 1.8084053993225098, + "learning_rate": 6.146209386281587e-07, + "loss": 0.2412, + "step": 13110 + }, + { + "epoch": 11.735241502683364, + "grad_norm": 1.9868453741073608, + "learning_rate": 6.143200962695547e-07, + "loss": 0.2508, + "step": 13120 + }, + { + "epoch": 11.744186046511627, + "grad_norm": 1.9329386949539185, + "learning_rate": 6.140192539109506e-07, + "loss": 0.2501, + "step": 13130 + }, + { + "epoch": 11.753130590339893, + "grad_norm": 1.5499323606491089, + "learning_rate": 6.137184115523465e-07, + "loss": 0.2395, + "step": 13140 + }, + { + "epoch": 11.762075134168157, + "grad_norm": 1.7129054069519043, + "learning_rate": 6.134175691937424e-07, + "loss": 0.247, + "step": 13150 + }, + { + "epoch": 11.771019677996422, + "grad_norm": 1.7108467817306519, + "learning_rate": 6.131167268351384e-07, + "loss": 0.2462, + "step": 13160 + }, + { + "epoch": 11.779964221824686, + "grad_norm": 1.5333002805709839, + "learning_rate": 6.128158844765343e-07, + "loss": 0.2364, + "step": 13170 + }, + { + "epoch": 11.788908765652952, + "grad_norm": 1.4564987421035767, + "learning_rate": 6.125150421179302e-07, + "loss": 0.2477, + "step": 13180 + }, + { + "epoch": 11.797853309481216, + "grad_norm": 1.80374276638031, + "learning_rate": 6.122141997593261e-07, + "loss": 0.2373, + "step": 13190 + }, + { + "epoch": 11.806797853309481, + "grad_norm": 1.8643357753753662, + "learning_rate": 6.119133574007221e-07, + "loss": 0.2483, + "step": 13200 + }, + { + "epoch": 11.815742397137747, + "grad_norm": 1.589076042175293, + "learning_rate": 6.116125150421178e-07, + "loss": 0.2448, + "step": 13210 + }, + { + "epoch": 11.82468694096601, + "grad_norm": 1.6423367261886597, + "learning_rate": 6.113116726835138e-07, + "loss": 0.2399, + "step": 13220 + }, + { + "epoch": 11.833631484794276, + "grad_norm": 1.806188702583313, + "learning_rate": 6.110108303249097e-07, + "loss": 0.2454, + "step": 13230 + }, + { + "epoch": 11.84257602862254, + "grad_norm": 1.4810848236083984, + "learning_rate": 6.107099879663056e-07, + "loss": 0.2393, + "step": 13240 + }, + { + "epoch": 11.851520572450806, + "grad_norm": 1.4600383043289185, + "learning_rate": 6.104091456077015e-07, + "loss": 0.2497, + "step": 13250 + }, + { + "epoch": 11.86046511627907, + "grad_norm": 1.5684937238693237, + "learning_rate": 6.101083032490975e-07, + "loss": 0.236, + "step": 13260 + }, + { + "epoch": 11.869409660107335, + "grad_norm": 1.8886797428131104, + "learning_rate": 6.098074608904934e-07, + "loss": 0.2515, + "step": 13270 + }, + { + "epoch": 11.878354203935599, + "grad_norm": 1.8850244283676147, + "learning_rate": 6.095066185318892e-07, + "loss": 0.2448, + "step": 13280 + }, + { + "epoch": 11.887298747763865, + "grad_norm": 1.6738568544387817, + "learning_rate": 6.092057761732852e-07, + "loss": 0.2504, + "step": 13290 + }, + { + "epoch": 11.896243291592128, + "grad_norm": 2.228408098220825, + "learning_rate": 6.089049338146811e-07, + "loss": 0.2402, + "step": 13300 + }, + { + "epoch": 11.905187835420394, + "grad_norm": 1.6836262941360474, + "learning_rate": 6.08604091456077e-07, + "loss": 0.2387, + "step": 13310 + }, + { + "epoch": 11.914132379248658, + "grad_norm": 1.552213430404663, + "learning_rate": 6.083032490974728e-07, + "loss": 0.2374, + "step": 13320 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 1.9206942319869995, + "learning_rate": 6.080024067388688e-07, + "loss": 0.25, + "step": 13330 + }, + { + "epoch": 11.932021466905187, + "grad_norm": 1.483750343322754, + "learning_rate": 6.077015643802647e-07, + "loss": 0.2366, + "step": 13340 + }, + { + "epoch": 11.940966010733453, + "grad_norm": 1.670534372329712, + "learning_rate": 6.074007220216606e-07, + "loss": 0.2496, + "step": 13350 + }, + { + "epoch": 11.949910554561717, + "grad_norm": 1.7745046615600586, + "learning_rate": 6.070998796630565e-07, + "loss": 0.2434, + "step": 13360 + }, + { + "epoch": 11.958855098389982, + "grad_norm": 1.43484365940094, + "learning_rate": 6.067990373044525e-07, + "loss": 0.2424, + "step": 13370 + }, + { + "epoch": 11.967799642218246, + "grad_norm": 2.0954678058624268, + "learning_rate": 6.064981949458483e-07, + "loss": 0.2462, + "step": 13380 + }, + { + "epoch": 11.976744186046512, + "grad_norm": 1.776699185371399, + "learning_rate": 6.061973525872443e-07, + "loss": 0.2324, + "step": 13390 + }, + { + "epoch": 11.985688729874777, + "grad_norm": 1.5925743579864502, + "learning_rate": 6.058965102286402e-07, + "loss": 0.2372, + "step": 13400 + }, + { + "epoch": 11.994633273703041, + "grad_norm": 1.5993913412094116, + "learning_rate": 6.055956678700361e-07, + "loss": 0.236, + "step": 13410 + }, + { + "epoch": 12.0, + "eval_bleu": 74.4219, + "eval_gen_len": 74.833, + "eval_loss": 0.18223372101783752, + "eval_runtime": 57.8879, + "eval_samples_per_second": 18.0, + "eval_steps_per_second": 0.19, + "step": 13416 + }, + { + "epoch": 12.003577817531307, + "grad_norm": 1.4651010036468506, + "learning_rate": 6.05294825511432e-07, + "loss": 0.2337, + "step": 13420 + }, + { + "epoch": 12.01252236135957, + "grad_norm": 1.615624189376831, + "learning_rate": 6.049939831528279e-07, + "loss": 0.2309, + "step": 13430 + }, + { + "epoch": 12.021466905187836, + "grad_norm": 1.7083666324615479, + "learning_rate": 6.046931407942238e-07, + "loss": 0.2513, + "step": 13440 + }, + { + "epoch": 12.0304114490161, + "grad_norm": 1.6717640161514282, + "learning_rate": 6.043922984356197e-07, + "loss": 0.2406, + "step": 13450 + }, + { + "epoch": 12.039355992844365, + "grad_norm": 1.376678228378296, + "learning_rate": 6.040914560770156e-07, + "loss": 0.2402, + "step": 13460 + }, + { + "epoch": 12.04830053667263, + "grad_norm": 1.4994889497756958, + "learning_rate": 6.037906137184115e-07, + "loss": 0.2328, + "step": 13470 + }, + { + "epoch": 12.057245080500895, + "grad_norm": 1.5415583848953247, + "learning_rate": 6.034897713598074e-07, + "loss": 0.2435, + "step": 13480 + }, + { + "epoch": 12.066189624329159, + "grad_norm": 1.8038772344589233, + "learning_rate": 6.031889290012033e-07, + "loss": 0.2404, + "step": 13490 + }, + { + "epoch": 12.075134168157424, + "grad_norm": 1.442786693572998, + "learning_rate": 6.028880866425993e-07, + "loss": 0.2309, + "step": 13500 + }, + { + "epoch": 12.084078711985688, + "grad_norm": 1.993149757385254, + "learning_rate": 6.025872442839952e-07, + "loss": 0.2324, + "step": 13510 + }, + { + "epoch": 12.093023255813954, + "grad_norm": 1.6655985116958618, + "learning_rate": 6.022864019253911e-07, + "loss": 0.2397, + "step": 13520 + }, + { + "epoch": 12.101967799642217, + "grad_norm": 1.6622962951660156, + "learning_rate": 6.01985559566787e-07, + "loss": 0.2349, + "step": 13530 + }, + { + "epoch": 12.110912343470483, + "grad_norm": 1.6364377737045288, + "learning_rate": 6.01684717208183e-07, + "loss": 0.2277, + "step": 13540 + }, + { + "epoch": 12.119856887298747, + "grad_norm": 1.8937984704971313, + "learning_rate": 6.013838748495787e-07, + "loss": 0.244, + "step": 13550 + }, + { + "epoch": 12.128801431127012, + "grad_norm": 1.9228928089141846, + "learning_rate": 6.010830324909747e-07, + "loss": 0.2334, + "step": 13560 + }, + { + "epoch": 12.137745974955278, + "grad_norm": 1.6912461519241333, + "learning_rate": 6.007821901323706e-07, + "loss": 0.2524, + "step": 13570 + }, + { + "epoch": 12.146690518783542, + "grad_norm": 1.6879265308380127, + "learning_rate": 6.004813477737665e-07, + "loss": 0.2405, + "step": 13580 + }, + { + "epoch": 12.155635062611807, + "grad_norm": 1.6367559432983398, + "learning_rate": 6.001805054151624e-07, + "loss": 0.2489, + "step": 13590 + }, + { + "epoch": 12.164579606440071, + "grad_norm": 1.880110502243042, + "learning_rate": 5.998796630565584e-07, + "loss": 0.239, + "step": 13600 + }, + { + "epoch": 12.173524150268337, + "grad_norm": 1.7080553770065308, + "learning_rate": 5.995788206979543e-07, + "loss": 0.2482, + "step": 13610 + }, + { + "epoch": 12.1824686940966, + "grad_norm": 1.6933702230453491, + "learning_rate": 5.992779783393502e-07, + "loss": 0.2411, + "step": 13620 + }, + { + "epoch": 12.191413237924866, + "grad_norm": 1.5030890703201294, + "learning_rate": 5.989771359807461e-07, + "loss": 0.2363, + "step": 13630 + }, + { + "epoch": 12.20035778175313, + "grad_norm": 2.3736958503723145, + "learning_rate": 5.98676293622142e-07, + "loss": 0.2323, + "step": 13640 + }, + { + "epoch": 12.209302325581396, + "grad_norm": 1.4384323358535767, + "learning_rate": 5.983754512635378e-07, + "loss": 0.2327, + "step": 13650 + }, + { + "epoch": 12.21824686940966, + "grad_norm": 1.5894116163253784, + "learning_rate": 5.980746089049337e-07, + "loss": 0.2536, + "step": 13660 + }, + { + "epoch": 12.227191413237925, + "grad_norm": 1.9905613660812378, + "learning_rate": 5.977737665463297e-07, + "loss": 0.2452, + "step": 13670 + }, + { + "epoch": 12.236135957066189, + "grad_norm": 1.5962880849838257, + "learning_rate": 5.974729241877256e-07, + "loss": 0.2286, + "step": 13680 + }, + { + "epoch": 12.245080500894455, + "grad_norm": 1.4475836753845215, + "learning_rate": 5.971720818291215e-07, + "loss": 0.2287, + "step": 13690 + }, + { + "epoch": 12.254025044722718, + "grad_norm": 1.6959360837936401, + "learning_rate": 5.968712394705174e-07, + "loss": 0.2525, + "step": 13700 + }, + { + "epoch": 12.262969588550984, + "grad_norm": 1.7910102605819702, + "learning_rate": 5.965703971119134e-07, + "loss": 0.235, + "step": 13710 + }, + { + "epoch": 12.271914132379248, + "grad_norm": 1.5741043090820312, + "learning_rate": 5.962695547533092e-07, + "loss": 0.2418, + "step": 13720 + }, + { + "epoch": 12.280858676207513, + "grad_norm": 1.4918581247329712, + "learning_rate": 5.959687123947052e-07, + "loss": 0.2271, + "step": 13730 + }, + { + "epoch": 12.289803220035779, + "grad_norm": 1.4841476678848267, + "learning_rate": 5.956678700361011e-07, + "loss": 0.2324, + "step": 13740 + }, + { + "epoch": 12.298747763864043, + "grad_norm": 1.6960680484771729, + "learning_rate": 5.95367027677497e-07, + "loss": 0.2242, + "step": 13750 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 1.4989248514175415, + "learning_rate": 5.950661853188928e-07, + "loss": 0.2342, + "step": 13760 + }, + { + "epoch": 12.316636851520572, + "grad_norm": 1.4370133876800537, + "learning_rate": 5.947653429602888e-07, + "loss": 0.2478, + "step": 13770 + }, + { + "epoch": 12.325581395348838, + "grad_norm": 1.4599870443344116, + "learning_rate": 5.944645006016847e-07, + "loss": 0.2353, + "step": 13780 + }, + { + "epoch": 12.334525939177102, + "grad_norm": 1.5223127603530884, + "learning_rate": 5.941636582430806e-07, + "loss": 0.2349, + "step": 13790 + }, + { + "epoch": 12.343470483005367, + "grad_norm": 1.4748775959014893, + "learning_rate": 5.938628158844765e-07, + "loss": 0.2312, + "step": 13800 + }, + { + "epoch": 12.352415026833631, + "grad_norm": 1.7307710647583008, + "learning_rate": 5.935619735258724e-07, + "loss": 0.2412, + "step": 13810 + }, + { + "epoch": 12.361359570661897, + "grad_norm": 1.6681541204452515, + "learning_rate": 5.932611311672683e-07, + "loss": 0.243, + "step": 13820 + }, + { + "epoch": 12.37030411449016, + "grad_norm": 1.7818663120269775, + "learning_rate": 5.929602888086642e-07, + "loss": 0.2326, + "step": 13830 + }, + { + "epoch": 12.379248658318426, + "grad_norm": 1.616071343421936, + "learning_rate": 5.926594464500602e-07, + "loss": 0.2371, + "step": 13840 + }, + { + "epoch": 12.38819320214669, + "grad_norm": 1.447346568107605, + "learning_rate": 5.923586040914561e-07, + "loss": 0.2448, + "step": 13850 + }, + { + "epoch": 12.397137745974955, + "grad_norm": 1.6985827684402466, + "learning_rate": 5.92057761732852e-07, + "loss": 0.2446, + "step": 13860 + }, + { + "epoch": 12.40608228980322, + "grad_norm": 2.0520944595336914, + "learning_rate": 5.917569193742478e-07, + "loss": 0.2266, + "step": 13870 + }, + { + "epoch": 12.415026833631485, + "grad_norm": 1.4311727285385132, + "learning_rate": 5.914560770156438e-07, + "loss": 0.2322, + "step": 13880 + }, + { + "epoch": 12.42397137745975, + "grad_norm": 1.4834709167480469, + "learning_rate": 5.911552346570396e-07, + "loss": 0.233, + "step": 13890 + }, + { + "epoch": 12.432915921288014, + "grad_norm": 1.5307376384735107, + "learning_rate": 5.908543922984356e-07, + "loss": 0.2346, + "step": 13900 + }, + { + "epoch": 12.44186046511628, + "grad_norm": 1.6108955144882202, + "learning_rate": 5.905535499398315e-07, + "loss": 0.2394, + "step": 13910 + }, + { + "epoch": 12.450805008944544, + "grad_norm": 1.6023128032684326, + "learning_rate": 5.902527075812274e-07, + "loss": 0.24, + "step": 13920 + }, + { + "epoch": 12.45974955277281, + "grad_norm": 1.350578784942627, + "learning_rate": 5.899518652226233e-07, + "loss": 0.2354, + "step": 13930 + }, + { + "epoch": 12.468694096601073, + "grad_norm": 1.5131261348724365, + "learning_rate": 5.896510228640193e-07, + "loss": 0.2245, + "step": 13940 + }, + { + "epoch": 12.477638640429339, + "grad_norm": 2.112520456314087, + "learning_rate": 5.893501805054152e-07, + "loss": 0.2441, + "step": 13950 + }, + { + "epoch": 12.486583184257602, + "grad_norm": 1.790661096572876, + "learning_rate": 5.890493381468111e-07, + "loss": 0.2312, + "step": 13960 + }, + { + "epoch": 12.495527728085868, + "grad_norm": 1.4098833799362183, + "learning_rate": 5.88748495788207e-07, + "loss": 0.2353, + "step": 13970 + }, + { + "epoch": 12.504472271914132, + "grad_norm": 1.5231249332427979, + "learning_rate": 5.88447653429603e-07, + "loss": 0.235, + "step": 13980 + }, + { + "epoch": 12.513416815742398, + "grad_norm": 1.5738141536712646, + "learning_rate": 5.881468110709987e-07, + "loss": 0.2339, + "step": 13990 + }, + { + "epoch": 12.522361359570661, + "grad_norm": 1.6588445901870728, + "learning_rate": 5.878459687123946e-07, + "loss": 0.2472, + "step": 14000 + }, + { + "epoch": 12.531305903398927, + "grad_norm": 1.5778062343597412, + "learning_rate": 5.875451263537906e-07, + "loss": 0.2351, + "step": 14010 + }, + { + "epoch": 12.54025044722719, + "grad_norm": 1.7402443885803223, + "learning_rate": 5.872442839951864e-07, + "loss": 0.2331, + "step": 14020 + }, + { + "epoch": 12.549194991055456, + "grad_norm": 1.3807802200317383, + "learning_rate": 5.869434416365824e-07, + "loss": 0.2381, + "step": 14030 + }, + { + "epoch": 12.55813953488372, + "grad_norm": 1.5534865856170654, + "learning_rate": 5.866425992779783e-07, + "loss": 0.2285, + "step": 14040 + }, + { + "epoch": 12.567084078711986, + "grad_norm": 1.992496132850647, + "learning_rate": 5.863417569193743e-07, + "loss": 0.2476, + "step": 14050 + }, + { + "epoch": 12.57602862254025, + "grad_norm": 1.708743691444397, + "learning_rate": 5.860409145607701e-07, + "loss": 0.2262, + "step": 14060 + }, + { + "epoch": 12.584973166368515, + "grad_norm": 1.4764150381088257, + "learning_rate": 5.857400722021661e-07, + "loss": 0.2248, + "step": 14070 + }, + { + "epoch": 12.59391771019678, + "grad_norm": 1.5710704326629639, + "learning_rate": 5.85439229843562e-07, + "loss": 0.2392, + "step": 14080 + }, + { + "epoch": 12.602862254025045, + "grad_norm": 1.4406896829605103, + "learning_rate": 5.851383874849578e-07, + "loss": 0.233, + "step": 14090 + }, + { + "epoch": 12.61180679785331, + "grad_norm": 1.868032455444336, + "learning_rate": 5.848375451263537e-07, + "loss": 0.2333, + "step": 14100 + }, + { + "epoch": 12.620751341681574, + "grad_norm": 1.7434061765670776, + "learning_rate": 5.845367027677497e-07, + "loss": 0.2436, + "step": 14110 + }, + { + "epoch": 12.62969588550984, + "grad_norm": 1.3921831846237183, + "learning_rate": 5.842358604091456e-07, + "loss": 0.226, + "step": 14120 + }, + { + "epoch": 12.638640429338103, + "grad_norm": 1.5419607162475586, + "learning_rate": 5.839350180505415e-07, + "loss": 0.2331, + "step": 14130 + }, + { + "epoch": 12.647584973166369, + "grad_norm": 1.5571625232696533, + "learning_rate": 5.836341756919374e-07, + "loss": 0.2235, + "step": 14140 + }, + { + "epoch": 12.656529516994633, + "grad_norm": 1.3981750011444092, + "learning_rate": 5.833333333333334e-07, + "loss": 0.2311, + "step": 14150 + }, + { + "epoch": 12.665474060822898, + "grad_norm": 1.5649229288101196, + "learning_rate": 5.830324909747292e-07, + "loss": 0.2495, + "step": 14160 + }, + { + "epoch": 12.674418604651162, + "grad_norm": 1.7458295822143555, + "learning_rate": 5.827316486161251e-07, + "loss": 0.2369, + "step": 14170 + }, + { + "epoch": 12.683363148479428, + "grad_norm": 1.5172150135040283, + "learning_rate": 5.824308062575211e-07, + "loss": 0.2297, + "step": 14180 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 1.622676968574524, + "learning_rate": 5.821299638989168e-07, + "loss": 0.2294, + "step": 14190 + }, + { + "epoch": 12.701252236135957, + "grad_norm": 1.3619602918624878, + "learning_rate": 5.818291215403128e-07, + "loss": 0.2456, + "step": 14200 + }, + { + "epoch": 12.710196779964221, + "grad_norm": 1.630332589149475, + "learning_rate": 5.815282791817087e-07, + "loss": 0.229, + "step": 14210 + }, + { + "epoch": 12.719141323792487, + "grad_norm": 1.5032150745391846, + "learning_rate": 5.812274368231047e-07, + "loss": 0.2365, + "step": 14220 + }, + { + "epoch": 12.728085867620752, + "grad_norm": 1.4477381706237793, + "learning_rate": 5.809265944645005e-07, + "loss": 0.2284, + "step": 14230 + }, + { + "epoch": 12.737030411449016, + "grad_norm": 1.5683680772781372, + "learning_rate": 5.806257521058965e-07, + "loss": 0.2225, + "step": 14240 + }, + { + "epoch": 12.745974955277282, + "grad_norm": 1.756728172302246, + "learning_rate": 5.803249097472924e-07, + "loss": 0.2287, + "step": 14250 + }, + { + "epoch": 12.754919499105545, + "grad_norm": 1.389137864112854, + "learning_rate": 5.800240673886883e-07, + "loss": 0.2249, + "step": 14260 + }, + { + "epoch": 12.763864042933811, + "grad_norm": 1.4172710180282593, + "learning_rate": 5.797232250300842e-07, + "loss": 0.2336, + "step": 14270 + }, + { + "epoch": 12.772808586762075, + "grad_norm": 1.6355493068695068, + "learning_rate": 5.794223826714802e-07, + "loss": 0.2387, + "step": 14280 + }, + { + "epoch": 12.78175313059034, + "grad_norm": 1.4162112474441528, + "learning_rate": 5.791215403128761e-07, + "loss": 0.2295, + "step": 14290 + }, + { + "epoch": 12.790697674418604, + "grad_norm": 1.554172396659851, + "learning_rate": 5.78820697954272e-07, + "loss": 0.2247, + "step": 14300 + }, + { + "epoch": 12.79964221824687, + "grad_norm": 1.6874934434890747, + "learning_rate": 5.785198555956678e-07, + "loss": 0.2308, + "step": 14310 + }, + { + "epoch": 12.808586762075134, + "grad_norm": 1.5708050727844238, + "learning_rate": 5.782190132370638e-07, + "loss": 0.2256, + "step": 14320 + }, + { + "epoch": 12.8175313059034, + "grad_norm": 1.644698977470398, + "learning_rate": 5.779181708784596e-07, + "loss": 0.2274, + "step": 14330 + }, + { + "epoch": 12.826475849731663, + "grad_norm": 1.6415210962295532, + "learning_rate": 5.776173285198555e-07, + "loss": 0.2334, + "step": 14340 + }, + { + "epoch": 12.835420393559929, + "grad_norm": 1.6948776245117188, + "learning_rate": 5.773164861612515e-07, + "loss": 0.2389, + "step": 14350 + }, + { + "epoch": 12.844364937388193, + "grad_norm": 1.391992449760437, + "learning_rate": 5.770156438026473e-07, + "loss": 0.2262, + "step": 14360 + }, + { + "epoch": 12.853309481216458, + "grad_norm": 1.5542418956756592, + "learning_rate": 5.767148014440433e-07, + "loss": 0.2391, + "step": 14370 + }, + { + "epoch": 12.862254025044722, + "grad_norm": 1.5921099185943604, + "learning_rate": 5.764139590854392e-07, + "loss": 0.2271, + "step": 14380 + }, + { + "epoch": 12.871198568872988, + "grad_norm": 1.3908007144927979, + "learning_rate": 5.761131167268352e-07, + "loss": 0.226, + "step": 14390 + }, + { + "epoch": 12.880143112701251, + "grad_norm": 1.539855718612671, + "learning_rate": 5.75812274368231e-07, + "loss": 0.2336, + "step": 14400 + }, + { + "epoch": 12.889087656529517, + "grad_norm": 1.658566951751709, + "learning_rate": 5.75511432009627e-07, + "loss": 0.2375, + "step": 14410 + }, + { + "epoch": 12.898032200357783, + "grad_norm": 1.6505448818206787, + "learning_rate": 5.752105896510228e-07, + "loss": 0.2309, + "step": 14420 + }, + { + "epoch": 12.906976744186046, + "grad_norm": 1.6353178024291992, + "learning_rate": 5.749097472924187e-07, + "loss": 0.2266, + "step": 14430 + }, + { + "epoch": 12.915921288014312, + "grad_norm": 1.521298885345459, + "learning_rate": 5.746089049338146e-07, + "loss": 0.2257, + "step": 14440 + }, + { + "epoch": 12.924865831842576, + "grad_norm": 1.3476217985153198, + "learning_rate": 5.743080625752106e-07, + "loss": 0.2366, + "step": 14450 + }, + { + "epoch": 12.933810375670841, + "grad_norm": 1.5443305969238281, + "learning_rate": 5.740072202166065e-07, + "loss": 0.2222, + "step": 14460 + }, + { + "epoch": 12.942754919499105, + "grad_norm": 1.4951808452606201, + "learning_rate": 5.737063778580024e-07, + "loss": 0.2311, + "step": 14470 + }, + { + "epoch": 12.95169946332737, + "grad_norm": 1.7990375757217407, + "learning_rate": 5.734055354993983e-07, + "loss": 0.2422, + "step": 14480 + }, + { + "epoch": 12.960644007155635, + "grad_norm": 1.728073000907898, + "learning_rate": 5.731046931407943e-07, + "loss": 0.2179, + "step": 14490 + }, + { + "epoch": 12.9695885509839, + "grad_norm": 2.882551908493042, + "learning_rate": 5.728038507821901e-07, + "loss": 0.2271, + "step": 14500 + }, + { + "epoch": 12.978533094812164, + "grad_norm": 1.6010873317718506, + "learning_rate": 5.72503008423586e-07, + "loss": 0.2243, + "step": 14510 + }, + { + "epoch": 12.98747763864043, + "grad_norm": 1.5712891817092896, + "learning_rate": 5.72202166064982e-07, + "loss": 0.2225, + "step": 14520 + }, + { + "epoch": 12.996422182468693, + "grad_norm": 1.4817034006118774, + "learning_rate": 5.719013237063777e-07, + "loss": 0.2265, + "step": 14530 + }, + { + "epoch": 13.0, + "eval_bleu": 75.1435, + "eval_gen_len": 74.9069, + "eval_loss": 0.17450717091560364, + "eval_runtime": 56.9118, + "eval_samples_per_second": 18.309, + "eval_steps_per_second": 0.193, + "step": 14534 + }, + { + "epoch": 13.005366726296959, + "grad_norm": 1.9442760944366455, + "learning_rate": 5.716004813477737e-07, + "loss": 0.2372, + "step": 14540 + }, + { + "epoch": 13.014311270125223, + "grad_norm": 1.7336536645889282, + "learning_rate": 5.712996389891696e-07, + "loss": 0.2355, + "step": 14550 + }, + { + "epoch": 13.023255813953488, + "grad_norm": 1.4192476272583008, + "learning_rate": 5.709987966305656e-07, + "loss": 0.2268, + "step": 14560 + }, + { + "epoch": 13.032200357781754, + "grad_norm": 1.541877269744873, + "learning_rate": 5.706979542719614e-07, + "loss": 0.2381, + "step": 14570 + }, + { + "epoch": 13.041144901610018, + "grad_norm": 1.6108826398849487, + "learning_rate": 5.703971119133574e-07, + "loss": 0.2286, + "step": 14580 + }, + { + "epoch": 13.050089445438283, + "grad_norm": 1.3249037265777588, + "learning_rate": 5.700962695547533e-07, + "loss": 0.2288, + "step": 14590 + }, + { + "epoch": 13.059033989266547, + "grad_norm": 1.6004801988601685, + "learning_rate": 5.697954271961492e-07, + "loss": 0.2288, + "step": 14600 + }, + { + "epoch": 13.067978533094813, + "grad_norm": 1.8404788970947266, + "learning_rate": 5.694945848375451e-07, + "loss": 0.2288, + "step": 14610 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 1.4611536264419556, + "learning_rate": 5.691937424789411e-07, + "loss": 0.2241, + "step": 14620 + }, + { + "epoch": 13.085867620751342, + "grad_norm": 1.7837131023406982, + "learning_rate": 5.68892900120337e-07, + "loss": 0.2309, + "step": 14630 + }, + { + "epoch": 13.094812164579606, + "grad_norm": 1.7805612087249756, + "learning_rate": 5.685920577617328e-07, + "loss": 0.2299, + "step": 14640 + }, + { + "epoch": 13.103756708407872, + "grad_norm": 1.5109468698501587, + "learning_rate": 5.682912154031287e-07, + "loss": 0.2272, + "step": 14650 + }, + { + "epoch": 13.112701252236135, + "grad_norm": 1.4673620462417603, + "learning_rate": 5.679903730445247e-07, + "loss": 0.2296, + "step": 14660 + }, + { + "epoch": 13.121645796064401, + "grad_norm": 1.500759243965149, + "learning_rate": 5.676895306859205e-07, + "loss": 0.2218, + "step": 14670 + }, + { + "epoch": 13.130590339892665, + "grad_norm": 1.8135573863983154, + "learning_rate": 5.673886883273164e-07, + "loss": 0.2228, + "step": 14680 + }, + { + "epoch": 13.13953488372093, + "grad_norm": 1.4665645360946655, + "learning_rate": 5.670878459687124e-07, + "loss": 0.2258, + "step": 14690 + }, + { + "epoch": 13.148479427549194, + "grad_norm": 1.346288800239563, + "learning_rate": 5.667870036101082e-07, + "loss": 0.2231, + "step": 14700 + }, + { + "epoch": 13.15742397137746, + "grad_norm": 1.46870756149292, + "learning_rate": 5.664861612515042e-07, + "loss": 0.2237, + "step": 14710 + }, + { + "epoch": 13.166368515205724, + "grad_norm": 1.597507357597351, + "learning_rate": 5.661853188929001e-07, + "loss": 0.22, + "step": 14720 + }, + { + "epoch": 13.17531305903399, + "grad_norm": 1.7851216793060303, + "learning_rate": 5.658844765342961e-07, + "loss": 0.2346, + "step": 14730 + }, + { + "epoch": 13.184257602862255, + "grad_norm": 1.528058409690857, + "learning_rate": 5.655836341756919e-07, + "loss": 0.2262, + "step": 14740 + }, + { + "epoch": 13.193202146690519, + "grad_norm": 1.7863088846206665, + "learning_rate": 5.652827918170878e-07, + "loss": 0.2216, + "step": 14750 + }, + { + "epoch": 13.202146690518784, + "grad_norm": 1.3855292797088623, + "learning_rate": 5.649819494584837e-07, + "loss": 0.2202, + "step": 14760 + }, + { + "epoch": 13.211091234347048, + "grad_norm": 1.404234528541565, + "learning_rate": 5.646811070998796e-07, + "loss": 0.229, + "step": 14770 + }, + { + "epoch": 13.220035778175314, + "grad_norm": 1.9877007007598877, + "learning_rate": 5.643802647412755e-07, + "loss": 0.2174, + "step": 14780 + }, + { + "epoch": 13.228980322003578, + "grad_norm": 1.4733182191848755, + "learning_rate": 5.640794223826715e-07, + "loss": 0.2357, + "step": 14790 + }, + { + "epoch": 13.237924865831843, + "grad_norm": 1.4954806566238403, + "learning_rate": 5.637785800240674e-07, + "loss": 0.2372, + "step": 14800 + }, + { + "epoch": 13.246869409660107, + "grad_norm": 1.637521505355835, + "learning_rate": 5.634777376654633e-07, + "loss": 0.2226, + "step": 14810 + }, + { + "epoch": 13.255813953488373, + "grad_norm": 1.5723708868026733, + "learning_rate": 5.631768953068592e-07, + "loss": 0.2235, + "step": 14820 + }, + { + "epoch": 13.264758497316636, + "grad_norm": 1.7516061067581177, + "learning_rate": 5.628760529482552e-07, + "loss": 0.2335, + "step": 14830 + }, + { + "epoch": 13.273703041144902, + "grad_norm": 1.581664800643921, + "learning_rate": 5.62575210589651e-07, + "loss": 0.2274, + "step": 14840 + }, + { + "epoch": 13.282647584973166, + "grad_norm": 1.4926707744598389, + "learning_rate": 5.622743682310469e-07, + "loss": 0.2219, + "step": 14850 + }, + { + "epoch": 13.291592128801431, + "grad_norm": 1.4483482837677002, + "learning_rate": 5.619735258724428e-07, + "loss": 0.2319, + "step": 14860 + }, + { + "epoch": 13.300536672629695, + "grad_norm": 1.5603644847869873, + "learning_rate": 5.616726835138386e-07, + "loss": 0.2221, + "step": 14870 + }, + { + "epoch": 13.30948121645796, + "grad_norm": 1.8198426961898804, + "learning_rate": 5.613718411552346e-07, + "loss": 0.2286, + "step": 14880 + }, + { + "epoch": 13.318425760286225, + "grad_norm": 1.4116787910461426, + "learning_rate": 5.610709987966305e-07, + "loss": 0.2146, + "step": 14890 + }, + { + "epoch": 13.32737030411449, + "grad_norm": 1.5447922945022583, + "learning_rate": 5.607701564380265e-07, + "loss": 0.2247, + "step": 14900 + }, + { + "epoch": 13.336314847942756, + "grad_norm": 1.5551434755325317, + "learning_rate": 5.604693140794223e-07, + "loss": 0.231, + "step": 14910 + }, + { + "epoch": 13.34525939177102, + "grad_norm": 1.7243269681930542, + "learning_rate": 5.601684717208183e-07, + "loss": 0.2283, + "step": 14920 + }, + { + "epoch": 13.354203935599285, + "grad_norm": 1.6431998014450073, + "learning_rate": 5.598676293622142e-07, + "loss": 0.2137, + "step": 14930 + }, + { + "epoch": 13.363148479427549, + "grad_norm": 1.7388650178909302, + "learning_rate": 5.595667870036101e-07, + "loss": 0.2377, + "step": 14940 + }, + { + "epoch": 13.372093023255815, + "grad_norm": 1.2878618240356445, + "learning_rate": 5.59265944645006e-07, + "loss": 0.2393, + "step": 14950 + }, + { + "epoch": 13.381037567084078, + "grad_norm": 1.5231084823608398, + "learning_rate": 5.58965102286402e-07, + "loss": 0.2316, + "step": 14960 + }, + { + "epoch": 13.389982110912344, + "grad_norm": 1.5280569791793823, + "learning_rate": 5.586642599277977e-07, + "loss": 0.2186, + "step": 14970 + }, + { + "epoch": 13.398926654740608, + "grad_norm": 1.4062917232513428, + "learning_rate": 5.583634175691937e-07, + "loss": 0.2317, + "step": 14980 + }, + { + "epoch": 13.407871198568873, + "grad_norm": 1.6176033020019531, + "learning_rate": 5.580625752105896e-07, + "loss": 0.2296, + "step": 14990 + }, + { + "epoch": 13.416815742397137, + "grad_norm": 1.3912999629974365, + "learning_rate": 5.577617328519856e-07, + "loss": 0.2316, + "step": 15000 + }, + { + "epoch": 13.425760286225403, + "grad_norm": 1.6246581077575684, + "learning_rate": 5.574608904933814e-07, + "loss": 0.2355, + "step": 15010 + }, + { + "epoch": 13.434704830053667, + "grad_norm": 1.3514788150787354, + "learning_rate": 5.571600481347773e-07, + "loss": 0.2274, + "step": 15020 + }, + { + "epoch": 13.443649373881932, + "grad_norm": 1.55750572681427, + "learning_rate": 5.568592057761733e-07, + "loss": 0.2262, + "step": 15030 + }, + { + "epoch": 13.452593917710196, + "grad_norm": 1.4673680067062378, + "learning_rate": 5.565583634175691e-07, + "loss": 0.2322, + "step": 15040 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 1.9114556312561035, + "learning_rate": 5.562575210589651e-07, + "loss": 0.2168, + "step": 15050 + }, + { + "epoch": 13.470483005366725, + "grad_norm": 1.3632735013961792, + "learning_rate": 5.55956678700361e-07, + "loss": 0.2199, + "step": 15060 + }, + { + "epoch": 13.479427549194991, + "grad_norm": 1.4581695795059204, + "learning_rate": 5.55655836341757e-07, + "loss": 0.225, + "step": 15070 + }, + { + "epoch": 13.488372093023255, + "grad_norm": 1.6625158786773682, + "learning_rate": 5.553549939831527e-07, + "loss": 0.2325, + "step": 15080 + }, + { + "epoch": 13.49731663685152, + "grad_norm": 1.43048894405365, + "learning_rate": 5.550541516245487e-07, + "loss": 0.2253, + "step": 15090 + }, + { + "epoch": 13.506261180679786, + "grad_norm": 1.6861284971237183, + "learning_rate": 5.547533092659446e-07, + "loss": 0.222, + "step": 15100 + }, + { + "epoch": 13.51520572450805, + "grad_norm": 1.7863863706588745, + "learning_rate": 5.544524669073405e-07, + "loss": 0.2173, + "step": 15110 + }, + { + "epoch": 13.524150268336316, + "grad_norm": 1.679726481437683, + "learning_rate": 5.541516245487364e-07, + "loss": 0.2273, + "step": 15120 + }, + { + "epoch": 13.53309481216458, + "grad_norm": 1.5793452262878418, + "learning_rate": 5.538507821901324e-07, + "loss": 0.2241, + "step": 15130 + }, + { + "epoch": 13.542039355992845, + "grad_norm": 1.9857158660888672, + "learning_rate": 5.535499398315282e-07, + "loss": 0.2256, + "step": 15140 + }, + { + "epoch": 13.550983899821109, + "grad_norm": 1.6583348512649536, + "learning_rate": 5.532490974729242e-07, + "loss": 0.2263, + "step": 15150 + }, + { + "epoch": 13.559928443649374, + "grad_norm": 1.7588074207305908, + "learning_rate": 5.529482551143201e-07, + "loss": 0.229, + "step": 15160 + }, + { + "epoch": 13.568872987477638, + "grad_norm": 1.573966145515442, + "learning_rate": 5.526474127557161e-07, + "loss": 0.2261, + "step": 15170 + }, + { + "epoch": 13.577817531305904, + "grad_norm": 1.895883321762085, + "learning_rate": 5.523465703971119e-07, + "loss": 0.222, + "step": 15180 + }, + { + "epoch": 13.586762075134168, + "grad_norm": 1.5346202850341797, + "learning_rate": 5.520457280385077e-07, + "loss": 0.2263, + "step": 15190 + }, + { + "epoch": 13.595706618962433, + "grad_norm": 1.743774175643921, + "learning_rate": 5.517448856799037e-07, + "loss": 0.2198, + "step": 15200 + }, + { + "epoch": 13.604651162790697, + "grad_norm": 1.6069917678833008, + "learning_rate": 5.514440433212995e-07, + "loss": 0.2261, + "step": 15210 + }, + { + "epoch": 13.613595706618963, + "grad_norm": 1.4142626523971558, + "learning_rate": 5.511432009626955e-07, + "loss": 0.2206, + "step": 15220 + }, + { + "epoch": 13.622540250447226, + "grad_norm": 1.695739984512329, + "learning_rate": 5.508423586040914e-07, + "loss": 0.2306, + "step": 15230 + }, + { + "epoch": 13.631484794275492, + "grad_norm": 1.6444977521896362, + "learning_rate": 5.505415162454874e-07, + "loss": 0.229, + "step": 15240 + }, + { + "epoch": 13.640429338103758, + "grad_norm": 1.3577944040298462, + "learning_rate": 5.502406738868832e-07, + "loss": 0.2212, + "step": 15250 + }, + { + "epoch": 13.649373881932021, + "grad_norm": 1.6013015508651733, + "learning_rate": 5.499398315282792e-07, + "loss": 0.2244, + "step": 15260 + }, + { + "epoch": 13.658318425760287, + "grad_norm": 1.581383228302002, + "learning_rate": 5.496389891696751e-07, + "loss": 0.2318, + "step": 15270 + }, + { + "epoch": 13.66726296958855, + "grad_norm": 1.4835501909255981, + "learning_rate": 5.49338146811071e-07, + "loss": 0.2114, + "step": 15280 + }, + { + "epoch": 13.676207513416816, + "grad_norm": 1.5943286418914795, + "learning_rate": 5.490373044524669e-07, + "loss": 0.2187, + "step": 15290 + }, + { + "epoch": 13.68515205724508, + "grad_norm": 2.1119771003723145, + "learning_rate": 5.487364620938629e-07, + "loss": 0.2243, + "step": 15300 + }, + { + "epoch": 13.694096601073346, + "grad_norm": 1.923747181892395, + "learning_rate": 5.484356197352586e-07, + "loss": 0.2276, + "step": 15310 + }, + { + "epoch": 13.70304114490161, + "grad_norm": 1.8697950839996338, + "learning_rate": 5.481347773766546e-07, + "loss": 0.2199, + "step": 15320 + }, + { + "epoch": 13.711985688729875, + "grad_norm": 1.5428868532180786, + "learning_rate": 5.478339350180505e-07, + "loss": 0.2234, + "step": 15330 + }, + { + "epoch": 13.720930232558139, + "grad_norm": 1.7273482084274292, + "learning_rate": 5.475330926594465e-07, + "loss": 0.2277, + "step": 15340 + }, + { + "epoch": 13.729874776386405, + "grad_norm": 1.4095451831817627, + "learning_rate": 5.472322503008423e-07, + "loss": 0.2252, + "step": 15350 + }, + { + "epoch": 13.738819320214668, + "grad_norm": 1.2859090566635132, + "learning_rate": 5.469314079422383e-07, + "loss": 0.2246, + "step": 15360 + }, + { + "epoch": 13.747763864042934, + "grad_norm": 2.154752254486084, + "learning_rate": 5.466305655836342e-07, + "loss": 0.2145, + "step": 15370 + }, + { + "epoch": 13.756708407871198, + "grad_norm": 1.4549967050552368, + "learning_rate": 5.4632972322503e-07, + "loss": 0.2267, + "step": 15380 + }, + { + "epoch": 13.765652951699463, + "grad_norm": 1.5076959133148193, + "learning_rate": 5.46028880866426e-07, + "loss": 0.2286, + "step": 15390 + }, + { + "epoch": 13.774597495527727, + "grad_norm": 1.331883430480957, + "learning_rate": 5.457280385078219e-07, + "loss": 0.2102, + "step": 15400 + }, + { + "epoch": 13.783542039355993, + "grad_norm": 1.76063871383667, + "learning_rate": 5.454271961492179e-07, + "loss": 0.2262, + "step": 15410 + }, + { + "epoch": 13.792486583184257, + "grad_norm": 1.3071162700653076, + "learning_rate": 5.451263537906136e-07, + "loss": 0.2228, + "step": 15420 + }, + { + "epoch": 13.801431127012522, + "grad_norm": 1.6594287157058716, + "learning_rate": 5.448255114320096e-07, + "loss": 0.2219, + "step": 15430 + }, + { + "epoch": 13.810375670840788, + "grad_norm": 1.4981613159179688, + "learning_rate": 5.445246690734055e-07, + "loss": 0.2246, + "step": 15440 + }, + { + "epoch": 13.819320214669052, + "grad_norm": 2.1496357917785645, + "learning_rate": 5.442238267148014e-07, + "loss": 0.2329, + "step": 15450 + }, + { + "epoch": 13.828264758497317, + "grad_norm": 1.5639127492904663, + "learning_rate": 5.439229843561973e-07, + "loss": 0.2211, + "step": 15460 + }, + { + "epoch": 13.837209302325581, + "grad_norm": 1.3146132230758667, + "learning_rate": 5.436221419975933e-07, + "loss": 0.2098, + "step": 15470 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 1.4802268743515015, + "learning_rate": 5.433212996389891e-07, + "loss": 0.2156, + "step": 15480 + }, + { + "epoch": 13.85509838998211, + "grad_norm": 1.6789542436599731, + "learning_rate": 5.430204572803851e-07, + "loss": 0.2182, + "step": 15490 + }, + { + "epoch": 13.864042933810376, + "grad_norm": 1.58533775806427, + "learning_rate": 5.42719614921781e-07, + "loss": 0.2153, + "step": 15500 + }, + { + "epoch": 13.87298747763864, + "grad_norm": 1.6321181058883667, + "learning_rate": 5.42418772563177e-07, + "loss": 0.2185, + "step": 15510 + }, + { + "epoch": 13.881932021466906, + "grad_norm": 1.8955652713775635, + "learning_rate": 5.421179302045727e-07, + "loss": 0.223, + "step": 15520 + }, + { + "epoch": 13.89087656529517, + "grad_norm": 1.695584774017334, + "learning_rate": 5.418170878459687e-07, + "loss": 0.2254, + "step": 15530 + }, + { + "epoch": 13.899821109123435, + "grad_norm": 2.4142327308654785, + "learning_rate": 5.415162454873646e-07, + "loss": 0.2355, + "step": 15540 + }, + { + "epoch": 13.908765652951699, + "grad_norm": 1.4618829488754272, + "learning_rate": 5.412154031287604e-07, + "loss": 0.2249, + "step": 15550 + }, + { + "epoch": 13.917710196779964, + "grad_norm": 2.5095059871673584, + "learning_rate": 5.409145607701564e-07, + "loss": 0.2154, + "step": 15560 + }, + { + "epoch": 13.926654740608228, + "grad_norm": 1.5938920974731445, + "learning_rate": 5.406137184115523e-07, + "loss": 0.2304, + "step": 15570 + }, + { + "epoch": 13.935599284436494, + "grad_norm": 1.4232935905456543, + "learning_rate": 5.403128760529483e-07, + "loss": 0.2188, + "step": 15580 + }, + { + "epoch": 13.94454382826476, + "grad_norm": 1.2760659456253052, + "learning_rate": 5.400120336943441e-07, + "loss": 0.2173, + "step": 15590 + }, + { + "epoch": 13.953488372093023, + "grad_norm": 1.420570731163025, + "learning_rate": 5.397111913357401e-07, + "loss": 0.2174, + "step": 15600 + }, + { + "epoch": 13.962432915921289, + "grad_norm": 1.5642368793487549, + "learning_rate": 5.39410348977136e-07, + "loss": 0.2208, + "step": 15610 + }, + { + "epoch": 13.971377459749553, + "grad_norm": 1.8670076131820679, + "learning_rate": 5.391095066185319e-07, + "loss": 0.2174, + "step": 15620 + }, + { + "epoch": 13.980322003577818, + "grad_norm": 1.752993106842041, + "learning_rate": 5.388086642599277e-07, + "loss": 0.2235, + "step": 15630 + }, + { + "epoch": 13.989266547406082, + "grad_norm": 1.523247480392456, + "learning_rate": 5.385078219013237e-07, + "loss": 0.2222, + "step": 15640 + }, + { + "epoch": 13.998211091234348, + "grad_norm": 1.7766495943069458, + "learning_rate": 5.382069795427195e-07, + "loss": 0.2152, + "step": 15650 + }, + { + "epoch": 14.0, + "eval_bleu": 75.7614, + "eval_gen_len": 74.7409, + "eval_loss": 0.16948764026165009, + "eval_runtime": 58.1609, + "eval_samples_per_second": 17.916, + "eval_steps_per_second": 0.189, + "step": 15652 + }, + { + "epoch": 14.007155635062611, + "grad_norm": 1.6034523248672485, + "learning_rate": 5.379061371841155e-07, + "loss": 0.1954, + "step": 15660 + }, + { + "epoch": 14.016100178890877, + "grad_norm": 1.594677209854126, + "learning_rate": 5.376052948255114e-07, + "loss": 0.2191, + "step": 15670 + }, + { + "epoch": 14.02504472271914, + "grad_norm": 1.6265920400619507, + "learning_rate": 5.373044524669074e-07, + "loss": 0.2293, + "step": 15680 + }, + { + "epoch": 14.033989266547406, + "grad_norm": 1.4514870643615723, + "learning_rate": 5.370036101083032e-07, + "loss": 0.2187, + "step": 15690 + }, + { + "epoch": 14.04293381037567, + "grad_norm": 1.4903329610824585, + "learning_rate": 5.367027677496992e-07, + "loss": 0.222, + "step": 15700 + }, + { + "epoch": 14.051878354203936, + "grad_norm": 1.57286536693573, + "learning_rate": 5.364019253910951e-07, + "loss": 0.2299, + "step": 15710 + }, + { + "epoch": 14.0608228980322, + "grad_norm": 1.7294665575027466, + "learning_rate": 5.361010830324909e-07, + "loss": 0.2163, + "step": 15720 + }, + { + "epoch": 14.069767441860465, + "grad_norm": 1.366902232170105, + "learning_rate": 5.358002406738869e-07, + "loss": 0.2179, + "step": 15730 + }, + { + "epoch": 14.078711985688729, + "grad_norm": 1.6594551801681519, + "learning_rate": 5.354993983152827e-07, + "loss": 0.2259, + "step": 15740 + }, + { + "epoch": 14.087656529516995, + "grad_norm": 1.7197548151016235, + "learning_rate": 5.351985559566786e-07, + "loss": 0.2286, + "step": 15750 + }, + { + "epoch": 14.09660107334526, + "grad_norm": 1.67202627658844, + "learning_rate": 5.348977135980745e-07, + "loss": 0.2189, + "step": 15760 + }, + { + "epoch": 14.105545617173524, + "grad_norm": 1.654624342918396, + "learning_rate": 5.345968712394705e-07, + "loss": 0.2223, + "step": 15770 + }, + { + "epoch": 14.11449016100179, + "grad_norm": 1.4540494680404663, + "learning_rate": 5.342960288808664e-07, + "loss": 0.2154, + "step": 15780 + }, + { + "epoch": 14.123434704830053, + "grad_norm": 1.5942274332046509, + "learning_rate": 5.339951865222623e-07, + "loss": 0.2292, + "step": 15790 + }, + { + "epoch": 14.132379248658319, + "grad_norm": 1.457651972770691, + "learning_rate": 5.336943441636582e-07, + "loss": 0.2127, + "step": 15800 + }, + { + "epoch": 14.141323792486583, + "grad_norm": 1.7262177467346191, + "learning_rate": 5.333935018050542e-07, + "loss": 0.2226, + "step": 15810 + }, + { + "epoch": 14.150268336314848, + "grad_norm": 1.687378168106079, + "learning_rate": 5.3309265944645e-07, + "loss": 0.2161, + "step": 15820 + }, + { + "epoch": 14.159212880143112, + "grad_norm": 1.799683928489685, + "learning_rate": 5.32791817087846e-07, + "loss": 0.2109, + "step": 15830 + }, + { + "epoch": 14.168157423971378, + "grad_norm": 1.6736897230148315, + "learning_rate": 5.324909747292419e-07, + "loss": 0.2216, + "step": 15840 + }, + { + "epoch": 14.177101967799642, + "grad_norm": 1.7621829509735107, + "learning_rate": 5.321901323706379e-07, + "loss": 0.2123, + "step": 15850 + }, + { + "epoch": 14.186046511627907, + "grad_norm": 1.7803364992141724, + "learning_rate": 5.318892900120336e-07, + "loss": 0.2205, + "step": 15860 + }, + { + "epoch": 14.194991055456171, + "grad_norm": 1.577661395072937, + "learning_rate": 5.315884476534296e-07, + "loss": 0.2099, + "step": 15870 + }, + { + "epoch": 14.203935599284437, + "grad_norm": 1.684728980064392, + "learning_rate": 5.312876052948255e-07, + "loss": 0.2161, + "step": 15880 + }, + { + "epoch": 14.2128801431127, + "grad_norm": 1.5282610654830933, + "learning_rate": 5.309867629362213e-07, + "loss": 0.2246, + "step": 15890 + }, + { + "epoch": 14.221824686940966, + "grad_norm": 1.5472359657287598, + "learning_rate": 5.306859205776173e-07, + "loss": 0.216, + "step": 15900 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 1.4690499305725098, + "learning_rate": 5.303850782190132e-07, + "loss": 0.2126, + "step": 15910 + }, + { + "epoch": 14.239713774597496, + "grad_norm": 1.7932279109954834, + "learning_rate": 5.300842358604091e-07, + "loss": 0.2184, + "step": 15920 + }, + { + "epoch": 14.248658318425761, + "grad_norm": 1.4595445394515991, + "learning_rate": 5.29783393501805e-07, + "loss": 0.2213, + "step": 15930 + }, + { + "epoch": 14.257602862254025, + "grad_norm": 2.5874969959259033, + "learning_rate": 5.29482551143201e-07, + "loss": 0.2221, + "step": 15940 + }, + { + "epoch": 14.26654740608229, + "grad_norm": 1.4573915004730225, + "learning_rate": 5.291817087845969e-07, + "loss": 0.2122, + "step": 15950 + }, + { + "epoch": 14.275491949910554, + "grad_norm": 1.4587106704711914, + "learning_rate": 5.288808664259927e-07, + "loss": 0.2107, + "step": 15960 + }, + { + "epoch": 14.28443649373882, + "grad_norm": 2.2001304626464844, + "learning_rate": 5.285800240673886e-07, + "loss": 0.2255, + "step": 15970 + }, + { + "epoch": 14.293381037567084, + "grad_norm": 1.332115650177002, + "learning_rate": 5.282791817087846e-07, + "loss": 0.219, + "step": 15980 + }, + { + "epoch": 14.30232558139535, + "grad_norm": 1.5230737924575806, + "learning_rate": 5.279783393501804e-07, + "loss": 0.2204, + "step": 15990 + }, + { + "epoch": 14.311270125223613, + "grad_norm": 1.4917973279953003, + "learning_rate": 5.276774969915764e-07, + "loss": 0.2182, + "step": 16000 + }, + { + "epoch": 14.320214669051879, + "grad_norm": 1.415745496749878, + "learning_rate": 5.273766546329723e-07, + "loss": 0.2123, + "step": 16010 + }, + { + "epoch": 14.329159212880143, + "grad_norm": 1.3475311994552612, + "learning_rate": 5.270758122743683e-07, + "loss": 0.2265, + "step": 16020 + }, + { + "epoch": 14.338103756708408, + "grad_norm": 1.4397448301315308, + "learning_rate": 5.267749699157641e-07, + "loss": 0.2181, + "step": 16030 + }, + { + "epoch": 14.347048300536672, + "grad_norm": 1.4729828834533691, + "learning_rate": 5.264741275571601e-07, + "loss": 0.2118, + "step": 16040 + }, + { + "epoch": 14.355992844364938, + "grad_norm": 1.548867106437683, + "learning_rate": 5.26173285198556e-07, + "loss": 0.2201, + "step": 16050 + }, + { + "epoch": 14.364937388193201, + "grad_norm": 1.7978005409240723, + "learning_rate": 5.258724428399518e-07, + "loss": 0.216, + "step": 16060 + }, + { + "epoch": 14.373881932021467, + "grad_norm": 1.3418782949447632, + "learning_rate": 5.255716004813477e-07, + "loss": 0.2106, + "step": 16070 + }, + { + "epoch": 14.38282647584973, + "grad_norm": 1.7105077505111694, + "learning_rate": 5.252707581227436e-07, + "loss": 0.2216, + "step": 16080 + }, + { + "epoch": 14.391771019677996, + "grad_norm": 1.3112951517105103, + "learning_rate": 5.249699157641395e-07, + "loss": 0.2117, + "step": 16090 + }, + { + "epoch": 14.400715563506262, + "grad_norm": 1.4080175161361694, + "learning_rate": 5.246690734055354e-07, + "loss": 0.2246, + "step": 16100 + }, + { + "epoch": 14.409660107334526, + "grad_norm": 1.5949844121932983, + "learning_rate": 5.243682310469314e-07, + "loss": 0.2108, + "step": 16110 + }, + { + "epoch": 14.418604651162791, + "grad_norm": 1.4474152326583862, + "learning_rate": 5.240673886883273e-07, + "loss": 0.2204, + "step": 16120 + }, + { + "epoch": 14.427549194991055, + "grad_norm": 1.3992246389389038, + "learning_rate": 5.237665463297232e-07, + "loss": 0.2125, + "step": 16130 + }, + { + "epoch": 14.43649373881932, + "grad_norm": 1.6255208253860474, + "learning_rate": 5.234657039711191e-07, + "loss": 0.2149, + "step": 16140 + }, + { + "epoch": 14.445438282647585, + "grad_norm": 1.5684301853179932, + "learning_rate": 5.231648616125151e-07, + "loss": 0.2147, + "step": 16150 + }, + { + "epoch": 14.45438282647585, + "grad_norm": 1.4556910991668701, + "learning_rate": 5.228640192539109e-07, + "loss": 0.2263, + "step": 16160 + }, + { + "epoch": 14.463327370304114, + "grad_norm": 1.5484651327133179, + "learning_rate": 5.225631768953069e-07, + "loss": 0.2193, + "step": 16170 + }, + { + "epoch": 14.47227191413238, + "grad_norm": 1.8516258001327515, + "learning_rate": 5.222623345367027e-07, + "loss": 0.2235, + "step": 16180 + }, + { + "epoch": 14.481216457960643, + "grad_norm": 1.7158583402633667, + "learning_rate": 5.219614921780987e-07, + "loss": 0.22, + "step": 16190 + }, + { + "epoch": 14.490161001788909, + "grad_norm": 1.575059413909912, + "learning_rate": 5.216606498194945e-07, + "loss": 0.2194, + "step": 16200 + }, + { + "epoch": 14.499105545617173, + "grad_norm": 1.519271731376648, + "learning_rate": 5.213598074608905e-07, + "loss": 0.2137, + "step": 16210 + }, + { + "epoch": 14.508050089445439, + "grad_norm": 1.4205260276794434, + "learning_rate": 5.210589651022864e-07, + "loss": 0.2228, + "step": 16220 + }, + { + "epoch": 14.516994633273702, + "grad_norm": 1.2977843284606934, + "learning_rate": 5.207581227436822e-07, + "loss": 0.2162, + "step": 16230 + }, + { + "epoch": 14.525939177101968, + "grad_norm": 1.5938493013381958, + "learning_rate": 5.204572803850782e-07, + "loss": 0.2092, + "step": 16240 + }, + { + "epoch": 14.534883720930232, + "grad_norm": 1.624735713005066, + "learning_rate": 5.201564380264741e-07, + "loss": 0.2267, + "step": 16250 + }, + { + "epoch": 14.543828264758497, + "grad_norm": 1.8016725778579712, + "learning_rate": 5.1985559566787e-07, + "loss": 0.2169, + "step": 16260 + }, + { + "epoch": 14.552772808586763, + "grad_norm": 1.4361350536346436, + "learning_rate": 5.195547533092659e-07, + "loss": 0.2109, + "step": 16270 + }, + { + "epoch": 14.561717352415027, + "grad_norm": 1.4204323291778564, + "learning_rate": 5.192539109506619e-07, + "loss": 0.2088, + "step": 16280 + }, + { + "epoch": 14.570661896243292, + "grad_norm": 1.6258000135421753, + "learning_rate": 5.189530685920577e-07, + "loss": 0.2079, + "step": 16290 + }, + { + "epoch": 14.579606440071556, + "grad_norm": 1.4437638521194458, + "learning_rate": 5.186522262334536e-07, + "loss": 0.221, + "step": 16300 + }, + { + "epoch": 14.588550983899822, + "grad_norm": 1.4674285650253296, + "learning_rate": 5.183513838748495e-07, + "loss": 0.2204, + "step": 16310 + }, + { + "epoch": 14.597495527728086, + "grad_norm": 1.5488989353179932, + "learning_rate": 5.180505415162455e-07, + "loss": 0.2269, + "step": 16320 + }, + { + "epoch": 14.606440071556351, + "grad_norm": 1.4164832830429077, + "learning_rate": 5.177496991576413e-07, + "loss": 0.2116, + "step": 16330 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 1.3262627124786377, + "learning_rate": 5.174488567990373e-07, + "loss": 0.2127, + "step": 16340 + }, + { + "epoch": 14.62432915921288, + "grad_norm": 1.4488004446029663, + "learning_rate": 5.171480144404332e-07, + "loss": 0.2246, + "step": 16350 + }, + { + "epoch": 14.633273703041144, + "grad_norm": 1.6791784763336182, + "learning_rate": 5.168471720818292e-07, + "loss": 0.2244, + "step": 16360 + }, + { + "epoch": 14.64221824686941, + "grad_norm": 1.7339402437210083, + "learning_rate": 5.16546329723225e-07, + "loss": 0.2066, + "step": 16370 + }, + { + "epoch": 14.651162790697674, + "grad_norm": 1.5072243213653564, + "learning_rate": 5.16245487364621e-07, + "loss": 0.2042, + "step": 16380 + }, + { + "epoch": 14.66010733452594, + "grad_norm": 1.5121406316757202, + "learning_rate": 5.159446450060169e-07, + "loss": 0.2077, + "step": 16390 + }, + { + "epoch": 14.669051878354203, + "grad_norm": 1.6104305982589722, + "learning_rate": 5.156438026474126e-07, + "loss": 0.216, + "step": 16400 + }, + { + "epoch": 14.677996422182469, + "grad_norm": 1.7308858633041382, + "learning_rate": 5.153429602888086e-07, + "loss": 0.2214, + "step": 16410 + }, + { + "epoch": 14.686940966010733, + "grad_norm": 1.368651270866394, + "learning_rate": 5.150421179302045e-07, + "loss": 0.2125, + "step": 16420 + }, + { + "epoch": 14.695885509838998, + "grad_norm": 1.5493015050888062, + "learning_rate": 5.147412755716004e-07, + "loss": 0.2132, + "step": 16430 + }, + { + "epoch": 14.704830053667262, + "grad_norm": 1.538008451461792, + "learning_rate": 5.144404332129963e-07, + "loss": 0.2087, + "step": 16440 + }, + { + "epoch": 14.713774597495528, + "grad_norm": 1.774583101272583, + "learning_rate": 5.141395908543923e-07, + "loss": 0.2137, + "step": 16450 + }, + { + "epoch": 14.722719141323793, + "grad_norm": 1.5929735898971558, + "learning_rate": 5.138387484957882e-07, + "loss": 0.2266, + "step": 16460 + }, + { + "epoch": 14.731663685152057, + "grad_norm": 1.4200128316879272, + "learning_rate": 5.135379061371841e-07, + "loss": 0.2239, + "step": 16470 + }, + { + "epoch": 14.740608228980323, + "grad_norm": 2.4553580284118652, + "learning_rate": 5.1323706377858e-07, + "loss": 0.2186, + "step": 16480 + }, + { + "epoch": 14.749552772808586, + "grad_norm": 1.5825417041778564, + "learning_rate": 5.12936221419976e-07, + "loss": 0.2183, + "step": 16490 + }, + { + "epoch": 14.758497316636852, + "grad_norm": 2.1846418380737305, + "learning_rate": 5.126353790613718e-07, + "loss": 0.2109, + "step": 16500 + }, + { + "epoch": 14.767441860465116, + "grad_norm": 1.337199091911316, + "learning_rate": 5.123345367027678e-07, + "loss": 0.2029, + "step": 16510 + }, + { + "epoch": 14.776386404293381, + "grad_norm": 1.4627991914749146, + "learning_rate": 5.120336943441636e-07, + "loss": 0.213, + "step": 16520 + }, + { + "epoch": 14.785330948121645, + "grad_norm": 1.2725661993026733, + "learning_rate": 5.117328519855595e-07, + "loss": 0.2104, + "step": 16530 + }, + { + "epoch": 14.79427549194991, + "grad_norm": 1.5113191604614258, + "learning_rate": 5.114320096269554e-07, + "loss": 0.2164, + "step": 16540 + }, + { + "epoch": 14.803220035778175, + "grad_norm": 1.4444341659545898, + "learning_rate": 5.111311672683514e-07, + "loss": 0.218, + "step": 16550 + }, + { + "epoch": 14.81216457960644, + "grad_norm": 1.361047625541687, + "learning_rate": 5.108303249097473e-07, + "loss": 0.2149, + "step": 16560 + }, + { + "epoch": 14.821109123434704, + "grad_norm": 1.663375973701477, + "learning_rate": 5.105294825511431e-07, + "loss": 0.2115, + "step": 16570 + }, + { + "epoch": 14.83005366726297, + "grad_norm": 1.6099789142608643, + "learning_rate": 5.102286401925391e-07, + "loss": 0.2151, + "step": 16580 + }, + { + "epoch": 14.838998211091234, + "grad_norm": 1.769354224205017, + "learning_rate": 5.09927797833935e-07, + "loss": 0.2201, + "step": 16590 + }, + { + "epoch": 14.847942754919499, + "grad_norm": 1.6463778018951416, + "learning_rate": 5.096269554753309e-07, + "loss": 0.2144, + "step": 16600 + }, + { + "epoch": 14.856887298747765, + "grad_norm": 1.7376254796981812, + "learning_rate": 5.093261131167268e-07, + "loss": 0.2196, + "step": 16610 + }, + { + "epoch": 14.865831842576029, + "grad_norm": 2.0024073123931885, + "learning_rate": 5.090252707581228e-07, + "loss": 0.2124, + "step": 16620 + }, + { + "epoch": 14.874776386404294, + "grad_norm": 1.7807821035385132, + "learning_rate": 5.087244283995186e-07, + "loss": 0.2132, + "step": 16630 + }, + { + "epoch": 14.883720930232558, + "grad_norm": 1.5845977067947388, + "learning_rate": 5.084235860409145e-07, + "loss": 0.224, + "step": 16640 + }, + { + "epoch": 14.892665474060824, + "grad_norm": 1.4428369998931885, + "learning_rate": 5.081227436823104e-07, + "loss": 0.2047, + "step": 16650 + }, + { + "epoch": 14.901610017889087, + "grad_norm": 1.8869410753250122, + "learning_rate": 5.078219013237064e-07, + "loss": 0.2116, + "step": 16660 + }, + { + "epoch": 14.910554561717353, + "grad_norm": 1.3606252670288086, + "learning_rate": 5.075210589651022e-07, + "loss": 0.2137, + "step": 16670 + }, + { + "epoch": 14.919499105545617, + "grad_norm": 1.5670171976089478, + "learning_rate": 5.072202166064982e-07, + "loss": 0.205, + "step": 16680 + }, + { + "epoch": 14.928443649373882, + "grad_norm": 2.256553888320923, + "learning_rate": 5.069193742478941e-07, + "loss": 0.2021, + "step": 16690 + }, + { + "epoch": 14.937388193202146, + "grad_norm": 1.7542644739151, + "learning_rate": 5.0661853188929e-07, + "loss": 0.2003, + "step": 16700 + }, + { + "epoch": 14.946332737030412, + "grad_norm": 1.4539682865142822, + "learning_rate": 5.063176895306859e-07, + "loss": 0.2116, + "step": 16710 + }, + { + "epoch": 14.955277280858676, + "grad_norm": 1.3423739671707153, + "learning_rate": 5.060168471720819e-07, + "loss": 0.2261, + "step": 16720 + }, + { + "epoch": 14.964221824686941, + "grad_norm": 1.2797144651412964, + "learning_rate": 5.057160048134778e-07, + "loss": 0.2113, + "step": 16730 + }, + { + "epoch": 14.973166368515205, + "grad_norm": 1.4044950008392334, + "learning_rate": 5.054151624548736e-07, + "loss": 0.2081, + "step": 16740 + }, + { + "epoch": 14.98211091234347, + "grad_norm": 1.540293574333191, + "learning_rate": 5.051143200962695e-07, + "loss": 0.212, + "step": 16750 + }, + { + "epoch": 14.991055456171736, + "grad_norm": 1.5579257011413574, + "learning_rate": 5.048134777376654e-07, + "loss": 0.2068, + "step": 16760 + }, + { + "epoch": 15.0, + "grad_norm": 2.3970136642456055, + "learning_rate": 5.045126353790613e-07, + "loss": 0.2078, + "step": 16770 + }, + { + "epoch": 15.0, + "eval_bleu": 76.2353, + "eval_gen_len": 74.7092, + "eval_loss": 0.1641465276479721, + "eval_runtime": 55.922, + "eval_samples_per_second": 18.633, + "eval_steps_per_second": 0.197, + "step": 16770 + }, + { + "epoch": 15.008944543828266, + "grad_norm": 1.358554720878601, + "learning_rate": 5.042117930204572e-07, + "loss": 0.208, + "step": 16780 + }, + { + "epoch": 15.01788908765653, + "grad_norm": 1.2903168201446533, + "learning_rate": 5.039109506618532e-07, + "loss": 0.208, + "step": 16790 + }, + { + "epoch": 15.026833631484795, + "grad_norm": 1.5758551359176636, + "learning_rate": 5.036101083032491e-07, + "loss": 0.209, + "step": 16800 + }, + { + "epoch": 15.035778175313059, + "grad_norm": 1.582900047302246, + "learning_rate": 5.03309265944645e-07, + "loss": 0.2205, + "step": 16810 + }, + { + "epoch": 15.044722719141324, + "grad_norm": 1.4756799936294556, + "learning_rate": 5.030084235860409e-07, + "loss": 0.2061, + "step": 16820 + }, + { + "epoch": 15.053667262969588, + "grad_norm": 1.690066933631897, + "learning_rate": 5.027075812274369e-07, + "loss": 0.2104, + "step": 16830 + }, + { + "epoch": 15.062611806797854, + "grad_norm": 1.670901894569397, + "learning_rate": 5.024067388688326e-07, + "loss": 0.2067, + "step": 16840 + }, + { + "epoch": 15.071556350626118, + "grad_norm": 1.7854502201080322, + "learning_rate": 5.021058965102286e-07, + "loss": 0.2014, + "step": 16850 + }, + { + "epoch": 15.080500894454383, + "grad_norm": 1.5761840343475342, + "learning_rate": 5.018050541516245e-07, + "loss": 0.2229, + "step": 16860 + }, + { + "epoch": 15.089445438282647, + "grad_norm": 1.319814920425415, + "learning_rate": 5.015042117930204e-07, + "loss": 0.2138, + "step": 16870 + }, + { + "epoch": 15.098389982110913, + "grad_norm": 1.3874000310897827, + "learning_rate": 5.012033694344163e-07, + "loss": 0.2049, + "step": 16880 + }, + { + "epoch": 15.107334525939176, + "grad_norm": 1.3627005815505981, + "learning_rate": 5.009025270758123e-07, + "loss": 0.2059, + "step": 16890 + }, + { + "epoch": 15.116279069767442, + "grad_norm": 1.7481482028961182, + "learning_rate": 5.006016847172082e-07, + "loss": 0.2112, + "step": 16900 + }, + { + "epoch": 15.125223613595706, + "grad_norm": 1.55675208568573, + "learning_rate": 5.003008423586041e-07, + "loss": 0.2068, + "step": 16910 + }, + { + "epoch": 15.134168157423971, + "grad_norm": 1.743762731552124, + "learning_rate": 5e-07, + "loss": 0.211, + "step": 16920 + }, + { + "epoch": 15.143112701252235, + "grad_norm": 1.395524501800537, + "learning_rate": 4.996991576413959e-07, + "loss": 0.2188, + "step": 16930 + }, + { + "epoch": 15.152057245080501, + "grad_norm": 1.6198045015335083, + "learning_rate": 4.993983152827918e-07, + "loss": 0.2048, + "step": 16940 + }, + { + "epoch": 15.161001788908766, + "grad_norm": 1.6451237201690674, + "learning_rate": 4.990974729241876e-07, + "loss": 0.2193, + "step": 16950 + }, + { + "epoch": 15.16994633273703, + "grad_norm": 2.0471298694610596, + "learning_rate": 4.987966305655836e-07, + "loss": 0.2117, + "step": 16960 + }, + { + "epoch": 15.178890876565296, + "grad_norm": 1.4981588125228882, + "learning_rate": 4.984957882069795e-07, + "loss": 0.2121, + "step": 16970 + }, + { + "epoch": 15.18783542039356, + "grad_norm": 1.6169943809509277, + "learning_rate": 4.981949458483754e-07, + "loss": 0.2198, + "step": 16980 + }, + { + "epoch": 15.196779964221825, + "grad_norm": 1.5530139207839966, + "learning_rate": 4.978941034897713e-07, + "loss": 0.2116, + "step": 16990 + }, + { + "epoch": 15.20572450805009, + "grad_norm": 1.7657912969589233, + "learning_rate": 4.975932611311672e-07, + "loss": 0.2164, + "step": 17000 + }, + { + "epoch": 15.214669051878355, + "grad_norm": 1.5685510635375977, + "learning_rate": 4.972924187725632e-07, + "loss": 0.2272, + "step": 17010 + }, + { + "epoch": 15.223613595706619, + "grad_norm": 1.2543871402740479, + "learning_rate": 4.969915764139591e-07, + "loss": 0.2058, + "step": 17020 + }, + { + "epoch": 15.232558139534884, + "grad_norm": 1.641847848892212, + "learning_rate": 4.96690734055355e-07, + "loss": 0.2097, + "step": 17030 + }, + { + "epoch": 15.241502683363148, + "grad_norm": 2.023759365081787, + "learning_rate": 4.963898916967509e-07, + "loss": 0.2152, + "step": 17040 + }, + { + "epoch": 15.250447227191414, + "grad_norm": 1.667617917060852, + "learning_rate": 4.960890493381468e-07, + "loss": 0.2129, + "step": 17050 + }, + { + "epoch": 15.259391771019677, + "grad_norm": 3.800983428955078, + "learning_rate": 4.957882069795428e-07, + "loss": 0.2103, + "step": 17060 + }, + { + "epoch": 15.268336314847943, + "grad_norm": 1.4913359880447388, + "learning_rate": 4.954873646209386e-07, + "loss": 0.2056, + "step": 17070 + }, + { + "epoch": 15.277280858676207, + "grad_norm": 1.730721116065979, + "learning_rate": 4.951865222623345e-07, + "loss": 0.2079, + "step": 17080 + }, + { + "epoch": 15.286225402504472, + "grad_norm": 1.3525733947753906, + "learning_rate": 4.948856799037304e-07, + "loss": 0.2154, + "step": 17090 + }, + { + "epoch": 15.295169946332736, + "grad_norm": 1.395639181137085, + "learning_rate": 4.945848375451263e-07, + "loss": 0.2089, + "step": 17100 + }, + { + "epoch": 15.304114490161002, + "grad_norm": 1.5808838605880737, + "learning_rate": 4.942839951865222e-07, + "loss": 0.2113, + "step": 17110 + }, + { + "epoch": 15.313059033989267, + "grad_norm": 1.6018389463424683, + "learning_rate": 4.939831528279181e-07, + "loss": 0.2196, + "step": 17120 + }, + { + "epoch": 15.322003577817531, + "grad_norm": 1.5155175924301147, + "learning_rate": 4.936823104693141e-07, + "loss": 0.2124, + "step": 17130 + }, + { + "epoch": 15.330948121645797, + "grad_norm": 1.5393834114074707, + "learning_rate": 4.9338146811071e-07, + "loss": 0.2149, + "step": 17140 + }, + { + "epoch": 15.33989266547406, + "grad_norm": 1.7581160068511963, + "learning_rate": 4.930806257521059e-07, + "loss": 0.2066, + "step": 17150 + }, + { + "epoch": 15.348837209302326, + "grad_norm": 1.5426652431488037, + "learning_rate": 4.927797833935018e-07, + "loss": 0.2133, + "step": 17160 + }, + { + "epoch": 15.35778175313059, + "grad_norm": 1.506504774093628, + "learning_rate": 4.924789410348976e-07, + "loss": 0.2201, + "step": 17170 + }, + { + "epoch": 15.366726296958856, + "grad_norm": 1.3995765447616577, + "learning_rate": 4.921780986762936e-07, + "loss": 0.2069, + "step": 17180 + }, + { + "epoch": 15.37567084078712, + "grad_norm": 1.7154018878936768, + "learning_rate": 4.918772563176895e-07, + "loss": 0.2123, + "step": 17190 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 1.425632119178772, + "learning_rate": 4.915764139590854e-07, + "loss": 0.2014, + "step": 17200 + }, + { + "epoch": 15.393559928443649, + "grad_norm": 1.711829423904419, + "learning_rate": 4.912755716004813e-07, + "loss": 0.2097, + "step": 17210 + }, + { + "epoch": 15.402504472271914, + "grad_norm": 1.620375394821167, + "learning_rate": 4.909747292418772e-07, + "loss": 0.2028, + "step": 17220 + }, + { + "epoch": 15.411449016100178, + "grad_norm": 1.46085786819458, + "learning_rate": 4.906738868832732e-07, + "loss": 0.2123, + "step": 17230 + }, + { + "epoch": 15.420393559928444, + "grad_norm": 1.2808018922805786, + "learning_rate": 4.903730445246691e-07, + "loss": 0.2187, + "step": 17240 + }, + { + "epoch": 15.429338103756708, + "grad_norm": 1.4686278104782104, + "learning_rate": 4.90072202166065e-07, + "loss": 0.205, + "step": 17250 + }, + { + "epoch": 15.438282647584973, + "grad_norm": 1.3774062395095825, + "learning_rate": 4.897713598074609e-07, + "loss": 0.2052, + "step": 17260 + }, + { + "epoch": 15.447227191413237, + "grad_norm": 1.7220516204833984, + "learning_rate": 4.894705174488568e-07, + "loss": 0.2088, + "step": 17270 + }, + { + "epoch": 15.456171735241503, + "grad_norm": 1.4161098003387451, + "learning_rate": 4.891696750902526e-07, + "loss": 0.2077, + "step": 17280 + }, + { + "epoch": 15.465116279069768, + "grad_norm": 1.5691733360290527, + "learning_rate": 4.888688327316485e-07, + "loss": 0.2059, + "step": 17290 + }, + { + "epoch": 15.474060822898032, + "grad_norm": 1.6341298818588257, + "learning_rate": 4.885679903730445e-07, + "loss": 0.2114, + "step": 17300 + }, + { + "epoch": 15.483005366726298, + "grad_norm": 1.7802695035934448, + "learning_rate": 4.882671480144404e-07, + "loss": 0.2071, + "step": 17310 + }, + { + "epoch": 15.491949910554561, + "grad_norm": 1.389773964881897, + "learning_rate": 4.879663056558363e-07, + "loss": 0.2093, + "step": 17320 + }, + { + "epoch": 15.500894454382827, + "grad_norm": 1.3430007696151733, + "learning_rate": 4.876654632972322e-07, + "loss": 0.2018, + "step": 17330 + }, + { + "epoch": 15.509838998211091, + "grad_norm": 1.5305284261703491, + "learning_rate": 4.873646209386281e-07, + "loss": 0.2018, + "step": 17340 + }, + { + "epoch": 15.518783542039357, + "grad_norm": 1.3044167757034302, + "learning_rate": 4.870637785800241e-07, + "loss": 0.2153, + "step": 17350 + }, + { + "epoch": 15.52772808586762, + "grad_norm": 1.5357669591903687, + "learning_rate": 4.8676293622142e-07, + "loss": 0.2068, + "step": 17360 + }, + { + "epoch": 15.536672629695886, + "grad_norm": 1.3363947868347168, + "learning_rate": 4.864620938628159e-07, + "loss": 0.1996, + "step": 17370 + }, + { + "epoch": 15.54561717352415, + "grad_norm": 1.579582929611206, + "learning_rate": 4.861612515042118e-07, + "loss": 0.2071, + "step": 17380 + }, + { + "epoch": 15.554561717352415, + "grad_norm": 1.605961799621582, + "learning_rate": 4.858604091456076e-07, + "loss": 0.2046, + "step": 17390 + }, + { + "epoch": 15.56350626118068, + "grad_norm": 1.5769977569580078, + "learning_rate": 4.855595667870036e-07, + "loss": 0.2158, + "step": 17400 + }, + { + "epoch": 15.572450805008945, + "grad_norm": 1.3805499076843262, + "learning_rate": 4.852587244283995e-07, + "loss": 0.2031, + "step": 17410 + }, + { + "epoch": 15.581395348837209, + "grad_norm": 1.209307312965393, + "learning_rate": 4.849578820697954e-07, + "loss": 0.2177, + "step": 17420 + }, + { + "epoch": 15.590339892665474, + "grad_norm": 1.6182724237442017, + "learning_rate": 4.846570397111913e-07, + "loss": 0.2149, + "step": 17430 + }, + { + "epoch": 15.59928443649374, + "grad_norm": 1.6751196384429932, + "learning_rate": 4.843561973525872e-07, + "loss": 0.206, + "step": 17440 + }, + { + "epoch": 15.608228980322004, + "grad_norm": 1.4189718961715698, + "learning_rate": 4.840553549939831e-07, + "loss": 0.21, + "step": 17450 + }, + { + "epoch": 15.61717352415027, + "grad_norm": 1.3712016344070435, + "learning_rate": 4.83754512635379e-07, + "loss": 0.2078, + "step": 17460 + }, + { + "epoch": 15.626118067978533, + "grad_norm": 1.6174589395523071, + "learning_rate": 4.83453670276775e-07, + "loss": 0.2038, + "step": 17470 + }, + { + "epoch": 15.635062611806799, + "grad_norm": 1.3036216497421265, + "learning_rate": 4.831528279181709e-07, + "loss": 0.2121, + "step": 17480 + }, + { + "epoch": 15.644007155635062, + "grad_norm": 1.3612110614776611, + "learning_rate": 4.828519855595668e-07, + "loss": 0.2214, + "step": 17490 + }, + { + "epoch": 15.652951699463328, + "grad_norm": 1.6744191646575928, + "learning_rate": 4.825511432009626e-07, + "loss": 0.2136, + "step": 17500 + }, + { + "epoch": 15.661896243291592, + "grad_norm": 1.761144757270813, + "learning_rate": 4.822503008423585e-07, + "loss": 0.2136, + "step": 17510 + }, + { + "epoch": 15.670840787119857, + "grad_norm": 1.4766287803649902, + "learning_rate": 4.819494584837545e-07, + "loss": 0.2097, + "step": 17520 + }, + { + "epoch": 15.679785330948121, + "grad_norm": 1.473301887512207, + "learning_rate": 4.816486161251504e-07, + "loss": 0.2055, + "step": 17530 + }, + { + "epoch": 15.688729874776387, + "grad_norm": 1.5784001350402832, + "learning_rate": 4.813477737665463e-07, + "loss": 0.215, + "step": 17540 + }, + { + "epoch": 15.69767441860465, + "grad_norm": 1.6430200338363647, + "learning_rate": 4.810469314079422e-07, + "loss": 0.1925, + "step": 17550 + }, + { + "epoch": 15.706618962432916, + "grad_norm": 1.4598944187164307, + "learning_rate": 4.807460890493381e-07, + "loss": 0.2028, + "step": 17560 + }, + { + "epoch": 15.71556350626118, + "grad_norm": 1.5248929262161255, + "learning_rate": 4.804452466907341e-07, + "loss": 0.2104, + "step": 17570 + }, + { + "epoch": 15.724508050089446, + "grad_norm": 1.4362632036209106, + "learning_rate": 4.8014440433213e-07, + "loss": 0.2094, + "step": 17580 + }, + { + "epoch": 15.73345259391771, + "grad_norm": 1.6463420391082764, + "learning_rate": 4.798435619735259e-07, + "loss": 0.2037, + "step": 17590 + }, + { + "epoch": 15.742397137745975, + "grad_norm": 1.5646806955337524, + "learning_rate": 4.795427196149218e-07, + "loss": 0.2107, + "step": 17600 + }, + { + "epoch": 15.751341681574239, + "grad_norm": 1.5375139713287354, + "learning_rate": 4.792418772563177e-07, + "loss": 0.2101, + "step": 17610 + }, + { + "epoch": 15.760286225402504, + "grad_norm": 1.3467087745666504, + "learning_rate": 4.789410348977135e-07, + "loss": 0.2035, + "step": 17620 + }, + { + "epoch": 15.76923076923077, + "grad_norm": 1.3728797435760498, + "learning_rate": 4.786401925391094e-07, + "loss": 0.2005, + "step": 17630 + }, + { + "epoch": 15.778175313059034, + "grad_norm": 1.9395620822906494, + "learning_rate": 4.783393501805054e-07, + "loss": 0.2093, + "step": 17640 + }, + { + "epoch": 15.7871198568873, + "grad_norm": 1.927126407623291, + "learning_rate": 4.780385078219013e-07, + "loss": 0.1979, + "step": 17650 + }, + { + "epoch": 15.796064400715563, + "grad_norm": 1.429348349571228, + "learning_rate": 4.777376654632972e-07, + "loss": 0.212, + "step": 17660 + }, + { + "epoch": 15.805008944543829, + "grad_norm": 1.7249642610549927, + "learning_rate": 4.774368231046931e-07, + "loss": 0.2155, + "step": 17670 + }, + { + "epoch": 15.813953488372093, + "grad_norm": 1.4198932647705078, + "learning_rate": 4.77135980746089e-07, + "loss": 0.2043, + "step": 17680 + }, + { + "epoch": 15.822898032200358, + "grad_norm": 1.4026259183883667, + "learning_rate": 4.76835138387485e-07, + "loss": 0.2094, + "step": 17690 + }, + { + "epoch": 15.831842576028622, + "grad_norm": 1.456063151359558, + "learning_rate": 4.765342960288808e-07, + "loss": 0.2019, + "step": 17700 + }, + { + "epoch": 15.840787119856888, + "grad_norm": 1.901902437210083, + "learning_rate": 4.762334536702767e-07, + "loss": 0.2112, + "step": 17710 + }, + { + "epoch": 15.849731663685152, + "grad_norm": 1.650774359703064, + "learning_rate": 4.7593261131167265e-07, + "loss": 0.2015, + "step": 17720 + }, + { + "epoch": 15.858676207513417, + "grad_norm": 1.569238543510437, + "learning_rate": 4.7563176895306854e-07, + "loss": 0.2073, + "step": 17730 + }, + { + "epoch": 15.867620751341681, + "grad_norm": 1.3177183866500854, + "learning_rate": 4.753309265944645e-07, + "loss": 0.2041, + "step": 17740 + }, + { + "epoch": 15.876565295169947, + "grad_norm": 1.7612563371658325, + "learning_rate": 4.750300842358604e-07, + "loss": 0.2172, + "step": 17750 + }, + { + "epoch": 15.88550983899821, + "grad_norm": 1.9814246892929077, + "learning_rate": 4.7472924187725626e-07, + "loss": 0.2067, + "step": 17760 + }, + { + "epoch": 15.894454382826476, + "grad_norm": 1.4732369184494019, + "learning_rate": 4.744283995186522e-07, + "loss": 0.2022, + "step": 17770 + }, + { + "epoch": 15.903398926654742, + "grad_norm": 1.6204142570495605, + "learning_rate": 4.741275571600481e-07, + "loss": 0.1983, + "step": 17780 + }, + { + "epoch": 15.912343470483005, + "grad_norm": 1.36124849319458, + "learning_rate": 4.7382671480144404e-07, + "loss": 0.1996, + "step": 17790 + }, + { + "epoch": 15.921288014311271, + "grad_norm": 1.7553470134735107, + "learning_rate": 4.7352587244283993e-07, + "loss": 0.2141, + "step": 17800 + }, + { + "epoch": 15.930232558139535, + "grad_norm": 1.593579888343811, + "learning_rate": 4.7322503008423587e-07, + "loss": 0.204, + "step": 17810 + }, + { + "epoch": 15.9391771019678, + "grad_norm": 1.376990556716919, + "learning_rate": 4.7292418772563176e-07, + "loss": 0.2125, + "step": 17820 + }, + { + "epoch": 15.948121645796064, + "grad_norm": 1.6332093477249146, + "learning_rate": 4.7262334536702765e-07, + "loss": 0.2081, + "step": 17830 + }, + { + "epoch": 15.95706618962433, + "grad_norm": 1.4526787996292114, + "learning_rate": 4.723225030084236e-07, + "loss": 0.2089, + "step": 17840 + }, + { + "epoch": 15.966010733452594, + "grad_norm": 1.6975692510604858, + "learning_rate": 4.720216606498195e-07, + "loss": 0.2032, + "step": 17850 + }, + { + "epoch": 15.97495527728086, + "grad_norm": 1.3301687240600586, + "learning_rate": 4.7172081829121543e-07, + "loss": 0.2068, + "step": 17860 + }, + { + "epoch": 15.983899821109123, + "grad_norm": 1.5423060655593872, + "learning_rate": 4.714199759326113e-07, + "loss": 0.2013, + "step": 17870 + }, + { + "epoch": 15.992844364937389, + "grad_norm": 1.4738348722457886, + "learning_rate": 4.7111913357400715e-07, + "loss": 0.2048, + "step": 17880 + }, + { + "epoch": 16.0, + "eval_bleu": 76.7381, + "eval_gen_len": 74.7274, + "eval_loss": 0.15930308401584625, + "eval_runtime": 58.2522, + "eval_samples_per_second": 17.888, + "eval_steps_per_second": 0.189, + "step": 17888 + }, + { + "epoch": 16.001788908765654, + "grad_norm": 1.6148046255111694, + "learning_rate": 4.708182912154031e-07, + "loss": 0.2143, + "step": 17890 + }, + { + "epoch": 16.010733452593918, + "grad_norm": 1.4588137865066528, + "learning_rate": 4.70517448856799e-07, + "loss": 0.2069, + "step": 17900 + }, + { + "epoch": 16.019677996422182, + "grad_norm": 1.6100811958312988, + "learning_rate": 4.7021660649819493e-07, + "loss": 0.2009, + "step": 17910 + }, + { + "epoch": 16.028622540250446, + "grad_norm": 1.336479902267456, + "learning_rate": 4.699157641395908e-07, + "loss": 0.2167, + "step": 17920 + }, + { + "epoch": 16.037567084078713, + "grad_norm": 1.51796293258667, + "learning_rate": 4.696149217809867e-07, + "loss": 0.2081, + "step": 17930 + }, + { + "epoch": 16.046511627906977, + "grad_norm": 1.6503642797470093, + "learning_rate": 4.6931407942238265e-07, + "loss": 0.1997, + "step": 17940 + }, + { + "epoch": 16.05545617173524, + "grad_norm": 1.253343105316162, + "learning_rate": 4.6901323706377854e-07, + "loss": 0.201, + "step": 17950 + }, + { + "epoch": 16.064400715563508, + "grad_norm": 1.4972025156021118, + "learning_rate": 4.687123947051745e-07, + "loss": 0.2015, + "step": 17960 + }, + { + "epoch": 16.073345259391772, + "grad_norm": 1.5771032571792603, + "learning_rate": 4.684115523465704e-07, + "loss": 0.1969, + "step": 17970 + }, + { + "epoch": 16.082289803220036, + "grad_norm": 1.274448275566101, + "learning_rate": 4.6811070998796626e-07, + "loss": 0.2054, + "step": 17980 + }, + { + "epoch": 16.0912343470483, + "grad_norm": 1.4598562717437744, + "learning_rate": 4.678098676293622e-07, + "loss": 0.2019, + "step": 17990 + }, + { + "epoch": 16.100178890876567, + "grad_norm": 1.2179709672927856, + "learning_rate": 4.675090252707581e-07, + "loss": 0.2069, + "step": 18000 + }, + { + "epoch": 16.10912343470483, + "grad_norm": 1.5141541957855225, + "learning_rate": 4.6720818291215404e-07, + "loss": 0.2114, + "step": 18010 + }, + { + "epoch": 16.118067978533094, + "grad_norm": 1.5173187255859375, + "learning_rate": 4.6690734055354993e-07, + "loss": 0.2063, + "step": 18020 + }, + { + "epoch": 16.12701252236136, + "grad_norm": 1.3374969959259033, + "learning_rate": 4.6660649819494587e-07, + "loss": 0.2056, + "step": 18030 + }, + { + "epoch": 16.135957066189626, + "grad_norm": 1.6040619611740112, + "learning_rate": 4.6630565583634176e-07, + "loss": 0.2061, + "step": 18040 + }, + { + "epoch": 16.14490161001789, + "grad_norm": 1.3951270580291748, + "learning_rate": 4.660048134777376e-07, + "loss": 0.1981, + "step": 18050 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 1.493987798690796, + "learning_rate": 4.6570397111913354e-07, + "loss": 0.2026, + "step": 18060 + }, + { + "epoch": 16.162790697674417, + "grad_norm": 1.9750702381134033, + "learning_rate": 4.6540312876052943e-07, + "loss": 0.2035, + "step": 18070 + }, + { + "epoch": 16.171735241502684, + "grad_norm": 1.3505460023880005, + "learning_rate": 4.651022864019254e-07, + "loss": 0.2043, + "step": 18080 + }, + { + "epoch": 16.18067978533095, + "grad_norm": 1.8226122856140137, + "learning_rate": 4.6480144404332127e-07, + "loss": 0.2089, + "step": 18090 + }, + { + "epoch": 16.189624329159212, + "grad_norm": 1.454171895980835, + "learning_rate": 4.6450060168471716e-07, + "loss": 0.1979, + "step": 18100 + }, + { + "epoch": 16.198568872987476, + "grad_norm": 1.459355115890503, + "learning_rate": 4.641997593261131e-07, + "loss": 0.2078, + "step": 18110 + }, + { + "epoch": 16.207513416815743, + "grad_norm": 1.3417216539382935, + "learning_rate": 4.63898916967509e-07, + "loss": 0.2161, + "step": 18120 + }, + { + "epoch": 16.216457960644007, + "grad_norm": 1.5160000324249268, + "learning_rate": 4.6359807460890493e-07, + "loss": 0.2021, + "step": 18130 + }, + { + "epoch": 16.22540250447227, + "grad_norm": 1.6167906522750854, + "learning_rate": 4.632972322503008e-07, + "loss": 0.2005, + "step": 18140 + }, + { + "epoch": 16.23434704830054, + "grad_norm": 1.8305431604385376, + "learning_rate": 4.629963898916967e-07, + "loss": 0.2182, + "step": 18150 + }, + { + "epoch": 16.243291592128802, + "grad_norm": 1.4843783378601074, + "learning_rate": 4.6269554753309265e-07, + "loss": 0.2062, + "step": 18160 + }, + { + "epoch": 16.252236135957066, + "grad_norm": 1.459753155708313, + "learning_rate": 4.6239470517448854e-07, + "loss": 0.2046, + "step": 18170 + }, + { + "epoch": 16.26118067978533, + "grad_norm": 1.5825345516204834, + "learning_rate": 4.620938628158845e-07, + "loss": 0.2117, + "step": 18180 + }, + { + "epoch": 16.270125223613597, + "grad_norm": 1.9152156114578247, + "learning_rate": 4.617930204572804e-07, + "loss": 0.2061, + "step": 18190 + }, + { + "epoch": 16.27906976744186, + "grad_norm": 1.5024219751358032, + "learning_rate": 4.614921780986763e-07, + "loss": 0.1946, + "step": 18200 + }, + { + "epoch": 16.288014311270125, + "grad_norm": 1.3793495893478394, + "learning_rate": 4.611913357400722e-07, + "loss": 0.2117, + "step": 18210 + }, + { + "epoch": 16.29695885509839, + "grad_norm": 1.4050053358078003, + "learning_rate": 4.6089049338146805e-07, + "loss": 0.1973, + "step": 18220 + }, + { + "epoch": 16.305903398926656, + "grad_norm": 1.4622000455856323, + "learning_rate": 4.60589651022864e-07, + "loss": 0.1954, + "step": 18230 + }, + { + "epoch": 16.31484794275492, + "grad_norm": 1.5552752017974854, + "learning_rate": 4.602888086642599e-07, + "loss": 0.2019, + "step": 18240 + }, + { + "epoch": 16.323792486583184, + "grad_norm": 1.2910131216049194, + "learning_rate": 4.599879663056558e-07, + "loss": 0.1979, + "step": 18250 + }, + { + "epoch": 16.332737030411447, + "grad_norm": 1.3379510641098022, + "learning_rate": 4.596871239470517e-07, + "loss": 0.2044, + "step": 18260 + }, + { + "epoch": 16.341681574239715, + "grad_norm": 1.603898048400879, + "learning_rate": 4.593862815884476e-07, + "loss": 0.2084, + "step": 18270 + }, + { + "epoch": 16.35062611806798, + "grad_norm": 1.4816044569015503, + "learning_rate": 4.5908543922984354e-07, + "loss": 0.2011, + "step": 18280 + }, + { + "epoch": 16.359570661896242, + "grad_norm": 1.563955307006836, + "learning_rate": 4.5878459687123943e-07, + "loss": 0.2062, + "step": 18290 + }, + { + "epoch": 16.36851520572451, + "grad_norm": 1.677878975868225, + "learning_rate": 4.584837545126354e-07, + "loss": 0.2114, + "step": 18300 + }, + { + "epoch": 16.377459749552774, + "grad_norm": 1.716578483581543, + "learning_rate": 4.5818291215403127e-07, + "loss": 0.2047, + "step": 18310 + }, + { + "epoch": 16.386404293381037, + "grad_norm": 1.7913365364074707, + "learning_rate": 4.5788206979542716e-07, + "loss": 0.1992, + "step": 18320 + }, + { + "epoch": 16.3953488372093, + "grad_norm": 1.5321120023727417, + "learning_rate": 4.575812274368231e-07, + "loss": 0.1967, + "step": 18330 + }, + { + "epoch": 16.40429338103757, + "grad_norm": 2.0473198890686035, + "learning_rate": 4.57280385078219e-07, + "loss": 0.2125, + "step": 18340 + }, + { + "epoch": 16.413237924865832, + "grad_norm": 1.355984091758728, + "learning_rate": 4.5697954271961493e-07, + "loss": 0.2019, + "step": 18350 + }, + { + "epoch": 16.422182468694096, + "grad_norm": 1.967839002609253, + "learning_rate": 4.566787003610108e-07, + "loss": 0.2122, + "step": 18360 + }, + { + "epoch": 16.43112701252236, + "grad_norm": 1.7413747310638428, + "learning_rate": 4.5637785800240676e-07, + "loss": 0.2055, + "step": 18370 + }, + { + "epoch": 16.440071556350627, + "grad_norm": 1.432185173034668, + "learning_rate": 4.5607701564380265e-07, + "loss": 0.2027, + "step": 18380 + }, + { + "epoch": 16.44901610017889, + "grad_norm": 1.5016663074493408, + "learning_rate": 4.557761732851985e-07, + "loss": 0.2012, + "step": 18390 + }, + { + "epoch": 16.457960644007155, + "grad_norm": 1.3006632328033447, + "learning_rate": 4.5547533092659443e-07, + "loss": 0.204, + "step": 18400 + }, + { + "epoch": 16.46690518783542, + "grad_norm": 1.7726274728775024, + "learning_rate": 4.551744885679903e-07, + "loss": 0.2034, + "step": 18410 + }, + { + "epoch": 16.475849731663686, + "grad_norm": 1.264997959136963, + "learning_rate": 4.5487364620938627e-07, + "loss": 0.1984, + "step": 18420 + }, + { + "epoch": 16.48479427549195, + "grad_norm": 1.3891260623931885, + "learning_rate": 4.5457280385078216e-07, + "loss": 0.197, + "step": 18430 + }, + { + "epoch": 16.493738819320214, + "grad_norm": 1.2195409536361694, + "learning_rate": 4.5427196149217805e-07, + "loss": 0.1939, + "step": 18440 + }, + { + "epoch": 16.502683363148478, + "grad_norm": 1.6551311016082764, + "learning_rate": 4.53971119133574e-07, + "loss": 0.2021, + "step": 18450 + }, + { + "epoch": 16.511627906976745, + "grad_norm": 1.1608327627182007, + "learning_rate": 4.536702767749699e-07, + "loss": 0.2095, + "step": 18460 + }, + { + "epoch": 16.52057245080501, + "grad_norm": 1.585875391960144, + "learning_rate": 4.533694344163658e-07, + "loss": 0.2021, + "step": 18470 + }, + { + "epoch": 16.529516994633273, + "grad_norm": 1.4391627311706543, + "learning_rate": 4.530685920577617e-07, + "loss": 0.2048, + "step": 18480 + }, + { + "epoch": 16.53846153846154, + "grad_norm": 1.7681891918182373, + "learning_rate": 4.527677496991576e-07, + "loss": 0.2034, + "step": 18490 + }, + { + "epoch": 16.547406082289804, + "grad_norm": 1.448768138885498, + "learning_rate": 4.5246690734055354e-07, + "loss": 0.2056, + "step": 18500 + }, + { + "epoch": 16.556350626118068, + "grad_norm": 1.363098382949829, + "learning_rate": 4.5216606498194943e-07, + "loss": 0.205, + "step": 18510 + }, + { + "epoch": 16.56529516994633, + "grad_norm": 1.4220198392868042, + "learning_rate": 4.518652226233454e-07, + "loss": 0.2126, + "step": 18520 + }, + { + "epoch": 16.5742397137746, + "grad_norm": 1.384798288345337, + "learning_rate": 4.5156438026474127e-07, + "loss": 0.2057, + "step": 18530 + }, + { + "epoch": 16.583184257602863, + "grad_norm": 1.5988340377807617, + "learning_rate": 4.5126353790613716e-07, + "loss": 0.1958, + "step": 18540 + }, + { + "epoch": 16.592128801431127, + "grad_norm": 1.342162013053894, + "learning_rate": 4.509626955475331e-07, + "loss": 0.1933, + "step": 18550 + }, + { + "epoch": 16.60107334525939, + "grad_norm": 1.457943320274353, + "learning_rate": 4.50661853188929e-07, + "loss": 0.1974, + "step": 18560 + }, + { + "epoch": 16.610017889087658, + "grad_norm": 1.8528119325637817, + "learning_rate": 4.503610108303249e-07, + "loss": 0.2047, + "step": 18570 + }, + { + "epoch": 16.61896243291592, + "grad_norm": 1.2887921333312988, + "learning_rate": 4.5006016847172077e-07, + "loss": 0.1988, + "step": 18580 + }, + { + "epoch": 16.627906976744185, + "grad_norm": 1.4914727210998535, + "learning_rate": 4.497593261131167e-07, + "loss": 0.1973, + "step": 18590 + }, + { + "epoch": 16.63685152057245, + "grad_norm": 1.537087082862854, + "learning_rate": 4.494584837545126e-07, + "loss": 0.1994, + "step": 18600 + }, + { + "epoch": 16.645796064400717, + "grad_norm": 1.5479522943496704, + "learning_rate": 4.491576413959085e-07, + "loss": 0.2152, + "step": 18610 + }, + { + "epoch": 16.65474060822898, + "grad_norm": 1.733384370803833, + "learning_rate": 4.4885679903730444e-07, + "loss": 0.1986, + "step": 18620 + }, + { + "epoch": 16.663685152057244, + "grad_norm": 1.5670526027679443, + "learning_rate": 4.485559566787003e-07, + "loss": 0.1961, + "step": 18630 + }, + { + "epoch": 16.67262969588551, + "grad_norm": 1.5489834547042847, + "learning_rate": 4.4825511432009627e-07, + "loss": 0.207, + "step": 18640 + }, + { + "epoch": 16.681574239713775, + "grad_norm": 1.5704835653305054, + "learning_rate": 4.4795427196149216e-07, + "loss": 0.1984, + "step": 18650 + }, + { + "epoch": 16.69051878354204, + "grad_norm": 2.105590581893921, + "learning_rate": 4.4765342960288805e-07, + "loss": 0.2051, + "step": 18660 + }, + { + "epoch": 16.699463327370303, + "grad_norm": 1.5057789087295532, + "learning_rate": 4.47352587244284e-07, + "loss": 0.2106, + "step": 18670 + }, + { + "epoch": 16.70840787119857, + "grad_norm": 1.5319921970367432, + "learning_rate": 4.470517448856799e-07, + "loss": 0.2026, + "step": 18680 + }, + { + "epoch": 16.717352415026834, + "grad_norm": 1.3663493394851685, + "learning_rate": 4.467509025270758e-07, + "loss": 0.1981, + "step": 18690 + }, + { + "epoch": 16.726296958855098, + "grad_norm": 1.5751093626022339, + "learning_rate": 4.464500601684717e-07, + "loss": 0.2094, + "step": 18700 + }, + { + "epoch": 16.735241502683362, + "grad_norm": 1.359885573387146, + "learning_rate": 4.461492178098676e-07, + "loss": 0.2007, + "step": 18710 + }, + { + "epoch": 16.74418604651163, + "grad_norm": 1.4744230508804321, + "learning_rate": 4.4584837545126355e-07, + "loss": 0.2027, + "step": 18720 + }, + { + "epoch": 16.753130590339893, + "grad_norm": 1.998220443725586, + "learning_rate": 4.4554753309265944e-07, + "loss": 0.1959, + "step": 18730 + }, + { + "epoch": 16.762075134168157, + "grad_norm": 1.3327491283416748, + "learning_rate": 4.452466907340554e-07, + "loss": 0.2062, + "step": 18740 + }, + { + "epoch": 16.77101967799642, + "grad_norm": 1.3492833375930786, + "learning_rate": 4.449458483754512e-07, + "loss": 0.2018, + "step": 18750 + }, + { + "epoch": 16.779964221824688, + "grad_norm": 1.4005316495895386, + "learning_rate": 4.4464500601684716e-07, + "loss": 0.2044, + "step": 18760 + }, + { + "epoch": 16.788908765652952, + "grad_norm": 1.7190395593643188, + "learning_rate": 4.4434416365824305e-07, + "loss": 0.2003, + "step": 18770 + }, + { + "epoch": 16.797853309481216, + "grad_norm": 1.4776062965393066, + "learning_rate": 4.4404332129963894e-07, + "loss": 0.1939, + "step": 18780 + }, + { + "epoch": 16.80679785330948, + "grad_norm": 1.5264639854431152, + "learning_rate": 4.437424789410349e-07, + "loss": 0.1953, + "step": 18790 + }, + { + "epoch": 16.815742397137747, + "grad_norm": 1.4779977798461914, + "learning_rate": 4.4344163658243077e-07, + "loss": 0.2013, + "step": 18800 + }, + { + "epoch": 16.82468694096601, + "grad_norm": 1.224633812904358, + "learning_rate": 4.431407942238267e-07, + "loss": 0.2037, + "step": 18810 + }, + { + "epoch": 16.833631484794275, + "grad_norm": 1.5299253463745117, + "learning_rate": 4.428399518652226e-07, + "loss": 0.199, + "step": 18820 + }, + { + "epoch": 16.842576028622542, + "grad_norm": 1.5062905550003052, + "learning_rate": 4.425391095066185e-07, + "loss": 0.194, + "step": 18830 + }, + { + "epoch": 16.851520572450806, + "grad_norm": 1.432369351387024, + "learning_rate": 4.4223826714801444e-07, + "loss": 0.2071, + "step": 18840 + }, + { + "epoch": 16.86046511627907, + "grad_norm": 1.5931068658828735, + "learning_rate": 4.419374247894103e-07, + "loss": 0.2024, + "step": 18850 + }, + { + "epoch": 16.869409660107333, + "grad_norm": 1.5152790546417236, + "learning_rate": 4.4163658243080627e-07, + "loss": 0.202, + "step": 18860 + }, + { + "epoch": 16.8783542039356, + "grad_norm": 1.5928902626037598, + "learning_rate": 4.4133574007220216e-07, + "loss": 0.1947, + "step": 18870 + }, + { + "epoch": 16.887298747763865, + "grad_norm": 1.2694294452667236, + "learning_rate": 4.4103489771359805e-07, + "loss": 0.197, + "step": 18880 + }, + { + "epoch": 16.89624329159213, + "grad_norm": 1.5884836912155151, + "learning_rate": 4.40734055354994e-07, + "loss": 0.2014, + "step": 18890 + }, + { + "epoch": 16.905187835420392, + "grad_norm": 1.3282958269119263, + "learning_rate": 4.404332129963899e-07, + "loss": 0.1959, + "step": 18900 + }, + { + "epoch": 16.91413237924866, + "grad_norm": 1.966212511062622, + "learning_rate": 4.401323706377858e-07, + "loss": 0.2147, + "step": 18910 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 1.6146342754364014, + "learning_rate": 4.3983152827918166e-07, + "loss": 0.1984, + "step": 18920 + }, + { + "epoch": 16.932021466905187, + "grad_norm": 1.3908056020736694, + "learning_rate": 4.3953068592057755e-07, + "loss": 0.2016, + "step": 18930 + }, + { + "epoch": 16.94096601073345, + "grad_norm": 1.724147915840149, + "learning_rate": 4.392298435619735e-07, + "loss": 0.2035, + "step": 18940 + }, + { + "epoch": 16.94991055456172, + "grad_norm": 1.3435845375061035, + "learning_rate": 4.389290012033694e-07, + "loss": 0.2024, + "step": 18950 + }, + { + "epoch": 16.958855098389982, + "grad_norm": 1.3221325874328613, + "learning_rate": 4.3862815884476533e-07, + "loss": 0.1984, + "step": 18960 + }, + { + "epoch": 16.967799642218246, + "grad_norm": 1.3773683309555054, + "learning_rate": 4.383273164861612e-07, + "loss": 0.198, + "step": 18970 + }, + { + "epoch": 16.97674418604651, + "grad_norm": 1.4278448820114136, + "learning_rate": 4.3802647412755716e-07, + "loss": 0.1952, + "step": 18980 + }, + { + "epoch": 16.985688729874777, + "grad_norm": 1.6305217742919922, + "learning_rate": 4.3772563176895305e-07, + "loss": 0.1977, + "step": 18990 + }, + { + "epoch": 16.99463327370304, + "grad_norm": 1.5182921886444092, + "learning_rate": 4.3742478941034894e-07, + "loss": 0.1975, + "step": 19000 + }, + { + "epoch": 17.0, + "eval_bleu": 76.9954, + "eval_gen_len": 74.7217, + "eval_loss": 0.1558741331100464, + "eval_runtime": 57.6067, + "eval_samples_per_second": 18.088, + "eval_steps_per_second": 0.191, + "step": 19006 + }, + { + "epoch": 17.003577817531305, + "grad_norm": 1.6046218872070312, + "learning_rate": 4.371239470517449e-07, + "loss": 0.2037, + "step": 19010 + }, + { + "epoch": 17.012522361359572, + "grad_norm": 1.2948042154312134, + "learning_rate": 4.3682310469314077e-07, + "loss": 0.2049, + "step": 19020 + }, + { + "epoch": 17.021466905187836, + "grad_norm": 1.4588265419006348, + "learning_rate": 4.365222623345367e-07, + "loss": 0.2005, + "step": 19030 + }, + { + "epoch": 17.0304114490161, + "grad_norm": 1.3486738204956055, + "learning_rate": 4.362214199759326e-07, + "loss": 0.1961, + "step": 19040 + }, + { + "epoch": 17.039355992844364, + "grad_norm": 1.3999135494232178, + "learning_rate": 4.359205776173285e-07, + "loss": 0.1944, + "step": 19050 + }, + { + "epoch": 17.04830053667263, + "grad_norm": 1.3097974061965942, + "learning_rate": 4.3561973525872444e-07, + "loss": 0.2025, + "step": 19060 + }, + { + "epoch": 17.057245080500895, + "grad_norm": 1.586107611656189, + "learning_rate": 4.3531889290012033e-07, + "loss": 0.2042, + "step": 19070 + }, + { + "epoch": 17.06618962432916, + "grad_norm": 1.2752659320831299, + "learning_rate": 4.3501805054151627e-07, + "loss": 0.1849, + "step": 19080 + }, + { + "epoch": 17.075134168157422, + "grad_norm": 1.2279033660888672, + "learning_rate": 4.347172081829121e-07, + "loss": 0.1934, + "step": 19090 + }, + { + "epoch": 17.08407871198569, + "grad_norm": 1.269182801246643, + "learning_rate": 4.34416365824308e-07, + "loss": 0.1994, + "step": 19100 + }, + { + "epoch": 17.093023255813954, + "grad_norm": 1.5643492937088013, + "learning_rate": 4.3411552346570394e-07, + "loss": 0.2004, + "step": 19110 + }, + { + "epoch": 17.101967799642217, + "grad_norm": 1.1503673791885376, + "learning_rate": 4.3381468110709983e-07, + "loss": 0.1979, + "step": 19120 + }, + { + "epoch": 17.11091234347048, + "grad_norm": 1.837729573249817, + "learning_rate": 4.3351383874849577e-07, + "loss": 0.2037, + "step": 19130 + }, + { + "epoch": 17.11985688729875, + "grad_norm": 1.4600396156311035, + "learning_rate": 4.3321299638989166e-07, + "loss": 0.2067, + "step": 19140 + }, + { + "epoch": 17.128801431127012, + "grad_norm": 1.4574378728866577, + "learning_rate": 4.329121540312876e-07, + "loss": 0.1919, + "step": 19150 + }, + { + "epoch": 17.137745974955276, + "grad_norm": 1.415816307067871, + "learning_rate": 4.326113116726835e-07, + "loss": 0.1992, + "step": 19160 + }, + { + "epoch": 17.146690518783544, + "grad_norm": 1.5313915014266968, + "learning_rate": 4.323104693140794e-07, + "loss": 0.1917, + "step": 19170 + }, + { + "epoch": 17.155635062611807, + "grad_norm": 1.6685739755630493, + "learning_rate": 4.3200962695547533e-07, + "loss": 0.1975, + "step": 19180 + }, + { + "epoch": 17.16457960644007, + "grad_norm": 1.4296932220458984, + "learning_rate": 4.317087845968712e-07, + "loss": 0.1997, + "step": 19190 + }, + { + "epoch": 17.173524150268335, + "grad_norm": 1.90603768825531, + "learning_rate": 4.3140794223826716e-07, + "loss": 0.2024, + "step": 19200 + }, + { + "epoch": 17.182468694096602, + "grad_norm": 1.4273818731307983, + "learning_rate": 4.3110709987966305e-07, + "loss": 0.1979, + "step": 19210 + }, + { + "epoch": 17.191413237924866, + "grad_norm": 1.6286354064941406, + "learning_rate": 4.3080625752105894e-07, + "loss": 0.2025, + "step": 19220 + }, + { + "epoch": 17.20035778175313, + "grad_norm": 1.6004372835159302, + "learning_rate": 4.305054151624549e-07, + "loss": 0.1967, + "step": 19230 + }, + { + "epoch": 17.209302325581394, + "grad_norm": 1.3184689283370972, + "learning_rate": 4.3020457280385077e-07, + "loss": 0.2022, + "step": 19240 + }, + { + "epoch": 17.21824686940966, + "grad_norm": 1.2840800285339355, + "learning_rate": 4.299037304452467e-07, + "loss": 0.1959, + "step": 19250 + }, + { + "epoch": 17.227191413237925, + "grad_norm": 1.484135389328003, + "learning_rate": 4.2960288808664255e-07, + "loss": 0.1954, + "step": 19260 + }, + { + "epoch": 17.23613595706619, + "grad_norm": 1.282882809638977, + "learning_rate": 4.2930204572803844e-07, + "loss": 0.2026, + "step": 19270 + }, + { + "epoch": 17.245080500894453, + "grad_norm": 1.2856765985488892, + "learning_rate": 4.290012033694344e-07, + "loss": 0.2099, + "step": 19280 + }, + { + "epoch": 17.25402504472272, + "grad_norm": 1.3113852739334106, + "learning_rate": 4.287003610108303e-07, + "loss": 0.1912, + "step": 19290 + }, + { + "epoch": 17.262969588550984, + "grad_norm": 1.4740384817123413, + "learning_rate": 4.283995186522262e-07, + "loss": 0.1943, + "step": 19300 + }, + { + "epoch": 17.271914132379248, + "grad_norm": 1.3779065608978271, + "learning_rate": 4.280986762936221e-07, + "loss": 0.1923, + "step": 19310 + }, + { + "epoch": 17.280858676207515, + "grad_norm": 1.5705339908599854, + "learning_rate": 4.27797833935018e-07, + "loss": 0.2067, + "step": 19320 + }, + { + "epoch": 17.28980322003578, + "grad_norm": 1.4014439582824707, + "learning_rate": 4.2749699157641394e-07, + "loss": 0.1893, + "step": 19330 + }, + { + "epoch": 17.298747763864043, + "grad_norm": 1.3906726837158203, + "learning_rate": 4.2719614921780983e-07, + "loss": 0.1976, + "step": 19340 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 1.3222224712371826, + "learning_rate": 4.268953068592058e-07, + "loss": 0.1928, + "step": 19350 + }, + { + "epoch": 17.316636851520574, + "grad_norm": 1.4415886402130127, + "learning_rate": 4.2659446450060166e-07, + "loss": 0.1999, + "step": 19360 + }, + { + "epoch": 17.325581395348838, + "grad_norm": 1.462741494178772, + "learning_rate": 4.262936221419976e-07, + "loss": 0.1904, + "step": 19370 + }, + { + "epoch": 17.3345259391771, + "grad_norm": 1.4457674026489258, + "learning_rate": 4.259927797833935e-07, + "loss": 0.1976, + "step": 19380 + }, + { + "epoch": 17.343470483005365, + "grad_norm": 1.386067271232605, + "learning_rate": 4.256919374247894e-07, + "loss": 0.1916, + "step": 19390 + }, + { + "epoch": 17.352415026833633, + "grad_norm": 1.6763598918914795, + "learning_rate": 4.2539109506618533e-07, + "loss": 0.2001, + "step": 19400 + }, + { + "epoch": 17.361359570661897, + "grad_norm": 1.5524550676345825, + "learning_rate": 4.250902527075812e-07, + "loss": 0.2001, + "step": 19410 + }, + { + "epoch": 17.37030411449016, + "grad_norm": 1.7988560199737549, + "learning_rate": 4.2478941034897716e-07, + "loss": 0.1914, + "step": 19420 + }, + { + "epoch": 17.379248658318424, + "grad_norm": 1.5703943967819214, + "learning_rate": 4.2448856799037305e-07, + "loss": 0.194, + "step": 19430 + }, + { + "epoch": 17.38819320214669, + "grad_norm": 1.5991483926773071, + "learning_rate": 4.241877256317689e-07, + "loss": 0.194, + "step": 19440 + }, + { + "epoch": 17.397137745974955, + "grad_norm": 1.4655667543411255, + "learning_rate": 4.2388688327316483e-07, + "loss": 0.2006, + "step": 19450 + }, + { + "epoch": 17.40608228980322, + "grad_norm": 1.4785929918289185, + "learning_rate": 4.235860409145607e-07, + "loss": 0.1812, + "step": 19460 + }, + { + "epoch": 17.415026833631483, + "grad_norm": 1.3344368934631348, + "learning_rate": 4.2328519855595666e-07, + "loss": 0.1981, + "step": 19470 + }, + { + "epoch": 17.42397137745975, + "grad_norm": 1.6425262689590454, + "learning_rate": 4.2298435619735255e-07, + "loss": 0.2029, + "step": 19480 + }, + { + "epoch": 17.432915921288014, + "grad_norm": 1.941648006439209, + "learning_rate": 4.2268351383874844e-07, + "loss": 0.1979, + "step": 19490 + }, + { + "epoch": 17.441860465116278, + "grad_norm": 1.6927695274353027, + "learning_rate": 4.223826714801444e-07, + "loss": 0.2023, + "step": 19500 + }, + { + "epoch": 17.450805008944545, + "grad_norm": 1.4542479515075684, + "learning_rate": 4.220818291215403e-07, + "loss": 0.1982, + "step": 19510 + }, + { + "epoch": 17.45974955277281, + "grad_norm": 1.3546243906021118, + "learning_rate": 4.217809867629362e-07, + "loss": 0.1879, + "step": 19520 + }, + { + "epoch": 17.468694096601073, + "grad_norm": 1.59962797164917, + "learning_rate": 4.214801444043321e-07, + "loss": 0.2071, + "step": 19530 + }, + { + "epoch": 17.477638640429337, + "grad_norm": 1.761206030845642, + "learning_rate": 4.2117930204572805e-07, + "loss": 0.1929, + "step": 19540 + }, + { + "epoch": 17.486583184257604, + "grad_norm": 1.3420017957687378, + "learning_rate": 4.2087845968712394e-07, + "loss": 0.1885, + "step": 19550 + }, + { + "epoch": 17.495527728085868, + "grad_norm": 1.486976981163025, + "learning_rate": 4.2057761732851983e-07, + "loss": 0.1939, + "step": 19560 + }, + { + "epoch": 17.504472271914132, + "grad_norm": 1.3547134399414062, + "learning_rate": 4.202767749699158e-07, + "loss": 0.1894, + "step": 19570 + }, + { + "epoch": 17.513416815742396, + "grad_norm": 1.7011101245880127, + "learning_rate": 4.1997593261131166e-07, + "loss": 0.1934, + "step": 19580 + }, + { + "epoch": 17.522361359570663, + "grad_norm": 1.407077431678772, + "learning_rate": 4.196750902527076e-07, + "loss": 0.1917, + "step": 19590 + }, + { + "epoch": 17.531305903398927, + "grad_norm": 1.2789831161499023, + "learning_rate": 4.193742478941035e-07, + "loss": 0.1967, + "step": 19600 + }, + { + "epoch": 17.54025044722719, + "grad_norm": 1.906341314315796, + "learning_rate": 4.1907340553549933e-07, + "loss": 0.1965, + "step": 19610 + }, + { + "epoch": 17.549194991055455, + "grad_norm": 1.4115309715270996, + "learning_rate": 4.187725631768953e-07, + "loss": 0.1883, + "step": 19620 + }, + { + "epoch": 17.558139534883722, + "grad_norm": 1.4893302917480469, + "learning_rate": 4.1847172081829117e-07, + "loss": 0.2001, + "step": 19630 + }, + { + "epoch": 17.567084078711986, + "grad_norm": 1.6963932514190674, + "learning_rate": 4.181708784596871e-07, + "loss": 0.2085, + "step": 19640 + }, + { + "epoch": 17.57602862254025, + "grad_norm": 1.3404951095581055, + "learning_rate": 4.17870036101083e-07, + "loss": 0.2071, + "step": 19650 + }, + { + "epoch": 17.584973166368513, + "grad_norm": 1.547065019607544, + "learning_rate": 4.175691937424789e-07, + "loss": 0.198, + "step": 19660 + }, + { + "epoch": 17.59391771019678, + "grad_norm": 1.356144666671753, + "learning_rate": 4.1726835138387483e-07, + "loss": 0.2053, + "step": 19670 + }, + { + "epoch": 17.602862254025045, + "grad_norm": 1.4185278415679932, + "learning_rate": 4.169675090252707e-07, + "loss": 0.196, + "step": 19680 + }, + { + "epoch": 17.61180679785331, + "grad_norm": 1.4725350141525269, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.2051, + "step": 19690 + }, + { + "epoch": 17.620751341681576, + "grad_norm": 1.3277311325073242, + "learning_rate": 4.1636582430806256e-07, + "loss": 0.2059, + "step": 19700 + }, + { + "epoch": 17.62969588550984, + "grad_norm": 1.6677687168121338, + "learning_rate": 4.1606498194945845e-07, + "loss": 0.1958, + "step": 19710 + }, + { + "epoch": 17.638640429338103, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.157641395908544e-07, + "loss": 0.1913, + "step": 19720 + }, + { + "epoch": 17.647584973166367, + "grad_norm": 1.4955782890319824, + "learning_rate": 4.154632972322503e-07, + "loss": 0.1906, + "step": 19730 + }, + { + "epoch": 17.656529516994635, + "grad_norm": 1.5030593872070312, + "learning_rate": 4.151624548736462e-07, + "loss": 0.1899, + "step": 19740 + }, + { + "epoch": 17.6654740608229, + "grad_norm": 1.590512752532959, + "learning_rate": 4.148616125150421e-07, + "loss": 0.2079, + "step": 19750 + }, + { + "epoch": 17.674418604651162, + "grad_norm": 1.3719285726547241, + "learning_rate": 4.1456077015643805e-07, + "loss": 0.2053, + "step": 19760 + }, + { + "epoch": 17.683363148479426, + "grad_norm": 1.3236194849014282, + "learning_rate": 4.1425992779783394e-07, + "loss": 0.188, + "step": 19770 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 1.441819429397583, + "learning_rate": 4.139590854392298e-07, + "loss": 0.1843, + "step": 19780 + }, + { + "epoch": 17.701252236135957, + "grad_norm": 1.6539798974990845, + "learning_rate": 4.136582430806257e-07, + "loss": 0.1974, + "step": 19790 + }, + { + "epoch": 17.71019677996422, + "grad_norm": 1.5544081926345825, + "learning_rate": 4.133574007220216e-07, + "loss": 0.1954, + "step": 19800 + }, + { + "epoch": 17.719141323792485, + "grad_norm": 1.3864456415176392, + "learning_rate": 4.1305655836341756e-07, + "loss": 0.1954, + "step": 19810 + }, + { + "epoch": 17.728085867620752, + "grad_norm": 1.770103096961975, + "learning_rate": 4.1275571600481345e-07, + "loss": 0.2061, + "step": 19820 + }, + { + "epoch": 17.737030411449016, + "grad_norm": 1.5458662509918213, + "learning_rate": 4.1245487364620934e-07, + "loss": 0.1995, + "step": 19830 + }, + { + "epoch": 17.74597495527728, + "grad_norm": 1.2489945888519287, + "learning_rate": 4.121540312876053e-07, + "loss": 0.2056, + "step": 19840 + }, + { + "epoch": 17.754919499105547, + "grad_norm": 1.4535328149795532, + "learning_rate": 4.1185318892900117e-07, + "loss": 0.1998, + "step": 19850 + }, + { + "epoch": 17.76386404293381, + "grad_norm": 1.4486058950424194, + "learning_rate": 4.115523465703971e-07, + "loss": 0.1887, + "step": 19860 + }, + { + "epoch": 17.772808586762075, + "grad_norm": 1.2764486074447632, + "learning_rate": 4.11251504211793e-07, + "loss": 0.1917, + "step": 19870 + }, + { + "epoch": 17.78175313059034, + "grad_norm": 1.711034893989563, + "learning_rate": 4.109506618531889e-07, + "loss": 0.1966, + "step": 19880 + }, + { + "epoch": 17.790697674418606, + "grad_norm": 1.7573375701904297, + "learning_rate": 4.1064981949458483e-07, + "loss": 0.1929, + "step": 19890 + }, + { + "epoch": 17.79964221824687, + "grad_norm": 1.5140432119369507, + "learning_rate": 4.103489771359807e-07, + "loss": 0.1876, + "step": 19900 + }, + { + "epoch": 17.808586762075134, + "grad_norm": 1.3529492616653442, + "learning_rate": 4.1004813477737667e-07, + "loss": 0.1927, + "step": 19910 + }, + { + "epoch": 17.817531305903398, + "grad_norm": 1.3087401390075684, + "learning_rate": 4.0974729241877256e-07, + "loss": 0.2023, + "step": 19920 + }, + { + "epoch": 17.826475849731665, + "grad_norm": 1.36638343334198, + "learning_rate": 4.094464500601685e-07, + "loss": 0.2056, + "step": 19930 + }, + { + "epoch": 17.83542039355993, + "grad_norm": 1.648963212966919, + "learning_rate": 4.091456077015644e-07, + "loss": 0.1988, + "step": 19940 + }, + { + "epoch": 17.844364937388193, + "grad_norm": 1.9200732707977295, + "learning_rate": 4.088447653429602e-07, + "loss": 0.2034, + "step": 19950 + }, + { + "epoch": 17.853309481216456, + "grad_norm": 1.5758631229400635, + "learning_rate": 4.0854392298435617e-07, + "loss": 0.195, + "step": 19960 + }, + { + "epoch": 17.862254025044724, + "grad_norm": 1.334425449371338, + "learning_rate": 4.0824308062575206e-07, + "loss": 0.2079, + "step": 19970 + }, + { + "epoch": 17.871198568872988, + "grad_norm": 1.6201728582382202, + "learning_rate": 4.07942238267148e-07, + "loss": 0.1999, + "step": 19980 + }, + { + "epoch": 17.88014311270125, + "grad_norm": 1.4320831298828125, + "learning_rate": 4.076413959085439e-07, + "loss": 0.1943, + "step": 19990 + }, + { + "epoch": 17.88908765652952, + "grad_norm": 1.2403602600097656, + "learning_rate": 4.073405535499398e-07, + "loss": 0.1925, + "step": 20000 + }, + { + "epoch": 17.898032200357783, + "grad_norm": 1.2947001457214355, + "learning_rate": 4.070397111913357e-07, + "loss": 0.2029, + "step": 20010 + }, + { + "epoch": 17.906976744186046, + "grad_norm": 1.276208758354187, + "learning_rate": 4.067388688327316e-07, + "loss": 0.193, + "step": 20020 + }, + { + "epoch": 17.91592128801431, + "grad_norm": 1.4883465766906738, + "learning_rate": 4.0643802647412756e-07, + "loss": 0.201, + "step": 20030 + }, + { + "epoch": 17.924865831842578, + "grad_norm": 1.36917245388031, + "learning_rate": 4.0613718411552345e-07, + "loss": 0.2017, + "step": 20040 + }, + { + "epoch": 17.93381037567084, + "grad_norm": 1.5583710670471191, + "learning_rate": 4.0583634175691934e-07, + "loss": 0.198, + "step": 20050 + }, + { + "epoch": 17.942754919499105, + "grad_norm": 1.357555866241455, + "learning_rate": 4.055354993983153e-07, + "loss": 0.1875, + "step": 20060 + }, + { + "epoch": 17.95169946332737, + "grad_norm": 1.5523446798324585, + "learning_rate": 4.0523465703971117e-07, + "loss": 0.1954, + "step": 20070 + }, + { + "epoch": 17.960644007155636, + "grad_norm": 1.2269097566604614, + "learning_rate": 4.049338146811071e-07, + "loss": 0.1992, + "step": 20080 + }, + { + "epoch": 17.9695885509839, + "grad_norm": 1.7482787370681763, + "learning_rate": 4.04632972322503e-07, + "loss": 0.1998, + "step": 20090 + }, + { + "epoch": 17.978533094812164, + "grad_norm": 1.9512193202972412, + "learning_rate": 4.043321299638989e-07, + "loss": 0.2004, + "step": 20100 + }, + { + "epoch": 17.987477638640428, + "grad_norm": 1.7851731777191162, + "learning_rate": 4.0403128760529484e-07, + "loss": 0.1885, + "step": 20110 + }, + { + "epoch": 17.996422182468695, + "grad_norm": 1.3705447912216187, + "learning_rate": 4.037304452466907e-07, + "loss": 0.1943, + "step": 20120 + }, + { + "epoch": 18.0, + "eval_bleu": 77.421, + "eval_gen_len": 74.6641, + "eval_loss": 0.15242676436901093, + "eval_runtime": 57.5495, + "eval_samples_per_second": 18.106, + "eval_steps_per_second": 0.191, + "step": 20124 + }, + { + "epoch": 18.00536672629696, + "grad_norm": 1.3600645065307617, + "learning_rate": 4.034296028880866e-07, + "loss": 0.1936, + "step": 20130 + }, + { + "epoch": 18.014311270125223, + "grad_norm": 1.514121174812317, + "learning_rate": 4.031287605294825e-07, + "loss": 0.1874, + "step": 20140 + }, + { + "epoch": 18.023255813953487, + "grad_norm": 1.8172216415405273, + "learning_rate": 4.0282791817087845e-07, + "loss": 0.1923, + "step": 20150 + }, + { + "epoch": 18.032200357781754, + "grad_norm": 1.2660489082336426, + "learning_rate": 4.0252707581227434e-07, + "loss": 0.1933, + "step": 20160 + }, + { + "epoch": 18.041144901610018, + "grad_norm": 2.1290335655212402, + "learning_rate": 4.0222623345367023e-07, + "loss": 0.1979, + "step": 20170 + }, + { + "epoch": 18.05008944543828, + "grad_norm": 1.3781754970550537, + "learning_rate": 4.0192539109506617e-07, + "loss": 0.1975, + "step": 20180 + }, + { + "epoch": 18.05903398926655, + "grad_norm": 1.3132025003433228, + "learning_rate": 4.0162454873646206e-07, + "loss": 0.1993, + "step": 20190 + }, + { + "epoch": 18.067978533094813, + "grad_norm": 1.3299691677093506, + "learning_rate": 4.01323706377858e-07, + "loss": 0.1916, + "step": 20200 + }, + { + "epoch": 18.076923076923077, + "grad_norm": 1.939542293548584, + "learning_rate": 4.010228640192539e-07, + "loss": 0.1988, + "step": 20210 + }, + { + "epoch": 18.08586762075134, + "grad_norm": 1.4302418231964111, + "learning_rate": 4.007220216606498e-07, + "loss": 0.1948, + "step": 20220 + }, + { + "epoch": 18.094812164579608, + "grad_norm": 1.2856136560440063, + "learning_rate": 4.004211793020457e-07, + "loss": 0.1948, + "step": 20230 + }, + { + "epoch": 18.10375670840787, + "grad_norm": 1.3175781965255737, + "learning_rate": 4.001203369434416e-07, + "loss": 0.1963, + "step": 20240 + }, + { + "epoch": 18.112701252236135, + "grad_norm": 1.9059381484985352, + "learning_rate": 3.9981949458483756e-07, + "loss": 0.1893, + "step": 20250 + }, + { + "epoch": 18.1216457960644, + "grad_norm": 1.9008736610412598, + "learning_rate": 3.9951865222623345e-07, + "loss": 0.1902, + "step": 20260 + }, + { + "epoch": 18.130590339892667, + "grad_norm": 1.2346973419189453, + "learning_rate": 3.9921780986762934e-07, + "loss": 0.1934, + "step": 20270 + }, + { + "epoch": 18.13953488372093, + "grad_norm": 1.430349349975586, + "learning_rate": 3.989169675090253e-07, + "loss": 0.2063, + "step": 20280 + }, + { + "epoch": 18.148479427549194, + "grad_norm": 1.5357002019882202, + "learning_rate": 3.9861612515042117e-07, + "loss": 0.1951, + "step": 20290 + }, + { + "epoch": 18.157423971377458, + "grad_norm": 1.565921425819397, + "learning_rate": 3.9831528279181706e-07, + "loss": 0.2069, + "step": 20300 + }, + { + "epoch": 18.166368515205725, + "grad_norm": 1.4086253643035889, + "learning_rate": 3.9801444043321295e-07, + "loss": 0.1901, + "step": 20310 + }, + { + "epoch": 18.17531305903399, + "grad_norm": 1.5787662267684937, + "learning_rate": 3.977135980746089e-07, + "loss": 0.1959, + "step": 20320 + }, + { + "epoch": 18.184257602862253, + "grad_norm": 1.3202236890792847, + "learning_rate": 3.974127557160048e-07, + "loss": 0.1905, + "step": 20330 + }, + { + "epoch": 18.19320214669052, + "grad_norm": 1.4912652969360352, + "learning_rate": 3.9711191335740067e-07, + "loss": 0.197, + "step": 20340 + }, + { + "epoch": 18.202146690518784, + "grad_norm": 1.5808525085449219, + "learning_rate": 3.968110709987966e-07, + "loss": 0.2094, + "step": 20350 + }, + { + "epoch": 18.211091234347048, + "grad_norm": 1.6998366117477417, + "learning_rate": 3.965102286401925e-07, + "loss": 0.1879, + "step": 20360 + }, + { + "epoch": 18.220035778175312, + "grad_norm": 1.2697880268096924, + "learning_rate": 3.9620938628158845e-07, + "loss": 0.1995, + "step": 20370 + }, + { + "epoch": 18.22898032200358, + "grad_norm": 1.6824156045913696, + "learning_rate": 3.9590854392298434e-07, + "loss": 0.1952, + "step": 20380 + }, + { + "epoch": 18.237924865831843, + "grad_norm": 1.4301773309707642, + "learning_rate": 3.9560770156438023e-07, + "loss": 0.2053, + "step": 20390 + }, + { + "epoch": 18.246869409660107, + "grad_norm": 1.4164366722106934, + "learning_rate": 3.9530685920577617e-07, + "loss": 0.1886, + "step": 20400 + }, + { + "epoch": 18.25581395348837, + "grad_norm": 1.2958494424819946, + "learning_rate": 3.9500601684717206e-07, + "loss": 0.1816, + "step": 20410 + }, + { + "epoch": 18.264758497316638, + "grad_norm": 1.4988905191421509, + "learning_rate": 3.94705174488568e-07, + "loss": 0.1913, + "step": 20420 + }, + { + "epoch": 18.273703041144902, + "grad_norm": 1.38814115524292, + "learning_rate": 3.944043321299639e-07, + "loss": 0.1902, + "step": 20430 + }, + { + "epoch": 18.282647584973166, + "grad_norm": 1.3260778188705444, + "learning_rate": 3.941034897713598e-07, + "loss": 0.1885, + "step": 20440 + }, + { + "epoch": 18.29159212880143, + "grad_norm": 2.0683860778808594, + "learning_rate": 3.9380264741275573e-07, + "loss": 0.1929, + "step": 20450 + }, + { + "epoch": 18.300536672629697, + "grad_norm": 1.2236251831054688, + "learning_rate": 3.935018050541516e-07, + "loss": 0.1837, + "step": 20460 + }, + { + "epoch": 18.30948121645796, + "grad_norm": 1.4860965013504028, + "learning_rate": 3.9320096269554756e-07, + "loss": 0.2023, + "step": 20470 + }, + { + "epoch": 18.318425760286225, + "grad_norm": 1.5370088815689087, + "learning_rate": 3.929001203369434e-07, + "loss": 0.1958, + "step": 20480 + }, + { + "epoch": 18.32737030411449, + "grad_norm": 1.4012691974639893, + "learning_rate": 3.925992779783393e-07, + "loss": 0.1801, + "step": 20490 + }, + { + "epoch": 18.336314847942756, + "grad_norm": 1.4192802906036377, + "learning_rate": 3.9229843561973523e-07, + "loss": 0.1889, + "step": 20500 + }, + { + "epoch": 18.34525939177102, + "grad_norm": 1.4101192951202393, + "learning_rate": 3.919975932611311e-07, + "loss": 0.189, + "step": 20510 + }, + { + "epoch": 18.354203935599283, + "grad_norm": 1.4301940202713013, + "learning_rate": 3.9169675090252706e-07, + "loss": 0.2039, + "step": 20520 + }, + { + "epoch": 18.36314847942755, + "grad_norm": 1.3676986694335938, + "learning_rate": 3.9139590854392295e-07, + "loss": 0.1956, + "step": 20530 + }, + { + "epoch": 18.372093023255815, + "grad_norm": 1.1791801452636719, + "learning_rate": 3.910950661853189e-07, + "loss": 0.1897, + "step": 20540 + }, + { + "epoch": 18.38103756708408, + "grad_norm": 1.5008537769317627, + "learning_rate": 3.907942238267148e-07, + "loss": 0.198, + "step": 20550 + }, + { + "epoch": 18.389982110912342, + "grad_norm": 1.3062366247177124, + "learning_rate": 3.904933814681107e-07, + "loss": 0.1938, + "step": 20560 + }, + { + "epoch": 18.39892665474061, + "grad_norm": 1.6328556537628174, + "learning_rate": 3.901925391095066e-07, + "loss": 0.1902, + "step": 20570 + }, + { + "epoch": 18.407871198568873, + "grad_norm": 1.6620492935180664, + "learning_rate": 3.898916967509025e-07, + "loss": 0.1849, + "step": 20580 + }, + { + "epoch": 18.416815742397137, + "grad_norm": 1.5257842540740967, + "learning_rate": 3.8959085439229845e-07, + "loss": 0.1999, + "step": 20590 + }, + { + "epoch": 18.4257602862254, + "grad_norm": 1.7352404594421387, + "learning_rate": 3.8929001203369434e-07, + "loss": 0.203, + "step": 20600 + }, + { + "epoch": 18.43470483005367, + "grad_norm": 1.398389220237732, + "learning_rate": 3.8898916967509023e-07, + "loss": 0.1869, + "step": 20610 + }, + { + "epoch": 18.443649373881932, + "grad_norm": 1.4931424856185913, + "learning_rate": 3.8868832731648617e-07, + "loss": 0.1927, + "step": 20620 + }, + { + "epoch": 18.452593917710196, + "grad_norm": 1.3729106187820435, + "learning_rate": 3.8838748495788206e-07, + "loss": 0.191, + "step": 20630 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 1.6463043689727783, + "learning_rate": 3.88086642599278e-07, + "loss": 0.1883, + "step": 20640 + }, + { + "epoch": 18.470483005366727, + "grad_norm": 1.7752493619918823, + "learning_rate": 3.8778580024067384e-07, + "loss": 0.1994, + "step": 20650 + }, + { + "epoch": 18.47942754919499, + "grad_norm": 1.4545904397964478, + "learning_rate": 3.8748495788206973e-07, + "loss": 0.1951, + "step": 20660 + }, + { + "epoch": 18.488372093023255, + "grad_norm": 1.4974523782730103, + "learning_rate": 3.871841155234657e-07, + "loss": 0.1925, + "step": 20670 + }, + { + "epoch": 18.497316636851522, + "grad_norm": 1.3493813276290894, + "learning_rate": 3.8688327316486156e-07, + "loss": 0.1861, + "step": 20680 + }, + { + "epoch": 18.506261180679786, + "grad_norm": 1.1932425498962402, + "learning_rate": 3.865824308062575e-07, + "loss": 0.1933, + "step": 20690 + }, + { + "epoch": 18.51520572450805, + "grad_norm": 1.5326428413391113, + "learning_rate": 3.862815884476534e-07, + "loss": 0.1859, + "step": 20700 + }, + { + "epoch": 18.524150268336314, + "grad_norm": 1.3875932693481445, + "learning_rate": 3.8598074608904934e-07, + "loss": 0.1851, + "step": 20710 + }, + { + "epoch": 18.53309481216458, + "grad_norm": 1.3919353485107422, + "learning_rate": 3.8567990373044523e-07, + "loss": 0.2017, + "step": 20720 + }, + { + "epoch": 18.542039355992845, + "grad_norm": 1.5686875581741333, + "learning_rate": 3.853790613718411e-07, + "loss": 0.1971, + "step": 20730 + }, + { + "epoch": 18.55098389982111, + "grad_norm": 1.4372426271438599, + "learning_rate": 3.8507821901323706e-07, + "loss": 0.1991, + "step": 20740 + }, + { + "epoch": 18.559928443649373, + "grad_norm": 1.2772883176803589, + "learning_rate": 3.8477737665463295e-07, + "loss": 0.2028, + "step": 20750 + }, + { + "epoch": 18.56887298747764, + "grad_norm": 1.4629807472229004, + "learning_rate": 3.844765342960289e-07, + "loss": 0.1931, + "step": 20760 + }, + { + "epoch": 18.577817531305904, + "grad_norm": 1.483835220336914, + "learning_rate": 3.841756919374248e-07, + "loss": 0.1839, + "step": 20770 + }, + { + "epoch": 18.586762075134168, + "grad_norm": 1.3380502462387085, + "learning_rate": 3.838748495788207e-07, + "loss": 0.1942, + "step": 20780 + }, + { + "epoch": 18.59570661896243, + "grad_norm": 1.3125088214874268, + "learning_rate": 3.835740072202166e-07, + "loss": 0.1998, + "step": 20790 + }, + { + "epoch": 18.6046511627907, + "grad_norm": 1.4294259548187256, + "learning_rate": 3.832731648616125e-07, + "loss": 0.1932, + "step": 20800 + }, + { + "epoch": 18.613595706618963, + "grad_norm": 1.4665868282318115, + "learning_rate": 3.8297232250300845e-07, + "loss": 0.1929, + "step": 20810 + }, + { + "epoch": 18.622540250447226, + "grad_norm": 1.2699508666992188, + "learning_rate": 3.826714801444043e-07, + "loss": 0.1859, + "step": 20820 + }, + { + "epoch": 18.631484794275494, + "grad_norm": 1.6539772748947144, + "learning_rate": 3.823706377858002e-07, + "loss": 0.1875, + "step": 20830 + }, + { + "epoch": 18.640429338103758, + "grad_norm": 1.6890268325805664, + "learning_rate": 3.820697954271961e-07, + "loss": 0.1987, + "step": 20840 + }, + { + "epoch": 18.64937388193202, + "grad_norm": 1.480602741241455, + "learning_rate": 3.81768953068592e-07, + "loss": 0.1923, + "step": 20850 + }, + { + "epoch": 18.658318425760285, + "grad_norm": 1.5095171928405762, + "learning_rate": 3.8146811070998795e-07, + "loss": 0.1915, + "step": 20860 + }, + { + "epoch": 18.667262969588553, + "grad_norm": 1.3432358503341675, + "learning_rate": 3.8116726835138384e-07, + "loss": 0.1903, + "step": 20870 + }, + { + "epoch": 18.676207513416816, + "grad_norm": 1.7593339681625366, + "learning_rate": 3.8086642599277973e-07, + "loss": 0.1968, + "step": 20880 + }, + { + "epoch": 18.68515205724508, + "grad_norm": 1.431167721748352, + "learning_rate": 3.805655836341757e-07, + "loss": 0.195, + "step": 20890 + }, + { + "epoch": 18.694096601073344, + "grad_norm": 1.451204538345337, + "learning_rate": 3.8026474127557157e-07, + "loss": 0.1934, + "step": 20900 + }, + { + "epoch": 18.70304114490161, + "grad_norm": 1.2916609048843384, + "learning_rate": 3.799638989169675e-07, + "loss": 0.1894, + "step": 20910 + }, + { + "epoch": 18.711985688729875, + "grad_norm": 1.6644831895828247, + "learning_rate": 3.796630565583634e-07, + "loss": 0.1839, + "step": 20920 + }, + { + "epoch": 18.72093023255814, + "grad_norm": 1.2776670455932617, + "learning_rate": 3.7936221419975934e-07, + "loss": 0.1877, + "step": 20930 + }, + { + "epoch": 18.729874776386403, + "grad_norm": 1.3584338426589966, + "learning_rate": 3.7906137184115523e-07, + "loss": 0.1946, + "step": 20940 + }, + { + "epoch": 18.73881932021467, + "grad_norm": 1.5859999656677246, + "learning_rate": 3.787605294825511e-07, + "loss": 0.1917, + "step": 20950 + }, + { + "epoch": 18.747763864042934, + "grad_norm": 1.5412558317184448, + "learning_rate": 3.7845968712394706e-07, + "loss": 0.1902, + "step": 20960 + }, + { + "epoch": 18.756708407871198, + "grad_norm": 1.4061285257339478, + "learning_rate": 3.7815884476534295e-07, + "loss": 0.1922, + "step": 20970 + }, + { + "epoch": 18.76565295169946, + "grad_norm": 1.3371307849884033, + "learning_rate": 3.778580024067389e-07, + "loss": 0.1851, + "step": 20980 + }, + { + "epoch": 18.77459749552773, + "grad_norm": 2.0113933086395264, + "learning_rate": 3.7755716004813473e-07, + "loss": 0.1969, + "step": 20990 + }, + { + "epoch": 18.783542039355993, + "grad_norm": 1.2509182691574097, + "learning_rate": 3.772563176895306e-07, + "loss": 0.1946, + "step": 21000 + }, + { + "epoch": 18.792486583184257, + "grad_norm": 1.297754168510437, + "learning_rate": 3.7695547533092657e-07, + "loss": 0.2006, + "step": 21010 + }, + { + "epoch": 18.801431127012524, + "grad_norm": 1.376678705215454, + "learning_rate": 3.7665463297232246e-07, + "loss": 0.1799, + "step": 21020 + }, + { + "epoch": 18.810375670840788, + "grad_norm": 1.495245099067688, + "learning_rate": 3.763537906137184e-07, + "loss": 0.1919, + "step": 21030 + }, + { + "epoch": 18.81932021466905, + "grad_norm": 1.2178292274475098, + "learning_rate": 3.760529482551143e-07, + "loss": 0.1972, + "step": 21040 + }, + { + "epoch": 18.828264758497316, + "grad_norm": 1.226501703262329, + "learning_rate": 3.757521058965102e-07, + "loss": 0.1968, + "step": 21050 + }, + { + "epoch": 18.837209302325583, + "grad_norm": 1.2760741710662842, + "learning_rate": 3.754512635379061e-07, + "loss": 0.1909, + "step": 21060 + }, + { + "epoch": 18.846153846153847, + "grad_norm": 1.7594233751296997, + "learning_rate": 3.75150421179302e-07, + "loss": 0.193, + "step": 21070 + }, + { + "epoch": 18.85509838998211, + "grad_norm": 1.4522331953048706, + "learning_rate": 3.7484957882069795e-07, + "loss": 0.1961, + "step": 21080 + }, + { + "epoch": 18.864042933810374, + "grad_norm": 1.6493743658065796, + "learning_rate": 3.7454873646209384e-07, + "loss": 0.1892, + "step": 21090 + }, + { + "epoch": 18.87298747763864, + "grad_norm": 1.2935068607330322, + "learning_rate": 3.742478941034898e-07, + "loss": 0.1832, + "step": 21100 + }, + { + "epoch": 18.881932021466906, + "grad_norm": 1.1401342153549194, + "learning_rate": 3.739470517448857e-07, + "loss": 0.1793, + "step": 21110 + }, + { + "epoch": 18.89087656529517, + "grad_norm": 1.3705753087997437, + "learning_rate": 3.7364620938628157e-07, + "loss": 0.1945, + "step": 21120 + }, + { + "epoch": 18.899821109123433, + "grad_norm": 1.6233444213867188, + "learning_rate": 3.733453670276775e-07, + "loss": 0.1961, + "step": 21130 + }, + { + "epoch": 18.9087656529517, + "grad_norm": 1.460810661315918, + "learning_rate": 3.730445246690734e-07, + "loss": 0.1934, + "step": 21140 + }, + { + "epoch": 18.917710196779964, + "grad_norm": 1.409800410270691, + "learning_rate": 3.7274368231046934e-07, + "loss": 0.1817, + "step": 21150 + }, + { + "epoch": 18.926654740608228, + "grad_norm": 1.3818022012710571, + "learning_rate": 3.7244283995186523e-07, + "loss": 0.1972, + "step": 21160 + }, + { + "epoch": 18.935599284436492, + "grad_norm": 1.642561912536621, + "learning_rate": 3.7214199759326107e-07, + "loss": 0.1905, + "step": 21170 + }, + { + "epoch": 18.94454382826476, + "grad_norm": 1.5107719898223877, + "learning_rate": 3.71841155234657e-07, + "loss": 0.1916, + "step": 21180 + }, + { + "epoch": 18.953488372093023, + "grad_norm": 1.3990639448165894, + "learning_rate": 3.715403128760529e-07, + "loss": 0.196, + "step": 21190 + }, + { + "epoch": 18.962432915921287, + "grad_norm": 1.2956585884094238, + "learning_rate": 3.7123947051744884e-07, + "loss": 0.1875, + "step": 21200 + }, + { + "epoch": 18.971377459749554, + "grad_norm": 1.4675569534301758, + "learning_rate": 3.7093862815884473e-07, + "loss": 0.183, + "step": 21210 + }, + { + "epoch": 18.980322003577818, + "grad_norm": 1.4663444757461548, + "learning_rate": 3.706377858002406e-07, + "loss": 0.1845, + "step": 21220 + }, + { + "epoch": 18.989266547406082, + "grad_norm": 1.0990031957626343, + "learning_rate": 3.7033694344163657e-07, + "loss": 0.1918, + "step": 21230 + }, + { + "epoch": 18.998211091234346, + "grad_norm": 1.2096024751663208, + "learning_rate": 3.7003610108303246e-07, + "loss": 0.1987, + "step": 21240 + }, + { + "epoch": 19.0, + "eval_bleu": 77.8231, + "eval_gen_len": 74.6833, + "eval_loss": 0.14946186542510986, + "eval_runtime": 56.7661, + "eval_samples_per_second": 18.356, + "eval_steps_per_second": 0.194, + "step": 21242 + }, + { + "epoch": 19.007155635062613, + "grad_norm": 1.557692050933838, + "learning_rate": 3.697352587244284e-07, + "loss": 0.1898, + "step": 21250 + }, + { + "epoch": 19.016100178890877, + "grad_norm": 1.4740185737609863, + "learning_rate": 3.694344163658243e-07, + "loss": 0.187, + "step": 21260 + }, + { + "epoch": 19.02504472271914, + "grad_norm": 1.4456875324249268, + "learning_rate": 3.6913357400722023e-07, + "loss": 0.1924, + "step": 21270 + }, + { + "epoch": 19.033989266547405, + "grad_norm": 1.546985149383545, + "learning_rate": 3.688327316486161e-07, + "loss": 0.1892, + "step": 21280 + }, + { + "epoch": 19.042933810375672, + "grad_norm": 1.4747453927993774, + "learning_rate": 3.68531889290012e-07, + "loss": 0.1934, + "step": 21290 + }, + { + "epoch": 19.051878354203936, + "grad_norm": 1.3266552686691284, + "learning_rate": 3.6823104693140796e-07, + "loss": 0.2022, + "step": 21300 + }, + { + "epoch": 19.0608228980322, + "grad_norm": 1.3492190837860107, + "learning_rate": 3.6793020457280385e-07, + "loss": 0.1921, + "step": 21310 + }, + { + "epoch": 19.069767441860463, + "grad_norm": 1.6898155212402344, + "learning_rate": 3.676293622141998e-07, + "loss": 0.1947, + "step": 21320 + }, + { + "epoch": 19.07871198568873, + "grad_norm": 1.310855507850647, + "learning_rate": 3.673285198555957e-07, + "loss": 0.1927, + "step": 21330 + }, + { + "epoch": 19.087656529516995, + "grad_norm": 1.9882363080978394, + "learning_rate": 3.670276774969915e-07, + "loss": 0.1936, + "step": 21340 + }, + { + "epoch": 19.09660107334526, + "grad_norm": 1.7846614122390747, + "learning_rate": 3.6672683513838746e-07, + "loss": 0.1872, + "step": 21350 + }, + { + "epoch": 19.105545617173526, + "grad_norm": 1.561082124710083, + "learning_rate": 3.6642599277978335e-07, + "loss": 0.1938, + "step": 21360 + }, + { + "epoch": 19.11449016100179, + "grad_norm": 1.4885318279266357, + "learning_rate": 3.661251504211793e-07, + "loss": 0.1832, + "step": 21370 + }, + { + "epoch": 19.123434704830053, + "grad_norm": 1.2708408832550049, + "learning_rate": 3.658243080625752e-07, + "loss": 0.1891, + "step": 21380 + }, + { + "epoch": 19.132379248658317, + "grad_norm": 1.5973031520843506, + "learning_rate": 3.6552346570397107e-07, + "loss": 0.1862, + "step": 21390 + }, + { + "epoch": 19.141323792486585, + "grad_norm": 1.3048038482666016, + "learning_rate": 3.65222623345367e-07, + "loss": 0.1994, + "step": 21400 + }, + { + "epoch": 19.15026833631485, + "grad_norm": 1.4048810005187988, + "learning_rate": 3.649217809867629e-07, + "loss": 0.194, + "step": 21410 + }, + { + "epoch": 19.159212880143112, + "grad_norm": 1.5730185508728027, + "learning_rate": 3.6462093862815885e-07, + "loss": 0.189, + "step": 21420 + }, + { + "epoch": 19.168157423971376, + "grad_norm": 1.3440287113189697, + "learning_rate": 3.6432009626955474e-07, + "loss": 0.1819, + "step": 21430 + }, + { + "epoch": 19.177101967799643, + "grad_norm": 1.4186955690383911, + "learning_rate": 3.640192539109506e-07, + "loss": 0.1939, + "step": 21440 + }, + { + "epoch": 19.186046511627907, + "grad_norm": 1.454852819442749, + "learning_rate": 3.6371841155234657e-07, + "loss": 0.199, + "step": 21450 + }, + { + "epoch": 19.19499105545617, + "grad_norm": 1.2862147092819214, + "learning_rate": 3.6341756919374246e-07, + "loss": 0.1878, + "step": 21460 + }, + { + "epoch": 19.203935599284435, + "grad_norm": 1.3074766397476196, + "learning_rate": 3.631167268351384e-07, + "loss": 0.1871, + "step": 21470 + }, + { + "epoch": 19.212880143112702, + "grad_norm": 1.3759242296218872, + "learning_rate": 3.628158844765343e-07, + "loss": 0.1939, + "step": 21480 + }, + { + "epoch": 19.221824686940966, + "grad_norm": 1.3743611574172974, + "learning_rate": 3.6251504211793023e-07, + "loss": 0.1854, + "step": 21490 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 1.6088826656341553, + "learning_rate": 3.622141997593261e-07, + "loss": 0.1998, + "step": 21500 + }, + { + "epoch": 19.239713774597494, + "grad_norm": 1.5923534631729126, + "learning_rate": 3.6191335740072196e-07, + "loss": 0.195, + "step": 21510 + }, + { + "epoch": 19.24865831842576, + "grad_norm": 1.2471575736999512, + "learning_rate": 3.616125150421179e-07, + "loss": 0.185, + "step": 21520 + }, + { + "epoch": 19.257602862254025, + "grad_norm": 1.300024390220642, + "learning_rate": 3.613116726835138e-07, + "loss": 0.1857, + "step": 21530 + }, + { + "epoch": 19.26654740608229, + "grad_norm": 1.4788153171539307, + "learning_rate": 3.6101083032490974e-07, + "loss": 0.1841, + "step": 21540 + }, + { + "epoch": 19.275491949910556, + "grad_norm": 1.4675010442733765, + "learning_rate": 3.607099879663056e-07, + "loss": 0.1875, + "step": 21550 + }, + { + "epoch": 19.28443649373882, + "grad_norm": 1.3291878700256348, + "learning_rate": 3.604091456077015e-07, + "loss": 0.185, + "step": 21560 + }, + { + "epoch": 19.293381037567084, + "grad_norm": 1.4330105781555176, + "learning_rate": 3.6010830324909746e-07, + "loss": 0.1791, + "step": 21570 + }, + { + "epoch": 19.302325581395348, + "grad_norm": 1.32407808303833, + "learning_rate": 3.5980746089049335e-07, + "loss": 0.1924, + "step": 21580 + }, + { + "epoch": 19.311270125223615, + "grad_norm": 1.3751063346862793, + "learning_rate": 3.595066185318893e-07, + "loss": 0.191, + "step": 21590 + }, + { + "epoch": 19.32021466905188, + "grad_norm": 1.3922836780548096, + "learning_rate": 3.592057761732852e-07, + "loss": 0.1774, + "step": 21600 + }, + { + "epoch": 19.329159212880143, + "grad_norm": 1.4643456935882568, + "learning_rate": 3.5890493381468107e-07, + "loss": 0.1879, + "step": 21610 + }, + { + "epoch": 19.338103756708406, + "grad_norm": 1.7122917175292969, + "learning_rate": 3.58604091456077e-07, + "loss": 0.1908, + "step": 21620 + }, + { + "epoch": 19.347048300536674, + "grad_norm": 1.4380452632904053, + "learning_rate": 3.583032490974729e-07, + "loss": 0.1887, + "step": 21630 + }, + { + "epoch": 19.355992844364938, + "grad_norm": 1.6291236877441406, + "learning_rate": 3.5800240673886885e-07, + "loss": 0.1882, + "step": 21640 + }, + { + "epoch": 19.3649373881932, + "grad_norm": 1.417327642440796, + "learning_rate": 3.5770156438026474e-07, + "loss": 0.1911, + "step": 21650 + }, + { + "epoch": 19.373881932021465, + "grad_norm": 1.411293864250183, + "learning_rate": 3.574007220216607e-07, + "loss": 0.1828, + "step": 21660 + }, + { + "epoch": 19.382826475849733, + "grad_norm": 1.677729606628418, + "learning_rate": 3.5709987966305657e-07, + "loss": 0.1905, + "step": 21670 + }, + { + "epoch": 19.391771019677996, + "grad_norm": 1.2971285581588745, + "learning_rate": 3.567990373044524e-07, + "loss": 0.1822, + "step": 21680 + }, + { + "epoch": 19.40071556350626, + "grad_norm": 1.531691312789917, + "learning_rate": 3.5649819494584835e-07, + "loss": 0.1938, + "step": 21690 + }, + { + "epoch": 19.409660107334528, + "grad_norm": 1.55259108543396, + "learning_rate": 3.5619735258724424e-07, + "loss": 0.1893, + "step": 21700 + }, + { + "epoch": 19.41860465116279, + "grad_norm": 1.436160683631897, + "learning_rate": 3.558965102286402e-07, + "loss": 0.193, + "step": 21710 + }, + { + "epoch": 19.427549194991055, + "grad_norm": 1.3020164966583252, + "learning_rate": 3.5559566787003607e-07, + "loss": 0.1882, + "step": 21720 + }, + { + "epoch": 19.43649373881932, + "grad_norm": 1.7475745677947998, + "learning_rate": 3.5529482551143196e-07, + "loss": 0.1977, + "step": 21730 + }, + { + "epoch": 19.445438282647586, + "grad_norm": 1.4146769046783447, + "learning_rate": 3.549939831528279e-07, + "loss": 0.1994, + "step": 21740 + }, + { + "epoch": 19.45438282647585, + "grad_norm": 1.409281849861145, + "learning_rate": 3.546931407942238e-07, + "loss": 0.1884, + "step": 21750 + }, + { + "epoch": 19.463327370304114, + "grad_norm": 1.7853525876998901, + "learning_rate": 3.5439229843561974e-07, + "loss": 0.1893, + "step": 21760 + }, + { + "epoch": 19.472271914132378, + "grad_norm": 1.4283171892166138, + "learning_rate": 3.5409145607701563e-07, + "loss": 0.1919, + "step": 21770 + }, + { + "epoch": 19.481216457960645, + "grad_norm": 1.1920448541641235, + "learning_rate": 3.537906137184115e-07, + "loss": 0.1875, + "step": 21780 + }, + { + "epoch": 19.49016100178891, + "grad_norm": 1.3144375085830688, + "learning_rate": 3.5348977135980746e-07, + "loss": 0.1907, + "step": 21790 + }, + { + "epoch": 19.499105545617173, + "grad_norm": 1.510340929031372, + "learning_rate": 3.5318892900120335e-07, + "loss": 0.1969, + "step": 21800 + }, + { + "epoch": 19.508050089445437, + "grad_norm": 1.3244644403457642, + "learning_rate": 3.528880866425993e-07, + "loss": 0.1915, + "step": 21810 + }, + { + "epoch": 19.516994633273704, + "grad_norm": 1.3958264589309692, + "learning_rate": 3.525872442839952e-07, + "loss": 0.1891, + "step": 21820 + }, + { + "epoch": 19.525939177101968, + "grad_norm": 1.2823486328125, + "learning_rate": 3.5228640192539107e-07, + "loss": 0.1744, + "step": 21830 + }, + { + "epoch": 19.53488372093023, + "grad_norm": 1.3641853332519531, + "learning_rate": 3.51985559566787e-07, + "loss": 0.1906, + "step": 21840 + }, + { + "epoch": 19.543828264758496, + "grad_norm": 1.4786450862884521, + "learning_rate": 3.516847172081829e-07, + "loss": 0.1857, + "step": 21850 + }, + { + "epoch": 19.552772808586763, + "grad_norm": 1.5867925882339478, + "learning_rate": 3.513838748495788e-07, + "loss": 0.1959, + "step": 21860 + }, + { + "epoch": 19.561717352415027, + "grad_norm": 1.313033938407898, + "learning_rate": 3.510830324909747e-07, + "loss": 0.1939, + "step": 21870 + }, + { + "epoch": 19.57066189624329, + "grad_norm": 1.3654066324234009, + "learning_rate": 3.5078219013237063e-07, + "loss": 0.1905, + "step": 21880 + }, + { + "epoch": 19.579606440071558, + "grad_norm": 1.2623218297958374, + "learning_rate": 3.504813477737665e-07, + "loss": 0.1893, + "step": 21890 + }, + { + "epoch": 19.58855098389982, + "grad_norm": 1.386467456817627, + "learning_rate": 3.501805054151624e-07, + "loss": 0.1815, + "step": 21900 + }, + { + "epoch": 19.597495527728086, + "grad_norm": 1.5557708740234375, + "learning_rate": 3.4987966305655835e-07, + "loss": 0.1917, + "step": 21910 + }, + { + "epoch": 19.60644007155635, + "grad_norm": 1.7414817810058594, + "learning_rate": 3.4957882069795424e-07, + "loss": 0.1895, + "step": 21920 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 1.3845630884170532, + "learning_rate": 3.492779783393502e-07, + "loss": 0.1854, + "step": 21930 + }, + { + "epoch": 19.62432915921288, + "grad_norm": 1.4406414031982422, + "learning_rate": 3.4897713598074607e-07, + "loss": 0.194, + "step": 21940 + }, + { + "epoch": 19.633273703041144, + "grad_norm": 1.2725870609283447, + "learning_rate": 3.4867629362214196e-07, + "loss": 0.1874, + "step": 21950 + }, + { + "epoch": 19.642218246869408, + "grad_norm": 1.4587557315826416, + "learning_rate": 3.483754512635379e-07, + "loss": 0.181, + "step": 21960 + }, + { + "epoch": 19.651162790697676, + "grad_norm": 1.490944743156433, + "learning_rate": 3.480746089049338e-07, + "loss": 0.1975, + "step": 21970 + }, + { + "epoch": 19.66010733452594, + "grad_norm": 1.3130711317062378, + "learning_rate": 3.4777376654632974e-07, + "loss": 0.1851, + "step": 21980 + }, + { + "epoch": 19.669051878354203, + "grad_norm": 1.3030730485916138, + "learning_rate": 3.4747292418772563e-07, + "loss": 0.189, + "step": 21990 + }, + { + "epoch": 19.677996422182467, + "grad_norm": 1.2224133014678955, + "learning_rate": 3.471720818291215e-07, + "loss": 0.1839, + "step": 22000 + }, + { + "epoch": 19.686940966010734, + "grad_norm": 1.2977553606033325, + "learning_rate": 3.4687123947051746e-07, + "loss": 0.1919, + "step": 22010 + }, + { + "epoch": 19.695885509838998, + "grad_norm": 1.4645330905914307, + "learning_rate": 3.4657039711191335e-07, + "loss": 0.1899, + "step": 22020 + }, + { + "epoch": 19.704830053667262, + "grad_norm": 1.3017830848693848, + "learning_rate": 3.4626955475330924e-07, + "loss": 0.1785, + "step": 22030 + }, + { + "epoch": 19.71377459749553, + "grad_norm": 1.2776682376861572, + "learning_rate": 3.4596871239470513e-07, + "loss": 0.1886, + "step": 22040 + }, + { + "epoch": 19.722719141323793, + "grad_norm": 1.4559818506240845, + "learning_rate": 3.456678700361011e-07, + "loss": 0.1801, + "step": 22050 + }, + { + "epoch": 19.731663685152057, + "grad_norm": 1.3212449550628662, + "learning_rate": 3.4536702767749696e-07, + "loss": 0.188, + "step": 22060 + }, + { + "epoch": 19.74060822898032, + "grad_norm": 1.4003950357437134, + "learning_rate": 3.4506618531889285e-07, + "loss": 0.1869, + "step": 22070 + }, + { + "epoch": 19.74955277280859, + "grad_norm": 1.3492873907089233, + "learning_rate": 3.447653429602888e-07, + "loss": 0.1887, + "step": 22080 + }, + { + "epoch": 19.758497316636852, + "grad_norm": 1.3928955793380737, + "learning_rate": 3.444645006016847e-07, + "loss": 0.1918, + "step": 22090 + }, + { + "epoch": 19.767441860465116, + "grad_norm": 1.2398922443389893, + "learning_rate": 3.4416365824308063e-07, + "loss": 0.1765, + "step": 22100 + }, + { + "epoch": 19.77638640429338, + "grad_norm": 1.2678223848342896, + "learning_rate": 3.438628158844765e-07, + "loss": 0.1883, + "step": 22110 + }, + { + "epoch": 19.785330948121647, + "grad_norm": 1.7187891006469727, + "learning_rate": 3.435619735258724e-07, + "loss": 0.19, + "step": 22120 + }, + { + "epoch": 19.79427549194991, + "grad_norm": 1.3717964887619019, + "learning_rate": 3.4326113116726835e-07, + "loss": 0.1804, + "step": 22130 + }, + { + "epoch": 19.803220035778175, + "grad_norm": 1.3578808307647705, + "learning_rate": 3.4296028880866424e-07, + "loss": 0.1908, + "step": 22140 + }, + { + "epoch": 19.81216457960644, + "grad_norm": 1.4975496530532837, + "learning_rate": 3.426594464500602e-07, + "loss": 0.194, + "step": 22150 + }, + { + "epoch": 19.821109123434706, + "grad_norm": 1.504811406135559, + "learning_rate": 3.423586040914561e-07, + "loss": 0.1988, + "step": 22160 + }, + { + "epoch": 19.83005366726297, + "grad_norm": 1.393159031867981, + "learning_rate": 3.4205776173285196e-07, + "loss": 0.1858, + "step": 22170 + }, + { + "epoch": 19.838998211091234, + "grad_norm": 1.8559398651123047, + "learning_rate": 3.417569193742479e-07, + "loss": 0.1897, + "step": 22180 + }, + { + "epoch": 19.8479427549195, + "grad_norm": 1.650848627090454, + "learning_rate": 3.414560770156438e-07, + "loss": 0.1931, + "step": 22190 + }, + { + "epoch": 19.856887298747765, + "grad_norm": 1.3729771375656128, + "learning_rate": 3.4115523465703974e-07, + "loss": 0.1888, + "step": 22200 + }, + { + "epoch": 19.86583184257603, + "grad_norm": 1.2925593852996826, + "learning_rate": 3.408543922984356e-07, + "loss": 0.1862, + "step": 22210 + }, + { + "epoch": 19.874776386404292, + "grad_norm": 1.4130479097366333, + "learning_rate": 3.4055354993983147e-07, + "loss": 0.1857, + "step": 22220 + }, + { + "epoch": 19.88372093023256, + "grad_norm": 1.4012774229049683, + "learning_rate": 3.402527075812274e-07, + "loss": 0.1892, + "step": 22230 + }, + { + "epoch": 19.892665474060824, + "grad_norm": 1.2611286640167236, + "learning_rate": 3.399518652226233e-07, + "loss": 0.1854, + "step": 22240 + }, + { + "epoch": 19.901610017889087, + "grad_norm": 1.2847506999969482, + "learning_rate": 3.3965102286401924e-07, + "loss": 0.1821, + "step": 22250 + }, + { + "epoch": 19.91055456171735, + "grad_norm": 1.5255744457244873, + "learning_rate": 3.3935018050541513e-07, + "loss": 0.1842, + "step": 22260 + }, + { + "epoch": 19.91949910554562, + "grad_norm": 1.539740800857544, + "learning_rate": 3.390493381468111e-07, + "loss": 0.1875, + "step": 22270 + }, + { + "epoch": 19.928443649373882, + "grad_norm": 1.4575883150100708, + "learning_rate": 3.3874849578820696e-07, + "loss": 0.187, + "step": 22280 + }, + { + "epoch": 19.937388193202146, + "grad_norm": 1.5852409601211548, + "learning_rate": 3.3844765342960285e-07, + "loss": 0.1976, + "step": 22290 + }, + { + "epoch": 19.94633273703041, + "grad_norm": 1.4154672622680664, + "learning_rate": 3.381468110709988e-07, + "loss": 0.1851, + "step": 22300 + }, + { + "epoch": 19.955277280858677, + "grad_norm": 1.5170589685440063, + "learning_rate": 3.378459687123947e-07, + "loss": 0.1902, + "step": 22310 + }, + { + "epoch": 19.96422182468694, + "grad_norm": 1.44078528881073, + "learning_rate": 3.3754512635379063e-07, + "loss": 0.1863, + "step": 22320 + }, + { + "epoch": 19.973166368515205, + "grad_norm": 1.4237490892410278, + "learning_rate": 3.372442839951865e-07, + "loss": 0.1852, + "step": 22330 + }, + { + "epoch": 19.98211091234347, + "grad_norm": 1.545115351676941, + "learning_rate": 3.369434416365824e-07, + "loss": 0.1785, + "step": 22340 + }, + { + "epoch": 19.991055456171736, + "grad_norm": 1.4572222232818604, + "learning_rate": 3.3664259927797835e-07, + "loss": 0.1818, + "step": 22350 + }, + { + "epoch": 20.0, + "grad_norm": 2.268296241760254, + "learning_rate": 3.3634175691937424e-07, + "loss": 0.1855, + "step": 22360 + }, + { + "epoch": 20.0, + "eval_bleu": 78.0784, + "eval_gen_len": 74.6804, + "eval_loss": 0.14719858765602112, + "eval_runtime": 59.9182, + "eval_samples_per_second": 17.39, + "eval_steps_per_second": 0.184, + "step": 22360 + }, + { + "epoch": 20.0, + "step": 22360, + "total_flos": 1.544729654989947e+17, + "train_loss": 0.0, + "train_runtime": 0.2648, + "train_samples_per_second": 12659125.519, + "train_steps_per_second": 63331.882 + } + ], + "logging_steps": 10, + "max_steps": 16770, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 2, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.544729654989947e+17, + "train_batch_size": 100, + "trial_name": null, + "trial_params": null +}