{ "best_metric": 1.578300952911377, "best_model_checkpoint": "en-to-lg-ufal/checkpoint-19400", "epoch": 4.999355753124597, "eval_steps": 500, "global_step": 19400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012884937508053087, "grad_norm": 0.9295567870140076, "learning_rate": 1.9949484536082476e-05, "loss": 3.2843, "step": 50 }, { "epoch": 0.025769875016106173, "grad_norm": 1.0073925256729126, "learning_rate": 1.9897938144329896e-05, "loss": 3.1584, "step": 100 }, { "epoch": 0.038654812524159254, "grad_norm": 1.3255535364151, "learning_rate": 1.984639175257732e-05, "loss": 3.0963, "step": 150 }, { "epoch": 0.051539750032212346, "grad_norm": 1.086804747581482, "learning_rate": 1.9794845360824745e-05, "loss": 2.9216, "step": 200 }, { "epoch": 0.06442468754026542, "grad_norm": 0.6704310178756714, "learning_rate": 1.9743298969072166e-05, "loss": 2.8337, "step": 250 }, { "epoch": 0.07730962504831851, "grad_norm": 0.837518572807312, "learning_rate": 1.969278350515464e-05, "loss": 2.7777, "step": 300 }, { "epoch": 0.09019456255637161, "grad_norm": 0.7694929242134094, "learning_rate": 1.9641237113402064e-05, "loss": 2.7939, "step": 350 }, { "epoch": 0.10307950006442469, "grad_norm": 1.0657893419265747, "learning_rate": 1.9589690721649485e-05, "loss": 2.7371, "step": 400 }, { "epoch": 0.11596443757247778, "grad_norm": 0.7758269906044006, "learning_rate": 1.953814432989691e-05, "loss": 2.6623, "step": 450 }, { "epoch": 0.12884937508053085, "grad_norm": 0.746475100517273, "learning_rate": 1.948659793814433e-05, "loss": 2.6365, "step": 500 }, { "epoch": 0.14173431258858393, "grad_norm": 0.8395822048187256, "learning_rate": 1.9435051546391754e-05, "loss": 2.6025, "step": 550 }, { "epoch": 0.15461925009663702, "grad_norm": 1.061213493347168, "learning_rate": 1.938350515463918e-05, "loss": 2.5525, "step": 600 }, { "epoch": 0.16750418760469013, "grad_norm": 0.8460017442703247, "learning_rate": 1.93319587628866e-05, "loss": 2.554, "step": 650 }, { "epoch": 0.18038912511274321, "grad_norm": 0.8066027164459229, "learning_rate": 1.9280412371134024e-05, "loss": 2.4868, "step": 700 }, { "epoch": 0.1932740626207963, "grad_norm": 0.8909623622894287, "learning_rate": 1.9228865979381445e-05, "loss": 2.5062, "step": 750 }, { "epoch": 0.20615900012884938, "grad_norm": 0.9352700114250183, "learning_rate": 1.9177319587628865e-05, "loss": 2.4822, "step": 800 }, { "epoch": 0.21904393763690247, "grad_norm": 1.2132598161697388, "learning_rate": 1.912577319587629e-05, "loss": 2.408, "step": 850 }, { "epoch": 0.23192887514495555, "grad_norm": 1.0032589435577393, "learning_rate": 1.907422680412371e-05, "loss": 2.436, "step": 900 }, { "epoch": 0.24481381265300864, "grad_norm": 1.0249050855636597, "learning_rate": 1.9022680412371135e-05, "loss": 2.4474, "step": 950 }, { "epoch": 0.2576987501610617, "grad_norm": 0.8265942335128784, "learning_rate": 1.897113402061856e-05, "loss": 2.4005, "step": 1000 }, { "epoch": 0.2705836876691148, "grad_norm": 0.9384586215019226, "learning_rate": 1.891958762886598e-05, "loss": 2.2971, "step": 1050 }, { "epoch": 0.28346862517716787, "grad_norm": 0.9439546465873718, "learning_rate": 1.8868041237113404e-05, "loss": 2.3563, "step": 1100 }, { "epoch": 0.29635356268522095, "grad_norm": 0.9652894139289856, "learning_rate": 1.8816494845360825e-05, "loss": 2.2672, "step": 1150 }, { "epoch": 0.30923850019327404, "grad_norm": 0.8074690103530884, "learning_rate": 1.876494845360825e-05, "loss": 2.319, "step": 1200 }, { "epoch": 0.3221234377013272, "grad_norm": 0.8441233038902283, "learning_rate": 1.8713402061855674e-05, "loss": 2.3201, "step": 1250 }, { "epoch": 0.33500837520938026, "grad_norm": 1.0416673421859741, "learning_rate": 1.8661855670103094e-05, "loss": 2.2416, "step": 1300 }, { "epoch": 0.34789331271743335, "grad_norm": 1.100706696510315, "learning_rate": 1.861030927835052e-05, "loss": 2.2427, "step": 1350 }, { "epoch": 0.36077825022548643, "grad_norm": 0.9824424386024475, "learning_rate": 1.855876288659794e-05, "loss": 2.196, "step": 1400 }, { "epoch": 0.3736631877335395, "grad_norm": 0.9210222363471985, "learning_rate": 1.850721649484536e-05, "loss": 2.2371, "step": 1450 }, { "epoch": 0.3865481252415926, "grad_norm": 1.0536917448043823, "learning_rate": 1.8455670103092785e-05, "loss": 2.2008, "step": 1500 }, { "epoch": 0.3994330627496457, "grad_norm": 1.0008552074432373, "learning_rate": 1.8404123711340206e-05, "loss": 2.1586, "step": 1550 }, { "epoch": 0.41231800025769877, "grad_norm": 0.8722209334373474, "learning_rate": 1.835257731958763e-05, "loss": 2.1753, "step": 1600 }, { "epoch": 0.42520293776575185, "grad_norm": 0.7571916580200195, "learning_rate": 1.8301030927835054e-05, "loss": 2.1598, "step": 1650 }, { "epoch": 0.43808787527380494, "grad_norm": 1.054757833480835, "learning_rate": 1.8249484536082475e-05, "loss": 2.18, "step": 1700 }, { "epoch": 0.450972812781858, "grad_norm": 0.8249649405479431, "learning_rate": 1.81979381443299e-05, "loss": 2.1078, "step": 1750 }, { "epoch": 0.4638577502899111, "grad_norm": 1.7045085430145264, "learning_rate": 1.814639175257732e-05, "loss": 2.211, "step": 1800 }, { "epoch": 0.4767426877979642, "grad_norm": 1.0341459512710571, "learning_rate": 1.8094845360824744e-05, "loss": 2.1393, "step": 1850 }, { "epoch": 0.4896276253060173, "grad_norm": 0.9365245699882507, "learning_rate": 1.804329896907217e-05, "loss": 2.1602, "step": 1900 }, { "epoch": 0.5025125628140703, "grad_norm": 1.2039780616760254, "learning_rate": 1.799175257731959e-05, "loss": 2.1431, "step": 1950 }, { "epoch": 0.5153975003221234, "grad_norm": 0.7472810745239258, "learning_rate": 1.7940206185567014e-05, "loss": 2.1242, "step": 2000 }, { "epoch": 0.5282824378301765, "grad_norm": 0.8963159918785095, "learning_rate": 1.7888659793814435e-05, "loss": 2.1757, "step": 2050 }, { "epoch": 0.5411673753382296, "grad_norm": 1.0002330541610718, "learning_rate": 1.7837113402061855e-05, "loss": 2.1329, "step": 2100 }, { "epoch": 0.5540523128462826, "grad_norm": 0.944322943687439, "learning_rate": 1.778556701030928e-05, "loss": 2.1254, "step": 2150 }, { "epoch": 0.5669372503543357, "grad_norm": 1.0756226778030396, "learning_rate": 1.77340206185567e-05, "loss": 2.1479, "step": 2200 }, { "epoch": 0.5798221878623888, "grad_norm": 0.9357224106788635, "learning_rate": 1.7682474226804125e-05, "loss": 2.142, "step": 2250 }, { "epoch": 0.5927071253704419, "grad_norm": 0.9683809876441956, "learning_rate": 1.763092783505155e-05, "loss": 2.1744, "step": 2300 }, { "epoch": 0.605592062878495, "grad_norm": 0.9993259310722351, "learning_rate": 1.757938144329897e-05, "loss": 2.0672, "step": 2350 }, { "epoch": 0.6184770003865481, "grad_norm": 1.168818473815918, "learning_rate": 1.7527835051546394e-05, "loss": 2.1412, "step": 2400 }, { "epoch": 0.6313619378946013, "grad_norm": 1.0189549922943115, "learning_rate": 1.7476288659793815e-05, "loss": 2.079, "step": 2450 }, { "epoch": 0.6442468754026544, "grad_norm": 0.935614824295044, "learning_rate": 1.742474226804124e-05, "loss": 2.0944, "step": 2500 }, { "epoch": 0.6571318129107074, "grad_norm": 0.9308194518089294, "learning_rate": 1.7373195876288664e-05, "loss": 2.0767, "step": 2550 }, { "epoch": 0.6700167504187605, "grad_norm": 0.9042763113975525, "learning_rate": 1.7321649484536084e-05, "loss": 2.0909, "step": 2600 }, { "epoch": 0.6829016879268136, "grad_norm": 0.9609789252281189, "learning_rate": 1.7270103092783505e-05, "loss": 2.1105, "step": 2650 }, { "epoch": 0.6957866254348667, "grad_norm": 1.844524621963501, "learning_rate": 1.721855670103093e-05, "loss": 2.1664, "step": 2700 }, { "epoch": 0.7086715629429198, "grad_norm": 1.3245840072631836, "learning_rate": 1.716701030927835e-05, "loss": 2.0887, "step": 2750 }, { "epoch": 0.7215565004509729, "grad_norm": 0.9674375057220459, "learning_rate": 1.7115463917525775e-05, "loss": 2.0579, "step": 2800 }, { "epoch": 0.734441437959026, "grad_norm": 1.1117796897888184, "learning_rate": 1.7063917525773196e-05, "loss": 2.0793, "step": 2850 }, { "epoch": 0.747326375467079, "grad_norm": 1.099692702293396, "learning_rate": 1.701237113402062e-05, "loss": 2.0568, "step": 2900 }, { "epoch": 0.7602113129751321, "grad_norm": 1.1181882619857788, "learning_rate": 1.6961855670103094e-05, "loss": 2.1385, "step": 2950 }, { "epoch": 0.7730962504831852, "grad_norm": 1.0414690971374512, "learning_rate": 1.6911340206185568e-05, "loss": 2.0686, "step": 3000 }, { "epoch": 0.7859811879912383, "grad_norm": 0.9875026345252991, "learning_rate": 1.6859793814432992e-05, "loss": 2.0503, "step": 3050 }, { "epoch": 0.7988661254992914, "grad_norm": 1.0894653797149658, "learning_rate": 1.6808247422680413e-05, "loss": 2.0283, "step": 3100 }, { "epoch": 0.8117510630073445, "grad_norm": 0.9688855409622192, "learning_rate": 1.6756701030927837e-05, "loss": 2.0442, "step": 3150 }, { "epoch": 0.8246360005153975, "grad_norm": 0.8581517338752747, "learning_rate": 1.6705154639175258e-05, "loss": 2.0938, "step": 3200 }, { "epoch": 0.8375209380234506, "grad_norm": 1.0975722074508667, "learning_rate": 1.6653608247422682e-05, "loss": 2.0493, "step": 3250 }, { "epoch": 0.8504058755315037, "grad_norm": 0.9611416459083557, "learning_rate": 1.6602061855670103e-05, "loss": 2.0434, "step": 3300 }, { "epoch": 0.8632908130395568, "grad_norm": 0.9956973195075989, "learning_rate": 1.6550515463917527e-05, "loss": 2.0556, "step": 3350 }, { "epoch": 0.8761757505476099, "grad_norm": 1.0307831764221191, "learning_rate": 1.6498969072164948e-05, "loss": 2.0353, "step": 3400 }, { "epoch": 0.889060688055663, "grad_norm": 0.9811009168624878, "learning_rate": 1.6447422680412372e-05, "loss": 2.0574, "step": 3450 }, { "epoch": 0.901945625563716, "grad_norm": 1.0394349098205566, "learning_rate": 1.6395876288659797e-05, "loss": 2.0544, "step": 3500 }, { "epoch": 0.9148305630717691, "grad_norm": 0.9915798902511597, "learning_rate": 1.6344329896907218e-05, "loss": 2.0685, "step": 3550 }, { "epoch": 0.9277155005798222, "grad_norm": 0.8833404183387756, "learning_rate": 1.6292783505154642e-05, "loss": 2.0734, "step": 3600 }, { "epoch": 0.9406004380878753, "grad_norm": 0.9089716672897339, "learning_rate": 1.6241237113402063e-05, "loss": 2.0531, "step": 3650 }, { "epoch": 0.9534853755959284, "grad_norm": 0.9168672561645508, "learning_rate": 1.6189690721649487e-05, "loss": 2.008, "step": 3700 }, { "epoch": 0.9663703131039815, "grad_norm": 0.9824495911598206, "learning_rate": 1.6138144329896908e-05, "loss": 2.0488, "step": 3750 }, { "epoch": 0.9792552506120346, "grad_norm": 1.2295233011245728, "learning_rate": 1.6086597938144332e-05, "loss": 2.0184, "step": 3800 }, { "epoch": 0.9921401881200876, "grad_norm": 0.9734981656074524, "learning_rate": 1.6035051546391753e-05, "loss": 2.0659, "step": 3850 }, { "epoch": 0.9998711506249195, "eval_bleu": 18.1461, "eval_gen_len": 45.4751, "eval_loss": 1.7681266069412231, "eval_runtime": 2364.104, "eval_samples_per_second": 6.565, "eval_steps_per_second": 0.41, "step": 3880 }, { "epoch": 1.0050251256281406, "grad_norm": 1.2103891372680664, "learning_rate": 1.5983505154639177e-05, "loss": 2.0579, "step": 3900 }, { "epoch": 1.0179100631361937, "grad_norm": 0.983440637588501, "learning_rate": 1.5931958762886598e-05, "loss": 1.9984, "step": 3950 }, { "epoch": 1.0307950006442468, "grad_norm": 1.0848294496536255, "learning_rate": 1.5880412371134022e-05, "loss": 2.0428, "step": 4000 }, { "epoch": 1.0436799381522999, "grad_norm": 1.143467903137207, "learning_rate": 1.5828865979381443e-05, "loss": 2.0236, "step": 4050 }, { "epoch": 1.056564875660353, "grad_norm": 0.9652382731437683, "learning_rate": 1.5777319587628867e-05, "loss": 2.0219, "step": 4100 }, { "epoch": 1.069449813168406, "grad_norm": 1.0381783246994019, "learning_rate": 1.5725773195876292e-05, "loss": 2.0466, "step": 4150 }, { "epoch": 1.0823347506764591, "grad_norm": 1.0150736570358276, "learning_rate": 1.5674226804123713e-05, "loss": 1.9878, "step": 4200 }, { "epoch": 1.0952196881845122, "grad_norm": 1.0291893482208252, "learning_rate": 1.5622680412371137e-05, "loss": 2.0637, "step": 4250 }, { "epoch": 1.1081046256925653, "grad_norm": 1.5909788608551025, "learning_rate": 1.5571134020618558e-05, "loss": 1.9356, "step": 4300 }, { "epoch": 1.1209895632006184, "grad_norm": 1.1417341232299805, "learning_rate": 1.551958762886598e-05, "loss": 2.0089, "step": 4350 }, { "epoch": 1.1338745007086715, "grad_norm": 1.0295405387878418, "learning_rate": 1.5468041237113403e-05, "loss": 2.0486, "step": 4400 }, { "epoch": 1.1467594382167245, "grad_norm": 1.1314431428909302, "learning_rate": 1.5416494845360827e-05, "loss": 1.9785, "step": 4450 }, { "epoch": 1.1596443757247776, "grad_norm": 0.9702105522155762, "learning_rate": 1.5364948453608248e-05, "loss": 1.9474, "step": 4500 }, { "epoch": 1.1725293132328307, "grad_norm": 1.9038368463516235, "learning_rate": 1.5313402061855672e-05, "loss": 2.0264, "step": 4550 }, { "epoch": 1.1854142507408838, "grad_norm": 1.0110225677490234, "learning_rate": 1.5261855670103093e-05, "loss": 2.0689, "step": 4600 }, { "epoch": 1.1982991882489369, "grad_norm": 0.9490695595741272, "learning_rate": 1.5210309278350517e-05, "loss": 1.9998, "step": 4650 }, { "epoch": 1.21118412575699, "grad_norm": 1.0398602485656738, "learning_rate": 1.515876288659794e-05, "loss": 1.9622, "step": 4700 }, { "epoch": 1.224069063265043, "grad_norm": 1.088680624961853, "learning_rate": 1.5107216494845362e-05, "loss": 2.0096, "step": 4750 }, { "epoch": 1.2369540007730961, "grad_norm": 1.0555858612060547, "learning_rate": 1.5055670103092785e-05, "loss": 1.9494, "step": 4800 }, { "epoch": 1.2498389382811492, "grad_norm": 1.1039170026779175, "learning_rate": 1.5004123711340208e-05, "loss": 2.0132, "step": 4850 }, { "epoch": 1.2627238757892023, "grad_norm": 1.069201946258545, "learning_rate": 1.4952577319587632e-05, "loss": 2.043, "step": 4900 }, { "epoch": 1.2756088132972554, "grad_norm": 17.700525283813477, "learning_rate": 1.4901030927835051e-05, "loss": 1.9629, "step": 4950 }, { "epoch": 1.2884937508053085, "grad_norm": 0.9558641314506531, "learning_rate": 1.4849484536082475e-05, "loss": 1.9488, "step": 5000 }, { "epoch": 1.3013786883133616, "grad_norm": 1.0301564931869507, "learning_rate": 1.4797938144329898e-05, "loss": 2.0172, "step": 5050 }, { "epoch": 1.3142636258214146, "grad_norm": 1.017805814743042, "learning_rate": 1.474639175257732e-05, "loss": 2.04, "step": 5100 }, { "epoch": 1.3271485633294677, "grad_norm": 0.8880634903907776, "learning_rate": 1.4694845360824743e-05, "loss": 2.0109, "step": 5150 }, { "epoch": 1.3400335008375208, "grad_norm": 0.9009851813316345, "learning_rate": 1.4643298969072166e-05, "loss": 1.9195, "step": 5200 }, { "epoch": 1.352918438345574, "grad_norm": 0.9800803661346436, "learning_rate": 1.459175257731959e-05, "loss": 2.006, "step": 5250 }, { "epoch": 1.365803375853627, "grad_norm": 0.9194086194038391, "learning_rate": 1.4540206185567012e-05, "loss": 1.927, "step": 5300 }, { "epoch": 1.37868831336168, "grad_norm": 1.1191316843032837, "learning_rate": 1.4488659793814435e-05, "loss": 2.0157, "step": 5350 }, { "epoch": 1.3915732508697332, "grad_norm": 1.159419298171997, "learning_rate": 1.4437113402061857e-05, "loss": 1.9806, "step": 5400 }, { "epoch": 1.4044581883777862, "grad_norm": 1.0512197017669678, "learning_rate": 1.438556701030928e-05, "loss": 1.9496, "step": 5450 }, { "epoch": 1.4173431258858393, "grad_norm": 1.125361442565918, "learning_rate": 1.4334020618556703e-05, "loss": 1.9632, "step": 5500 }, { "epoch": 1.4302280633938924, "grad_norm": 1.0887775421142578, "learning_rate": 1.4282474226804123e-05, "loss": 1.928, "step": 5550 }, { "epoch": 1.4431130009019455, "grad_norm": 1.0122510194778442, "learning_rate": 1.4230927835051546e-05, "loss": 1.9677, "step": 5600 }, { "epoch": 1.4559979384099986, "grad_norm": 0.9938000440597534, "learning_rate": 1.417938144329897e-05, "loss": 1.9492, "step": 5650 }, { "epoch": 1.4688828759180517, "grad_norm": 1.1632938385009766, "learning_rate": 1.4127835051546393e-05, "loss": 1.9629, "step": 5700 }, { "epoch": 1.4817678134261048, "grad_norm": 0.9199478626251221, "learning_rate": 1.4076288659793815e-05, "loss": 1.999, "step": 5750 }, { "epoch": 1.4946527509341578, "grad_norm": 1.1851013898849487, "learning_rate": 1.4024742268041238e-05, "loss": 1.964, "step": 5800 }, { "epoch": 1.507537688442211, "grad_norm": 1.541742205619812, "learning_rate": 1.397319587628866e-05, "loss": 2.0018, "step": 5850 }, { "epoch": 1.520422625950264, "grad_norm": 1.3189605474472046, "learning_rate": 1.3921649484536083e-05, "loss": 1.9245, "step": 5900 }, { "epoch": 1.533307563458317, "grad_norm": 0.995586097240448, "learning_rate": 1.3870103092783507e-05, "loss": 2.0303, "step": 5950 }, { "epoch": 1.5461925009663702, "grad_norm": 1.0431631803512573, "learning_rate": 1.381855670103093e-05, "loss": 1.9044, "step": 6000 }, { "epoch": 1.5590774384744233, "grad_norm": 1.0870169401168823, "learning_rate": 1.3767010309278352e-05, "loss": 2.0023, "step": 6050 }, { "epoch": 1.5719623759824763, "grad_norm": 0.921909511089325, "learning_rate": 1.3715463917525775e-05, "loss": 1.9186, "step": 6100 }, { "epoch": 1.5848473134905294, "grad_norm": 1.1961994171142578, "learning_rate": 1.3663917525773196e-05, "loss": 1.9109, "step": 6150 }, { "epoch": 1.5977322509985825, "grad_norm": 1.0308939218521118, "learning_rate": 1.3612371134020618e-05, "loss": 1.9146, "step": 6200 }, { "epoch": 1.6106171885066356, "grad_norm": 2.139348030090332, "learning_rate": 1.3560824742268041e-05, "loss": 1.9541, "step": 6250 }, { "epoch": 1.6235021260146887, "grad_norm": 1.0335361957550049, "learning_rate": 1.3509278350515465e-05, "loss": 2.0021, "step": 6300 }, { "epoch": 1.6363870635227418, "grad_norm": 1.0377309322357178, "learning_rate": 1.3457731958762888e-05, "loss": 1.9546, "step": 6350 }, { "epoch": 1.6492720010307949, "grad_norm": 0.9605665802955627, "learning_rate": 1.340618556701031e-05, "loss": 1.9477, "step": 6400 }, { "epoch": 1.662156938538848, "grad_norm": 1.1977301836013794, "learning_rate": 1.3354639175257733e-05, "loss": 1.928, "step": 6450 }, { "epoch": 1.675041876046901, "grad_norm": 0.9851065874099731, "learning_rate": 1.3303092783505156e-05, "loss": 1.9224, "step": 6500 }, { "epoch": 1.687926813554954, "grad_norm": 0.8746098875999451, "learning_rate": 1.3251546391752578e-05, "loss": 1.9798, "step": 6550 }, { "epoch": 1.7008117510630072, "grad_norm": 0.9900497794151306, "learning_rate": 1.3200000000000002e-05, "loss": 1.9244, "step": 6600 }, { "epoch": 1.7136966885710603, "grad_norm": 0.9657949805259705, "learning_rate": 1.3148453608247425e-05, "loss": 1.9565, "step": 6650 }, { "epoch": 1.7265816260791134, "grad_norm": 1.1366913318634033, "learning_rate": 1.3096907216494847e-05, "loss": 1.9724, "step": 6700 }, { "epoch": 1.7394665635871664, "grad_norm": 1.15602445602417, "learning_rate": 1.3045360824742268e-05, "loss": 1.927, "step": 6750 }, { "epoch": 1.7523515010952195, "grad_norm": 0.9413411021232605, "learning_rate": 1.2993814432989691e-05, "loss": 1.9347, "step": 6800 }, { "epoch": 1.7652364386032726, "grad_norm": 0.9607951641082764, "learning_rate": 1.2942268041237113e-05, "loss": 1.9495, "step": 6850 }, { "epoch": 1.7781213761113257, "grad_norm": 1.155685305595398, "learning_rate": 1.2890721649484536e-05, "loss": 1.9498, "step": 6900 }, { "epoch": 1.7910063136193788, "grad_norm": 1.8821039199829102, "learning_rate": 1.283917525773196e-05, "loss": 1.9411, "step": 6950 }, { "epoch": 1.8038912511274319, "grad_norm": 1.2264201641082764, "learning_rate": 1.2787628865979383e-05, "loss": 1.9488, "step": 7000 }, { "epoch": 1.816776188635485, "grad_norm": 0.9997029304504395, "learning_rate": 1.2736082474226805e-05, "loss": 1.9162, "step": 7050 }, { "epoch": 1.829661126143538, "grad_norm": 1.1943738460540771, "learning_rate": 1.2684536082474228e-05, "loss": 1.961, "step": 7100 }, { "epoch": 1.8425460636515911, "grad_norm": 1.1875113248825073, "learning_rate": 1.263298969072165e-05, "loss": 1.9256, "step": 7150 }, { "epoch": 1.8554310011596442, "grad_norm": 1.0550329685211182, "learning_rate": 1.2581443298969073e-05, "loss": 1.9519, "step": 7200 }, { "epoch": 1.8683159386676973, "grad_norm": 1.3292375802993774, "learning_rate": 1.2529896907216497e-05, "loss": 1.9341, "step": 7250 }, { "epoch": 1.8812008761757506, "grad_norm": 1.0914188623428345, "learning_rate": 1.247835051546392e-05, "loss": 1.894, "step": 7300 }, { "epoch": 1.8940858136838037, "grad_norm": 1.1687994003295898, "learning_rate": 1.242680412371134e-05, "loss": 1.9044, "step": 7350 }, { "epoch": 1.9069707511918568, "grad_norm": 1.0040736198425293, "learning_rate": 1.2376288659793816e-05, "loss": 1.919, "step": 7400 }, { "epoch": 1.9198556886999099, "grad_norm": 1.0108208656311035, "learning_rate": 1.2324742268041239e-05, "loss": 1.8785, "step": 7450 }, { "epoch": 1.932740626207963, "grad_norm": 1.0039801597595215, "learning_rate": 1.2273195876288662e-05, "loss": 1.8761, "step": 7500 }, { "epoch": 1.945625563716016, "grad_norm": 1.0580838918685913, "learning_rate": 1.2221649484536084e-05, "loss": 1.8699, "step": 7550 }, { "epoch": 1.9585105012240691, "grad_norm": 1.1629561185836792, "learning_rate": 1.2170103092783505e-05, "loss": 1.927, "step": 7600 }, { "epoch": 1.9713954387321222, "grad_norm": 0.908470094203949, "learning_rate": 1.2118556701030928e-05, "loss": 1.9778, "step": 7650 }, { "epoch": 1.9842803762401753, "grad_norm": 1.1411256790161133, "learning_rate": 1.206701030927835e-05, "loss": 1.9035, "step": 7700 }, { "epoch": 1.9971653137482284, "grad_norm": 0.9729508757591248, "learning_rate": 1.2015463917525774e-05, "loss": 1.9071, "step": 7750 }, { "epoch": 2.0, "eval_bleu": 19.4026, "eval_gen_len": 45.2194, "eval_loss": 1.662958025932312, "eval_runtime": 2310.4182, "eval_samples_per_second": 6.717, "eval_steps_per_second": 0.42, "step": 7761 }, { "epoch": 2.0100502512562812, "grad_norm": 1.2045557498931885, "learning_rate": 1.1963917525773197e-05, "loss": 1.8937, "step": 7800 }, { "epoch": 2.0229351887643343, "grad_norm": 0.9809572696685791, "learning_rate": 1.191237113402062e-05, "loss": 1.9757, "step": 7850 }, { "epoch": 2.0358201262723874, "grad_norm": 1.0871580839157104, "learning_rate": 1.1860824742268042e-05, "loss": 1.9366, "step": 7900 }, { "epoch": 2.0487050637804405, "grad_norm": 1.1801034212112427, "learning_rate": 1.1809278350515465e-05, "loss": 1.92, "step": 7950 }, { "epoch": 2.0615900012884936, "grad_norm": 1.118897557258606, "learning_rate": 1.1757731958762887e-05, "loss": 1.9593, "step": 8000 }, { "epoch": 2.0744749387965467, "grad_norm": 1.1403993368148804, "learning_rate": 1.1706185567010311e-05, "loss": 1.8699, "step": 8050 }, { "epoch": 2.0873598763045997, "grad_norm": 3.7084872722625732, "learning_rate": 1.1654639175257734e-05, "loss": 1.8686, "step": 8100 }, { "epoch": 2.100244813812653, "grad_norm": 1.135362982749939, "learning_rate": 1.1603092783505157e-05, "loss": 1.9068, "step": 8150 }, { "epoch": 2.113129751320706, "grad_norm": 1.5067939758300781, "learning_rate": 1.1551546391752577e-05, "loss": 1.9196, "step": 8200 }, { "epoch": 2.126014688828759, "grad_norm": 1.0155831575393677, "learning_rate": 1.15e-05, "loss": 1.8711, "step": 8250 }, { "epoch": 2.138899626336812, "grad_norm": 1.2201752662658691, "learning_rate": 1.1448453608247423e-05, "loss": 1.8985, "step": 8300 }, { "epoch": 2.151784563844865, "grad_norm": 1.0999071598052979, "learning_rate": 1.1396907216494845e-05, "loss": 1.8978, "step": 8350 }, { "epoch": 2.1646695013529182, "grad_norm": 1.1638729572296143, "learning_rate": 1.134536082474227e-05, "loss": 1.8593, "step": 8400 }, { "epoch": 2.1775544388609713, "grad_norm": 1.1784203052520752, "learning_rate": 1.1293814432989692e-05, "loss": 1.8663, "step": 8450 }, { "epoch": 2.1904393763690244, "grad_norm": 1.026315450668335, "learning_rate": 1.1242268041237115e-05, "loss": 1.8285, "step": 8500 }, { "epoch": 2.2033243138770775, "grad_norm": 0.9852938652038574, "learning_rate": 1.1190721649484537e-05, "loss": 1.9048, "step": 8550 }, { "epoch": 2.2162092513851306, "grad_norm": 1.052303671836853, "learning_rate": 1.113917525773196e-05, "loss": 1.8808, "step": 8600 }, { "epoch": 2.2290941888931837, "grad_norm": 5.098678112030029, "learning_rate": 1.1087628865979382e-05, "loss": 1.8679, "step": 8650 }, { "epoch": 2.2419791264012368, "grad_norm": 1.1427991390228271, "learning_rate": 1.1036082474226806e-05, "loss": 1.9065, "step": 8700 }, { "epoch": 2.25486406390929, "grad_norm": 3.338353395462036, "learning_rate": 1.0984536082474229e-05, "loss": 1.8915, "step": 8750 }, { "epoch": 2.267749001417343, "grad_norm": 1.2049264907836914, "learning_rate": 1.093298969072165e-05, "loss": 1.9018, "step": 8800 }, { "epoch": 2.280633938925396, "grad_norm": 1.0751088857650757, "learning_rate": 1.0881443298969072e-05, "loss": 1.8914, "step": 8850 }, { "epoch": 2.293518876433449, "grad_norm": 0.9149059653282166, "learning_rate": 1.0829896907216495e-05, "loss": 1.8717, "step": 8900 }, { "epoch": 2.306403813941502, "grad_norm": 1.0154598951339722, "learning_rate": 1.0778350515463918e-05, "loss": 1.8979, "step": 8950 }, { "epoch": 2.3192887514495553, "grad_norm": 1.198451042175293, "learning_rate": 1.072680412371134e-05, "loss": 1.8987, "step": 9000 }, { "epoch": 2.3321736889576083, "grad_norm": 1.0957529544830322, "learning_rate": 1.0675257731958764e-05, "loss": 1.892, "step": 9050 }, { "epoch": 2.3450586264656614, "grad_norm": 1.055180311203003, "learning_rate": 1.0623711340206187e-05, "loss": 1.8694, "step": 9100 }, { "epoch": 2.3579435639737145, "grad_norm": 1.094000220298767, "learning_rate": 1.057216494845361e-05, "loss": 1.9105, "step": 9150 }, { "epoch": 2.3708285014817676, "grad_norm": 1.0135473012924194, "learning_rate": 1.0520618556701032e-05, "loss": 1.892, "step": 9200 }, { "epoch": 2.3837134389898207, "grad_norm": 1.2554734945297241, "learning_rate": 1.0469072164948455e-05, "loss": 1.9458, "step": 9250 }, { "epoch": 2.3965983764978738, "grad_norm": 1.312153697013855, "learning_rate": 1.0417525773195877e-05, "loss": 1.9138, "step": 9300 }, { "epoch": 2.409483314005927, "grad_norm": 1.0656461715698242, "learning_rate": 1.0367010309278351e-05, "loss": 1.931, "step": 9350 }, { "epoch": 2.42236825151398, "grad_norm": 1.4363470077514648, "learning_rate": 1.0315463917525774e-05, "loss": 1.8815, "step": 9400 }, { "epoch": 2.435253189022033, "grad_norm": 1.2165566682815552, "learning_rate": 1.0263917525773196e-05, "loss": 1.8774, "step": 9450 }, { "epoch": 2.448138126530086, "grad_norm": 1.2879478931427002, "learning_rate": 1.021237113402062e-05, "loss": 1.9324, "step": 9500 }, { "epoch": 2.461023064038139, "grad_norm": 1.1694004535675049, "learning_rate": 1.0160824742268043e-05, "loss": 1.8595, "step": 9550 }, { "epoch": 2.4739080015461923, "grad_norm": 1.1013145446777344, "learning_rate": 1.0109278350515466e-05, "loss": 1.8523, "step": 9600 }, { "epoch": 2.4867929390542454, "grad_norm": 1.0102790594100952, "learning_rate": 1.0057731958762887e-05, "loss": 1.8239, "step": 9650 }, { "epoch": 2.4996778765622985, "grad_norm": 2.130802869796753, "learning_rate": 1.0006185567010309e-05, "loss": 1.9528, "step": 9700 }, { "epoch": 2.5125628140703515, "grad_norm": 1.3339593410491943, "learning_rate": 9.954639175257733e-06, "loss": 1.8553, "step": 9750 }, { "epoch": 2.5254477515784046, "grad_norm": 1.0675934553146362, "learning_rate": 9.903092783505154e-06, "loss": 1.8547, "step": 9800 }, { "epoch": 2.5383326890864577, "grad_norm": 1.111695408821106, "learning_rate": 9.851546391752578e-06, "loss": 1.81, "step": 9850 }, { "epoch": 2.551217626594511, "grad_norm": 1.3534823656082153, "learning_rate": 9.800000000000001e-06, "loss": 1.8789, "step": 9900 }, { "epoch": 2.564102564102564, "grad_norm": 1.138906717300415, "learning_rate": 9.748453608247424e-06, "loss": 1.8355, "step": 9950 }, { "epoch": 2.576987501610617, "grad_norm": 1.183947205543518, "learning_rate": 9.696907216494846e-06, "loss": 1.8563, "step": 10000 }, { "epoch": 2.58987243911867, "grad_norm": 0.9370452165603638, "learning_rate": 9.645360824742269e-06, "loss": 1.8845, "step": 10050 }, { "epoch": 2.602757376626723, "grad_norm": 1.1878234148025513, "learning_rate": 9.593814432989691e-06, "loss": 1.9203, "step": 10100 }, { "epoch": 2.615642314134776, "grad_norm": 1.0847914218902588, "learning_rate": 9.542268041237114e-06, "loss": 1.8816, "step": 10150 }, { "epoch": 2.6285272516428293, "grad_norm": 1.2127825021743774, "learning_rate": 9.490721649484536e-06, "loss": 1.899, "step": 10200 }, { "epoch": 2.6414121891508824, "grad_norm": 1.0567888021469116, "learning_rate": 9.439175257731959e-06, "loss": 1.8433, "step": 10250 }, { "epoch": 2.6542971266589355, "grad_norm": 1.1838716268539429, "learning_rate": 9.387628865979383e-06, "loss": 1.8931, "step": 10300 }, { "epoch": 2.6671820641669886, "grad_norm": 1.1113821268081665, "learning_rate": 9.336082474226806e-06, "loss": 1.9041, "step": 10350 }, { "epoch": 2.6800670016750416, "grad_norm": 1.538613200187683, "learning_rate": 9.284536082474228e-06, "loss": 1.9085, "step": 10400 }, { "epoch": 2.6929519391830947, "grad_norm": 1.1364761590957642, "learning_rate": 9.23298969072165e-06, "loss": 1.8925, "step": 10450 }, { "epoch": 2.705836876691148, "grad_norm": 1.1954172849655151, "learning_rate": 9.181443298969073e-06, "loss": 1.8608, "step": 10500 }, { "epoch": 2.718721814199201, "grad_norm": 0.8984624147415161, "learning_rate": 9.129896907216496e-06, "loss": 1.823, "step": 10550 }, { "epoch": 2.731606751707254, "grad_norm": 1.0665663480758667, "learning_rate": 9.078350515463919e-06, "loss": 1.9052, "step": 10600 }, { "epoch": 2.744491689215307, "grad_norm": 1.2751344442367554, "learning_rate": 9.026804123711341e-06, "loss": 1.8573, "step": 10650 }, { "epoch": 2.75737662672336, "grad_norm": 0.964619517326355, "learning_rate": 8.975257731958764e-06, "loss": 1.9581, "step": 10700 }, { "epoch": 2.7702615642314132, "grad_norm": 1.1248286962509155, "learning_rate": 8.923711340206186e-06, "loss": 1.9004, "step": 10750 }, { "epoch": 2.7831465017394663, "grad_norm": 1.2471715211868286, "learning_rate": 8.872164948453609e-06, "loss": 1.8969, "step": 10800 }, { "epoch": 2.7960314392475194, "grad_norm": 1.3639956712722778, "learning_rate": 8.820618556701031e-06, "loss": 1.8493, "step": 10850 }, { "epoch": 2.8089163767555725, "grad_norm": 1.1183199882507324, "learning_rate": 8.769072164948454e-06, "loss": 1.8888, "step": 10900 }, { "epoch": 2.8218013142636256, "grad_norm": 1.3162132501602173, "learning_rate": 8.717525773195877e-06, "loss": 1.8567, "step": 10950 }, { "epoch": 2.8346862517716787, "grad_norm": 1.2056224346160889, "learning_rate": 8.6659793814433e-06, "loss": 1.8687, "step": 11000 }, { "epoch": 2.8475711892797317, "grad_norm": 1.285947322845459, "learning_rate": 8.614432989690722e-06, "loss": 1.8864, "step": 11050 }, { "epoch": 2.860456126787785, "grad_norm": 1.292939305305481, "learning_rate": 8.562886597938144e-06, "loss": 1.9056, "step": 11100 }, { "epoch": 2.873341064295838, "grad_norm": 1.2155085802078247, "learning_rate": 8.511340206185568e-06, "loss": 1.8699, "step": 11150 }, { "epoch": 2.886226001803891, "grad_norm": 1.4173967838287354, "learning_rate": 8.459793814432991e-06, "loss": 1.8581, "step": 11200 }, { "epoch": 2.899110939311944, "grad_norm": 1.0226136445999146, "learning_rate": 8.408247422680414e-06, "loss": 1.8491, "step": 11250 }, { "epoch": 2.911995876819997, "grad_norm": 1.2074532508850098, "learning_rate": 8.356701030927836e-06, "loss": 1.8493, "step": 11300 }, { "epoch": 2.9248808143280502, "grad_norm": 1.0812984704971313, "learning_rate": 8.305154639175259e-06, "loss": 1.8839, "step": 11350 }, { "epoch": 2.9377657518361033, "grad_norm": 1.3052395582199097, "learning_rate": 8.253608247422681e-06, "loss": 1.869, "step": 11400 }, { "epoch": 2.9506506893441564, "grad_norm": 1.0708857774734497, "learning_rate": 8.202061855670104e-06, "loss": 1.855, "step": 11450 }, { "epoch": 2.9635356268522095, "grad_norm": 0.9860512614250183, "learning_rate": 8.150515463917526e-06, "loss": 1.8535, "step": 11500 }, { "epoch": 2.9764205643602626, "grad_norm": 0.9245162010192871, "learning_rate": 8.098969072164949e-06, "loss": 1.8647, "step": 11550 }, { "epoch": 2.9893055018683157, "grad_norm": 1.1266101598739624, "learning_rate": 8.047422680412372e-06, "loss": 1.8646, "step": 11600 }, { "epoch": 2.9998711506249194, "eval_bleu": 19.9649, "eval_gen_len": 45.274, "eval_loss": 1.6131339073181152, "eval_runtime": 2307.2556, "eval_samples_per_second": 6.727, "eval_steps_per_second": 0.42, "step": 11641 }, { "epoch": 3.002190439376369, "grad_norm": 0.924880862236023, "learning_rate": 7.995876288659794e-06, "loss": 1.8485, "step": 11650 }, { "epoch": 3.0150753768844223, "grad_norm": 1.0113394260406494, "learning_rate": 7.944329896907217e-06, "loss": 1.8349, "step": 11700 }, { "epoch": 3.0279603143924754, "grad_norm": 1.1226181983947754, "learning_rate": 7.89278350515464e-06, "loss": 1.8707, "step": 11750 }, { "epoch": 3.0408452519005285, "grad_norm": 1.0973234176635742, "learning_rate": 7.841237113402062e-06, "loss": 1.8104, "step": 11800 }, { "epoch": 3.0537301894085815, "grad_norm": 1.2233961820602417, "learning_rate": 7.789690721649486e-06, "loss": 1.8668, "step": 11850 }, { "epoch": 3.0666151269166346, "grad_norm": 1.1300643682479858, "learning_rate": 7.738144329896909e-06, "loss": 1.8661, "step": 11900 }, { "epoch": 3.0795000644246877, "grad_norm": 1.1732138395309448, "learning_rate": 7.68659793814433e-06, "loss": 1.8213, "step": 11950 }, { "epoch": 3.092385001932741, "grad_norm": 1.459231972694397, "learning_rate": 7.635051546391754e-06, "loss": 1.8533, "step": 12000 }, { "epoch": 3.105269939440794, "grad_norm": 1.353126049041748, "learning_rate": 7.5835051546391755e-06, "loss": 1.8838, "step": 12050 }, { "epoch": 3.118154876948847, "grad_norm": 0.9796210527420044, "learning_rate": 7.531958762886599e-06, "loss": 1.8079, "step": 12100 }, { "epoch": 3.1310398144569, "grad_norm": 1.1230041980743408, "learning_rate": 7.4804123711340214e-06, "loss": 1.9103, "step": 12150 }, { "epoch": 3.143924751964953, "grad_norm": 1.3261069059371948, "learning_rate": 7.428865979381444e-06, "loss": 1.845, "step": 12200 }, { "epoch": 3.156809689473006, "grad_norm": 1.0127289295196533, "learning_rate": 7.377319587628866e-06, "loss": 1.8408, "step": 12250 }, { "epoch": 3.1696946269810593, "grad_norm": 1.1761748790740967, "learning_rate": 7.325773195876289e-06, "loss": 1.855, "step": 12300 }, { "epoch": 3.1825795644891124, "grad_norm": 1.1443302631378174, "learning_rate": 7.274226804123712e-06, "loss": 1.8234, "step": 12350 }, { "epoch": 3.1954645019971655, "grad_norm": 1.1420938968658447, "learning_rate": 7.222680412371135e-06, "loss": 1.9063, "step": 12400 }, { "epoch": 3.2083494395052186, "grad_norm": 0.9729594588279724, "learning_rate": 7.171134020618558e-06, "loss": 1.9181, "step": 12450 }, { "epoch": 3.2212343770132716, "grad_norm": 1.2192091941833496, "learning_rate": 7.11958762886598e-06, "loss": 1.9125, "step": 12500 }, { "epoch": 3.2341193145213247, "grad_norm": 1.335284948348999, "learning_rate": 7.068041237113402e-06, "loss": 1.8348, "step": 12550 }, { "epoch": 3.247004252029378, "grad_norm": 1.5923230648040771, "learning_rate": 7.016494845360825e-06, "loss": 1.8523, "step": 12600 }, { "epoch": 3.259889189537431, "grad_norm": 1.3718382120132446, "learning_rate": 6.964948453608248e-06, "loss": 1.8404, "step": 12650 }, { "epoch": 3.272774127045484, "grad_norm": 1.3347010612487793, "learning_rate": 6.9134020618556705e-06, "loss": 1.852, "step": 12700 }, { "epoch": 3.285659064553537, "grad_norm": 1.3411351442337036, "learning_rate": 6.861855670103094e-06, "loss": 1.8505, "step": 12750 }, { "epoch": 3.29854400206159, "grad_norm": 1.2156487703323364, "learning_rate": 6.8103092783505165e-06, "loss": 1.8589, "step": 12800 }, { "epoch": 3.3114289395696432, "grad_norm": 1.1836252212524414, "learning_rate": 6.758762886597938e-06, "loss": 1.8518, "step": 12850 }, { "epoch": 3.3243138770776963, "grad_norm": 1.3949558734893799, "learning_rate": 6.707216494845361e-06, "loss": 1.8459, "step": 12900 }, { "epoch": 3.3371988145857494, "grad_norm": 1.0333205461502075, "learning_rate": 6.655670103092784e-06, "loss": 1.86, "step": 12950 }, { "epoch": 3.3500837520938025, "grad_norm": 1.0828937292099, "learning_rate": 6.604123711340207e-06, "loss": 1.8463, "step": 13000 }, { "epoch": 3.3629686896018556, "grad_norm": 1.1059962511062622, "learning_rate": 6.552577319587629e-06, "loss": 1.8608, "step": 13050 }, { "epoch": 3.3758536271099087, "grad_norm": 1.025884747505188, "learning_rate": 6.501030927835053e-06, "loss": 1.8371, "step": 13100 }, { "epoch": 3.3887385646179617, "grad_norm": 1.0845879316329956, "learning_rate": 6.449484536082474e-06, "loss": 1.8205, "step": 13150 }, { "epoch": 3.401623502126015, "grad_norm": 0.9505090713500977, "learning_rate": 6.397938144329897e-06, "loss": 1.8467, "step": 13200 }, { "epoch": 3.414508439634068, "grad_norm": 1.1278256177902222, "learning_rate": 6.34639175257732e-06, "loss": 1.8406, "step": 13250 }, { "epoch": 3.427393377142121, "grad_norm": 1.0838017463684082, "learning_rate": 6.294845360824743e-06, "loss": 1.8284, "step": 13300 }, { "epoch": 3.440278314650174, "grad_norm": 1.212337851524353, "learning_rate": 6.2432989690721655e-06, "loss": 1.8202, "step": 13350 }, { "epoch": 3.453163252158227, "grad_norm": 1.0882956981658936, "learning_rate": 6.191752577319589e-06, "loss": 1.8493, "step": 13400 }, { "epoch": 3.4660481896662803, "grad_norm": 1.213768482208252, "learning_rate": 6.140206185567011e-06, "loss": 1.8518, "step": 13450 }, { "epoch": 3.4789331271743333, "grad_norm": 1.2855191230773926, "learning_rate": 6.088659793814433e-06, "loss": 1.8574, "step": 13500 }, { "epoch": 3.4918180646823864, "grad_norm": 1.0810940265655518, "learning_rate": 6.037113402061856e-06, "loss": 1.8766, "step": 13550 }, { "epoch": 3.5047030021904395, "grad_norm": 1.15432608127594, "learning_rate": 5.985567010309279e-06, "loss": 1.836, "step": 13600 }, { "epoch": 3.5175879396984926, "grad_norm": 1.468928337097168, "learning_rate": 5.934020618556702e-06, "loss": 1.8165, "step": 13650 }, { "epoch": 3.5304728772065457, "grad_norm": 1.6314187049865723, "learning_rate": 5.882474226804124e-06, "loss": 1.8725, "step": 13700 }, { "epoch": 3.5433578147145988, "grad_norm": 1.1987876892089844, "learning_rate": 5.830927835051546e-06, "loss": 1.8486, "step": 13750 }, { "epoch": 3.556242752222652, "grad_norm": 1.1263744831085205, "learning_rate": 5.779381443298969e-06, "loss": 1.8739, "step": 13800 }, { "epoch": 3.569127689730705, "grad_norm": 1.2357795238494873, "learning_rate": 5.727835051546392e-06, "loss": 1.848, "step": 13850 }, { "epoch": 3.582012627238758, "grad_norm": 1.228352427482605, "learning_rate": 5.6762886597938145e-06, "loss": 1.8785, "step": 13900 }, { "epoch": 3.594897564746811, "grad_norm": 1.0710021257400513, "learning_rate": 5.624742268041238e-06, "loss": 1.8108, "step": 13950 }, { "epoch": 3.607782502254864, "grad_norm": 0.9839572906494141, "learning_rate": 5.5731958762886605e-06, "loss": 1.8779, "step": 14000 }, { "epoch": 3.6206674397629173, "grad_norm": 1.1732807159423828, "learning_rate": 5.521649484536082e-06, "loss": 1.8692, "step": 14050 }, { "epoch": 3.6335523772709704, "grad_norm": 1.0930730104446411, "learning_rate": 5.470103092783506e-06, "loss": 1.7912, "step": 14100 }, { "epoch": 3.6464373147790234, "grad_norm": 1.1306408643722534, "learning_rate": 5.418556701030928e-06, "loss": 1.7992, "step": 14150 }, { "epoch": 3.6593222522870765, "grad_norm": 1.171573281288147, "learning_rate": 5.367010309278351e-06, "loss": 1.8286, "step": 14200 }, { "epoch": 3.6722071897951296, "grad_norm": 1.033572793006897, "learning_rate": 5.315463917525774e-06, "loss": 1.8168, "step": 14250 }, { "epoch": 3.6850921273031827, "grad_norm": 1.109149694442749, "learning_rate": 5.263917525773197e-06, "loss": 1.8229, "step": 14300 }, { "epoch": 3.697977064811236, "grad_norm": 1.085472822189331, "learning_rate": 5.2123711340206184e-06, "loss": 1.7883, "step": 14350 }, { "epoch": 3.710862002319289, "grad_norm": 1.0914117097854614, "learning_rate": 5.160824742268041e-06, "loss": 1.8375, "step": 14400 }, { "epoch": 3.723746939827342, "grad_norm": 1.3772042989730835, "learning_rate": 5.110309278350516e-06, "loss": 1.8259, "step": 14450 }, { "epoch": 3.736631877335395, "grad_norm": 0.9119631052017212, "learning_rate": 5.058762886597939e-06, "loss": 1.8679, "step": 14500 }, { "epoch": 3.749516814843448, "grad_norm": 1.1717164516448975, "learning_rate": 5.007216494845362e-06, "loss": 1.8524, "step": 14550 }, { "epoch": 3.762401752351501, "grad_norm": 1.131783127784729, "learning_rate": 4.955670103092784e-06, "loss": 1.8081, "step": 14600 }, { "epoch": 3.7752866898595543, "grad_norm": 1.1898800134658813, "learning_rate": 4.904123711340207e-06, "loss": 1.8172, "step": 14650 }, { "epoch": 3.7881716273676074, "grad_norm": 1.0781954526901245, "learning_rate": 4.8525773195876294e-06, "loss": 1.8365, "step": 14700 }, { "epoch": 3.8010565648756605, "grad_norm": 1.1128448247909546, "learning_rate": 4.801030927835052e-06, "loss": 1.8904, "step": 14750 }, { "epoch": 3.8139415023837135, "grad_norm": 1.0720164775848389, "learning_rate": 4.7494845360824746e-06, "loss": 1.8521, "step": 14800 }, { "epoch": 3.8268264398917666, "grad_norm": 1.0853550434112549, "learning_rate": 4.697938144329897e-06, "loss": 1.7686, "step": 14850 }, { "epoch": 3.8397113773998197, "grad_norm": 0.9527387619018555, "learning_rate": 4.64639175257732e-06, "loss": 1.8529, "step": 14900 }, { "epoch": 3.852596314907873, "grad_norm": 1.3065271377563477, "learning_rate": 4.594845360824743e-06, "loss": 1.8569, "step": 14950 }, { "epoch": 3.865481252415926, "grad_norm": 1.2607804536819458, "learning_rate": 4.543298969072165e-06, "loss": 1.8417, "step": 15000 }, { "epoch": 3.878366189923979, "grad_norm": 1.089626669883728, "learning_rate": 4.491752577319588e-06, "loss": 1.8491, "step": 15050 }, { "epoch": 3.891251127432032, "grad_norm": 1.2275793552398682, "learning_rate": 4.440206185567011e-06, "loss": 1.8475, "step": 15100 }, { "epoch": 3.904136064940085, "grad_norm": 1.2494066953659058, "learning_rate": 4.388659793814433e-06, "loss": 1.8095, "step": 15150 }, { "epoch": 3.9170210024481382, "grad_norm": 1.0344122648239136, "learning_rate": 4.337113402061856e-06, "loss": 1.8177, "step": 15200 }, { "epoch": 3.9299059399561913, "grad_norm": 1.20706307888031, "learning_rate": 4.285567010309279e-06, "loss": 1.7988, "step": 15250 }, { "epoch": 3.9427908774642444, "grad_norm": 1.0796010494232178, "learning_rate": 4.234020618556701e-06, "loss": 1.8232, "step": 15300 }, { "epoch": 3.9556758149722975, "grad_norm": 1.2336502075195312, "learning_rate": 4.1824742268041245e-06, "loss": 1.8759, "step": 15350 }, { "epoch": 3.9685607524803506, "grad_norm": 1.0533246994018555, "learning_rate": 4.130927835051547e-06, "loss": 1.8084, "step": 15400 }, { "epoch": 3.9814456899884036, "grad_norm": 1.1837642192840576, "learning_rate": 4.07938144329897e-06, "loss": 1.8939, "step": 15450 }, { "epoch": 3.9943306274964567, "grad_norm": 1.1679872274398804, "learning_rate": 4.027835051546392e-06, "loss": 1.8759, "step": 15500 }, { "epoch": 4.0, "eval_bleu": 20.3549, "eval_gen_len": 45.3627, "eval_loss": 1.5858944654464722, "eval_runtime": 2286.6294, "eval_samples_per_second": 6.787, "eval_steps_per_second": 0.424, "step": 15522 }, { "epoch": 4.007215565004509, "grad_norm": 1.6103421449661255, "learning_rate": 3.976288659793815e-06, "loss": 1.8672, "step": 15550 }, { "epoch": 4.0201005025125625, "grad_norm": 1.074686050415039, "learning_rate": 3.924742268041237e-06, "loss": 1.8157, "step": 15600 }, { "epoch": 4.0329854400206155, "grad_norm": 1.11213219165802, "learning_rate": 3.87319587628866e-06, "loss": 1.8017, "step": 15650 }, { "epoch": 4.045870377528669, "grad_norm": 1.1408663988113403, "learning_rate": 3.821649484536083e-06, "loss": 1.7926, "step": 15700 }, { "epoch": 4.058755315036722, "grad_norm": 1.0430666208267212, "learning_rate": 3.7701030927835054e-06, "loss": 1.7889, "step": 15750 }, { "epoch": 4.071640252544775, "grad_norm": 1.1866077184677124, "learning_rate": 3.718556701030928e-06, "loss": 1.7969, "step": 15800 }, { "epoch": 4.084525190052828, "grad_norm": 1.362838625907898, "learning_rate": 3.667010309278351e-06, "loss": 1.8444, "step": 15850 }, { "epoch": 4.097410127560881, "grad_norm": 1.0319502353668213, "learning_rate": 3.6154639175257735e-06, "loss": 1.8415, "step": 15900 }, { "epoch": 4.110295065068934, "grad_norm": 1.0608545541763306, "learning_rate": 3.563917525773196e-06, "loss": 1.8614, "step": 15950 }, { "epoch": 4.123180002576987, "grad_norm": 1.0609222650527954, "learning_rate": 3.512371134020619e-06, "loss": 1.8395, "step": 16000 }, { "epoch": 4.13606494008504, "grad_norm": 1.0247282981872559, "learning_rate": 3.460824742268041e-06, "loss": 1.8515, "step": 16050 }, { "epoch": 4.148949877593093, "grad_norm": 1.1988474130630493, "learning_rate": 3.409278350515464e-06, "loss": 1.8241, "step": 16100 }, { "epoch": 4.161834815101146, "grad_norm": 1.164461374282837, "learning_rate": 3.357731958762887e-06, "loss": 1.8444, "step": 16150 }, { "epoch": 4.1747197526091995, "grad_norm": 1.090458631515503, "learning_rate": 3.3061855670103093e-06, "loss": 1.844, "step": 16200 }, { "epoch": 4.187604690117253, "grad_norm": 1.4075111150741577, "learning_rate": 3.2546391752577323e-06, "loss": 1.8042, "step": 16250 }, { "epoch": 4.200489627625306, "grad_norm": 1.1348927021026611, "learning_rate": 3.2030927835051553e-06, "loss": 1.7964, "step": 16300 }, { "epoch": 4.213374565133359, "grad_norm": 1.3190233707427979, "learning_rate": 3.1515463917525774e-06, "loss": 1.8265, "step": 16350 }, { "epoch": 4.226259502641412, "grad_norm": 1.0522139072418213, "learning_rate": 3.1000000000000004e-06, "loss": 1.8129, "step": 16400 }, { "epoch": 4.239144440149465, "grad_norm": 1.0048468112945557, "learning_rate": 3.048453608247423e-06, "loss": 1.8452, "step": 16450 }, { "epoch": 4.252029377657518, "grad_norm": 1.180903434753418, "learning_rate": 2.9969072164948455e-06, "loss": 1.8357, "step": 16500 }, { "epoch": 4.264914315165571, "grad_norm": 1.297938585281372, "learning_rate": 2.945360824742268e-06, "loss": 1.8436, "step": 16550 }, { "epoch": 4.277799252673624, "grad_norm": 1.0574970245361328, "learning_rate": 2.893814432989691e-06, "loss": 1.8493, "step": 16600 }, { "epoch": 4.290684190181677, "grad_norm": 1.082375168800354, "learning_rate": 2.8422680412371136e-06, "loss": 1.8424, "step": 16650 }, { "epoch": 4.30356912768973, "grad_norm": 0.9978700280189514, "learning_rate": 2.790721649484536e-06, "loss": 1.865, "step": 16700 }, { "epoch": 4.316454065197783, "grad_norm": 1.1757255792617798, "learning_rate": 2.739175257731959e-06, "loss": 1.8743, "step": 16750 }, { "epoch": 4.3293390027058365, "grad_norm": 1.1812617778778076, "learning_rate": 2.6876288659793813e-06, "loss": 1.8218, "step": 16800 }, { "epoch": 4.34222394021389, "grad_norm": 1.126605749130249, "learning_rate": 2.6360824742268043e-06, "loss": 1.8278, "step": 16850 }, { "epoch": 4.355108877721943, "grad_norm": 1.1985175609588623, "learning_rate": 2.5845360824742273e-06, "loss": 1.8033, "step": 16900 }, { "epoch": 4.367993815229996, "grad_norm": 1.1603890657424927, "learning_rate": 2.5329896907216494e-06, "loss": 1.8361, "step": 16950 }, { "epoch": 4.380878752738049, "grad_norm": 1.209686517715454, "learning_rate": 2.4814432989690724e-06, "loss": 1.8762, "step": 17000 }, { "epoch": 4.393763690246102, "grad_norm": 1.105088233947754, "learning_rate": 2.429896907216495e-06, "loss": 1.8506, "step": 17050 }, { "epoch": 4.406648627754155, "grad_norm": 1.1008309125900269, "learning_rate": 2.378350515463918e-06, "loss": 1.8256, "step": 17100 }, { "epoch": 4.419533565262208, "grad_norm": 1.3243298530578613, "learning_rate": 2.3268041237113405e-06, "loss": 1.8009, "step": 17150 }, { "epoch": 4.432418502770261, "grad_norm": 1.0257518291473389, "learning_rate": 2.275257731958763e-06, "loss": 1.8524, "step": 17200 }, { "epoch": 4.445303440278314, "grad_norm": 0.9655742645263672, "learning_rate": 2.2237113402061856e-06, "loss": 1.8044, "step": 17250 }, { "epoch": 4.458188377786367, "grad_norm": 1.0721204280853271, "learning_rate": 2.172164948453608e-06, "loss": 1.8242, "step": 17300 }, { "epoch": 4.47107331529442, "grad_norm": 1.227541208267212, "learning_rate": 2.120618556701031e-06, "loss": 1.8186, "step": 17350 }, { "epoch": 4.4839582528024735, "grad_norm": 1.0894291400909424, "learning_rate": 2.070103092783505e-06, "loss": 1.7768, "step": 17400 }, { "epoch": 4.496843190310527, "grad_norm": 1.004269003868103, "learning_rate": 2.0185567010309277e-06, "loss": 1.8019, "step": 17450 }, { "epoch": 4.50972812781858, "grad_norm": 1.1534968614578247, "learning_rate": 1.9670103092783507e-06, "loss": 1.8244, "step": 17500 }, { "epoch": 4.522613065326633, "grad_norm": 1.3757740259170532, "learning_rate": 1.9154639175257733e-06, "loss": 1.8458, "step": 17550 }, { "epoch": 4.535498002834686, "grad_norm": 1.184401035308838, "learning_rate": 1.8639175257731958e-06, "loss": 1.829, "step": 17600 }, { "epoch": 4.548382940342739, "grad_norm": 1.2132309675216675, "learning_rate": 1.8123711340206188e-06, "loss": 1.8529, "step": 17650 }, { "epoch": 4.561267877850792, "grad_norm": 1.0804411172866821, "learning_rate": 1.7608247422680414e-06, "loss": 1.8534, "step": 17700 }, { "epoch": 4.574152815358845, "grad_norm": 1.5346250534057617, "learning_rate": 1.709278350515464e-06, "loss": 1.8277, "step": 17750 }, { "epoch": 4.587037752866898, "grad_norm": 1.1482338905334473, "learning_rate": 1.6577319587628867e-06, "loss": 1.803, "step": 17800 }, { "epoch": 4.599922690374951, "grad_norm": 1.171758770942688, "learning_rate": 1.6061855670103093e-06, "loss": 1.83, "step": 17850 }, { "epoch": 4.612807627883004, "grad_norm": 1.3336864709854126, "learning_rate": 1.554639175257732e-06, "loss": 1.8386, "step": 17900 }, { "epoch": 4.625692565391057, "grad_norm": 1.0265740156173706, "learning_rate": 1.5030927835051548e-06, "loss": 1.8072, "step": 17950 }, { "epoch": 4.6385775028991105, "grad_norm": 0.9137164950370789, "learning_rate": 1.4515463917525774e-06, "loss": 1.8189, "step": 18000 }, { "epoch": 4.651462440407164, "grad_norm": 1.2193052768707275, "learning_rate": 1.4000000000000001e-06, "loss": 1.8489, "step": 18050 }, { "epoch": 4.664347377915217, "grad_norm": 1.1527855396270752, "learning_rate": 1.348453608247423e-06, "loss": 1.7922, "step": 18100 }, { "epoch": 4.67723231542327, "grad_norm": 0.9622436761856079, "learning_rate": 1.2969072164948455e-06, "loss": 1.81, "step": 18150 }, { "epoch": 4.690117252931323, "grad_norm": 1.173771858215332, "learning_rate": 1.245360824742268e-06, "loss": 1.8375, "step": 18200 }, { "epoch": 4.703002190439376, "grad_norm": 1.1019172668457031, "learning_rate": 1.1938144329896908e-06, "loss": 1.836, "step": 18250 }, { "epoch": 4.715887127947429, "grad_norm": 1.419956088066101, "learning_rate": 1.1422680412371134e-06, "loss": 1.7774, "step": 18300 }, { "epoch": 4.728772065455482, "grad_norm": 0.9891812205314636, "learning_rate": 1.0907216494845362e-06, "loss": 1.7989, "step": 18350 }, { "epoch": 4.741657002963535, "grad_norm": 1.0100419521331787, "learning_rate": 1.039175257731959e-06, "loss": 1.8439, "step": 18400 }, { "epoch": 4.754541940471588, "grad_norm": 1.1432263851165771, "learning_rate": 9.876288659793815e-07, "loss": 1.8154, "step": 18450 }, { "epoch": 4.767426877979641, "grad_norm": 1.1132447719573975, "learning_rate": 9.360824742268042e-07, "loss": 1.8817, "step": 18500 }, { "epoch": 4.7803118154876945, "grad_norm": 1.086591362953186, "learning_rate": 8.845360824742269e-07, "loss": 1.8116, "step": 18550 }, { "epoch": 4.7931967529957475, "grad_norm": 1.0462530851364136, "learning_rate": 8.329896907216496e-07, "loss": 1.8432, "step": 18600 }, { "epoch": 4.806081690503801, "grad_norm": 1.0310077667236328, "learning_rate": 7.814432989690722e-07, "loss": 1.825, "step": 18650 }, { "epoch": 4.818966628011854, "grad_norm": 1.0306345224380493, "learning_rate": 7.298969072164949e-07, "loss": 1.8085, "step": 18700 }, { "epoch": 4.831851565519907, "grad_norm": 1.4541102647781372, "learning_rate": 6.783505154639176e-07, "loss": 1.8276, "step": 18750 }, { "epoch": 4.84473650302796, "grad_norm": 1.2799372673034668, "learning_rate": 6.268041237113402e-07, "loss": 1.8876, "step": 18800 }, { "epoch": 4.857621440536013, "grad_norm": 1.0106414556503296, "learning_rate": 5.75257731958763e-07, "loss": 1.7739, "step": 18850 }, { "epoch": 4.870506378044066, "grad_norm": 1.11018967628479, "learning_rate": 5.237113402061856e-07, "loss": 1.8272, "step": 18900 }, { "epoch": 4.883391315552119, "grad_norm": 1.587908387184143, "learning_rate": 4.7216494845360834e-07, "loss": 1.8069, "step": 18950 }, { "epoch": 4.896276253060172, "grad_norm": 1.1059330701828003, "learning_rate": 4.2061855670103096e-07, "loss": 1.85, "step": 19000 }, { "epoch": 4.909161190568225, "grad_norm": 1.2126922607421875, "learning_rate": 3.690721649484536e-07, "loss": 1.8446, "step": 19050 }, { "epoch": 4.922046128076278, "grad_norm": 1.0558154582977295, "learning_rate": 3.1752577319587635e-07, "loss": 1.8169, "step": 19100 }, { "epoch": 4.9349310655843315, "grad_norm": 1.0007387399673462, "learning_rate": 2.65979381443299e-07, "loss": 1.813, "step": 19150 }, { "epoch": 4.947816003092385, "grad_norm": 1.1500272750854492, "learning_rate": 2.1443298969072168e-07, "loss": 1.8094, "step": 19200 }, { "epoch": 4.960700940600438, "grad_norm": 1.0796419382095337, "learning_rate": 1.6288659793814433e-07, "loss": 1.7628, "step": 19250 }, { "epoch": 4.973585878108491, "grad_norm": 1.1715208292007446, "learning_rate": 1.1134020618556701e-07, "loss": 1.8339, "step": 19300 }, { "epoch": 4.986470815616544, "grad_norm": 1.0896481275558472, "learning_rate": 5.97938144329897e-08, "loss": 1.8318, "step": 19350 }, { "epoch": 4.999355753124597, "grad_norm": 1.2462085485458374, "learning_rate": 8.247422680412371e-09, "loss": 1.8287, "step": 19400 }, { "epoch": 4.999355753124597, "eval_bleu": 20.4616, "eval_gen_len": 45.2528, "eval_loss": 1.578300952911377, "eval_runtime": 2280.7257, "eval_samples_per_second": 6.805, "eval_steps_per_second": 0.425, "step": 19400 } ], "logging_steps": 50, "max_steps": 19400, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.497673924411392e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }