diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37311 @@ +{ + "best_metric": 0.29454880952835083, + "best_model_checkpoint": "./cifar100_outputs/checkpoint-47817", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 53130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.1900806427001953, + "learning_rate": 1.9996235648409565e-05, + "loss": 4.6367, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.6411144733428955, + "learning_rate": 1.9992471296819125e-05, + "loss": 4.5998, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 2.594529151916504, + "learning_rate": 1.9988706945228688e-05, + "loss": 4.5914, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.5973572731018066, + "learning_rate": 1.9984942593638247e-05, + "loss": 4.6124, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.887483596801758, + "learning_rate": 1.998117824204781e-05, + "loss": 4.6123, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.584811210632324, + "learning_rate": 1.997741389045737e-05, + "loss": 4.6138, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.9465954303741455, + "learning_rate": 1.9973649538866934e-05, + "loss": 4.5905, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 2.4353935718536377, + "learning_rate": 1.9969885187276493e-05, + "loss": 4.5707, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 2.8148584365844727, + "learning_rate": 1.9966120835686053e-05, + "loss": 4.5688, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 2.962184429168701, + "learning_rate": 1.9962356484095616e-05, + "loss": 4.5784, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 2.464444637298584, + "learning_rate": 1.9958592132505176e-05, + "loss": 4.5557, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 3.1651523113250732, + "learning_rate": 1.995482778091474e-05, + "loss": 4.5485, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 2.603081226348877, + "learning_rate": 1.99510634293243e-05, + "loss": 4.5642, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 2.5633766651153564, + "learning_rate": 1.9947299077733862e-05, + "loss": 4.5619, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 3.072338342666626, + "learning_rate": 1.9943534726143422e-05, + "loss": 4.5694, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 2.9631261825561523, + "learning_rate": 1.9939770374552985e-05, + "loss": 4.554, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 2.998347043991089, + "learning_rate": 1.9936006022962545e-05, + "loss": 4.5413, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 2.716461420059204, + "learning_rate": 1.9932241671372108e-05, + "loss": 4.5095, + "step": 180 + }, + { + "epoch": 0.04, + "grad_norm": 2.613055944442749, + "learning_rate": 1.992847731978167e-05, + "loss": 4.5277, + "step": 190 + }, + { + "epoch": 0.04, + "grad_norm": 2.56919002532959, + "learning_rate": 1.992471296819123e-05, + "loss": 4.5149, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 3.0990872383117676, + "learning_rate": 1.9920948616600794e-05, + "loss": 4.5029, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 2.658292770385742, + "learning_rate": 1.9917184265010354e-05, + "loss": 4.4927, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 2.613175868988037, + "learning_rate": 1.9913419913419917e-05, + "loss": 4.46, + "step": 230 + }, + { + "epoch": 0.05, + "grad_norm": 2.9422857761383057, + "learning_rate": 1.9909655561829477e-05, + "loss": 4.4698, + "step": 240 + }, + { + "epoch": 0.05, + "grad_norm": 3.384212017059326, + "learning_rate": 1.990589121023904e-05, + "loss": 4.4693, + "step": 250 + }, + { + "epoch": 0.05, + "grad_norm": 2.8845982551574707, + "learning_rate": 1.99021268586486e-05, + "loss": 4.4814, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 2.9230496883392334, + "learning_rate": 1.989836250705816e-05, + "loss": 4.4504, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 3.3872103691101074, + "learning_rate": 1.9894598155467723e-05, + "loss": 4.4218, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 3.033067226409912, + "learning_rate": 1.9890833803877282e-05, + "loss": 4.4298, + "step": 290 + }, + { + "epoch": 0.06, + "grad_norm": 2.819579601287842, + "learning_rate": 1.9887069452286845e-05, + "loss": 4.4118, + "step": 300 + }, + { + "epoch": 0.06, + "grad_norm": 2.8507134914398193, + "learning_rate": 1.9883305100696405e-05, + "loss": 4.4365, + "step": 310 + }, + { + "epoch": 0.06, + "grad_norm": 2.762634038925171, + "learning_rate": 1.987954074910597e-05, + "loss": 4.4232, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 3.0568439960479736, + "learning_rate": 1.9875776397515528e-05, + "loss": 4.3622, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 2.941974639892578, + "learning_rate": 1.987201204592509e-05, + "loss": 4.4076, + "step": 340 + }, + { + "epoch": 0.07, + "grad_norm": 3.0038816928863525, + "learning_rate": 1.986824769433465e-05, + "loss": 4.3652, + "step": 350 + }, + { + "epoch": 0.07, + "grad_norm": 2.681337356567383, + "learning_rate": 1.9864483342744214e-05, + "loss": 4.3819, + "step": 360 + }, + { + "epoch": 0.07, + "grad_norm": 2.8940606117248535, + "learning_rate": 1.9860718991153774e-05, + "loss": 4.3638, + "step": 370 + }, + { + "epoch": 0.07, + "grad_norm": 3.30637526512146, + "learning_rate": 1.9856954639563337e-05, + "loss": 4.385, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 3.1481170654296875, + "learning_rate": 1.98531902879729e-05, + "loss": 4.3957, + "step": 390 + }, + { + "epoch": 0.08, + "grad_norm": 2.795309543609619, + "learning_rate": 1.984942593638246e-05, + "loss": 4.3601, + "step": 400 + }, + { + "epoch": 0.08, + "grad_norm": 3.1323978900909424, + "learning_rate": 1.9845661584792023e-05, + "loss": 4.3536, + "step": 410 + }, + { + "epoch": 0.08, + "grad_norm": 3.0077526569366455, + "learning_rate": 1.9841897233201583e-05, + "loss": 4.3306, + "step": 420 + }, + { + "epoch": 0.08, + "grad_norm": 3.095212936401367, + "learning_rate": 1.9838132881611146e-05, + "loss": 4.3473, + "step": 430 + }, + { + "epoch": 0.08, + "grad_norm": 3.200291633605957, + "learning_rate": 1.9834368530020706e-05, + "loss": 4.2868, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 3.210118532180786, + "learning_rate": 1.9830604178430266e-05, + "loss": 4.3114, + "step": 450 + }, + { + "epoch": 0.09, + "grad_norm": 3.286947011947632, + "learning_rate": 1.982683982683983e-05, + "loss": 4.3222, + "step": 460 + }, + { + "epoch": 0.09, + "grad_norm": 2.8678014278411865, + "learning_rate": 1.982307547524939e-05, + "loss": 4.2983, + "step": 470 + }, + { + "epoch": 0.09, + "grad_norm": 3.726501703262329, + "learning_rate": 1.9819311123658952e-05, + "loss": 4.2952, + "step": 480 + }, + { + "epoch": 0.09, + "grad_norm": 2.8604493141174316, + "learning_rate": 1.981554677206851e-05, + "loss": 4.2516, + "step": 490 + }, + { + "epoch": 0.09, + "grad_norm": 2.8486504554748535, + "learning_rate": 1.9811782420478075e-05, + "loss": 4.2516, + "step": 500 + }, + { + "epoch": 0.1, + "grad_norm": 2.9079558849334717, + "learning_rate": 1.9808018068887634e-05, + "loss": 4.2404, + "step": 510 + }, + { + "epoch": 0.1, + "grad_norm": 3.027830123901367, + "learning_rate": 1.9804253717297198e-05, + "loss": 4.2424, + "step": 520 + }, + { + "epoch": 0.1, + "grad_norm": 3.154351234436035, + "learning_rate": 1.9800489365706757e-05, + "loss": 4.21, + "step": 530 + }, + { + "epoch": 0.1, + "grad_norm": 3.0818393230438232, + "learning_rate": 1.979672501411632e-05, + "loss": 4.242, + "step": 540 + }, + { + "epoch": 0.1, + "grad_norm": 3.877248525619507, + "learning_rate": 1.979296066252588e-05, + "loss": 4.2019, + "step": 550 + }, + { + "epoch": 0.11, + "grad_norm": 3.109707832336426, + "learning_rate": 1.9789196310935443e-05, + "loss": 4.1811, + "step": 560 + }, + { + "epoch": 0.11, + "grad_norm": 3.032790184020996, + "learning_rate": 1.9785431959345007e-05, + "loss": 4.1958, + "step": 570 + }, + { + "epoch": 0.11, + "grad_norm": 3.237541913986206, + "learning_rate": 1.9781667607754566e-05, + "loss": 4.1735, + "step": 580 + }, + { + "epoch": 0.11, + "grad_norm": 3.851644992828369, + "learning_rate": 1.977790325616413e-05, + "loss": 4.1478, + "step": 590 + }, + { + "epoch": 0.11, + "grad_norm": 2.9260008335113525, + "learning_rate": 1.977413890457369e-05, + "loss": 4.1436, + "step": 600 + }, + { + "epoch": 0.11, + "grad_norm": 3.1182501316070557, + "learning_rate": 1.977037455298325e-05, + "loss": 4.1768, + "step": 610 + }, + { + "epoch": 0.12, + "grad_norm": 3.236636161804199, + "learning_rate": 1.9766610201392812e-05, + "loss": 4.1488, + "step": 620 + }, + { + "epoch": 0.12, + "grad_norm": 2.9359562397003174, + "learning_rate": 1.9762845849802372e-05, + "loss": 4.1619, + "step": 630 + }, + { + "epoch": 0.12, + "grad_norm": 2.9437806606292725, + "learning_rate": 1.9759081498211935e-05, + "loss": 4.092, + "step": 640 + }, + { + "epoch": 0.12, + "grad_norm": 3.566126585006714, + "learning_rate": 1.9755317146621495e-05, + "loss": 4.1258, + "step": 650 + }, + { + "epoch": 0.12, + "grad_norm": 3.742216110229492, + "learning_rate": 1.9751552795031058e-05, + "loss": 4.0587, + "step": 660 + }, + { + "epoch": 0.13, + "grad_norm": 2.9798004627227783, + "learning_rate": 1.9747788443440618e-05, + "loss": 4.0615, + "step": 670 + }, + { + "epoch": 0.13, + "grad_norm": 3.074636697769165, + "learning_rate": 1.974402409185018e-05, + "loss": 4.0414, + "step": 680 + }, + { + "epoch": 0.13, + "grad_norm": 3.1997222900390625, + "learning_rate": 1.974025974025974e-05, + "loss": 4.0918, + "step": 690 + }, + { + "epoch": 0.13, + "grad_norm": 3.321722984313965, + "learning_rate": 1.9736495388669304e-05, + "loss": 4.0527, + "step": 700 + }, + { + "epoch": 0.13, + "grad_norm": 3.0019032955169678, + "learning_rate": 1.9732731037078864e-05, + "loss": 3.9662, + "step": 710 + }, + { + "epoch": 0.14, + "grad_norm": 2.8338370323181152, + "learning_rate": 1.9728966685488427e-05, + "loss": 4.0178, + "step": 720 + }, + { + "epoch": 0.14, + "grad_norm": 3.2571487426757812, + "learning_rate": 1.9725202333897987e-05, + "loss": 4.0006, + "step": 730 + }, + { + "epoch": 0.14, + "grad_norm": 2.9352447986602783, + "learning_rate": 1.972143798230755e-05, + "loss": 4.0078, + "step": 740 + }, + { + "epoch": 0.14, + "grad_norm": 4.793808460235596, + "learning_rate": 1.9717673630717113e-05, + "loss": 4.0501, + "step": 750 + }, + { + "epoch": 0.14, + "grad_norm": 6.661139488220215, + "learning_rate": 1.9713909279126673e-05, + "loss": 3.9689, + "step": 760 + }, + { + "epoch": 0.14, + "grad_norm": 3.5628387928009033, + "learning_rate": 1.9710144927536236e-05, + "loss": 3.9937, + "step": 770 + }, + { + "epoch": 0.15, + "grad_norm": 3.0853545665740967, + "learning_rate": 1.9706380575945796e-05, + "loss": 3.9623, + "step": 780 + }, + { + "epoch": 0.15, + "grad_norm": 3.3460278511047363, + "learning_rate": 1.9702616224355355e-05, + "loss": 4.0009, + "step": 790 + }, + { + "epoch": 0.15, + "grad_norm": 3.079176425933838, + "learning_rate": 1.969885187276492e-05, + "loss": 3.9128, + "step": 800 + }, + { + "epoch": 0.15, + "grad_norm": 3.0661251544952393, + "learning_rate": 1.9695087521174478e-05, + "loss": 3.9185, + "step": 810 + }, + { + "epoch": 0.15, + "grad_norm": 2.881580352783203, + "learning_rate": 1.969132316958404e-05, + "loss": 3.8848, + "step": 820 + }, + { + "epoch": 0.16, + "grad_norm": 4.219837665557861, + "learning_rate": 1.96875588179936e-05, + "loss": 3.9397, + "step": 830 + }, + { + "epoch": 0.16, + "grad_norm": 3.501627206802368, + "learning_rate": 1.9683794466403164e-05, + "loss": 3.9385, + "step": 840 + }, + { + "epoch": 0.16, + "grad_norm": 3.5525853633880615, + "learning_rate": 1.9680030114812724e-05, + "loss": 3.9145, + "step": 850 + }, + { + "epoch": 0.16, + "grad_norm": 3.5347719192504883, + "learning_rate": 1.9676265763222287e-05, + "loss": 3.9678, + "step": 860 + }, + { + "epoch": 0.16, + "grad_norm": 5.558157920837402, + "learning_rate": 1.9672501411631847e-05, + "loss": 3.8212, + "step": 870 + }, + { + "epoch": 0.17, + "grad_norm": 3.4307472705841064, + "learning_rate": 1.966873706004141e-05, + "loss": 3.8341, + "step": 880 + }, + { + "epoch": 0.17, + "grad_norm": 4.318007469177246, + "learning_rate": 1.966497270845097e-05, + "loss": 3.8516, + "step": 890 + }, + { + "epoch": 0.17, + "grad_norm": 4.074690341949463, + "learning_rate": 1.9661208356860533e-05, + "loss": 3.8625, + "step": 900 + }, + { + "epoch": 0.17, + "grad_norm": 3.376126289367676, + "learning_rate": 1.9657444005270093e-05, + "loss": 3.8391, + "step": 910 + }, + { + "epoch": 0.17, + "grad_norm": 5.645514011383057, + "learning_rate": 1.9653679653679656e-05, + "loss": 3.8403, + "step": 920 + }, + { + "epoch": 0.18, + "grad_norm": 3.457108497619629, + "learning_rate": 1.964991530208922e-05, + "loss": 3.8908, + "step": 930 + }, + { + "epoch": 0.18, + "grad_norm": 2.9738640785217285, + "learning_rate": 1.964615095049878e-05, + "loss": 3.7746, + "step": 940 + }, + { + "epoch": 0.18, + "grad_norm": 4.399070739746094, + "learning_rate": 1.9642386598908342e-05, + "loss": 3.7559, + "step": 950 + }, + { + "epoch": 0.18, + "grad_norm": 4.6740546226501465, + "learning_rate": 1.96386222473179e-05, + "loss": 3.7348, + "step": 960 + }, + { + "epoch": 0.18, + "grad_norm": 3.3767216205596924, + "learning_rate": 1.963485789572746e-05, + "loss": 3.6984, + "step": 970 + }, + { + "epoch": 0.18, + "grad_norm": 3.3283941745758057, + "learning_rate": 1.9631093544137025e-05, + "loss": 3.756, + "step": 980 + }, + { + "epoch": 0.19, + "grad_norm": 8.651248931884766, + "learning_rate": 1.9627329192546585e-05, + "loss": 3.7782, + "step": 990 + }, + { + "epoch": 0.19, + "grad_norm": 4.265860080718994, + "learning_rate": 1.9623564840956148e-05, + "loss": 3.787, + "step": 1000 + }, + { + "epoch": 0.19, + "grad_norm": 3.8226921558380127, + "learning_rate": 1.9619800489365707e-05, + "loss": 3.8101, + "step": 1010 + }, + { + "epoch": 0.19, + "grad_norm": 3.2111263275146484, + "learning_rate": 1.961603613777527e-05, + "loss": 3.737, + "step": 1020 + }, + { + "epoch": 0.19, + "grad_norm": 5.9517107009887695, + "learning_rate": 1.961227178618483e-05, + "loss": 3.7126, + "step": 1030 + }, + { + "epoch": 0.2, + "grad_norm": 5.050525665283203, + "learning_rate": 1.9608507434594394e-05, + "loss": 3.7004, + "step": 1040 + }, + { + "epoch": 0.2, + "grad_norm": 2.8831429481506348, + "learning_rate": 1.9604743083003953e-05, + "loss": 3.6664, + "step": 1050 + }, + { + "epoch": 0.2, + "grad_norm": 4.123471260070801, + "learning_rate": 1.9600978731413516e-05, + "loss": 3.7305, + "step": 1060 + }, + { + "epoch": 0.2, + "grad_norm": 3.836909770965576, + "learning_rate": 1.9597214379823076e-05, + "loss": 3.7419, + "step": 1070 + }, + { + "epoch": 0.2, + "grad_norm": 3.690175771713257, + "learning_rate": 1.959345002823264e-05, + "loss": 3.6338, + "step": 1080 + }, + { + "epoch": 0.21, + "grad_norm": 3.598996639251709, + "learning_rate": 1.95896856766422e-05, + "loss": 3.7188, + "step": 1090 + }, + { + "epoch": 0.21, + "grad_norm": 5.010788917541504, + "learning_rate": 1.9585921325051762e-05, + "loss": 3.7787, + "step": 1100 + }, + { + "epoch": 0.21, + "grad_norm": 4.156946182250977, + "learning_rate": 1.9582156973461322e-05, + "loss": 3.6394, + "step": 1110 + }, + { + "epoch": 0.21, + "grad_norm": 3.7809722423553467, + "learning_rate": 1.9578392621870885e-05, + "loss": 3.6945, + "step": 1120 + }, + { + "epoch": 0.21, + "grad_norm": 7.646225929260254, + "learning_rate": 1.957462827028045e-05, + "loss": 3.5862, + "step": 1130 + }, + { + "epoch": 0.21, + "grad_norm": 3.2124147415161133, + "learning_rate": 1.9570863918690005e-05, + "loss": 3.6443, + "step": 1140 + }, + { + "epoch": 0.22, + "grad_norm": 8.297073364257812, + "learning_rate": 1.9567099567099568e-05, + "loss": 3.7029, + "step": 1150 + }, + { + "epoch": 0.22, + "grad_norm": 4.9652862548828125, + "learning_rate": 1.956333521550913e-05, + "loss": 3.6561, + "step": 1160 + }, + { + "epoch": 0.22, + "grad_norm": 5.712035179138184, + "learning_rate": 1.955957086391869e-05, + "loss": 3.5105, + "step": 1170 + }, + { + "epoch": 0.22, + "grad_norm": 3.600637912750244, + "learning_rate": 1.9555806512328254e-05, + "loss": 3.5531, + "step": 1180 + }, + { + "epoch": 0.22, + "grad_norm": 4.265107154846191, + "learning_rate": 1.9552042160737814e-05, + "loss": 3.6188, + "step": 1190 + }, + { + "epoch": 0.23, + "grad_norm": 6.627386093139648, + "learning_rate": 1.9548277809147377e-05, + "loss": 3.5911, + "step": 1200 + }, + { + "epoch": 0.23, + "grad_norm": 4.785702228546143, + "learning_rate": 1.9544513457556937e-05, + "loss": 3.5837, + "step": 1210 + }, + { + "epoch": 0.23, + "grad_norm": 3.3369293212890625, + "learning_rate": 1.95407491059665e-05, + "loss": 3.6222, + "step": 1220 + }, + { + "epoch": 0.23, + "grad_norm": 3.7859039306640625, + "learning_rate": 1.953698475437606e-05, + "loss": 3.5098, + "step": 1230 + }, + { + "epoch": 0.23, + "grad_norm": 3.6498477458953857, + "learning_rate": 1.9533220402785623e-05, + "loss": 3.4831, + "step": 1240 + }, + { + "epoch": 0.24, + "grad_norm": 4.522767543792725, + "learning_rate": 1.9529456051195183e-05, + "loss": 3.6801, + "step": 1250 + }, + { + "epoch": 0.24, + "grad_norm": 3.2972137928009033, + "learning_rate": 1.9525691699604746e-05, + "loss": 3.3404, + "step": 1260 + }, + { + "epoch": 0.24, + "grad_norm": 8.638528823852539, + "learning_rate": 1.9521927348014305e-05, + "loss": 3.5866, + "step": 1270 + }, + { + "epoch": 0.24, + "grad_norm": 6.175476551055908, + "learning_rate": 1.951816299642387e-05, + "loss": 3.509, + "step": 1280 + }, + { + "epoch": 0.24, + "grad_norm": 3.4966530799865723, + "learning_rate": 1.951439864483343e-05, + "loss": 3.4481, + "step": 1290 + }, + { + "epoch": 0.24, + "grad_norm": 3.191817283630371, + "learning_rate": 1.951063429324299e-05, + "loss": 3.622, + "step": 1300 + }, + { + "epoch": 0.25, + "grad_norm": 4.43389368057251, + "learning_rate": 1.950686994165255e-05, + "loss": 3.5212, + "step": 1310 + }, + { + "epoch": 0.25, + "grad_norm": 7.5795392990112305, + "learning_rate": 1.950310559006211e-05, + "loss": 3.6054, + "step": 1320 + }, + { + "epoch": 0.25, + "grad_norm": 3.8179938793182373, + "learning_rate": 1.9499341238471674e-05, + "loss": 3.3792, + "step": 1330 + }, + { + "epoch": 0.25, + "grad_norm": 4.200837135314941, + "learning_rate": 1.9495576886881234e-05, + "loss": 3.5091, + "step": 1340 + }, + { + "epoch": 0.25, + "grad_norm": 3.6297414302825928, + "learning_rate": 1.9491812535290797e-05, + "loss": 3.4068, + "step": 1350 + }, + { + "epoch": 0.26, + "grad_norm": 3.7957377433776855, + "learning_rate": 1.948804818370036e-05, + "loss": 3.4862, + "step": 1360 + }, + { + "epoch": 0.26, + "grad_norm": 3.713080644607544, + "learning_rate": 1.948428383210992e-05, + "loss": 3.4861, + "step": 1370 + }, + { + "epoch": 0.26, + "grad_norm": 5.54100227355957, + "learning_rate": 1.9480519480519483e-05, + "loss": 3.4529, + "step": 1380 + }, + { + "epoch": 0.26, + "grad_norm": 3.8583154678344727, + "learning_rate": 1.9476755128929043e-05, + "loss": 3.4531, + "step": 1390 + }, + { + "epoch": 0.26, + "grad_norm": 4.6709675788879395, + "learning_rate": 1.9472990777338606e-05, + "loss": 3.4798, + "step": 1400 + }, + { + "epoch": 0.27, + "grad_norm": 3.9218077659606934, + "learning_rate": 1.9469226425748166e-05, + "loss": 3.4589, + "step": 1410 + }, + { + "epoch": 0.27, + "grad_norm": 3.0315423011779785, + "learning_rate": 1.946546207415773e-05, + "loss": 3.3474, + "step": 1420 + }, + { + "epoch": 0.27, + "grad_norm": 4.207186222076416, + "learning_rate": 1.946169772256729e-05, + "loss": 3.3297, + "step": 1430 + }, + { + "epoch": 0.27, + "grad_norm": 3.950998544692993, + "learning_rate": 1.9457933370976852e-05, + "loss": 3.4374, + "step": 1440 + }, + { + "epoch": 0.27, + "grad_norm": 4.596767902374268, + "learning_rate": 1.9454169019386412e-05, + "loss": 3.417, + "step": 1450 + }, + { + "epoch": 0.27, + "grad_norm": 4.453456878662109, + "learning_rate": 1.9450404667795975e-05, + "loss": 3.2764, + "step": 1460 + }, + { + "epoch": 0.28, + "grad_norm": 4.382648944854736, + "learning_rate": 1.9446640316205535e-05, + "loss": 3.4189, + "step": 1470 + }, + { + "epoch": 0.28, + "grad_norm": 9.689038276672363, + "learning_rate": 1.9442875964615098e-05, + "loss": 3.4015, + "step": 1480 + }, + { + "epoch": 0.28, + "grad_norm": 4.036463737487793, + "learning_rate": 1.9439111613024658e-05, + "loss": 3.3156, + "step": 1490 + }, + { + "epoch": 0.28, + "grad_norm": 3.8814964294433594, + "learning_rate": 1.9435347261434217e-05, + "loss": 3.1903, + "step": 1500 + }, + { + "epoch": 0.28, + "grad_norm": 4.564530849456787, + "learning_rate": 1.943158290984378e-05, + "loss": 3.3632, + "step": 1510 + }, + { + "epoch": 0.29, + "grad_norm": 4.282833576202393, + "learning_rate": 1.942781855825334e-05, + "loss": 3.4336, + "step": 1520 + }, + { + "epoch": 0.29, + "grad_norm": 4.326153755187988, + "learning_rate": 1.9424054206662903e-05, + "loss": 3.4133, + "step": 1530 + }, + { + "epoch": 0.29, + "grad_norm": 8.867426872253418, + "learning_rate": 1.9420289855072467e-05, + "loss": 3.3124, + "step": 1540 + }, + { + "epoch": 0.29, + "grad_norm": 4.446542739868164, + "learning_rate": 1.9416525503482026e-05, + "loss": 3.2558, + "step": 1550 + }, + { + "epoch": 0.29, + "grad_norm": 10.327835083007812, + "learning_rate": 1.941276115189159e-05, + "loss": 3.3257, + "step": 1560 + }, + { + "epoch": 0.3, + "grad_norm": 4.354964256286621, + "learning_rate": 1.940899680030115e-05, + "loss": 3.2928, + "step": 1570 + }, + { + "epoch": 0.3, + "grad_norm": 4.222498893737793, + "learning_rate": 1.9405232448710712e-05, + "loss": 3.3359, + "step": 1580 + }, + { + "epoch": 0.3, + "grad_norm": 3.6612141132354736, + "learning_rate": 1.9401468097120272e-05, + "loss": 3.2173, + "step": 1590 + }, + { + "epoch": 0.3, + "grad_norm": 3.745842695236206, + "learning_rate": 1.9397703745529835e-05, + "loss": 3.1372, + "step": 1600 + }, + { + "epoch": 0.3, + "grad_norm": 8.753840446472168, + "learning_rate": 1.9393939393939395e-05, + "loss": 3.321, + "step": 1610 + }, + { + "epoch": 0.3, + "grad_norm": 4.043251037597656, + "learning_rate": 1.9390175042348958e-05, + "loss": 3.3141, + "step": 1620 + }, + { + "epoch": 0.31, + "grad_norm": 7.991616249084473, + "learning_rate": 1.9386410690758518e-05, + "loss": 3.2403, + "step": 1630 + }, + { + "epoch": 0.31, + "grad_norm": 4.049126148223877, + "learning_rate": 1.938264633916808e-05, + "loss": 3.2457, + "step": 1640 + }, + { + "epoch": 0.31, + "grad_norm": 3.2904117107391357, + "learning_rate": 1.937888198757764e-05, + "loss": 3.3252, + "step": 1650 + }, + { + "epoch": 0.31, + "grad_norm": 4.309184551239014, + "learning_rate": 1.93751176359872e-05, + "loss": 3.1543, + "step": 1660 + }, + { + "epoch": 0.31, + "grad_norm": 3.324444532394409, + "learning_rate": 1.9371353284396764e-05, + "loss": 3.3012, + "step": 1670 + }, + { + "epoch": 0.32, + "grad_norm": 4.101102352142334, + "learning_rate": 1.9367588932806324e-05, + "loss": 3.2415, + "step": 1680 + }, + { + "epoch": 0.32, + "grad_norm": 4.955221652984619, + "learning_rate": 1.9363824581215887e-05, + "loss": 3.1479, + "step": 1690 + }, + { + "epoch": 0.32, + "grad_norm": 7.092071056365967, + "learning_rate": 1.9360060229625447e-05, + "loss": 3.2071, + "step": 1700 + }, + { + "epoch": 0.32, + "grad_norm": 4.205142021179199, + "learning_rate": 1.935629587803501e-05, + "loss": 3.1963, + "step": 1710 + }, + { + "epoch": 0.32, + "grad_norm": 4.192953109741211, + "learning_rate": 1.9352531526444573e-05, + "loss": 3.0651, + "step": 1720 + }, + { + "epoch": 0.33, + "grad_norm": 4.651326656341553, + "learning_rate": 1.9348767174854133e-05, + "loss": 3.3063, + "step": 1730 + }, + { + "epoch": 0.33, + "grad_norm": 5.780113697052002, + "learning_rate": 1.9345002823263696e-05, + "loss": 3.1667, + "step": 1740 + }, + { + "epoch": 0.33, + "grad_norm": 5.112650394439697, + "learning_rate": 1.9341238471673256e-05, + "loss": 3.2213, + "step": 1750 + }, + { + "epoch": 0.33, + "grad_norm": 5.035951137542725, + "learning_rate": 1.933747412008282e-05, + "loss": 3.2067, + "step": 1760 + }, + { + "epoch": 0.33, + "grad_norm": 4.030355930328369, + "learning_rate": 1.933370976849238e-05, + "loss": 3.1413, + "step": 1770 + }, + { + "epoch": 0.34, + "grad_norm": 6.994670391082764, + "learning_rate": 1.932994541690194e-05, + "loss": 3.1571, + "step": 1780 + }, + { + "epoch": 0.34, + "grad_norm": 6.542640686035156, + "learning_rate": 1.93261810653115e-05, + "loss": 3.0721, + "step": 1790 + }, + { + "epoch": 0.34, + "grad_norm": 3.707923173904419, + "learning_rate": 1.9322416713721065e-05, + "loss": 3.0077, + "step": 1800 + }, + { + "epoch": 0.34, + "grad_norm": 9.954961776733398, + "learning_rate": 1.9318652362130624e-05, + "loss": 3.1853, + "step": 1810 + }, + { + "epoch": 0.34, + "grad_norm": 4.040740013122559, + "learning_rate": 1.9314888010540187e-05, + "loss": 3.1621, + "step": 1820 + }, + { + "epoch": 0.34, + "grad_norm": 3.269940137863159, + "learning_rate": 1.9311123658949747e-05, + "loss": 3.0776, + "step": 1830 + }, + { + "epoch": 0.35, + "grad_norm": 6.951628684997559, + "learning_rate": 1.9307359307359307e-05, + "loss": 3.0865, + "step": 1840 + }, + { + "epoch": 0.35, + "grad_norm": 4.079604625701904, + "learning_rate": 1.930359495576887e-05, + "loss": 3.1071, + "step": 1850 + }, + { + "epoch": 0.35, + "grad_norm": 6.9071431159973145, + "learning_rate": 1.929983060417843e-05, + "loss": 3.1679, + "step": 1860 + }, + { + "epoch": 0.35, + "grad_norm": 4.998239994049072, + "learning_rate": 1.9296066252587993e-05, + "loss": 2.9328, + "step": 1870 + }, + { + "epoch": 0.35, + "grad_norm": 7.7196455001831055, + "learning_rate": 1.9292301900997553e-05, + "loss": 3.1693, + "step": 1880 + }, + { + "epoch": 0.36, + "grad_norm": 6.726539611816406, + "learning_rate": 1.9288537549407116e-05, + "loss": 3.119, + "step": 1890 + }, + { + "epoch": 0.36, + "grad_norm": 5.929136276245117, + "learning_rate": 1.9284773197816676e-05, + "loss": 3.0396, + "step": 1900 + }, + { + "epoch": 0.36, + "grad_norm": 3.6181488037109375, + "learning_rate": 1.928100884622624e-05, + "loss": 2.9703, + "step": 1910 + }, + { + "epoch": 0.36, + "grad_norm": 4.3397040367126465, + "learning_rate": 1.9277244494635802e-05, + "loss": 3.1033, + "step": 1920 + }, + { + "epoch": 0.36, + "grad_norm": 7.7239484786987305, + "learning_rate": 1.9273480143045362e-05, + "loss": 3.001, + "step": 1930 + }, + { + "epoch": 0.37, + "grad_norm": 3.317354440689087, + "learning_rate": 1.9269715791454925e-05, + "loss": 2.9419, + "step": 1940 + }, + { + "epoch": 0.37, + "grad_norm": 9.748729705810547, + "learning_rate": 1.9265951439864485e-05, + "loss": 3.1594, + "step": 1950 + }, + { + "epoch": 0.37, + "grad_norm": 4.6273064613342285, + "learning_rate": 1.9262187088274048e-05, + "loss": 2.9821, + "step": 1960 + }, + { + "epoch": 0.37, + "grad_norm": 8.60408878326416, + "learning_rate": 1.9258422736683608e-05, + "loss": 3.1565, + "step": 1970 + }, + { + "epoch": 0.37, + "grad_norm": 5.767194747924805, + "learning_rate": 1.925465838509317e-05, + "loss": 2.9479, + "step": 1980 + }, + { + "epoch": 0.37, + "grad_norm": 13.294413566589355, + "learning_rate": 1.925089403350273e-05, + "loss": 3.0082, + "step": 1990 + }, + { + "epoch": 0.38, + "grad_norm": 5.2884979248046875, + "learning_rate": 1.9247129681912294e-05, + "loss": 2.9004, + "step": 2000 + }, + { + "epoch": 0.38, + "grad_norm": 6.24321174621582, + "learning_rate": 1.9243365330321854e-05, + "loss": 2.8892, + "step": 2010 + }, + { + "epoch": 0.38, + "grad_norm": 4.944282054901123, + "learning_rate": 1.9239600978731413e-05, + "loss": 3.0362, + "step": 2020 + }, + { + "epoch": 0.38, + "grad_norm": 4.824367046356201, + "learning_rate": 1.9235836627140976e-05, + "loss": 3.0755, + "step": 2030 + }, + { + "epoch": 0.38, + "grad_norm": 5.201323509216309, + "learning_rate": 1.9232072275550536e-05, + "loss": 2.9256, + "step": 2040 + }, + { + "epoch": 0.39, + "grad_norm": 6.465519428253174, + "learning_rate": 1.92283079239601e-05, + "loss": 2.8423, + "step": 2050 + }, + { + "epoch": 0.39, + "grad_norm": 4.563433647155762, + "learning_rate": 1.922454357236966e-05, + "loss": 3.063, + "step": 2060 + }, + { + "epoch": 0.39, + "grad_norm": 5.595127582550049, + "learning_rate": 1.9220779220779222e-05, + "loss": 2.89, + "step": 2070 + }, + { + "epoch": 0.39, + "grad_norm": 5.29541540145874, + "learning_rate": 1.9217014869188782e-05, + "loss": 2.9488, + "step": 2080 + }, + { + "epoch": 0.39, + "grad_norm": 10.494131088256836, + "learning_rate": 1.9213250517598345e-05, + "loss": 2.9473, + "step": 2090 + }, + { + "epoch": 0.4, + "grad_norm": 7.640602111816406, + "learning_rate": 1.920948616600791e-05, + "loss": 2.9225, + "step": 2100 + }, + { + "epoch": 0.4, + "grad_norm": 3.552557945251465, + "learning_rate": 1.9205721814417468e-05, + "loss": 2.9173, + "step": 2110 + }, + { + "epoch": 0.4, + "grad_norm": 4.50587272644043, + "learning_rate": 1.920195746282703e-05, + "loss": 2.8535, + "step": 2120 + }, + { + "epoch": 0.4, + "grad_norm": 7.200937271118164, + "learning_rate": 1.919819311123659e-05, + "loss": 2.9673, + "step": 2130 + }, + { + "epoch": 0.4, + "grad_norm": 4.5765485763549805, + "learning_rate": 1.9194428759646154e-05, + "loss": 2.8064, + "step": 2140 + }, + { + "epoch": 0.4, + "grad_norm": 5.502497673034668, + "learning_rate": 1.9190664408055714e-05, + "loss": 2.8947, + "step": 2150 + }, + { + "epoch": 0.41, + "grad_norm": 19.691614151000977, + "learning_rate": 1.9186900056465277e-05, + "loss": 3.028, + "step": 2160 + }, + { + "epoch": 0.41, + "grad_norm": 3.6259846687316895, + "learning_rate": 1.9183135704874837e-05, + "loss": 2.8456, + "step": 2170 + }, + { + "epoch": 0.41, + "grad_norm": 4.246203422546387, + "learning_rate": 1.9179371353284397e-05, + "loss": 2.8903, + "step": 2180 + }, + { + "epoch": 0.41, + "grad_norm": 5.460795879364014, + "learning_rate": 1.917560700169396e-05, + "loss": 2.9129, + "step": 2190 + }, + { + "epoch": 0.41, + "grad_norm": 5.957250118255615, + "learning_rate": 1.917184265010352e-05, + "loss": 2.8096, + "step": 2200 + }, + { + "epoch": 0.42, + "grad_norm": 9.38149642944336, + "learning_rate": 1.9168078298513083e-05, + "loss": 2.9384, + "step": 2210 + }, + { + "epoch": 0.42, + "grad_norm": 3.8041586875915527, + "learning_rate": 1.9164313946922643e-05, + "loss": 2.7674, + "step": 2220 + }, + { + "epoch": 0.42, + "grad_norm": 4.8668904304504395, + "learning_rate": 1.9160549595332206e-05, + "loss": 2.7463, + "step": 2230 + }, + { + "epoch": 0.42, + "grad_norm": 4.3397698402404785, + "learning_rate": 1.9156785243741765e-05, + "loss": 2.8838, + "step": 2240 + }, + { + "epoch": 0.42, + "grad_norm": 10.79181957244873, + "learning_rate": 1.915302089215133e-05, + "loss": 2.8304, + "step": 2250 + }, + { + "epoch": 0.43, + "grad_norm": 6.324551582336426, + "learning_rate": 1.914925654056089e-05, + "loss": 2.8287, + "step": 2260 + }, + { + "epoch": 0.43, + "grad_norm": 5.550736427307129, + "learning_rate": 1.914549218897045e-05, + "loss": 2.7671, + "step": 2270 + }, + { + "epoch": 0.43, + "grad_norm": 5.587412357330322, + "learning_rate": 1.9141727837380015e-05, + "loss": 3.0649, + "step": 2280 + }, + { + "epoch": 0.43, + "grad_norm": 4.18621301651001, + "learning_rate": 1.9137963485789574e-05, + "loss": 2.863, + "step": 2290 + }, + { + "epoch": 0.43, + "grad_norm": 6.899294376373291, + "learning_rate": 1.9134199134199138e-05, + "loss": 2.9058, + "step": 2300 + }, + { + "epoch": 0.43, + "grad_norm": 4.400335788726807, + "learning_rate": 1.9130434782608697e-05, + "loss": 2.7415, + "step": 2310 + }, + { + "epoch": 0.44, + "grad_norm": 18.84127426147461, + "learning_rate": 1.912667043101826e-05, + "loss": 2.8094, + "step": 2320 + }, + { + "epoch": 0.44, + "grad_norm": 11.325419425964355, + "learning_rate": 1.912290607942782e-05, + "loss": 2.7769, + "step": 2330 + }, + { + "epoch": 0.44, + "grad_norm": 11.26507568359375, + "learning_rate": 1.9119141727837383e-05, + "loss": 2.8841, + "step": 2340 + }, + { + "epoch": 0.44, + "grad_norm": 6.831374168395996, + "learning_rate": 1.9115377376246943e-05, + "loss": 2.832, + "step": 2350 + }, + { + "epoch": 0.44, + "grad_norm": 14.735857963562012, + "learning_rate": 1.9111613024656503e-05, + "loss": 2.7335, + "step": 2360 + }, + { + "epoch": 0.45, + "grad_norm": 8.749835968017578, + "learning_rate": 1.9107848673066066e-05, + "loss": 2.7282, + "step": 2370 + }, + { + "epoch": 0.45, + "grad_norm": 4.772495269775391, + "learning_rate": 1.9104084321475626e-05, + "loss": 2.6789, + "step": 2380 + }, + { + "epoch": 0.45, + "grad_norm": 3.5235466957092285, + "learning_rate": 1.910031996988519e-05, + "loss": 2.8611, + "step": 2390 + }, + { + "epoch": 0.45, + "grad_norm": 3.360028028488159, + "learning_rate": 1.909655561829475e-05, + "loss": 2.6936, + "step": 2400 + }, + { + "epoch": 0.45, + "grad_norm": 7.748390197753906, + "learning_rate": 1.9092791266704312e-05, + "loss": 2.6904, + "step": 2410 + }, + { + "epoch": 0.46, + "grad_norm": 6.634457588195801, + "learning_rate": 1.9089026915113872e-05, + "loss": 2.9342, + "step": 2420 + }, + { + "epoch": 0.46, + "grad_norm": 8.747628211975098, + "learning_rate": 1.9085262563523435e-05, + "loss": 2.7683, + "step": 2430 + }, + { + "epoch": 0.46, + "grad_norm": 3.515625476837158, + "learning_rate": 1.9081498211932995e-05, + "loss": 2.7442, + "step": 2440 + }, + { + "epoch": 0.46, + "grad_norm": 3.4130609035491943, + "learning_rate": 1.9077733860342558e-05, + "loss": 2.6693, + "step": 2450 + }, + { + "epoch": 0.46, + "grad_norm": 4.92776346206665, + "learning_rate": 1.907396950875212e-05, + "loss": 2.733, + "step": 2460 + }, + { + "epoch": 0.46, + "grad_norm": 11.010396957397461, + "learning_rate": 1.907020515716168e-05, + "loss": 2.6891, + "step": 2470 + }, + { + "epoch": 0.47, + "grad_norm": 3.8040144443511963, + "learning_rate": 1.9066440805571244e-05, + "loss": 2.5664, + "step": 2480 + }, + { + "epoch": 0.47, + "grad_norm": 6.726337909698486, + "learning_rate": 1.9062676453980804e-05, + "loss": 2.7416, + "step": 2490 + }, + { + "epoch": 0.47, + "grad_norm": 7.524297714233398, + "learning_rate": 1.9058912102390367e-05, + "loss": 2.6905, + "step": 2500 + }, + { + "epoch": 0.47, + "grad_norm": 3.6792001724243164, + "learning_rate": 1.9055147750799927e-05, + "loss": 2.5483, + "step": 2510 + }, + { + "epoch": 0.47, + "grad_norm": 6.578388690948486, + "learning_rate": 1.905138339920949e-05, + "loss": 2.7227, + "step": 2520 + }, + { + "epoch": 0.48, + "grad_norm": 2.9775335788726807, + "learning_rate": 1.904761904761905e-05, + "loss": 2.5024, + "step": 2530 + }, + { + "epoch": 0.48, + "grad_norm": 10.870952606201172, + "learning_rate": 1.904385469602861e-05, + "loss": 2.622, + "step": 2540 + }, + { + "epoch": 0.48, + "grad_norm": 8.206267356872559, + "learning_rate": 1.9040090344438172e-05, + "loss": 2.8171, + "step": 2550 + }, + { + "epoch": 0.48, + "grad_norm": 6.320977210998535, + "learning_rate": 1.9036325992847732e-05, + "loss": 2.6359, + "step": 2560 + }, + { + "epoch": 0.48, + "grad_norm": 7.017043113708496, + "learning_rate": 1.9032561641257295e-05, + "loss": 2.6382, + "step": 2570 + }, + { + "epoch": 0.49, + "grad_norm": 4.2981061935424805, + "learning_rate": 1.9028797289666855e-05, + "loss": 2.5545, + "step": 2580 + }, + { + "epoch": 0.49, + "grad_norm": 4.075060844421387, + "learning_rate": 1.9025032938076418e-05, + "loss": 2.6773, + "step": 2590 + }, + { + "epoch": 0.49, + "grad_norm": 3.952000379562378, + "learning_rate": 1.9021268586485978e-05, + "loss": 2.4791, + "step": 2600 + }, + { + "epoch": 0.49, + "grad_norm": 4.052979469299316, + "learning_rate": 1.901750423489554e-05, + "loss": 2.7196, + "step": 2610 + }, + { + "epoch": 0.49, + "grad_norm": 4.638850212097168, + "learning_rate": 1.90137398833051e-05, + "loss": 2.7139, + "step": 2620 + }, + { + "epoch": 0.5, + "grad_norm": 7.109199523925781, + "learning_rate": 1.9009975531714664e-05, + "loss": 2.5936, + "step": 2630 + }, + { + "epoch": 0.5, + "grad_norm": 4.101054668426514, + "learning_rate": 1.9006211180124224e-05, + "loss": 2.5606, + "step": 2640 + }, + { + "epoch": 0.5, + "grad_norm": 6.660862445831299, + "learning_rate": 1.9002446828533787e-05, + "loss": 2.5895, + "step": 2650 + }, + { + "epoch": 0.5, + "grad_norm": 5.114608287811279, + "learning_rate": 1.899868247694335e-05, + "loss": 2.4963, + "step": 2660 + }, + { + "epoch": 0.5, + "grad_norm": 11.300898551940918, + "learning_rate": 1.899491812535291e-05, + "loss": 2.7821, + "step": 2670 + }, + { + "epoch": 0.5, + "grad_norm": 5.744706630706787, + "learning_rate": 1.8991153773762473e-05, + "loss": 2.5695, + "step": 2680 + }, + { + "epoch": 0.51, + "grad_norm": 7.060439586639404, + "learning_rate": 1.8987389422172033e-05, + "loss": 2.4313, + "step": 2690 + }, + { + "epoch": 0.51, + "grad_norm": 4.207651138305664, + "learning_rate": 1.8983625070581596e-05, + "loss": 2.4718, + "step": 2700 + }, + { + "epoch": 0.51, + "grad_norm": 6.556578636169434, + "learning_rate": 1.8979860718991156e-05, + "loss": 2.5789, + "step": 2710 + }, + { + "epoch": 0.51, + "grad_norm": 7.473084449768066, + "learning_rate": 1.8976096367400716e-05, + "loss": 2.6073, + "step": 2720 + }, + { + "epoch": 0.51, + "grad_norm": 8.552806854248047, + "learning_rate": 1.897233201581028e-05, + "loss": 2.568, + "step": 2730 + }, + { + "epoch": 0.52, + "grad_norm": 8.12427043914795, + "learning_rate": 1.896856766421984e-05, + "loss": 2.4706, + "step": 2740 + }, + { + "epoch": 0.52, + "grad_norm": 7.1217474937438965, + "learning_rate": 1.89648033126294e-05, + "loss": 2.4005, + "step": 2750 + }, + { + "epoch": 0.52, + "grad_norm": 3.9127049446105957, + "learning_rate": 1.896103896103896e-05, + "loss": 2.517, + "step": 2760 + }, + { + "epoch": 0.52, + "grad_norm": 15.194258689880371, + "learning_rate": 1.8957274609448525e-05, + "loss": 2.5601, + "step": 2770 + }, + { + "epoch": 0.52, + "grad_norm": 12.40319538116455, + "learning_rate": 1.8953510257858084e-05, + "loss": 2.3259, + "step": 2780 + }, + { + "epoch": 0.53, + "grad_norm": 10.481034278869629, + "learning_rate": 1.8949745906267647e-05, + "loss": 2.4099, + "step": 2790 + }, + { + "epoch": 0.53, + "grad_norm": 4.578127861022949, + "learning_rate": 1.8945981554677207e-05, + "loss": 2.416, + "step": 2800 + }, + { + "epoch": 0.53, + "grad_norm": 15.104098320007324, + "learning_rate": 1.894221720308677e-05, + "loss": 2.4252, + "step": 2810 + }, + { + "epoch": 0.53, + "grad_norm": 7.515500545501709, + "learning_rate": 1.893845285149633e-05, + "loss": 2.4741, + "step": 2820 + }, + { + "epoch": 0.53, + "grad_norm": 3.786322593688965, + "learning_rate": 1.8934688499905893e-05, + "loss": 2.5193, + "step": 2830 + }, + { + "epoch": 0.53, + "grad_norm": 3.358415365219116, + "learning_rate": 1.8930924148315456e-05, + "loss": 2.603, + "step": 2840 + }, + { + "epoch": 0.54, + "grad_norm": 7.374852180480957, + "learning_rate": 1.8927159796725016e-05, + "loss": 2.5356, + "step": 2850 + }, + { + "epoch": 0.54, + "grad_norm": 8.016167640686035, + "learning_rate": 1.892339544513458e-05, + "loss": 2.4606, + "step": 2860 + }, + { + "epoch": 0.54, + "grad_norm": 5.327517509460449, + "learning_rate": 1.891963109354414e-05, + "loss": 2.3619, + "step": 2870 + }, + { + "epoch": 0.54, + "grad_norm": 9.286544799804688, + "learning_rate": 1.89158667419537e-05, + "loss": 2.4749, + "step": 2880 + }, + { + "epoch": 0.54, + "grad_norm": 7.052702903747559, + "learning_rate": 1.8912102390363262e-05, + "loss": 2.5101, + "step": 2890 + }, + { + "epoch": 0.55, + "grad_norm": 24.803468704223633, + "learning_rate": 1.8908338038772822e-05, + "loss": 2.4248, + "step": 2900 + }, + { + "epoch": 0.55, + "grad_norm": 2.5042014122009277, + "learning_rate": 1.8904573687182385e-05, + "loss": 2.3821, + "step": 2910 + }, + { + "epoch": 0.55, + "grad_norm": 6.960639476776123, + "learning_rate": 1.8900809335591945e-05, + "loss": 2.3184, + "step": 2920 + }, + { + "epoch": 0.55, + "grad_norm": 8.575072288513184, + "learning_rate": 1.8897044984001508e-05, + "loss": 2.5079, + "step": 2930 + }, + { + "epoch": 0.55, + "grad_norm": 9.817319869995117, + "learning_rate": 1.8893280632411068e-05, + "loss": 2.3782, + "step": 2940 + }, + { + "epoch": 0.56, + "grad_norm": 11.542531967163086, + "learning_rate": 1.888951628082063e-05, + "loss": 2.3612, + "step": 2950 + }, + { + "epoch": 0.56, + "grad_norm": 8.432719230651855, + "learning_rate": 1.888575192923019e-05, + "loss": 2.6094, + "step": 2960 + }, + { + "epoch": 0.56, + "grad_norm": 7.157316207885742, + "learning_rate": 1.8881987577639754e-05, + "loss": 2.3173, + "step": 2970 + }, + { + "epoch": 0.56, + "grad_norm": 7.01808500289917, + "learning_rate": 1.8878223226049314e-05, + "loss": 2.4611, + "step": 2980 + }, + { + "epoch": 0.56, + "grad_norm": 7.0088324546813965, + "learning_rate": 1.8874458874458877e-05, + "loss": 2.3906, + "step": 2990 + }, + { + "epoch": 0.56, + "grad_norm": 5.18332052230835, + "learning_rate": 1.8870694522868436e-05, + "loss": 2.699, + "step": 3000 + }, + { + "epoch": 0.57, + "grad_norm": 11.170907974243164, + "learning_rate": 1.8866930171278e-05, + "loss": 2.1945, + "step": 3010 + }, + { + "epoch": 0.57, + "grad_norm": 8.540265083312988, + "learning_rate": 1.8863165819687563e-05, + "loss": 2.3214, + "step": 3020 + }, + { + "epoch": 0.57, + "grad_norm": 6.900252342224121, + "learning_rate": 1.8859401468097123e-05, + "loss": 2.4009, + "step": 3030 + }, + { + "epoch": 0.57, + "grad_norm": 7.43867826461792, + "learning_rate": 1.8855637116506686e-05, + "loss": 2.5143, + "step": 3040 + }, + { + "epoch": 0.57, + "grad_norm": 5.265262126922607, + "learning_rate": 1.8851872764916242e-05, + "loss": 2.3603, + "step": 3050 + }, + { + "epoch": 0.58, + "grad_norm": 7.436113357543945, + "learning_rate": 1.8848108413325805e-05, + "loss": 2.1487, + "step": 3060 + }, + { + "epoch": 0.58, + "grad_norm": 6.59679651260376, + "learning_rate": 1.884434406173537e-05, + "loss": 2.3253, + "step": 3070 + }, + { + "epoch": 0.58, + "grad_norm": 5.955831527709961, + "learning_rate": 1.8840579710144928e-05, + "loss": 2.2582, + "step": 3080 + }, + { + "epoch": 0.58, + "grad_norm": 8.666378021240234, + "learning_rate": 1.883681535855449e-05, + "loss": 2.2552, + "step": 3090 + }, + { + "epoch": 0.58, + "grad_norm": 8.003193855285645, + "learning_rate": 1.883305100696405e-05, + "loss": 2.2303, + "step": 3100 + }, + { + "epoch": 0.59, + "grad_norm": 6.098687648773193, + "learning_rate": 1.8829286655373614e-05, + "loss": 2.4797, + "step": 3110 + }, + { + "epoch": 0.59, + "grad_norm": 4.299673557281494, + "learning_rate": 1.8825522303783174e-05, + "loss": 2.3256, + "step": 3120 + }, + { + "epoch": 0.59, + "grad_norm": 5.408268451690674, + "learning_rate": 1.8821757952192737e-05, + "loss": 2.2441, + "step": 3130 + }, + { + "epoch": 0.59, + "grad_norm": 8.5181303024292, + "learning_rate": 1.8817993600602297e-05, + "loss": 2.2284, + "step": 3140 + }, + { + "epoch": 0.59, + "grad_norm": 9.531221389770508, + "learning_rate": 1.881422924901186e-05, + "loss": 2.3657, + "step": 3150 + }, + { + "epoch": 0.59, + "grad_norm": 5.5880889892578125, + "learning_rate": 1.881046489742142e-05, + "loss": 2.3858, + "step": 3160 + }, + { + "epoch": 0.6, + "grad_norm": 8.181473731994629, + "learning_rate": 1.8806700545830983e-05, + "loss": 2.3284, + "step": 3170 + }, + { + "epoch": 0.6, + "grad_norm": 11.959000587463379, + "learning_rate": 1.8802936194240543e-05, + "loss": 2.265, + "step": 3180 + }, + { + "epoch": 0.6, + "grad_norm": 4.405294418334961, + "learning_rate": 1.8799171842650106e-05, + "loss": 2.4588, + "step": 3190 + }, + { + "epoch": 0.6, + "grad_norm": 5.867427825927734, + "learning_rate": 1.8795407491059666e-05, + "loss": 2.2892, + "step": 3200 + }, + { + "epoch": 0.6, + "grad_norm": 4.539100646972656, + "learning_rate": 1.879164313946923e-05, + "loss": 2.033, + "step": 3210 + }, + { + "epoch": 0.61, + "grad_norm": 9.679889678955078, + "learning_rate": 1.8787878787878792e-05, + "loss": 2.344, + "step": 3220 + }, + { + "epoch": 0.61, + "grad_norm": 9.749494552612305, + "learning_rate": 1.878411443628835e-05, + "loss": 2.1338, + "step": 3230 + }, + { + "epoch": 0.61, + "grad_norm": 11.74128246307373, + "learning_rate": 1.878035008469791e-05, + "loss": 2.4541, + "step": 3240 + }, + { + "epoch": 0.61, + "grad_norm": 18.931474685668945, + "learning_rate": 1.8776585733107475e-05, + "loss": 2.2487, + "step": 3250 + }, + { + "epoch": 0.61, + "grad_norm": 3.824002265930176, + "learning_rate": 1.8772821381517034e-05, + "loss": 2.407, + "step": 3260 + }, + { + "epoch": 0.62, + "grad_norm": 8.731551170349121, + "learning_rate": 1.8769057029926598e-05, + "loss": 2.2347, + "step": 3270 + }, + { + "epoch": 0.62, + "grad_norm": 4.22593879699707, + "learning_rate": 1.8765292678336157e-05, + "loss": 2.0125, + "step": 3280 + }, + { + "epoch": 0.62, + "grad_norm": 5.457214832305908, + "learning_rate": 1.876152832674572e-05, + "loss": 2.1522, + "step": 3290 + }, + { + "epoch": 0.62, + "grad_norm": 9.144113540649414, + "learning_rate": 1.875776397515528e-05, + "loss": 2.4606, + "step": 3300 + }, + { + "epoch": 0.62, + "grad_norm": 3.9879465103149414, + "learning_rate": 1.8753999623564843e-05, + "loss": 2.0853, + "step": 3310 + }, + { + "epoch": 0.62, + "grad_norm": 8.082098960876465, + "learning_rate": 1.8750235271974403e-05, + "loss": 2.1247, + "step": 3320 + }, + { + "epoch": 0.63, + "grad_norm": 13.393769264221191, + "learning_rate": 1.8746470920383966e-05, + "loss": 2.1507, + "step": 3330 + }, + { + "epoch": 0.63, + "grad_norm": 3.922086477279663, + "learning_rate": 1.8742706568793526e-05, + "loss": 2.2208, + "step": 3340 + }, + { + "epoch": 0.63, + "grad_norm": 7.737308025360107, + "learning_rate": 1.873894221720309e-05, + "loss": 2.2848, + "step": 3350 + }, + { + "epoch": 0.63, + "grad_norm": 18.515275955200195, + "learning_rate": 1.873517786561265e-05, + "loss": 2.0712, + "step": 3360 + }, + { + "epoch": 0.63, + "grad_norm": 9.918493270874023, + "learning_rate": 1.8731413514022212e-05, + "loss": 2.1302, + "step": 3370 + }, + { + "epoch": 0.64, + "grad_norm": 3.196310043334961, + "learning_rate": 1.8727649162431772e-05, + "loss": 2.1687, + "step": 3380 + }, + { + "epoch": 0.64, + "grad_norm": 5.755160808563232, + "learning_rate": 1.8723884810841335e-05, + "loss": 2.3199, + "step": 3390 + }, + { + "epoch": 0.64, + "grad_norm": 6.229613780975342, + "learning_rate": 1.8720120459250895e-05, + "loss": 2.0273, + "step": 3400 + }, + { + "epoch": 0.64, + "grad_norm": 4.063007831573486, + "learning_rate": 1.8716356107660455e-05, + "loss": 2.2429, + "step": 3410 + }, + { + "epoch": 0.64, + "grad_norm": 4.2730302810668945, + "learning_rate": 1.8712591756070018e-05, + "loss": 2.2487, + "step": 3420 + }, + { + "epoch": 0.65, + "grad_norm": 5.997701644897461, + "learning_rate": 1.8708827404479578e-05, + "loss": 2.288, + "step": 3430 + }, + { + "epoch": 0.65, + "grad_norm": 13.065136909484863, + "learning_rate": 1.870506305288914e-05, + "loss": 2.2238, + "step": 3440 + }, + { + "epoch": 0.65, + "grad_norm": 7.368946075439453, + "learning_rate": 1.8701298701298704e-05, + "loss": 2.2514, + "step": 3450 + }, + { + "epoch": 0.65, + "grad_norm": 5.36475944519043, + "learning_rate": 1.8697534349708264e-05, + "loss": 2.1219, + "step": 3460 + }, + { + "epoch": 0.65, + "grad_norm": 6.188838958740234, + "learning_rate": 1.8693769998117827e-05, + "loss": 2.2102, + "step": 3470 + }, + { + "epoch": 0.65, + "grad_norm": 12.989961624145508, + "learning_rate": 1.8690005646527387e-05, + "loss": 2.2069, + "step": 3480 + }, + { + "epoch": 0.66, + "grad_norm": 6.382188320159912, + "learning_rate": 1.868624129493695e-05, + "loss": 2.0466, + "step": 3490 + }, + { + "epoch": 0.66, + "grad_norm": 6.724734306335449, + "learning_rate": 1.868247694334651e-05, + "loss": 2.0416, + "step": 3500 + }, + { + "epoch": 0.66, + "grad_norm": 10.628301620483398, + "learning_rate": 1.8678712591756073e-05, + "loss": 2.3185, + "step": 3510 + }, + { + "epoch": 0.66, + "grad_norm": 3.1391384601593018, + "learning_rate": 1.8674948240165632e-05, + "loss": 1.9034, + "step": 3520 + }, + { + "epoch": 0.66, + "grad_norm": 9.9121732711792, + "learning_rate": 1.8671183888575196e-05, + "loss": 2.1419, + "step": 3530 + }, + { + "epoch": 0.67, + "grad_norm": 11.588180541992188, + "learning_rate": 1.8667419536984755e-05, + "loss": 2.1772, + "step": 3540 + }, + { + "epoch": 0.67, + "grad_norm": 8.538999557495117, + "learning_rate": 1.866365518539432e-05, + "loss": 2.1558, + "step": 3550 + }, + { + "epoch": 0.67, + "grad_norm": 6.513119697570801, + "learning_rate": 1.8659890833803878e-05, + "loss": 2.2185, + "step": 3560 + }, + { + "epoch": 0.67, + "grad_norm": 9.248276710510254, + "learning_rate": 1.865612648221344e-05, + "loss": 2.0644, + "step": 3570 + }, + { + "epoch": 0.67, + "grad_norm": 8.285011291503906, + "learning_rate": 1.8652362130623e-05, + "loss": 2.1054, + "step": 3580 + }, + { + "epoch": 0.68, + "grad_norm": 6.439683437347412, + "learning_rate": 1.864859777903256e-05, + "loss": 2.123, + "step": 3590 + }, + { + "epoch": 0.68, + "grad_norm": 8.896665573120117, + "learning_rate": 1.8644833427442124e-05, + "loss": 2.1271, + "step": 3600 + }, + { + "epoch": 0.68, + "grad_norm": 10.368461608886719, + "learning_rate": 1.8641069075851684e-05, + "loss": 1.907, + "step": 3610 + }, + { + "epoch": 0.68, + "grad_norm": 3.7022368907928467, + "learning_rate": 1.8637304724261247e-05, + "loss": 1.9676, + "step": 3620 + }, + { + "epoch": 0.68, + "grad_norm": 11.112764358520508, + "learning_rate": 1.863354037267081e-05, + "loss": 1.9633, + "step": 3630 + }, + { + "epoch": 0.69, + "grad_norm": 10.671046257019043, + "learning_rate": 1.862977602108037e-05, + "loss": 2.1857, + "step": 3640 + }, + { + "epoch": 0.69, + "grad_norm": 4.5004682540893555, + "learning_rate": 1.8626011669489933e-05, + "loss": 1.9923, + "step": 3650 + }, + { + "epoch": 0.69, + "grad_norm": 27.06603240966797, + "learning_rate": 1.8622247317899493e-05, + "loss": 2.2683, + "step": 3660 + }, + { + "epoch": 0.69, + "grad_norm": 10.998562812805176, + "learning_rate": 1.8618482966309056e-05, + "loss": 2.1817, + "step": 3670 + }, + { + "epoch": 0.69, + "grad_norm": 6.846683979034424, + "learning_rate": 1.8614718614718616e-05, + "loss": 2.0747, + "step": 3680 + }, + { + "epoch": 0.69, + "grad_norm": 8.303011894226074, + "learning_rate": 1.861095426312818e-05, + "loss": 2.0505, + "step": 3690 + }, + { + "epoch": 0.7, + "grad_norm": 14.131027221679688, + "learning_rate": 1.860718991153774e-05, + "loss": 2.143, + "step": 3700 + }, + { + "epoch": 0.7, + "grad_norm": 6.333893775939941, + "learning_rate": 1.8603425559947302e-05, + "loss": 2.0173, + "step": 3710 + }, + { + "epoch": 0.7, + "grad_norm": 13.341752052307129, + "learning_rate": 1.859966120835686e-05, + "loss": 2.0513, + "step": 3720 + }, + { + "epoch": 0.7, + "grad_norm": 8.092610359191895, + "learning_rate": 1.8595896856766425e-05, + "loss": 1.966, + "step": 3730 + }, + { + "epoch": 0.7, + "grad_norm": 6.7696356773376465, + "learning_rate": 1.8592132505175985e-05, + "loss": 2.0219, + "step": 3740 + }, + { + "epoch": 0.71, + "grad_norm": 5.491209030151367, + "learning_rate": 1.8588368153585544e-05, + "loss": 2.062, + "step": 3750 + }, + { + "epoch": 0.71, + "grad_norm": 8.02304458618164, + "learning_rate": 1.8584603801995107e-05, + "loss": 2.1111, + "step": 3760 + }, + { + "epoch": 0.71, + "grad_norm": 8.15708065032959, + "learning_rate": 1.8580839450404667e-05, + "loss": 1.8681, + "step": 3770 + }, + { + "epoch": 0.71, + "grad_norm": 14.145309448242188, + "learning_rate": 1.857707509881423e-05, + "loss": 1.9796, + "step": 3780 + }, + { + "epoch": 0.71, + "grad_norm": 3.182771682739258, + "learning_rate": 1.857331074722379e-05, + "loss": 2.2717, + "step": 3790 + }, + { + "epoch": 0.72, + "grad_norm": 6.4951090812683105, + "learning_rate": 1.8569546395633353e-05, + "loss": 2.2107, + "step": 3800 + }, + { + "epoch": 0.72, + "grad_norm": 15.642072677612305, + "learning_rate": 1.8565782044042916e-05, + "loss": 2.1561, + "step": 3810 + }, + { + "epoch": 0.72, + "grad_norm": 3.078226327896118, + "learning_rate": 1.8562017692452476e-05, + "loss": 1.9989, + "step": 3820 + }, + { + "epoch": 0.72, + "grad_norm": 4.49596643447876, + "learning_rate": 1.855825334086204e-05, + "loss": 1.9954, + "step": 3830 + }, + { + "epoch": 0.72, + "grad_norm": 11.259866714477539, + "learning_rate": 1.85544889892716e-05, + "loss": 1.8364, + "step": 3840 + }, + { + "epoch": 0.72, + "grad_norm": 9.790380477905273, + "learning_rate": 1.8550724637681162e-05, + "loss": 2.0872, + "step": 3850 + }, + { + "epoch": 0.73, + "grad_norm": 10.723867416381836, + "learning_rate": 1.8546960286090722e-05, + "loss": 1.9063, + "step": 3860 + }, + { + "epoch": 0.73, + "grad_norm": 5.424127101898193, + "learning_rate": 1.8543195934500285e-05, + "loss": 1.9285, + "step": 3870 + }, + { + "epoch": 0.73, + "grad_norm": 6.509077548980713, + "learning_rate": 1.8539431582909845e-05, + "loss": 1.9326, + "step": 3880 + }, + { + "epoch": 0.73, + "grad_norm": 9.685775756835938, + "learning_rate": 1.8535667231319408e-05, + "loss": 2.2902, + "step": 3890 + }, + { + "epoch": 0.73, + "grad_norm": 11.874910354614258, + "learning_rate": 1.8531902879728968e-05, + "loss": 1.8581, + "step": 3900 + }, + { + "epoch": 0.74, + "grad_norm": 6.72626256942749, + "learning_rate": 1.852813852813853e-05, + "loss": 2.0148, + "step": 3910 + }, + { + "epoch": 0.74, + "grad_norm": 21.550472259521484, + "learning_rate": 1.852437417654809e-05, + "loss": 1.9228, + "step": 3920 + }, + { + "epoch": 0.74, + "grad_norm": 10.180908203125, + "learning_rate": 1.852060982495765e-05, + "loss": 1.94, + "step": 3930 + }, + { + "epoch": 0.74, + "grad_norm": 2.4823896884918213, + "learning_rate": 1.8516845473367214e-05, + "loss": 1.8551, + "step": 3940 + }, + { + "epoch": 0.74, + "grad_norm": 37.56202697753906, + "learning_rate": 1.8513081121776774e-05, + "loss": 2.1671, + "step": 3950 + }, + { + "epoch": 0.75, + "grad_norm": 13.235817909240723, + "learning_rate": 1.8509316770186337e-05, + "loss": 1.9974, + "step": 3960 + }, + { + "epoch": 0.75, + "grad_norm": 13.139102935791016, + "learning_rate": 1.8505552418595896e-05, + "loss": 2.0052, + "step": 3970 + }, + { + "epoch": 0.75, + "grad_norm": 4.566539287567139, + "learning_rate": 1.850178806700546e-05, + "loss": 1.8644, + "step": 3980 + }, + { + "epoch": 0.75, + "grad_norm": 12.045722007751465, + "learning_rate": 1.849802371541502e-05, + "loss": 1.9597, + "step": 3990 + }, + { + "epoch": 0.75, + "grad_norm": 9.117792129516602, + "learning_rate": 1.8494259363824583e-05, + "loss": 1.9613, + "step": 4000 + }, + { + "epoch": 0.75, + "grad_norm": 30.70831298828125, + "learning_rate": 1.8490495012234146e-05, + "loss": 1.9947, + "step": 4010 + }, + { + "epoch": 0.76, + "grad_norm": 8.068875312805176, + "learning_rate": 1.8486730660643705e-05, + "loss": 2.0513, + "step": 4020 + }, + { + "epoch": 0.76, + "grad_norm": 6.033820629119873, + "learning_rate": 1.848296630905327e-05, + "loss": 2.1018, + "step": 4030 + }, + { + "epoch": 0.76, + "grad_norm": 7.325448036193848, + "learning_rate": 1.847920195746283e-05, + "loss": 2.0123, + "step": 4040 + }, + { + "epoch": 0.76, + "grad_norm": 4.722583770751953, + "learning_rate": 1.847543760587239e-05, + "loss": 2.086, + "step": 4050 + }, + { + "epoch": 0.76, + "grad_norm": 5.869627952575684, + "learning_rate": 1.847167325428195e-05, + "loss": 1.8988, + "step": 4060 + }, + { + "epoch": 0.77, + "grad_norm": 5.616646766662598, + "learning_rate": 1.8467908902691514e-05, + "loss": 1.9823, + "step": 4070 + }, + { + "epoch": 0.77, + "grad_norm": 7.294058799743652, + "learning_rate": 1.8464144551101074e-05, + "loss": 1.97, + "step": 4080 + }, + { + "epoch": 0.77, + "grad_norm": 3.1566827297210693, + "learning_rate": 1.8460380199510637e-05, + "loss": 1.9226, + "step": 4090 + }, + { + "epoch": 0.77, + "grad_norm": 8.810595512390137, + "learning_rate": 1.8456615847920197e-05, + "loss": 2.0752, + "step": 4100 + }, + { + "epoch": 0.77, + "grad_norm": 8.686856269836426, + "learning_rate": 1.8452851496329757e-05, + "loss": 2.0722, + "step": 4110 + }, + { + "epoch": 0.78, + "grad_norm": 8.495172500610352, + "learning_rate": 1.844908714473932e-05, + "loss": 1.7657, + "step": 4120 + }, + { + "epoch": 0.78, + "grad_norm": 7.293805122375488, + "learning_rate": 1.844532279314888e-05, + "loss": 1.6995, + "step": 4130 + }, + { + "epoch": 0.78, + "grad_norm": 8.600557327270508, + "learning_rate": 1.8441558441558443e-05, + "loss": 1.7201, + "step": 4140 + }, + { + "epoch": 0.78, + "grad_norm": 9.685958862304688, + "learning_rate": 1.8437794089968003e-05, + "loss": 1.7472, + "step": 4150 + }, + { + "epoch": 0.78, + "grad_norm": 20.721675872802734, + "learning_rate": 1.8434029738377566e-05, + "loss": 1.8939, + "step": 4160 + }, + { + "epoch": 0.78, + "grad_norm": 12.649245262145996, + "learning_rate": 1.8430265386787126e-05, + "loss": 1.8974, + "step": 4170 + }, + { + "epoch": 0.79, + "grad_norm": 17.670217514038086, + "learning_rate": 1.842650103519669e-05, + "loss": 1.7713, + "step": 4180 + }, + { + "epoch": 0.79, + "grad_norm": 13.489444732666016, + "learning_rate": 1.8422736683606252e-05, + "loss": 2.0021, + "step": 4190 + }, + { + "epoch": 0.79, + "grad_norm": 15.630731582641602, + "learning_rate": 1.8418972332015812e-05, + "loss": 1.9024, + "step": 4200 + }, + { + "epoch": 0.79, + "grad_norm": 11.856411933898926, + "learning_rate": 1.8415207980425375e-05, + "loss": 2.1115, + "step": 4210 + }, + { + "epoch": 0.79, + "grad_norm": 8.349586486816406, + "learning_rate": 1.8411443628834935e-05, + "loss": 1.7986, + "step": 4220 + }, + { + "epoch": 0.8, + "grad_norm": 10.628458023071289, + "learning_rate": 1.8407679277244498e-05, + "loss": 1.7519, + "step": 4230 + }, + { + "epoch": 0.8, + "grad_norm": 5.205367565155029, + "learning_rate": 1.8403914925654058e-05, + "loss": 1.8736, + "step": 4240 + }, + { + "epoch": 0.8, + "grad_norm": 11.882181167602539, + "learning_rate": 1.840015057406362e-05, + "loss": 1.9139, + "step": 4250 + }, + { + "epoch": 0.8, + "grad_norm": 8.525898933410645, + "learning_rate": 1.839638622247318e-05, + "loss": 2.0817, + "step": 4260 + }, + { + "epoch": 0.8, + "grad_norm": 11.283459663391113, + "learning_rate": 1.839262187088274e-05, + "loss": 1.8557, + "step": 4270 + }, + { + "epoch": 0.81, + "grad_norm": 12.506168365478516, + "learning_rate": 1.8388857519292303e-05, + "loss": 1.9177, + "step": 4280 + }, + { + "epoch": 0.81, + "grad_norm": 9.862067222595215, + "learning_rate": 1.8385093167701863e-05, + "loss": 2.0223, + "step": 4290 + }, + { + "epoch": 0.81, + "grad_norm": 6.25103759765625, + "learning_rate": 1.8381328816111426e-05, + "loss": 2.045, + "step": 4300 + }, + { + "epoch": 0.81, + "grad_norm": 7.032151699066162, + "learning_rate": 1.8377564464520986e-05, + "loss": 1.8646, + "step": 4310 + }, + { + "epoch": 0.81, + "grad_norm": 9.486144065856934, + "learning_rate": 1.837380011293055e-05, + "loss": 1.6828, + "step": 4320 + }, + { + "epoch": 0.81, + "grad_norm": 10.718717575073242, + "learning_rate": 1.837003576134011e-05, + "loss": 2.1125, + "step": 4330 + }, + { + "epoch": 0.82, + "grad_norm": 12.391996383666992, + "learning_rate": 1.8366271409749672e-05, + "loss": 1.7862, + "step": 4340 + }, + { + "epoch": 0.82, + "grad_norm": 9.340251922607422, + "learning_rate": 1.8362507058159232e-05, + "loss": 1.8666, + "step": 4350 + }, + { + "epoch": 0.82, + "grad_norm": 13.326081275939941, + "learning_rate": 1.8358742706568795e-05, + "loss": 1.7263, + "step": 4360 + }, + { + "epoch": 0.82, + "grad_norm": 7.439601898193359, + "learning_rate": 1.8354978354978358e-05, + "loss": 1.9045, + "step": 4370 + }, + { + "epoch": 0.82, + "grad_norm": 7.711633682250977, + "learning_rate": 1.8351214003387918e-05, + "loss": 1.6444, + "step": 4380 + }, + { + "epoch": 0.83, + "grad_norm": 27.964046478271484, + "learning_rate": 1.834744965179748e-05, + "loss": 1.9252, + "step": 4390 + }, + { + "epoch": 0.83, + "grad_norm": 23.466896057128906, + "learning_rate": 1.834368530020704e-05, + "loss": 1.9257, + "step": 4400 + }, + { + "epoch": 0.83, + "grad_norm": 8.017694473266602, + "learning_rate": 1.8339920948616604e-05, + "loss": 1.8272, + "step": 4410 + }, + { + "epoch": 0.83, + "grad_norm": 9.13419246673584, + "learning_rate": 1.8336156597026164e-05, + "loss": 1.8617, + "step": 4420 + }, + { + "epoch": 0.83, + "grad_norm": 4.761314392089844, + "learning_rate": 1.8332392245435727e-05, + "loss": 1.6775, + "step": 4430 + }, + { + "epoch": 0.84, + "grad_norm": 19.422325134277344, + "learning_rate": 1.8328627893845287e-05, + "loss": 2.015, + "step": 4440 + }, + { + "epoch": 0.84, + "grad_norm": 7.556143760681152, + "learning_rate": 1.8324863542254847e-05, + "loss": 1.7302, + "step": 4450 + }, + { + "epoch": 0.84, + "grad_norm": 13.221558570861816, + "learning_rate": 1.832109919066441e-05, + "loss": 1.6569, + "step": 4460 + }, + { + "epoch": 0.84, + "grad_norm": 8.56251335144043, + "learning_rate": 1.831733483907397e-05, + "loss": 1.7976, + "step": 4470 + }, + { + "epoch": 0.84, + "grad_norm": 15.487319946289062, + "learning_rate": 1.8313570487483533e-05, + "loss": 2.0049, + "step": 4480 + }, + { + "epoch": 0.85, + "grad_norm": 12.269038200378418, + "learning_rate": 1.8309806135893092e-05, + "loss": 1.6322, + "step": 4490 + }, + { + "epoch": 0.85, + "grad_norm": 10.545343399047852, + "learning_rate": 1.8306041784302656e-05, + "loss": 1.7523, + "step": 4500 + }, + { + "epoch": 0.85, + "grad_norm": 11.680045127868652, + "learning_rate": 1.8302277432712215e-05, + "loss": 1.7489, + "step": 4510 + }, + { + "epoch": 0.85, + "grad_norm": 13.965753555297852, + "learning_rate": 1.829851308112178e-05, + "loss": 1.7063, + "step": 4520 + }, + { + "epoch": 0.85, + "grad_norm": 14.397643089294434, + "learning_rate": 1.8294748729531338e-05, + "loss": 1.8438, + "step": 4530 + }, + { + "epoch": 0.85, + "grad_norm": 15.942564010620117, + "learning_rate": 1.82909843779409e-05, + "loss": 1.8897, + "step": 4540 + }, + { + "epoch": 0.86, + "grad_norm": 4.38701868057251, + "learning_rate": 1.8287220026350465e-05, + "loss": 1.8604, + "step": 4550 + }, + { + "epoch": 0.86, + "grad_norm": 2.3334455490112305, + "learning_rate": 1.8283455674760024e-05, + "loss": 1.6651, + "step": 4560 + }, + { + "epoch": 0.86, + "grad_norm": 12.184940338134766, + "learning_rate": 1.8279691323169587e-05, + "loss": 2.0425, + "step": 4570 + }, + { + "epoch": 0.86, + "grad_norm": 29.6932430267334, + "learning_rate": 1.8275926971579147e-05, + "loss": 1.7136, + "step": 4580 + }, + { + "epoch": 0.86, + "grad_norm": 9.31380844116211, + "learning_rate": 1.827216261998871e-05, + "loss": 1.6088, + "step": 4590 + }, + { + "epoch": 0.87, + "grad_norm": 6.644775390625, + "learning_rate": 1.826839826839827e-05, + "loss": 1.7825, + "step": 4600 + }, + { + "epoch": 0.87, + "grad_norm": 14.836319923400879, + "learning_rate": 1.8264633916807833e-05, + "loss": 1.7212, + "step": 4610 + }, + { + "epoch": 0.87, + "grad_norm": 10.639488220214844, + "learning_rate": 1.8260869565217393e-05, + "loss": 1.6093, + "step": 4620 + }, + { + "epoch": 0.87, + "grad_norm": 10.97711181640625, + "learning_rate": 1.8257105213626953e-05, + "loss": 1.6979, + "step": 4630 + }, + { + "epoch": 0.87, + "grad_norm": 18.360660552978516, + "learning_rate": 1.8253340862036516e-05, + "loss": 1.8738, + "step": 4640 + }, + { + "epoch": 0.88, + "grad_norm": 9.36877727508545, + "learning_rate": 1.8249576510446076e-05, + "loss": 1.6484, + "step": 4650 + }, + { + "epoch": 0.88, + "grad_norm": 13.112272262573242, + "learning_rate": 1.824581215885564e-05, + "loss": 1.6616, + "step": 4660 + }, + { + "epoch": 0.88, + "grad_norm": 6.932171821594238, + "learning_rate": 1.82420478072652e-05, + "loss": 1.6586, + "step": 4670 + }, + { + "epoch": 0.88, + "grad_norm": 8.852755546569824, + "learning_rate": 1.8238283455674762e-05, + "loss": 1.8107, + "step": 4680 + }, + { + "epoch": 0.88, + "grad_norm": 7.760582447052002, + "learning_rate": 1.823451910408432e-05, + "loss": 1.7808, + "step": 4690 + }, + { + "epoch": 0.88, + "grad_norm": 8.635812759399414, + "learning_rate": 1.8230754752493885e-05, + "loss": 1.6713, + "step": 4700 + }, + { + "epoch": 0.89, + "grad_norm": 16.030855178833008, + "learning_rate": 1.8226990400903445e-05, + "loss": 1.6924, + "step": 4710 + }, + { + "epoch": 0.89, + "grad_norm": 16.42034149169922, + "learning_rate": 1.8223226049313008e-05, + "loss": 1.8233, + "step": 4720 + }, + { + "epoch": 0.89, + "grad_norm": 7.589072227478027, + "learning_rate": 1.8219461697722567e-05, + "loss": 1.4428, + "step": 4730 + }, + { + "epoch": 0.89, + "grad_norm": 14.824429512023926, + "learning_rate": 1.821569734613213e-05, + "loss": 1.7412, + "step": 4740 + }, + { + "epoch": 0.89, + "grad_norm": 2.6960065364837646, + "learning_rate": 1.8211932994541694e-05, + "loss": 1.7846, + "step": 4750 + }, + { + "epoch": 0.9, + "grad_norm": 9.252869606018066, + "learning_rate": 1.8208168642951254e-05, + "loss": 1.6731, + "step": 4760 + }, + { + "epoch": 0.9, + "grad_norm": 12.624210357666016, + "learning_rate": 1.8204404291360817e-05, + "loss": 1.7592, + "step": 4770 + }, + { + "epoch": 0.9, + "grad_norm": 5.478265762329102, + "learning_rate": 1.8200639939770376e-05, + "loss": 1.6657, + "step": 4780 + }, + { + "epoch": 0.9, + "grad_norm": 11.12108039855957, + "learning_rate": 1.819687558817994e-05, + "loss": 1.6616, + "step": 4790 + }, + { + "epoch": 0.9, + "grad_norm": 5.339282035827637, + "learning_rate": 1.81931112365895e-05, + "loss": 1.5382, + "step": 4800 + }, + { + "epoch": 0.91, + "grad_norm": 30.758148193359375, + "learning_rate": 1.818934688499906e-05, + "loss": 1.7256, + "step": 4810 + }, + { + "epoch": 0.91, + "grad_norm": 3.9476451873779297, + "learning_rate": 1.8185582533408622e-05, + "loss": 1.5978, + "step": 4820 + }, + { + "epoch": 0.91, + "grad_norm": 2.4273171424865723, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.4934, + "step": 4830 + }, + { + "epoch": 0.91, + "grad_norm": 12.683277130126953, + "learning_rate": 1.8178053830227745e-05, + "loss": 1.8227, + "step": 4840 + }, + { + "epoch": 0.91, + "grad_norm": 7.911778450012207, + "learning_rate": 1.8174289478637305e-05, + "loss": 1.5653, + "step": 4850 + }, + { + "epoch": 0.91, + "grad_norm": 30.177553176879883, + "learning_rate": 1.8170525127046868e-05, + "loss": 1.6419, + "step": 4860 + }, + { + "epoch": 0.92, + "grad_norm": 22.67075538635254, + "learning_rate": 1.8166760775456428e-05, + "loss": 1.8511, + "step": 4870 + }, + { + "epoch": 0.92, + "grad_norm": 16.48545265197754, + "learning_rate": 1.816299642386599e-05, + "loss": 1.2912, + "step": 4880 + }, + { + "epoch": 0.92, + "grad_norm": 13.775650978088379, + "learning_rate": 1.815923207227555e-05, + "loss": 1.7698, + "step": 4890 + }, + { + "epoch": 0.92, + "grad_norm": 6.288990020751953, + "learning_rate": 1.8155467720685114e-05, + "loss": 1.5706, + "step": 4900 + }, + { + "epoch": 0.92, + "grad_norm": 17.974271774291992, + "learning_rate": 1.8151703369094674e-05, + "loss": 1.6064, + "step": 4910 + }, + { + "epoch": 0.93, + "grad_norm": 28.941722869873047, + "learning_rate": 1.8147939017504237e-05, + "loss": 1.6713, + "step": 4920 + }, + { + "epoch": 0.93, + "grad_norm": 11.917637825012207, + "learning_rate": 1.81441746659138e-05, + "loss": 1.6204, + "step": 4930 + }, + { + "epoch": 0.93, + "grad_norm": 24.066226959228516, + "learning_rate": 1.814041031432336e-05, + "loss": 1.7583, + "step": 4940 + }, + { + "epoch": 0.93, + "grad_norm": 9.762685775756836, + "learning_rate": 1.8136645962732923e-05, + "loss": 1.4719, + "step": 4950 + }, + { + "epoch": 0.93, + "grad_norm": 3.6561665534973145, + "learning_rate": 1.8132881611142483e-05, + "loss": 1.5726, + "step": 4960 + }, + { + "epoch": 0.94, + "grad_norm": 8.348560333251953, + "learning_rate": 1.8129117259552043e-05, + "loss": 1.6183, + "step": 4970 + }, + { + "epoch": 0.94, + "grad_norm": 13.388601303100586, + "learning_rate": 1.8125352907961606e-05, + "loss": 1.614, + "step": 4980 + }, + { + "epoch": 0.94, + "grad_norm": 5.767553806304932, + "learning_rate": 1.8121588556371165e-05, + "loss": 1.5365, + "step": 4990 + }, + { + "epoch": 0.94, + "grad_norm": 8.167011260986328, + "learning_rate": 1.811782420478073e-05, + "loss": 1.6598, + "step": 5000 + }, + { + "epoch": 0.94, + "grad_norm": 33.257511138916016, + "learning_rate": 1.811405985319029e-05, + "loss": 1.9184, + "step": 5010 + }, + { + "epoch": 0.94, + "grad_norm": 21.02366065979004, + "learning_rate": 1.811029550159985e-05, + "loss": 1.7108, + "step": 5020 + }, + { + "epoch": 0.95, + "grad_norm": 9.491157531738281, + "learning_rate": 1.810653115000941e-05, + "loss": 1.5522, + "step": 5030 + }, + { + "epoch": 0.95, + "grad_norm": 13.046422004699707, + "learning_rate": 1.8102766798418974e-05, + "loss": 1.7517, + "step": 5040 + }, + { + "epoch": 0.95, + "grad_norm": 11.477202415466309, + "learning_rate": 1.8099002446828534e-05, + "loss": 1.6329, + "step": 5050 + }, + { + "epoch": 0.95, + "grad_norm": 12.039795875549316, + "learning_rate": 1.8095238095238097e-05, + "loss": 1.7396, + "step": 5060 + }, + { + "epoch": 0.95, + "grad_norm": 10.725845336914062, + "learning_rate": 1.8091473743647657e-05, + "loss": 1.5248, + "step": 5070 + }, + { + "epoch": 0.96, + "grad_norm": 9.213160514831543, + "learning_rate": 1.808770939205722e-05, + "loss": 1.5598, + "step": 5080 + }, + { + "epoch": 0.96, + "grad_norm": 5.273130893707275, + "learning_rate": 1.808394504046678e-05, + "loss": 1.5013, + "step": 5090 + }, + { + "epoch": 0.96, + "grad_norm": 4.758931636810303, + "learning_rate": 1.8080180688876343e-05, + "loss": 1.6013, + "step": 5100 + }, + { + "epoch": 0.96, + "grad_norm": 12.065877914428711, + "learning_rate": 1.8076416337285906e-05, + "loss": 1.5301, + "step": 5110 + }, + { + "epoch": 0.96, + "grad_norm": 6.5146260261535645, + "learning_rate": 1.8072651985695466e-05, + "loss": 1.5861, + "step": 5120 + }, + { + "epoch": 0.97, + "grad_norm": 8.56678295135498, + "learning_rate": 1.806888763410503e-05, + "loss": 1.5615, + "step": 5130 + }, + { + "epoch": 0.97, + "grad_norm": 12.599604606628418, + "learning_rate": 1.8065123282514586e-05, + "loss": 1.5666, + "step": 5140 + }, + { + "epoch": 0.97, + "grad_norm": 5.8899827003479, + "learning_rate": 1.806135893092415e-05, + "loss": 1.544, + "step": 5150 + }, + { + "epoch": 0.97, + "grad_norm": 12.435726165771484, + "learning_rate": 1.8057594579333712e-05, + "loss": 1.4651, + "step": 5160 + }, + { + "epoch": 0.97, + "grad_norm": 17.321407318115234, + "learning_rate": 1.8053830227743272e-05, + "loss": 1.7957, + "step": 5170 + }, + { + "epoch": 0.97, + "grad_norm": 9.785209655761719, + "learning_rate": 1.8050065876152835e-05, + "loss": 1.5213, + "step": 5180 + }, + { + "epoch": 0.98, + "grad_norm": 5.476792812347412, + "learning_rate": 1.8046301524562395e-05, + "loss": 1.4325, + "step": 5190 + }, + { + "epoch": 0.98, + "grad_norm": 8.630148887634277, + "learning_rate": 1.8042537172971958e-05, + "loss": 1.4048, + "step": 5200 + }, + { + "epoch": 0.98, + "grad_norm": 18.134206771850586, + "learning_rate": 1.8038772821381518e-05, + "loss": 1.6637, + "step": 5210 + }, + { + "epoch": 0.98, + "grad_norm": 41.112239837646484, + "learning_rate": 1.803500846979108e-05, + "loss": 1.4036, + "step": 5220 + }, + { + "epoch": 0.98, + "grad_norm": 3.7763662338256836, + "learning_rate": 1.803124411820064e-05, + "loss": 1.7532, + "step": 5230 + }, + { + "epoch": 0.99, + "grad_norm": 24.516996383666992, + "learning_rate": 1.8027479766610204e-05, + "loss": 1.4504, + "step": 5240 + }, + { + "epoch": 0.99, + "grad_norm": 18.604209899902344, + "learning_rate": 1.8023715415019763e-05, + "loss": 1.509, + "step": 5250 + }, + { + "epoch": 0.99, + "grad_norm": 6.184398651123047, + "learning_rate": 1.8019951063429327e-05, + "loss": 1.4567, + "step": 5260 + }, + { + "epoch": 0.99, + "grad_norm": 7.315374374389648, + "learning_rate": 1.8016186711838886e-05, + "loss": 1.5755, + "step": 5270 + }, + { + "epoch": 0.99, + "grad_norm": 8.645417213439941, + "learning_rate": 1.801242236024845e-05, + "loss": 1.6434, + "step": 5280 + }, + { + "epoch": 1.0, + "grad_norm": 12.885428428649902, + "learning_rate": 1.800865800865801e-05, + "loss": 1.4227, + "step": 5290 + }, + { + "epoch": 1.0, + "grad_norm": 3.746971607208252, + "learning_rate": 1.8004893657067572e-05, + "loss": 1.2959, + "step": 5300 + }, + { + "epoch": 1.0, + "grad_norm": 5.56287145614624, + "learning_rate": 1.8001129305477136e-05, + "loss": 1.3866, + "step": 5310 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8746666666666667, + "eval_loss": 1.0967950820922852, + "eval_runtime": 33.6773, + "eval_samples_per_second": 222.702, + "eval_steps_per_second": 27.853, + "step": 5313 + }, + { + "epoch": 1.0, + "grad_norm": 5.499802112579346, + "learning_rate": 1.7997364953886692e-05, + "loss": 1.676, + "step": 5320 + }, + { + "epoch": 1.0, + "grad_norm": 16.520263671875, + "learning_rate": 1.7993600602296255e-05, + "loss": 1.5193, + "step": 5330 + }, + { + "epoch": 1.01, + "grad_norm": 10.004297256469727, + "learning_rate": 1.7989836250705818e-05, + "loss": 1.5083, + "step": 5340 + }, + { + "epoch": 1.01, + "grad_norm": 25.948406219482422, + "learning_rate": 1.7986071899115378e-05, + "loss": 1.2522, + "step": 5350 + }, + { + "epoch": 1.01, + "grad_norm": 38.36083221435547, + "learning_rate": 1.798230754752494e-05, + "loss": 1.3645, + "step": 5360 + }, + { + "epoch": 1.01, + "grad_norm": 7.533431529998779, + "learning_rate": 1.79785431959345e-05, + "loss": 1.3528, + "step": 5370 + }, + { + "epoch": 1.01, + "grad_norm": 6.777375221252441, + "learning_rate": 1.7974778844344064e-05, + "loss": 1.5204, + "step": 5380 + }, + { + "epoch": 1.01, + "grad_norm": 8.859736442565918, + "learning_rate": 1.7971014492753624e-05, + "loss": 1.3621, + "step": 5390 + }, + { + "epoch": 1.02, + "grad_norm": 14.00033187866211, + "learning_rate": 1.7967250141163187e-05, + "loss": 1.6082, + "step": 5400 + }, + { + "epoch": 1.02, + "grad_norm": 33.97581481933594, + "learning_rate": 1.7963485789572747e-05, + "loss": 1.6728, + "step": 5410 + }, + { + "epoch": 1.02, + "grad_norm": 15.276714324951172, + "learning_rate": 1.795972143798231e-05, + "loss": 1.3271, + "step": 5420 + }, + { + "epoch": 1.02, + "grad_norm": 9.051756858825684, + "learning_rate": 1.795595708639187e-05, + "loss": 1.6152, + "step": 5430 + }, + { + "epoch": 1.02, + "grad_norm": 14.027779579162598, + "learning_rate": 1.7952192734801433e-05, + "loss": 1.3804, + "step": 5440 + }, + { + "epoch": 1.03, + "grad_norm": 20.677108764648438, + "learning_rate": 1.7948428383210993e-05, + "loss": 1.4668, + "step": 5450 + }, + { + "epoch": 1.03, + "grad_norm": 7.414674758911133, + "learning_rate": 1.7944664031620556e-05, + "loss": 1.3469, + "step": 5460 + }, + { + "epoch": 1.03, + "grad_norm": 8.591064453125, + "learning_rate": 1.7940899680030116e-05, + "loss": 1.0537, + "step": 5470 + }, + { + "epoch": 1.03, + "grad_norm": 14.56933307647705, + "learning_rate": 1.793713532843968e-05, + "loss": 1.2944, + "step": 5480 + }, + { + "epoch": 1.03, + "grad_norm": 9.722732543945312, + "learning_rate": 1.793337097684924e-05, + "loss": 1.7081, + "step": 5490 + }, + { + "epoch": 1.04, + "grad_norm": 22.593751907348633, + "learning_rate": 1.7929606625258798e-05, + "loss": 1.4611, + "step": 5500 + }, + { + "epoch": 1.04, + "grad_norm": 4.620069980621338, + "learning_rate": 1.792584227366836e-05, + "loss": 1.3241, + "step": 5510 + }, + { + "epoch": 1.04, + "grad_norm": 18.490928649902344, + "learning_rate": 1.792207792207792e-05, + "loss": 1.2878, + "step": 5520 + }, + { + "epoch": 1.04, + "grad_norm": 4.842093467712402, + "learning_rate": 1.7918313570487484e-05, + "loss": 1.3626, + "step": 5530 + }, + { + "epoch": 1.04, + "grad_norm": 2.8665945529937744, + "learning_rate": 1.7914549218897047e-05, + "loss": 1.3749, + "step": 5540 + }, + { + "epoch": 1.04, + "grad_norm": 8.311807632446289, + "learning_rate": 1.7910784867306607e-05, + "loss": 1.0602, + "step": 5550 + }, + { + "epoch": 1.05, + "grad_norm": 11.766958236694336, + "learning_rate": 1.790702051571617e-05, + "loss": 1.2955, + "step": 5560 + }, + { + "epoch": 1.05, + "grad_norm": 4.863016605377197, + "learning_rate": 1.790325616412573e-05, + "loss": 1.7003, + "step": 5570 + }, + { + "epoch": 1.05, + "grad_norm": 18.07254409790039, + "learning_rate": 1.7899491812535293e-05, + "loss": 1.3364, + "step": 5580 + }, + { + "epoch": 1.05, + "grad_norm": 4.024524211883545, + "learning_rate": 1.7895727460944853e-05, + "loss": 1.3305, + "step": 5590 + }, + { + "epoch": 1.05, + "grad_norm": 15.396920204162598, + "learning_rate": 1.7891963109354416e-05, + "loss": 1.309, + "step": 5600 + }, + { + "epoch": 1.06, + "grad_norm": 9.364221572875977, + "learning_rate": 1.7888198757763976e-05, + "loss": 1.4572, + "step": 5610 + }, + { + "epoch": 1.06, + "grad_norm": 21.09225082397461, + "learning_rate": 1.788443440617354e-05, + "loss": 1.4568, + "step": 5620 + }, + { + "epoch": 1.06, + "grad_norm": 37.32011795043945, + "learning_rate": 1.78806700545831e-05, + "loss": 1.5728, + "step": 5630 + }, + { + "epoch": 1.06, + "grad_norm": 6.8537468910217285, + "learning_rate": 1.7876905702992662e-05, + "loss": 1.4453, + "step": 5640 + }, + { + "epoch": 1.06, + "grad_norm": 10.12431812286377, + "learning_rate": 1.7873141351402222e-05, + "loss": 1.4191, + "step": 5650 + }, + { + "epoch": 1.07, + "grad_norm": 6.774322032928467, + "learning_rate": 1.7869376999811785e-05, + "loss": 1.3566, + "step": 5660 + }, + { + "epoch": 1.07, + "grad_norm": 9.487374305725098, + "learning_rate": 1.7865612648221345e-05, + "loss": 1.2845, + "step": 5670 + }, + { + "epoch": 1.07, + "grad_norm": 8.616371154785156, + "learning_rate": 1.7861848296630905e-05, + "loss": 1.2827, + "step": 5680 + }, + { + "epoch": 1.07, + "grad_norm": 18.26116943359375, + "learning_rate": 1.7858083945040468e-05, + "loss": 1.279, + "step": 5690 + }, + { + "epoch": 1.07, + "grad_norm": 6.658090114593506, + "learning_rate": 1.7854319593450027e-05, + "loss": 1.1976, + "step": 5700 + }, + { + "epoch": 1.07, + "grad_norm": 2.6320950984954834, + "learning_rate": 1.785055524185959e-05, + "loss": 1.3875, + "step": 5710 + }, + { + "epoch": 1.08, + "grad_norm": 13.106246948242188, + "learning_rate": 1.7846790890269154e-05, + "loss": 1.42, + "step": 5720 + }, + { + "epoch": 1.08, + "grad_norm": 7.456079006195068, + "learning_rate": 1.7843026538678714e-05, + "loss": 1.106, + "step": 5730 + }, + { + "epoch": 1.08, + "grad_norm": 13.17140007019043, + "learning_rate": 1.7839262187088277e-05, + "loss": 1.153, + "step": 5740 + }, + { + "epoch": 1.08, + "grad_norm": 28.36896324157715, + "learning_rate": 1.7835497835497836e-05, + "loss": 1.6023, + "step": 5750 + }, + { + "epoch": 1.08, + "grad_norm": 13.630576133728027, + "learning_rate": 1.78317334839074e-05, + "loss": 1.5223, + "step": 5760 + }, + { + "epoch": 1.09, + "grad_norm": 3.064805030822754, + "learning_rate": 1.782796913231696e-05, + "loss": 1.3474, + "step": 5770 + }, + { + "epoch": 1.09, + "grad_norm": 16.060394287109375, + "learning_rate": 1.7824204780726523e-05, + "loss": 1.3724, + "step": 5780 + }, + { + "epoch": 1.09, + "grad_norm": 8.598823547363281, + "learning_rate": 1.7820440429136082e-05, + "loss": 1.2749, + "step": 5790 + }, + { + "epoch": 1.09, + "grad_norm": 15.959213256835938, + "learning_rate": 1.7816676077545645e-05, + "loss": 1.5515, + "step": 5800 + }, + { + "epoch": 1.09, + "grad_norm": 11.059527397155762, + "learning_rate": 1.7812911725955205e-05, + "loss": 1.3217, + "step": 5810 + }, + { + "epoch": 1.1, + "grad_norm": 14.561382293701172, + "learning_rate": 1.780914737436477e-05, + "loss": 1.4639, + "step": 5820 + }, + { + "epoch": 1.1, + "grad_norm": 7.75567626953125, + "learning_rate": 1.7805383022774328e-05, + "loss": 1.4613, + "step": 5830 + }, + { + "epoch": 1.1, + "grad_norm": 5.322535037994385, + "learning_rate": 1.7801618671183888e-05, + "loss": 1.3931, + "step": 5840 + }, + { + "epoch": 1.1, + "grad_norm": 8.112421035766602, + "learning_rate": 1.779785431959345e-05, + "loss": 1.3793, + "step": 5850 + }, + { + "epoch": 1.1, + "grad_norm": 10.282398223876953, + "learning_rate": 1.779408996800301e-05, + "loss": 1.4861, + "step": 5860 + }, + { + "epoch": 1.1, + "grad_norm": 16.15000343322754, + "learning_rate": 1.7790325616412574e-05, + "loss": 1.4151, + "step": 5870 + }, + { + "epoch": 1.11, + "grad_norm": 16.440019607543945, + "learning_rate": 1.7786561264822134e-05, + "loss": 1.4436, + "step": 5880 + }, + { + "epoch": 1.11, + "grad_norm": 8.281665802001953, + "learning_rate": 1.7782796913231697e-05, + "loss": 1.416, + "step": 5890 + }, + { + "epoch": 1.11, + "grad_norm": 6.84842586517334, + "learning_rate": 1.777903256164126e-05, + "loss": 1.1374, + "step": 5900 + }, + { + "epoch": 1.11, + "grad_norm": 6.96786642074585, + "learning_rate": 1.777526821005082e-05, + "loss": 1.0473, + "step": 5910 + }, + { + "epoch": 1.11, + "grad_norm": 17.65937042236328, + "learning_rate": 1.7771503858460383e-05, + "loss": 1.1968, + "step": 5920 + }, + { + "epoch": 1.12, + "grad_norm": 7.487143516540527, + "learning_rate": 1.7767739506869943e-05, + "loss": 1.4706, + "step": 5930 + }, + { + "epoch": 1.12, + "grad_norm": 16.491477966308594, + "learning_rate": 1.7763975155279506e-05, + "loss": 1.3157, + "step": 5940 + }, + { + "epoch": 1.12, + "grad_norm": 6.16838264465332, + "learning_rate": 1.7760210803689066e-05, + "loss": 1.1594, + "step": 5950 + }, + { + "epoch": 1.12, + "grad_norm": 27.0855712890625, + "learning_rate": 1.775644645209863e-05, + "loss": 1.3721, + "step": 5960 + }, + { + "epoch": 1.12, + "grad_norm": 7.84747314453125, + "learning_rate": 1.775268210050819e-05, + "loss": 1.3545, + "step": 5970 + }, + { + "epoch": 1.13, + "grad_norm": 12.929715156555176, + "learning_rate": 1.7748917748917752e-05, + "loss": 1.2785, + "step": 5980 + }, + { + "epoch": 1.13, + "grad_norm": 9.559564590454102, + "learning_rate": 1.774515339732731e-05, + "loss": 1.242, + "step": 5990 + }, + { + "epoch": 1.13, + "grad_norm": 8.087242126464844, + "learning_rate": 1.7741389045736875e-05, + "loss": 1.1266, + "step": 6000 + }, + { + "epoch": 1.13, + "grad_norm": 9.025975227355957, + "learning_rate": 1.7737624694146434e-05, + "loss": 1.4599, + "step": 6010 + }, + { + "epoch": 1.13, + "grad_norm": 8.053915023803711, + "learning_rate": 1.7733860342555994e-05, + "loss": 0.8851, + "step": 6020 + }, + { + "epoch": 1.13, + "grad_norm": 11.262309074401855, + "learning_rate": 1.7730095990965557e-05, + "loss": 1.1816, + "step": 6030 + }, + { + "epoch": 1.14, + "grad_norm": 8.548508644104004, + "learning_rate": 1.7726331639375117e-05, + "loss": 1.2975, + "step": 6040 + }, + { + "epoch": 1.14, + "grad_norm": 6.463540554046631, + "learning_rate": 1.772256728778468e-05, + "loss": 1.1834, + "step": 6050 + }, + { + "epoch": 1.14, + "grad_norm": 8.140536308288574, + "learning_rate": 1.771880293619424e-05, + "loss": 1.0621, + "step": 6060 + }, + { + "epoch": 1.14, + "grad_norm": 3.9653189182281494, + "learning_rate": 1.7715038584603803e-05, + "loss": 1.1374, + "step": 6070 + }, + { + "epoch": 1.14, + "grad_norm": 16.860267639160156, + "learning_rate": 1.7711274233013366e-05, + "loss": 1.3068, + "step": 6080 + }, + { + "epoch": 1.15, + "grad_norm": 7.63691520690918, + "learning_rate": 1.7707509881422926e-05, + "loss": 1.1172, + "step": 6090 + }, + { + "epoch": 1.15, + "grad_norm": 12.925235748291016, + "learning_rate": 1.770374552983249e-05, + "loss": 0.7959, + "step": 6100 + }, + { + "epoch": 1.15, + "grad_norm": 27.463420867919922, + "learning_rate": 1.769998117824205e-05, + "loss": 1.3255, + "step": 6110 + }, + { + "epoch": 1.15, + "grad_norm": 9.057517051696777, + "learning_rate": 1.7696216826651612e-05, + "loss": 1.4139, + "step": 6120 + }, + { + "epoch": 1.15, + "grad_norm": 5.9874982833862305, + "learning_rate": 1.7692452475061172e-05, + "loss": 1.3465, + "step": 6130 + }, + { + "epoch": 1.16, + "grad_norm": 16.482505798339844, + "learning_rate": 1.7688688123470735e-05, + "loss": 0.9673, + "step": 6140 + }, + { + "epoch": 1.16, + "grad_norm": 4.9901885986328125, + "learning_rate": 1.7684923771880295e-05, + "loss": 1.3453, + "step": 6150 + }, + { + "epoch": 1.16, + "grad_norm": 18.844236373901367, + "learning_rate": 1.7681159420289858e-05, + "loss": 1.1033, + "step": 6160 + }, + { + "epoch": 1.16, + "grad_norm": 18.83871841430664, + "learning_rate": 1.7677395068699418e-05, + "loss": 1.4439, + "step": 6170 + }, + { + "epoch": 1.16, + "grad_norm": 10.778878211975098, + "learning_rate": 1.767363071710898e-05, + "loss": 1.2751, + "step": 6180 + }, + { + "epoch": 1.17, + "grad_norm": 26.04317283630371, + "learning_rate": 1.766986636551854e-05, + "loss": 1.1188, + "step": 6190 + }, + { + "epoch": 1.17, + "grad_norm": 14.1301851272583, + "learning_rate": 1.76661020139281e-05, + "loss": 1.376, + "step": 6200 + }, + { + "epoch": 1.17, + "grad_norm": 8.929794311523438, + "learning_rate": 1.7662337662337664e-05, + "loss": 1.2898, + "step": 6210 + }, + { + "epoch": 1.17, + "grad_norm": 24.299001693725586, + "learning_rate": 1.7658573310747223e-05, + "loss": 1.1609, + "step": 6220 + }, + { + "epoch": 1.17, + "grad_norm": 15.158361434936523, + "learning_rate": 1.7654808959156787e-05, + "loss": 1.2065, + "step": 6230 + }, + { + "epoch": 1.17, + "grad_norm": 7.862684726715088, + "learning_rate": 1.7651044607566346e-05, + "loss": 1.302, + "step": 6240 + }, + { + "epoch": 1.18, + "grad_norm": 4.036644458770752, + "learning_rate": 1.764728025597591e-05, + "loss": 0.9381, + "step": 6250 + }, + { + "epoch": 1.18, + "grad_norm": 17.739803314208984, + "learning_rate": 1.764351590438547e-05, + "loss": 0.9986, + "step": 6260 + }, + { + "epoch": 1.18, + "grad_norm": 23.925811767578125, + "learning_rate": 1.7639751552795032e-05, + "loss": 1.4922, + "step": 6270 + }, + { + "epoch": 1.18, + "grad_norm": 5.70884895324707, + "learning_rate": 1.7635987201204596e-05, + "loss": 1.2282, + "step": 6280 + }, + { + "epoch": 1.18, + "grad_norm": 10.17802619934082, + "learning_rate": 1.7632222849614155e-05, + "loss": 1.3497, + "step": 6290 + }, + { + "epoch": 1.19, + "grad_norm": 7.948269844055176, + "learning_rate": 1.762845849802372e-05, + "loss": 1.0691, + "step": 6300 + }, + { + "epoch": 1.19, + "grad_norm": 19.185148239135742, + "learning_rate": 1.7624694146433278e-05, + "loss": 1.206, + "step": 6310 + }, + { + "epoch": 1.19, + "grad_norm": 19.54599952697754, + "learning_rate": 1.762092979484284e-05, + "loss": 1.2538, + "step": 6320 + }, + { + "epoch": 1.19, + "grad_norm": 5.534640789031982, + "learning_rate": 1.76171654432524e-05, + "loss": 0.9516, + "step": 6330 + }, + { + "epoch": 1.19, + "grad_norm": 10.249228477478027, + "learning_rate": 1.7613401091661964e-05, + "loss": 1.1624, + "step": 6340 + }, + { + "epoch": 1.2, + "grad_norm": 17.44942283630371, + "learning_rate": 1.7609636740071524e-05, + "loss": 1.4045, + "step": 6350 + }, + { + "epoch": 1.2, + "grad_norm": 8.629911422729492, + "learning_rate": 1.7605872388481084e-05, + "loss": 1.3624, + "step": 6360 + }, + { + "epoch": 1.2, + "grad_norm": 14.105700492858887, + "learning_rate": 1.7602108036890647e-05, + "loss": 1.2062, + "step": 6370 + }, + { + "epoch": 1.2, + "grad_norm": 9.206585884094238, + "learning_rate": 1.7598343685300207e-05, + "loss": 1.4023, + "step": 6380 + }, + { + "epoch": 1.2, + "grad_norm": 34.754150390625, + "learning_rate": 1.759457933370977e-05, + "loss": 1.1324, + "step": 6390 + }, + { + "epoch": 1.2, + "grad_norm": 21.429977416992188, + "learning_rate": 1.759081498211933e-05, + "loss": 1.3611, + "step": 6400 + }, + { + "epoch": 1.21, + "grad_norm": 13.360370635986328, + "learning_rate": 1.7587050630528893e-05, + "loss": 1.2232, + "step": 6410 + }, + { + "epoch": 1.21, + "grad_norm": 7.976438522338867, + "learning_rate": 1.7583286278938453e-05, + "loss": 1.2726, + "step": 6420 + }, + { + "epoch": 1.21, + "grad_norm": 3.3813695907592773, + "learning_rate": 1.7579521927348016e-05, + "loss": 1.3038, + "step": 6430 + }, + { + "epoch": 1.21, + "grad_norm": 4.168612480163574, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.0494, + "step": 6440 + }, + { + "epoch": 1.21, + "grad_norm": 4.296903133392334, + "learning_rate": 1.757199322416714e-05, + "loss": 1.3312, + "step": 6450 + }, + { + "epoch": 1.22, + "grad_norm": 9.388733863830566, + "learning_rate": 1.7568228872576702e-05, + "loss": 1.0723, + "step": 6460 + }, + { + "epoch": 1.22, + "grad_norm": 12.61970043182373, + "learning_rate": 1.756446452098626e-05, + "loss": 1.1115, + "step": 6470 + }, + { + "epoch": 1.22, + "grad_norm": 7.065197944641113, + "learning_rate": 1.7560700169395825e-05, + "loss": 1.173, + "step": 6480 + }, + { + "epoch": 1.22, + "grad_norm": 20.004718780517578, + "learning_rate": 1.7556935817805385e-05, + "loss": 1.0418, + "step": 6490 + }, + { + "epoch": 1.22, + "grad_norm": 5.888309001922607, + "learning_rate": 1.7553171466214948e-05, + "loss": 1.2062, + "step": 6500 + }, + { + "epoch": 1.23, + "grad_norm": 31.428281784057617, + "learning_rate": 1.7549407114624507e-05, + "loss": 1.3629, + "step": 6510 + }, + { + "epoch": 1.23, + "grad_norm": 37.77729034423828, + "learning_rate": 1.754564276303407e-05, + "loss": 1.1825, + "step": 6520 + }, + { + "epoch": 1.23, + "grad_norm": 3.3252882957458496, + "learning_rate": 1.754187841144363e-05, + "loss": 1.3147, + "step": 6530 + }, + { + "epoch": 1.23, + "grad_norm": 14.839897155761719, + "learning_rate": 1.753811405985319e-05, + "loss": 1.2901, + "step": 6540 + }, + { + "epoch": 1.23, + "grad_norm": 13.03376293182373, + "learning_rate": 1.7534349708262753e-05, + "loss": 0.8962, + "step": 6550 + }, + { + "epoch": 1.23, + "grad_norm": 15.040424346923828, + "learning_rate": 1.7530585356672313e-05, + "loss": 1.3194, + "step": 6560 + }, + { + "epoch": 1.24, + "grad_norm": 12.113794326782227, + "learning_rate": 1.7526821005081876e-05, + "loss": 1.1243, + "step": 6570 + }, + { + "epoch": 1.24, + "grad_norm": 13.116803169250488, + "learning_rate": 1.7523056653491436e-05, + "loss": 0.9943, + "step": 6580 + }, + { + "epoch": 1.24, + "grad_norm": 6.835216522216797, + "learning_rate": 1.7519292301901e-05, + "loss": 1.2691, + "step": 6590 + }, + { + "epoch": 1.24, + "grad_norm": 10.555135726928711, + "learning_rate": 1.751552795031056e-05, + "loss": 0.9653, + "step": 6600 + }, + { + "epoch": 1.24, + "grad_norm": 20.608491897583008, + "learning_rate": 1.7511763598720122e-05, + "loss": 1.2499, + "step": 6610 + }, + { + "epoch": 1.25, + "grad_norm": 10.590780258178711, + "learning_rate": 1.7507999247129682e-05, + "loss": 1.1381, + "step": 6620 + }, + { + "epoch": 1.25, + "grad_norm": 1.9735676050186157, + "learning_rate": 1.7504234895539245e-05, + "loss": 0.9156, + "step": 6630 + }, + { + "epoch": 1.25, + "grad_norm": 10.672266006469727, + "learning_rate": 1.7500470543948808e-05, + "loss": 1.3334, + "step": 6640 + }, + { + "epoch": 1.25, + "grad_norm": 16.784568786621094, + "learning_rate": 1.7496706192358368e-05, + "loss": 1.4105, + "step": 6650 + }, + { + "epoch": 1.25, + "grad_norm": 14.953975677490234, + "learning_rate": 1.749294184076793e-05, + "loss": 1.0408, + "step": 6660 + }, + { + "epoch": 1.26, + "grad_norm": 14.848883628845215, + "learning_rate": 1.748917748917749e-05, + "loss": 1.1514, + "step": 6670 + }, + { + "epoch": 1.26, + "grad_norm": 13.122929573059082, + "learning_rate": 1.7485413137587054e-05, + "loss": 1.1443, + "step": 6680 + }, + { + "epoch": 1.26, + "grad_norm": 20.22705078125, + "learning_rate": 1.7481648785996614e-05, + "loss": 1.0515, + "step": 6690 + }, + { + "epoch": 1.26, + "grad_norm": 11.579082489013672, + "learning_rate": 1.7477884434406177e-05, + "loss": 1.3199, + "step": 6700 + }, + { + "epoch": 1.26, + "grad_norm": 20.08841323852539, + "learning_rate": 1.7474120082815737e-05, + "loss": 1.0948, + "step": 6710 + }, + { + "epoch": 1.26, + "grad_norm": 23.67656135559082, + "learning_rate": 1.7470355731225296e-05, + "loss": 1.158, + "step": 6720 + }, + { + "epoch": 1.27, + "grad_norm": 11.358781814575195, + "learning_rate": 1.746659137963486e-05, + "loss": 1.1823, + "step": 6730 + }, + { + "epoch": 1.27, + "grad_norm": 11.688865661621094, + "learning_rate": 1.746282702804442e-05, + "loss": 1.3804, + "step": 6740 + }, + { + "epoch": 1.27, + "grad_norm": 10.149205207824707, + "learning_rate": 1.7459062676453983e-05, + "loss": 0.9094, + "step": 6750 + }, + { + "epoch": 1.27, + "grad_norm": 9.609593391418457, + "learning_rate": 1.7455298324863542e-05, + "loss": 1.2674, + "step": 6760 + }, + { + "epoch": 1.27, + "grad_norm": 15.846113204956055, + "learning_rate": 1.7451533973273105e-05, + "loss": 1.2045, + "step": 6770 + }, + { + "epoch": 1.28, + "grad_norm": 14.391237258911133, + "learning_rate": 1.7447769621682665e-05, + "loss": 1.0173, + "step": 6780 + }, + { + "epoch": 1.28, + "grad_norm": 18.767101287841797, + "learning_rate": 1.744400527009223e-05, + "loss": 1.1205, + "step": 6790 + }, + { + "epoch": 1.28, + "grad_norm": 8.494179725646973, + "learning_rate": 1.7440240918501788e-05, + "loss": 1.0519, + "step": 6800 + }, + { + "epoch": 1.28, + "grad_norm": 7.717184543609619, + "learning_rate": 1.743647656691135e-05, + "loss": 1.0126, + "step": 6810 + }, + { + "epoch": 1.28, + "grad_norm": 19.109561920166016, + "learning_rate": 1.743271221532091e-05, + "loss": 0.9986, + "step": 6820 + }, + { + "epoch": 1.29, + "grad_norm": 7.908644199371338, + "learning_rate": 1.7428947863730474e-05, + "loss": 0.9898, + "step": 6830 + }, + { + "epoch": 1.29, + "grad_norm": 7.440939426422119, + "learning_rate": 1.7425183512140037e-05, + "loss": 1.1074, + "step": 6840 + }, + { + "epoch": 1.29, + "grad_norm": 6.752496719360352, + "learning_rate": 1.7421419160549597e-05, + "loss": 1.1364, + "step": 6850 + }, + { + "epoch": 1.29, + "grad_norm": 16.471498489379883, + "learning_rate": 1.741765480895916e-05, + "loss": 1.385, + "step": 6860 + }, + { + "epoch": 1.29, + "grad_norm": 5.562114715576172, + "learning_rate": 1.741389045736872e-05, + "loss": 1.0686, + "step": 6870 + }, + { + "epoch": 1.29, + "grad_norm": 7.111942768096924, + "learning_rate": 1.7410126105778283e-05, + "loss": 1.0202, + "step": 6880 + }, + { + "epoch": 1.3, + "grad_norm": 26.472091674804688, + "learning_rate": 1.7406361754187843e-05, + "loss": 1.2902, + "step": 6890 + }, + { + "epoch": 1.3, + "grad_norm": 9.381438255310059, + "learning_rate": 1.7402597402597403e-05, + "loss": 1.2116, + "step": 6900 + }, + { + "epoch": 1.3, + "grad_norm": 61.49063491821289, + "learning_rate": 1.7398833051006966e-05, + "loss": 1.4989, + "step": 6910 + }, + { + "epoch": 1.3, + "grad_norm": 20.3765869140625, + "learning_rate": 1.7395068699416526e-05, + "loss": 0.8747, + "step": 6920 + }, + { + "epoch": 1.3, + "grad_norm": 16.292556762695312, + "learning_rate": 1.739130434782609e-05, + "loss": 1.1964, + "step": 6930 + }, + { + "epoch": 1.31, + "grad_norm": 6.413318634033203, + "learning_rate": 1.738753999623565e-05, + "loss": 0.9384, + "step": 6940 + }, + { + "epoch": 1.31, + "grad_norm": 11.494325637817383, + "learning_rate": 1.7383775644645212e-05, + "loss": 1.4534, + "step": 6950 + }, + { + "epoch": 1.31, + "grad_norm": 16.83256721496582, + "learning_rate": 1.738001129305477e-05, + "loss": 1.1801, + "step": 6960 + }, + { + "epoch": 1.31, + "grad_norm": 10.633999824523926, + "learning_rate": 1.7376246941464335e-05, + "loss": 1.3302, + "step": 6970 + }, + { + "epoch": 1.31, + "grad_norm": 23.226150512695312, + "learning_rate": 1.7372482589873894e-05, + "loss": 1.1845, + "step": 6980 + }, + { + "epoch": 1.32, + "grad_norm": 14.6954345703125, + "learning_rate": 1.7368718238283458e-05, + "loss": 1.2775, + "step": 6990 + }, + { + "epoch": 1.32, + "grad_norm": 11.616198539733887, + "learning_rate": 1.7364953886693017e-05, + "loss": 0.9493, + "step": 7000 + }, + { + "epoch": 1.32, + "grad_norm": 5.6046929359436035, + "learning_rate": 1.736118953510258e-05, + "loss": 1.2158, + "step": 7010 + }, + { + "epoch": 1.32, + "grad_norm": 7.577897071838379, + "learning_rate": 1.7357425183512144e-05, + "loss": 1.092, + "step": 7020 + }, + { + "epoch": 1.32, + "grad_norm": 14.711727142333984, + "learning_rate": 1.7353660831921703e-05, + "loss": 1.3447, + "step": 7030 + }, + { + "epoch": 1.33, + "grad_norm": 22.08068084716797, + "learning_rate": 1.7349896480331267e-05, + "loss": 0.9474, + "step": 7040 + }, + { + "epoch": 1.33, + "grad_norm": 5.154607772827148, + "learning_rate": 1.7346132128740826e-05, + "loss": 1.0093, + "step": 7050 + }, + { + "epoch": 1.33, + "grad_norm": 7.215937614440918, + "learning_rate": 1.7342367777150386e-05, + "loss": 1.178, + "step": 7060 + }, + { + "epoch": 1.33, + "grad_norm": 3.9813942909240723, + "learning_rate": 1.733860342555995e-05, + "loss": 1.1342, + "step": 7070 + }, + { + "epoch": 1.33, + "grad_norm": 8.915732383728027, + "learning_rate": 1.733483907396951e-05, + "loss": 1.1109, + "step": 7080 + }, + { + "epoch": 1.33, + "grad_norm": 1.5986876487731934, + "learning_rate": 1.7331074722379072e-05, + "loss": 1.1772, + "step": 7090 + }, + { + "epoch": 1.34, + "grad_norm": 5.484806537628174, + "learning_rate": 1.7327310370788632e-05, + "loss": 1.1488, + "step": 7100 + }, + { + "epoch": 1.34, + "grad_norm": 15.327614784240723, + "learning_rate": 1.7323546019198195e-05, + "loss": 1.1301, + "step": 7110 + }, + { + "epoch": 1.34, + "grad_norm": 8.505776405334473, + "learning_rate": 1.7319781667607755e-05, + "loss": 1.1446, + "step": 7120 + }, + { + "epoch": 1.34, + "grad_norm": 11.313884735107422, + "learning_rate": 1.7316017316017318e-05, + "loss": 1.0336, + "step": 7130 + }, + { + "epoch": 1.34, + "grad_norm": 8.289346694946289, + "learning_rate": 1.7312252964426878e-05, + "loss": 0.9494, + "step": 7140 + }, + { + "epoch": 1.35, + "grad_norm": 14.154651641845703, + "learning_rate": 1.730848861283644e-05, + "loss": 0.8998, + "step": 7150 + }, + { + "epoch": 1.35, + "grad_norm": 8.944982528686523, + "learning_rate": 1.7304724261246e-05, + "loss": 0.9689, + "step": 7160 + }, + { + "epoch": 1.35, + "grad_norm": 9.77503490447998, + "learning_rate": 1.7300959909655564e-05, + "loss": 1.0465, + "step": 7170 + }, + { + "epoch": 1.35, + "grad_norm": 28.841171264648438, + "learning_rate": 1.7297195558065124e-05, + "loss": 1.2161, + "step": 7180 + }, + { + "epoch": 1.35, + "grad_norm": 8.071671485900879, + "learning_rate": 1.7293431206474687e-05, + "loss": 1.1167, + "step": 7190 + }, + { + "epoch": 1.36, + "grad_norm": 11.291550636291504, + "learning_rate": 1.728966685488425e-05, + "loss": 1.2706, + "step": 7200 + }, + { + "epoch": 1.36, + "grad_norm": 13.803507804870605, + "learning_rate": 1.728590250329381e-05, + "loss": 1.1404, + "step": 7210 + }, + { + "epoch": 1.36, + "grad_norm": 9.958276748657227, + "learning_rate": 1.7282138151703373e-05, + "loss": 1.2336, + "step": 7220 + }, + { + "epoch": 1.36, + "grad_norm": 8.445544242858887, + "learning_rate": 1.727837380011293e-05, + "loss": 1.16, + "step": 7230 + }, + { + "epoch": 1.36, + "grad_norm": 20.41545867919922, + "learning_rate": 1.7274609448522492e-05, + "loss": 1.3856, + "step": 7240 + }, + { + "epoch": 1.36, + "grad_norm": 9.721564292907715, + "learning_rate": 1.7270845096932056e-05, + "loss": 1.0896, + "step": 7250 + }, + { + "epoch": 1.37, + "grad_norm": 18.577259063720703, + "learning_rate": 1.7267080745341615e-05, + "loss": 1.181, + "step": 7260 + }, + { + "epoch": 1.37, + "grad_norm": 24.051130294799805, + "learning_rate": 1.726331639375118e-05, + "loss": 1.0202, + "step": 7270 + }, + { + "epoch": 1.37, + "grad_norm": 12.581955909729004, + "learning_rate": 1.7259552042160738e-05, + "loss": 1.0709, + "step": 7280 + }, + { + "epoch": 1.37, + "grad_norm": 6.8670854568481445, + "learning_rate": 1.72557876905703e-05, + "loss": 0.9722, + "step": 7290 + }, + { + "epoch": 1.37, + "grad_norm": 9.2810640335083, + "learning_rate": 1.725202333897986e-05, + "loss": 1.1988, + "step": 7300 + }, + { + "epoch": 1.38, + "grad_norm": 8.035595893859863, + "learning_rate": 1.7248258987389424e-05, + "loss": 1.059, + "step": 7310 + }, + { + "epoch": 1.38, + "grad_norm": 5.461163520812988, + "learning_rate": 1.7244494635798984e-05, + "loss": 1.1715, + "step": 7320 + }, + { + "epoch": 1.38, + "grad_norm": 6.015925407409668, + "learning_rate": 1.7240730284208547e-05, + "loss": 0.8844, + "step": 7330 + }, + { + "epoch": 1.38, + "grad_norm": 17.79153060913086, + "learning_rate": 1.7236965932618107e-05, + "loss": 1.1316, + "step": 7340 + }, + { + "epoch": 1.38, + "grad_norm": 65.43628692626953, + "learning_rate": 1.723320158102767e-05, + "loss": 0.9798, + "step": 7350 + }, + { + "epoch": 1.39, + "grad_norm": 12.030508041381836, + "learning_rate": 1.722943722943723e-05, + "loss": 1.093, + "step": 7360 + }, + { + "epoch": 1.39, + "grad_norm": 5.5272297859191895, + "learning_rate": 1.7225672877846793e-05, + "loss": 0.882, + "step": 7370 + }, + { + "epoch": 1.39, + "grad_norm": 6.576135635375977, + "learning_rate": 1.7221908526256356e-05, + "loss": 1.0853, + "step": 7380 + }, + { + "epoch": 1.39, + "grad_norm": 13.047865867614746, + "learning_rate": 1.7218144174665916e-05, + "loss": 1.0521, + "step": 7390 + }, + { + "epoch": 1.39, + "grad_norm": 10.292452812194824, + "learning_rate": 1.721437982307548e-05, + "loss": 1.3856, + "step": 7400 + }, + { + "epoch": 1.39, + "grad_norm": 4.0016326904296875, + "learning_rate": 1.7210615471485036e-05, + "loss": 0.9876, + "step": 7410 + }, + { + "epoch": 1.4, + "grad_norm": 10.368799209594727, + "learning_rate": 1.72068511198946e-05, + "loss": 0.7977, + "step": 7420 + }, + { + "epoch": 1.4, + "grad_norm": 12.0331449508667, + "learning_rate": 1.7203086768304162e-05, + "loss": 1.0317, + "step": 7430 + }, + { + "epoch": 1.4, + "grad_norm": 4.535419940948486, + "learning_rate": 1.719932241671372e-05, + "loss": 1.2162, + "step": 7440 + }, + { + "epoch": 1.4, + "grad_norm": 4.63490104675293, + "learning_rate": 1.7195558065123285e-05, + "loss": 0.9855, + "step": 7450 + }, + { + "epoch": 1.4, + "grad_norm": 12.621866226196289, + "learning_rate": 1.7191793713532845e-05, + "loss": 1.1798, + "step": 7460 + }, + { + "epoch": 1.41, + "grad_norm": 6.641757488250732, + "learning_rate": 1.7188029361942408e-05, + "loss": 1.2643, + "step": 7470 + }, + { + "epoch": 1.41, + "grad_norm": 4.4592604637146, + "learning_rate": 1.7184265010351967e-05, + "loss": 0.8314, + "step": 7480 + }, + { + "epoch": 1.41, + "grad_norm": 22.9270076751709, + "learning_rate": 1.718050065876153e-05, + "loss": 1.2297, + "step": 7490 + }, + { + "epoch": 1.41, + "grad_norm": 13.613977432250977, + "learning_rate": 1.717673630717109e-05, + "loss": 0.957, + "step": 7500 + }, + { + "epoch": 1.41, + "grad_norm": 18.736854553222656, + "learning_rate": 1.7172971955580654e-05, + "loss": 1.0854, + "step": 7510 + }, + { + "epoch": 1.42, + "grad_norm": 15.172490119934082, + "learning_rate": 1.7169207603990213e-05, + "loss": 0.9997, + "step": 7520 + }, + { + "epoch": 1.42, + "grad_norm": 14.035980224609375, + "learning_rate": 1.7165443252399776e-05, + "loss": 0.9522, + "step": 7530 + }, + { + "epoch": 1.42, + "grad_norm": 6.1426310539245605, + "learning_rate": 1.7161678900809336e-05, + "loss": 0.8932, + "step": 7540 + }, + { + "epoch": 1.42, + "grad_norm": 4.456810474395752, + "learning_rate": 1.71579145492189e-05, + "loss": 1.0575, + "step": 7550 + }, + { + "epoch": 1.42, + "grad_norm": 18.907272338867188, + "learning_rate": 1.715415019762846e-05, + "loss": 0.9667, + "step": 7560 + }, + { + "epoch": 1.42, + "grad_norm": 24.286773681640625, + "learning_rate": 1.7150385846038022e-05, + "loss": 1.1699, + "step": 7570 + }, + { + "epoch": 1.43, + "grad_norm": 6.49676513671875, + "learning_rate": 1.7146621494447582e-05, + "loss": 1.1024, + "step": 7580 + }, + { + "epoch": 1.43, + "grad_norm": 13.635810852050781, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.9841, + "step": 7590 + }, + { + "epoch": 1.43, + "grad_norm": 13.655981063842773, + "learning_rate": 1.7139092791266705e-05, + "loss": 1.0624, + "step": 7600 + }, + { + "epoch": 1.43, + "grad_norm": 3.580477714538574, + "learning_rate": 1.7135328439676268e-05, + "loss": 0.943, + "step": 7610 + }, + { + "epoch": 1.43, + "grad_norm": 19.59247589111328, + "learning_rate": 1.7131564088085828e-05, + "loss": 0.9837, + "step": 7620 + }, + { + "epoch": 1.44, + "grad_norm": 3.9449715614318848, + "learning_rate": 1.712779973649539e-05, + "loss": 1.0507, + "step": 7630 + }, + { + "epoch": 1.44, + "grad_norm": 10.523272514343262, + "learning_rate": 1.712403538490495e-05, + "loss": 1.0263, + "step": 7640 + }, + { + "epoch": 1.44, + "grad_norm": 14.362699508666992, + "learning_rate": 1.7120271033314514e-05, + "loss": 1.1896, + "step": 7650 + }, + { + "epoch": 1.44, + "grad_norm": 2.5842766761779785, + "learning_rate": 1.7116506681724074e-05, + "loss": 1.0641, + "step": 7660 + }, + { + "epoch": 1.44, + "grad_norm": 17.754718780517578, + "learning_rate": 1.7112742330133637e-05, + "loss": 0.9287, + "step": 7670 + }, + { + "epoch": 1.45, + "grad_norm": 10.024535179138184, + "learning_rate": 1.7108977978543197e-05, + "loss": 1.2929, + "step": 7680 + }, + { + "epoch": 1.45, + "grad_norm": 14.328852653503418, + "learning_rate": 1.710521362695276e-05, + "loss": 1.2506, + "step": 7690 + }, + { + "epoch": 1.45, + "grad_norm": 3.0855348110198975, + "learning_rate": 1.710144927536232e-05, + "loss": 0.9465, + "step": 7700 + }, + { + "epoch": 1.45, + "grad_norm": 3.1114163398742676, + "learning_rate": 1.7097684923771883e-05, + "loss": 1.1016, + "step": 7710 + }, + { + "epoch": 1.45, + "grad_norm": 7.601757526397705, + "learning_rate": 1.7093920572181443e-05, + "loss": 1.1521, + "step": 7720 + }, + { + "epoch": 1.45, + "grad_norm": 8.17392349243164, + "learning_rate": 1.7090156220591006e-05, + "loss": 1.1438, + "step": 7730 + }, + { + "epoch": 1.46, + "grad_norm": 14.822709083557129, + "learning_rate": 1.7086391869000565e-05, + "loss": 1.0884, + "step": 7740 + }, + { + "epoch": 1.46, + "grad_norm": 12.926187515258789, + "learning_rate": 1.708262751741013e-05, + "loss": 1.2607, + "step": 7750 + }, + { + "epoch": 1.46, + "grad_norm": 13.08968734741211, + "learning_rate": 1.707886316581969e-05, + "loss": 1.1789, + "step": 7760 + }, + { + "epoch": 1.46, + "grad_norm": 13.06160831451416, + "learning_rate": 1.7075098814229248e-05, + "loss": 1.2074, + "step": 7770 + }, + { + "epoch": 1.46, + "grad_norm": 9.818836212158203, + "learning_rate": 1.707133446263881e-05, + "loss": 1.2368, + "step": 7780 + }, + { + "epoch": 1.47, + "grad_norm": 15.557538032531738, + "learning_rate": 1.706757011104837e-05, + "loss": 1.1518, + "step": 7790 + }, + { + "epoch": 1.47, + "grad_norm": 12.945401191711426, + "learning_rate": 1.7063805759457934e-05, + "loss": 0.9489, + "step": 7800 + }, + { + "epoch": 1.47, + "grad_norm": 8.303050994873047, + "learning_rate": 1.7060041407867497e-05, + "loss": 1.0736, + "step": 7810 + }, + { + "epoch": 1.47, + "grad_norm": 19.958223342895508, + "learning_rate": 1.7056277056277057e-05, + "loss": 1.3085, + "step": 7820 + }, + { + "epoch": 1.47, + "grad_norm": 13.087350845336914, + "learning_rate": 1.705251270468662e-05, + "loss": 1.2354, + "step": 7830 + }, + { + "epoch": 1.48, + "grad_norm": 5.501480579376221, + "learning_rate": 1.704874835309618e-05, + "loss": 1.1402, + "step": 7840 + }, + { + "epoch": 1.48, + "grad_norm": 16.800434112548828, + "learning_rate": 1.7044984001505743e-05, + "loss": 1.176, + "step": 7850 + }, + { + "epoch": 1.48, + "grad_norm": 15.918781280517578, + "learning_rate": 1.7041219649915303e-05, + "loss": 1.0657, + "step": 7860 + }, + { + "epoch": 1.48, + "grad_norm": 10.953459739685059, + "learning_rate": 1.7037455298324866e-05, + "loss": 0.9552, + "step": 7870 + }, + { + "epoch": 1.48, + "grad_norm": 11.856039047241211, + "learning_rate": 1.7033690946734426e-05, + "loss": 0.9727, + "step": 7880 + }, + { + "epoch": 1.49, + "grad_norm": 6.3316755294799805, + "learning_rate": 1.702992659514399e-05, + "loss": 0.9975, + "step": 7890 + }, + { + "epoch": 1.49, + "grad_norm": 54.56752014160156, + "learning_rate": 1.702616224355355e-05, + "loss": 1.0682, + "step": 7900 + }, + { + "epoch": 1.49, + "grad_norm": 16.042255401611328, + "learning_rate": 1.7022397891963112e-05, + "loss": 1.2348, + "step": 7910 + }, + { + "epoch": 1.49, + "grad_norm": 14.975419044494629, + "learning_rate": 1.7018633540372672e-05, + "loss": 1.0173, + "step": 7920 + }, + { + "epoch": 1.49, + "grad_norm": 11.27600383758545, + "learning_rate": 1.701486918878223e-05, + "loss": 1.1986, + "step": 7930 + }, + { + "epoch": 1.49, + "grad_norm": 25.487232208251953, + "learning_rate": 1.7011104837191795e-05, + "loss": 1.5951, + "step": 7940 + }, + { + "epoch": 1.5, + "grad_norm": 19.516244888305664, + "learning_rate": 1.7007340485601354e-05, + "loss": 1.0892, + "step": 7950 + }, + { + "epoch": 1.5, + "grad_norm": 1.6688463687896729, + "learning_rate": 1.7003576134010918e-05, + "loss": 0.8421, + "step": 7960 + }, + { + "epoch": 1.5, + "grad_norm": 7.081192493438721, + "learning_rate": 1.6999811782420477e-05, + "loss": 1.0691, + "step": 7970 + }, + { + "epoch": 1.5, + "grad_norm": 10.31060791015625, + "learning_rate": 1.699604743083004e-05, + "loss": 0.8777, + "step": 7980 + }, + { + "epoch": 1.5, + "grad_norm": 7.9369659423828125, + "learning_rate": 1.6992283079239604e-05, + "loss": 0.9824, + "step": 7990 + }, + { + "epoch": 1.51, + "grad_norm": 13.844595909118652, + "learning_rate": 1.6988518727649163e-05, + "loss": 0.9572, + "step": 8000 + }, + { + "epoch": 1.51, + "grad_norm": 32.09144592285156, + "learning_rate": 1.6984754376058727e-05, + "loss": 1.0308, + "step": 8010 + }, + { + "epoch": 1.51, + "grad_norm": 11.795945167541504, + "learning_rate": 1.6980990024468286e-05, + "loss": 1.1595, + "step": 8020 + }, + { + "epoch": 1.51, + "grad_norm": 22.064987182617188, + "learning_rate": 1.697722567287785e-05, + "loss": 0.7747, + "step": 8030 + }, + { + "epoch": 1.51, + "grad_norm": 8.580666542053223, + "learning_rate": 1.697346132128741e-05, + "loss": 0.9154, + "step": 8040 + }, + { + "epoch": 1.52, + "grad_norm": 9.91849136352539, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.2361, + "step": 8050 + }, + { + "epoch": 1.52, + "grad_norm": 13.927072525024414, + "learning_rate": 1.6965932618106532e-05, + "loss": 0.8912, + "step": 8060 + }, + { + "epoch": 1.52, + "grad_norm": 7.275015830993652, + "learning_rate": 1.6962168266516095e-05, + "loss": 1.1334, + "step": 8070 + }, + { + "epoch": 1.52, + "grad_norm": 24.040142059326172, + "learning_rate": 1.6958403914925655e-05, + "loss": 0.9843, + "step": 8080 + }, + { + "epoch": 1.52, + "grad_norm": 22.510576248168945, + "learning_rate": 1.6954639563335218e-05, + "loss": 0.9083, + "step": 8090 + }, + { + "epoch": 1.52, + "grad_norm": 7.329137325286865, + "learning_rate": 1.6950875211744778e-05, + "loss": 1.0092, + "step": 8100 + }, + { + "epoch": 1.53, + "grad_norm": 5.5893330574035645, + "learning_rate": 1.6947110860154338e-05, + "loss": 0.9781, + "step": 8110 + }, + { + "epoch": 1.53, + "grad_norm": 22.0462703704834, + "learning_rate": 1.69433465085639e-05, + "loss": 0.9981, + "step": 8120 + }, + { + "epoch": 1.53, + "grad_norm": 10.5774564743042, + "learning_rate": 1.693958215697346e-05, + "loss": 1.0343, + "step": 8130 + }, + { + "epoch": 1.53, + "grad_norm": 15.137490272521973, + "learning_rate": 1.6935817805383024e-05, + "loss": 1.0412, + "step": 8140 + }, + { + "epoch": 1.53, + "grad_norm": 16.09028434753418, + "learning_rate": 1.6932053453792584e-05, + "loss": 0.8931, + "step": 8150 + }, + { + "epoch": 1.54, + "grad_norm": 3.9936368465423584, + "learning_rate": 1.6928289102202147e-05, + "loss": 0.787, + "step": 8160 + }, + { + "epoch": 1.54, + "grad_norm": 20.518054962158203, + "learning_rate": 1.692452475061171e-05, + "loss": 1.2866, + "step": 8170 + }, + { + "epoch": 1.54, + "grad_norm": 6.0525689125061035, + "learning_rate": 1.692076039902127e-05, + "loss": 1.1158, + "step": 8180 + }, + { + "epoch": 1.54, + "grad_norm": 9.403233528137207, + "learning_rate": 1.6916996047430833e-05, + "loss": 1.0815, + "step": 8190 + }, + { + "epoch": 1.54, + "grad_norm": 22.706296920776367, + "learning_rate": 1.6913231695840393e-05, + "loss": 1.0685, + "step": 8200 + }, + { + "epoch": 1.55, + "grad_norm": 10.937347412109375, + "learning_rate": 1.6909467344249956e-05, + "loss": 1.1039, + "step": 8210 + }, + { + "epoch": 1.55, + "grad_norm": 20.09309959411621, + "learning_rate": 1.6905702992659516e-05, + "loss": 0.9035, + "step": 8220 + }, + { + "epoch": 1.55, + "grad_norm": 8.693171501159668, + "learning_rate": 1.690193864106908e-05, + "loss": 1.0463, + "step": 8230 + }, + { + "epoch": 1.55, + "grad_norm": 15.067790031433105, + "learning_rate": 1.689817428947864e-05, + "loss": 1.1981, + "step": 8240 + }, + { + "epoch": 1.55, + "grad_norm": 5.470821857452393, + "learning_rate": 1.68944099378882e-05, + "loss": 1.1446, + "step": 8250 + }, + { + "epoch": 1.55, + "grad_norm": 7.007253646850586, + "learning_rate": 1.689064558629776e-05, + "loss": 1.101, + "step": 8260 + }, + { + "epoch": 1.56, + "grad_norm": 7.471108913421631, + "learning_rate": 1.6886881234707325e-05, + "loss": 0.7692, + "step": 8270 + }, + { + "epoch": 1.56, + "grad_norm": 10.229903221130371, + "learning_rate": 1.6883116883116884e-05, + "loss": 0.9428, + "step": 8280 + }, + { + "epoch": 1.56, + "grad_norm": 13.042211532592773, + "learning_rate": 1.6879352531526444e-05, + "loss": 1.2651, + "step": 8290 + }, + { + "epoch": 1.56, + "grad_norm": 9.159402847290039, + "learning_rate": 1.6875588179936007e-05, + "loss": 0.7305, + "step": 8300 + }, + { + "epoch": 1.56, + "grad_norm": 7.143734931945801, + "learning_rate": 1.6871823828345567e-05, + "loss": 1.3042, + "step": 8310 + }, + { + "epoch": 1.57, + "grad_norm": 5.597439765930176, + "learning_rate": 1.686805947675513e-05, + "loss": 0.7694, + "step": 8320 + }, + { + "epoch": 1.57, + "grad_norm": 24.404653549194336, + "learning_rate": 1.686429512516469e-05, + "loss": 0.7511, + "step": 8330 + }, + { + "epoch": 1.57, + "grad_norm": 3.326371908187866, + "learning_rate": 1.6860530773574253e-05, + "loss": 0.8211, + "step": 8340 + }, + { + "epoch": 1.57, + "grad_norm": 16.690340042114258, + "learning_rate": 1.6856766421983813e-05, + "loss": 1.0918, + "step": 8350 + }, + { + "epoch": 1.57, + "grad_norm": 3.873743772506714, + "learning_rate": 1.6853002070393376e-05, + "loss": 0.8984, + "step": 8360 + }, + { + "epoch": 1.58, + "grad_norm": 38.66802978515625, + "learning_rate": 1.684923771880294e-05, + "loss": 0.6514, + "step": 8370 + }, + { + "epoch": 1.58, + "grad_norm": 23.641817092895508, + "learning_rate": 1.68454733672125e-05, + "loss": 0.9783, + "step": 8380 + }, + { + "epoch": 1.58, + "grad_norm": 14.112013816833496, + "learning_rate": 1.6841709015622062e-05, + "loss": 0.8311, + "step": 8390 + }, + { + "epoch": 1.58, + "grad_norm": 5.782837867736816, + "learning_rate": 1.6837944664031622e-05, + "loss": 0.9418, + "step": 8400 + }, + { + "epoch": 1.58, + "grad_norm": 5.840491771697998, + "learning_rate": 1.6834180312441185e-05, + "loss": 0.9358, + "step": 8410 + }, + { + "epoch": 1.58, + "grad_norm": 28.026111602783203, + "learning_rate": 1.6830415960850745e-05, + "loss": 1.5066, + "step": 8420 + }, + { + "epoch": 1.59, + "grad_norm": 13.618431091308594, + "learning_rate": 1.6826651609260308e-05, + "loss": 1.0053, + "step": 8430 + }, + { + "epoch": 1.59, + "grad_norm": 8.274946212768555, + "learning_rate": 1.6822887257669868e-05, + "loss": 1.0545, + "step": 8440 + }, + { + "epoch": 1.59, + "grad_norm": 19.648679733276367, + "learning_rate": 1.6819122906079427e-05, + "loss": 0.8776, + "step": 8450 + }, + { + "epoch": 1.59, + "grad_norm": 15.531411170959473, + "learning_rate": 1.681535855448899e-05, + "loss": 1.0128, + "step": 8460 + }, + { + "epoch": 1.59, + "grad_norm": 4.063401222229004, + "learning_rate": 1.681159420289855e-05, + "loss": 1.0426, + "step": 8470 + }, + { + "epoch": 1.6, + "grad_norm": 11.043712615966797, + "learning_rate": 1.6807829851308114e-05, + "loss": 0.7918, + "step": 8480 + }, + { + "epoch": 1.6, + "grad_norm": 8.745186805725098, + "learning_rate": 1.6804065499717673e-05, + "loss": 0.8653, + "step": 8490 + }, + { + "epoch": 1.6, + "grad_norm": 13.314781188964844, + "learning_rate": 1.6800301148127236e-05, + "loss": 0.904, + "step": 8500 + }, + { + "epoch": 1.6, + "grad_norm": 4.7882208824157715, + "learning_rate": 1.6796536796536796e-05, + "loss": 0.8867, + "step": 8510 + }, + { + "epoch": 1.6, + "grad_norm": 11.435511589050293, + "learning_rate": 1.679277244494636e-05, + "loss": 1.0631, + "step": 8520 + }, + { + "epoch": 1.61, + "grad_norm": 30.225162506103516, + "learning_rate": 1.678900809335592e-05, + "loss": 1.235, + "step": 8530 + }, + { + "epoch": 1.61, + "grad_norm": 17.80291175842285, + "learning_rate": 1.6785243741765482e-05, + "loss": 1.2388, + "step": 8540 + }, + { + "epoch": 1.61, + "grad_norm": 19.908802032470703, + "learning_rate": 1.6781479390175045e-05, + "loss": 1.1334, + "step": 8550 + }, + { + "epoch": 1.61, + "grad_norm": 4.993206977844238, + "learning_rate": 1.6777715038584605e-05, + "loss": 0.9747, + "step": 8560 + }, + { + "epoch": 1.61, + "grad_norm": 27.299789428710938, + "learning_rate": 1.677395068699417e-05, + "loss": 1.1113, + "step": 8570 + }, + { + "epoch": 1.61, + "grad_norm": 12.851706504821777, + "learning_rate": 1.6770186335403728e-05, + "loss": 0.907, + "step": 8580 + }, + { + "epoch": 1.62, + "grad_norm": 24.122529983520508, + "learning_rate": 1.676642198381329e-05, + "loss": 0.9915, + "step": 8590 + }, + { + "epoch": 1.62, + "grad_norm": 22.42790985107422, + "learning_rate": 1.676265763222285e-05, + "loss": 0.9087, + "step": 8600 + }, + { + "epoch": 1.62, + "grad_norm": 10.046234130859375, + "learning_rate": 1.6758893280632414e-05, + "loss": 0.9194, + "step": 8610 + }, + { + "epoch": 1.62, + "grad_norm": 11.75545883178711, + "learning_rate": 1.6755128929041974e-05, + "loss": 1.2203, + "step": 8620 + }, + { + "epoch": 1.62, + "grad_norm": 12.967751502990723, + "learning_rate": 1.6751364577451534e-05, + "loss": 1.1011, + "step": 8630 + }, + { + "epoch": 1.63, + "grad_norm": 6.418219566345215, + "learning_rate": 1.6747600225861097e-05, + "loss": 1.0657, + "step": 8640 + }, + { + "epoch": 1.63, + "grad_norm": 39.12499237060547, + "learning_rate": 1.6743835874270657e-05, + "loss": 1.2006, + "step": 8650 + }, + { + "epoch": 1.63, + "grad_norm": 2.25285267829895, + "learning_rate": 1.674007152268022e-05, + "loss": 0.8309, + "step": 8660 + }, + { + "epoch": 1.63, + "grad_norm": 8.486536026000977, + "learning_rate": 1.673630717108978e-05, + "loss": 0.9349, + "step": 8670 + }, + { + "epoch": 1.63, + "grad_norm": 3.4122467041015625, + "learning_rate": 1.6732542819499343e-05, + "loss": 1.1027, + "step": 8680 + }, + { + "epoch": 1.64, + "grad_norm": 11.690104484558105, + "learning_rate": 1.6728778467908903e-05, + "loss": 0.9033, + "step": 8690 + }, + { + "epoch": 1.64, + "grad_norm": 16.22601318359375, + "learning_rate": 1.6725014116318466e-05, + "loss": 1.0882, + "step": 8700 + }, + { + "epoch": 1.64, + "grad_norm": 20.739831924438477, + "learning_rate": 1.6721249764728025e-05, + "loss": 0.9375, + "step": 8710 + }, + { + "epoch": 1.64, + "grad_norm": 15.394549369812012, + "learning_rate": 1.671748541313759e-05, + "loss": 0.7345, + "step": 8720 + }, + { + "epoch": 1.64, + "grad_norm": 26.133543014526367, + "learning_rate": 1.6713721061547152e-05, + "loss": 0.64, + "step": 8730 + }, + { + "epoch": 1.65, + "grad_norm": 13.604958534240723, + "learning_rate": 1.670995670995671e-05, + "loss": 0.9136, + "step": 8740 + }, + { + "epoch": 1.65, + "grad_norm": 42.65677261352539, + "learning_rate": 1.6706192358366275e-05, + "loss": 1.0659, + "step": 8750 + }, + { + "epoch": 1.65, + "grad_norm": 8.75528621673584, + "learning_rate": 1.6702428006775834e-05, + "loss": 0.8149, + "step": 8760 + }, + { + "epoch": 1.65, + "grad_norm": 5.066274642944336, + "learning_rate": 1.6698663655185398e-05, + "loss": 1.0136, + "step": 8770 + }, + { + "epoch": 1.65, + "grad_norm": 22.15322494506836, + "learning_rate": 1.6694899303594957e-05, + "loss": 1.0216, + "step": 8780 + }, + { + "epoch": 1.65, + "grad_norm": 14.151029586791992, + "learning_rate": 1.669113495200452e-05, + "loss": 1.1095, + "step": 8790 + }, + { + "epoch": 1.66, + "grad_norm": 6.58624267578125, + "learning_rate": 1.668737060041408e-05, + "loss": 0.8408, + "step": 8800 + }, + { + "epoch": 1.66, + "grad_norm": 25.148658752441406, + "learning_rate": 1.668360624882364e-05, + "loss": 1.0391, + "step": 8810 + }, + { + "epoch": 1.66, + "grad_norm": 6.866160869598389, + "learning_rate": 1.6679841897233203e-05, + "loss": 0.9131, + "step": 8820 + }, + { + "epoch": 1.66, + "grad_norm": 32.80194854736328, + "learning_rate": 1.6676077545642763e-05, + "loss": 0.8875, + "step": 8830 + }, + { + "epoch": 1.66, + "grad_norm": 3.0241827964782715, + "learning_rate": 1.6672313194052326e-05, + "loss": 0.7572, + "step": 8840 + }, + { + "epoch": 1.67, + "grad_norm": 14.243484497070312, + "learning_rate": 1.6668548842461886e-05, + "loss": 0.9565, + "step": 8850 + }, + { + "epoch": 1.67, + "grad_norm": 18.31822395324707, + "learning_rate": 1.666478449087145e-05, + "loss": 1.0248, + "step": 8860 + }, + { + "epoch": 1.67, + "grad_norm": 18.321674346923828, + "learning_rate": 1.666102013928101e-05, + "loss": 0.9848, + "step": 8870 + }, + { + "epoch": 1.67, + "grad_norm": 14.554122924804688, + "learning_rate": 1.6657255787690572e-05, + "loss": 0.9658, + "step": 8880 + }, + { + "epoch": 1.67, + "grad_norm": 12.484946250915527, + "learning_rate": 1.6653491436100132e-05, + "loss": 0.8559, + "step": 8890 + }, + { + "epoch": 1.68, + "grad_norm": 5.653412342071533, + "learning_rate": 1.6649727084509695e-05, + "loss": 0.9627, + "step": 8900 + }, + { + "epoch": 1.68, + "grad_norm": 7.155862331390381, + "learning_rate": 1.6645962732919258e-05, + "loss": 0.8445, + "step": 8910 + }, + { + "epoch": 1.68, + "grad_norm": 12.255889892578125, + "learning_rate": 1.6642198381328818e-05, + "loss": 0.7791, + "step": 8920 + }, + { + "epoch": 1.68, + "grad_norm": 8.364027976989746, + "learning_rate": 1.663843402973838e-05, + "loss": 1.0761, + "step": 8930 + }, + { + "epoch": 1.68, + "grad_norm": 19.489355087280273, + "learning_rate": 1.663466967814794e-05, + "loss": 0.8686, + "step": 8940 + }, + { + "epoch": 1.68, + "grad_norm": 12.537185668945312, + "learning_rate": 1.6630905326557504e-05, + "loss": 0.9145, + "step": 8950 + }, + { + "epoch": 1.69, + "grad_norm": 26.021825790405273, + "learning_rate": 1.6627140974967064e-05, + "loss": 0.8122, + "step": 8960 + }, + { + "epoch": 1.69, + "grad_norm": 6.992552757263184, + "learning_rate": 1.6623376623376627e-05, + "loss": 0.7718, + "step": 8970 + }, + { + "epoch": 1.69, + "grad_norm": 10.073301315307617, + "learning_rate": 1.6619612271786187e-05, + "loss": 0.9453, + "step": 8980 + }, + { + "epoch": 1.69, + "grad_norm": 25.186731338500977, + "learning_rate": 1.6615847920195746e-05, + "loss": 0.7758, + "step": 8990 + }, + { + "epoch": 1.69, + "grad_norm": 15.207765579223633, + "learning_rate": 1.661208356860531e-05, + "loss": 1.0578, + "step": 9000 + }, + { + "epoch": 1.7, + "grad_norm": 14.172220230102539, + "learning_rate": 1.660831921701487e-05, + "loss": 1.0341, + "step": 9010 + }, + { + "epoch": 1.7, + "grad_norm": 10.55916976928711, + "learning_rate": 1.6604554865424432e-05, + "loss": 0.9354, + "step": 9020 + }, + { + "epoch": 1.7, + "grad_norm": 13.917641639709473, + "learning_rate": 1.6600790513833992e-05, + "loss": 1.136, + "step": 9030 + }, + { + "epoch": 1.7, + "grad_norm": 11.619650840759277, + "learning_rate": 1.6597026162243555e-05, + "loss": 0.7212, + "step": 9040 + }, + { + "epoch": 1.7, + "grad_norm": 17.172563552856445, + "learning_rate": 1.6593261810653115e-05, + "loss": 0.7703, + "step": 9050 + }, + { + "epoch": 1.71, + "grad_norm": 18.772756576538086, + "learning_rate": 1.6589497459062678e-05, + "loss": 0.8157, + "step": 9060 + }, + { + "epoch": 1.71, + "grad_norm": 18.383899688720703, + "learning_rate": 1.6585733107472238e-05, + "loss": 1.1417, + "step": 9070 + }, + { + "epoch": 1.71, + "grad_norm": 15.073466300964355, + "learning_rate": 1.65819687558818e-05, + "loss": 0.9564, + "step": 9080 + }, + { + "epoch": 1.71, + "grad_norm": 23.901710510253906, + "learning_rate": 1.657820440429136e-05, + "loss": 1.1246, + "step": 9090 + }, + { + "epoch": 1.71, + "grad_norm": 7.932938575744629, + "learning_rate": 1.6574440052700924e-05, + "loss": 0.949, + "step": 9100 + }, + { + "epoch": 1.71, + "grad_norm": 4.0341410636901855, + "learning_rate": 1.6570675701110487e-05, + "loss": 1.0209, + "step": 9110 + }, + { + "epoch": 1.72, + "grad_norm": 7.543979167938232, + "learning_rate": 1.6566911349520047e-05, + "loss": 1.0694, + "step": 9120 + }, + { + "epoch": 1.72, + "grad_norm": 18.913318634033203, + "learning_rate": 1.656314699792961e-05, + "loss": 0.9295, + "step": 9130 + }, + { + "epoch": 1.72, + "grad_norm": 7.830410003662109, + "learning_rate": 1.655938264633917e-05, + "loss": 0.7556, + "step": 9140 + }, + { + "epoch": 1.72, + "grad_norm": 7.6431379318237305, + "learning_rate": 1.655561829474873e-05, + "loss": 0.8671, + "step": 9150 + }, + { + "epoch": 1.72, + "grad_norm": 27.64036750793457, + "learning_rate": 1.6551853943158293e-05, + "loss": 0.6891, + "step": 9160 + }, + { + "epoch": 1.73, + "grad_norm": 15.285219192504883, + "learning_rate": 1.6548089591567853e-05, + "loss": 1.1812, + "step": 9170 + }, + { + "epoch": 1.73, + "grad_norm": 11.814650535583496, + "learning_rate": 1.6544325239977416e-05, + "loss": 0.8133, + "step": 9180 + }, + { + "epoch": 1.73, + "grad_norm": 3.3252789974212646, + "learning_rate": 1.6540560888386976e-05, + "loss": 1.0719, + "step": 9190 + }, + { + "epoch": 1.73, + "grad_norm": 2.2466318607330322, + "learning_rate": 1.653679653679654e-05, + "loss": 0.8765, + "step": 9200 + }, + { + "epoch": 1.73, + "grad_norm": 34.163726806640625, + "learning_rate": 1.65330321852061e-05, + "loss": 1.2877, + "step": 9210 + }, + { + "epoch": 1.74, + "grad_norm": 27.794078826904297, + "learning_rate": 1.652926783361566e-05, + "loss": 0.8177, + "step": 9220 + }, + { + "epoch": 1.74, + "grad_norm": 16.112585067749023, + "learning_rate": 1.652550348202522e-05, + "loss": 1.1625, + "step": 9230 + }, + { + "epoch": 1.74, + "grad_norm": 19.92578125, + "learning_rate": 1.6521739130434785e-05, + "loss": 1.1242, + "step": 9240 + }, + { + "epoch": 1.74, + "grad_norm": 17.408260345458984, + "learning_rate": 1.6517974778844344e-05, + "loss": 1.1734, + "step": 9250 + }, + { + "epoch": 1.74, + "grad_norm": 23.571901321411133, + "learning_rate": 1.6514210427253907e-05, + "loss": 0.9096, + "step": 9260 + }, + { + "epoch": 1.74, + "grad_norm": 10.09569263458252, + "learning_rate": 1.6510446075663467e-05, + "loss": 0.8421, + "step": 9270 + }, + { + "epoch": 1.75, + "grad_norm": 22.68370246887207, + "learning_rate": 1.650668172407303e-05, + "loss": 0.8, + "step": 9280 + }, + { + "epoch": 1.75, + "grad_norm": 26.997875213623047, + "learning_rate": 1.6502917372482594e-05, + "loss": 0.9072, + "step": 9290 + }, + { + "epoch": 1.75, + "grad_norm": 26.56907081604004, + "learning_rate": 1.6499153020892153e-05, + "loss": 1.1413, + "step": 9300 + }, + { + "epoch": 1.75, + "grad_norm": 13.025582313537598, + "learning_rate": 1.6495388669301716e-05, + "loss": 0.712, + "step": 9310 + }, + { + "epoch": 1.75, + "grad_norm": 2.8579206466674805, + "learning_rate": 1.6491624317711273e-05, + "loss": 1.0311, + "step": 9320 + }, + { + "epoch": 1.76, + "grad_norm": 7.947895526885986, + "learning_rate": 1.6487859966120836e-05, + "loss": 0.8033, + "step": 9330 + }, + { + "epoch": 1.76, + "grad_norm": 6.863089561462402, + "learning_rate": 1.64840956145304e-05, + "loss": 1.0714, + "step": 9340 + }, + { + "epoch": 1.76, + "grad_norm": 10.841645240783691, + "learning_rate": 1.648033126293996e-05, + "loss": 0.6842, + "step": 9350 + }, + { + "epoch": 1.76, + "grad_norm": 27.438404083251953, + "learning_rate": 1.6476566911349522e-05, + "loss": 0.851, + "step": 9360 + }, + { + "epoch": 1.76, + "grad_norm": 2.9211061000823975, + "learning_rate": 1.6472802559759082e-05, + "loss": 0.8752, + "step": 9370 + }, + { + "epoch": 1.77, + "grad_norm": 13.311325073242188, + "learning_rate": 1.6469038208168645e-05, + "loss": 1.0659, + "step": 9380 + }, + { + "epoch": 1.77, + "grad_norm": 9.285094261169434, + "learning_rate": 1.6465273856578205e-05, + "loss": 0.8645, + "step": 9390 + }, + { + "epoch": 1.77, + "grad_norm": 12.683711051940918, + "learning_rate": 1.6461509504987768e-05, + "loss": 0.8845, + "step": 9400 + }, + { + "epoch": 1.77, + "grad_norm": 16.096887588500977, + "learning_rate": 1.6457745153397328e-05, + "loss": 0.9758, + "step": 9410 + }, + { + "epoch": 1.77, + "grad_norm": 10.897958755493164, + "learning_rate": 1.645398080180689e-05, + "loss": 1.2507, + "step": 9420 + }, + { + "epoch": 1.77, + "grad_norm": 22.48199462890625, + "learning_rate": 1.645021645021645e-05, + "loss": 0.7385, + "step": 9430 + }, + { + "epoch": 1.78, + "grad_norm": 16.660545349121094, + "learning_rate": 1.6446452098626014e-05, + "loss": 0.8564, + "step": 9440 + }, + { + "epoch": 1.78, + "grad_norm": 8.656999588012695, + "learning_rate": 1.6442687747035574e-05, + "loss": 0.7063, + "step": 9450 + }, + { + "epoch": 1.78, + "grad_norm": 10.140769004821777, + "learning_rate": 1.6438923395445137e-05, + "loss": 1.0147, + "step": 9460 + }, + { + "epoch": 1.78, + "grad_norm": 4.824342727661133, + "learning_rate": 1.64351590438547e-05, + "loss": 0.7609, + "step": 9470 + }, + { + "epoch": 1.78, + "grad_norm": 9.159531593322754, + "learning_rate": 1.643139469226426e-05, + "loss": 1.0369, + "step": 9480 + }, + { + "epoch": 1.79, + "grad_norm": 14.373122215270996, + "learning_rate": 1.6427630340673823e-05, + "loss": 1.1801, + "step": 9490 + }, + { + "epoch": 1.79, + "grad_norm": 5.078024864196777, + "learning_rate": 1.642386598908338e-05, + "loss": 0.9217, + "step": 9500 + }, + { + "epoch": 1.79, + "grad_norm": 7.723122596740723, + "learning_rate": 1.6420101637492942e-05, + "loss": 0.9727, + "step": 9510 + }, + { + "epoch": 1.79, + "grad_norm": 9.917556762695312, + "learning_rate": 1.6416337285902505e-05, + "loss": 1.2133, + "step": 9520 + }, + { + "epoch": 1.79, + "grad_norm": 6.582098007202148, + "learning_rate": 1.6412572934312065e-05, + "loss": 0.9397, + "step": 9530 + }, + { + "epoch": 1.8, + "grad_norm": 10.489495277404785, + "learning_rate": 1.640880858272163e-05, + "loss": 1.093, + "step": 9540 + }, + { + "epoch": 1.8, + "grad_norm": 21.351308822631836, + "learning_rate": 1.6405044231131188e-05, + "loss": 0.9417, + "step": 9550 + }, + { + "epoch": 1.8, + "grad_norm": 14.546730041503906, + "learning_rate": 1.640127987954075e-05, + "loss": 1.0752, + "step": 9560 + }, + { + "epoch": 1.8, + "grad_norm": 14.62718391418457, + "learning_rate": 1.639751552795031e-05, + "loss": 0.85, + "step": 9570 + }, + { + "epoch": 1.8, + "grad_norm": 9.844496726989746, + "learning_rate": 1.6393751176359874e-05, + "loss": 1.3426, + "step": 9580 + }, + { + "epoch": 1.81, + "grad_norm": 3.517404317855835, + "learning_rate": 1.6389986824769434e-05, + "loss": 0.8968, + "step": 9590 + }, + { + "epoch": 1.81, + "grad_norm": 39.88374328613281, + "learning_rate": 1.6386222473178997e-05, + "loss": 0.6198, + "step": 9600 + }, + { + "epoch": 1.81, + "grad_norm": 23.119050979614258, + "learning_rate": 1.6382458121588557e-05, + "loss": 0.7901, + "step": 9610 + }, + { + "epoch": 1.81, + "grad_norm": 7.583620071411133, + "learning_rate": 1.637869376999812e-05, + "loss": 1.0529, + "step": 9620 + }, + { + "epoch": 1.81, + "grad_norm": 29.128244400024414, + "learning_rate": 1.637492941840768e-05, + "loss": 1.237, + "step": 9630 + }, + { + "epoch": 1.81, + "grad_norm": 3.6622776985168457, + "learning_rate": 1.6371165066817243e-05, + "loss": 0.9682, + "step": 9640 + }, + { + "epoch": 1.82, + "grad_norm": 28.509607315063477, + "learning_rate": 1.6367400715226803e-05, + "loss": 1.0373, + "step": 9650 + }, + { + "epoch": 1.82, + "grad_norm": 2.624483346939087, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.9508, + "step": 9660 + }, + { + "epoch": 1.82, + "grad_norm": 14.195404052734375, + "learning_rate": 1.6359872012045926e-05, + "loss": 1.0253, + "step": 9670 + }, + { + "epoch": 1.82, + "grad_norm": 10.364255905151367, + "learning_rate": 1.6356107660455485e-05, + "loss": 1.1393, + "step": 9680 + }, + { + "epoch": 1.82, + "grad_norm": 17.5921688079834, + "learning_rate": 1.635234330886505e-05, + "loss": 1.2439, + "step": 9690 + }, + { + "epoch": 1.83, + "grad_norm": 20.8980712890625, + "learning_rate": 1.6348578957274612e-05, + "loss": 1.1815, + "step": 9700 + }, + { + "epoch": 1.83, + "grad_norm": 5.478924751281738, + "learning_rate": 1.634481460568417e-05, + "loss": 1.0223, + "step": 9710 + }, + { + "epoch": 1.83, + "grad_norm": 20.507476806640625, + "learning_rate": 1.6341050254093735e-05, + "loss": 0.6629, + "step": 9720 + }, + { + "epoch": 1.83, + "grad_norm": 28.950838088989258, + "learning_rate": 1.6337285902503294e-05, + "loss": 0.858, + "step": 9730 + }, + { + "epoch": 1.83, + "grad_norm": 14.215471267700195, + "learning_rate": 1.6333521550912858e-05, + "loss": 1.0337, + "step": 9740 + }, + { + "epoch": 1.84, + "grad_norm": 9.36776065826416, + "learning_rate": 1.6329757199322417e-05, + "loss": 1.4074, + "step": 9750 + }, + { + "epoch": 1.84, + "grad_norm": 7.266942501068115, + "learning_rate": 1.632599284773198e-05, + "loss": 0.8383, + "step": 9760 + }, + { + "epoch": 1.84, + "grad_norm": 8.291203498840332, + "learning_rate": 1.632222849614154e-05, + "loss": 0.8997, + "step": 9770 + }, + { + "epoch": 1.84, + "grad_norm": 28.071760177612305, + "learning_rate": 1.6318464144551103e-05, + "loss": 0.8101, + "step": 9780 + }, + { + "epoch": 1.84, + "grad_norm": 8.149245262145996, + "learning_rate": 1.6314699792960663e-05, + "loss": 0.7052, + "step": 9790 + }, + { + "epoch": 1.84, + "grad_norm": 34.350502014160156, + "learning_rate": 1.6310935441370226e-05, + "loss": 0.9759, + "step": 9800 + }, + { + "epoch": 1.85, + "grad_norm": 4.613625526428223, + "learning_rate": 1.6307171089779786e-05, + "loss": 0.9999, + "step": 9810 + }, + { + "epoch": 1.85, + "grad_norm": 8.235222816467285, + "learning_rate": 1.630340673818935e-05, + "loss": 0.8468, + "step": 9820 + }, + { + "epoch": 1.85, + "grad_norm": 9.5585298538208, + "learning_rate": 1.629964238659891e-05, + "loss": 1.0039, + "step": 9830 + }, + { + "epoch": 1.85, + "grad_norm": 23.900362014770508, + "learning_rate": 1.6295878035008472e-05, + "loss": 1.15, + "step": 9840 + }, + { + "epoch": 1.85, + "grad_norm": 5.563354969024658, + "learning_rate": 1.6292113683418032e-05, + "loss": 0.723, + "step": 9850 + }, + { + "epoch": 1.86, + "grad_norm": 38.29470443725586, + "learning_rate": 1.6288349331827592e-05, + "loss": 0.9061, + "step": 9860 + }, + { + "epoch": 1.86, + "grad_norm": 17.7316837310791, + "learning_rate": 1.6284584980237155e-05, + "loss": 1.1306, + "step": 9870 + }, + { + "epoch": 1.86, + "grad_norm": 22.314489364624023, + "learning_rate": 1.6280820628646715e-05, + "loss": 0.8234, + "step": 9880 + }, + { + "epoch": 1.86, + "grad_norm": 10.5621337890625, + "learning_rate": 1.6277056277056278e-05, + "loss": 0.7809, + "step": 9890 + }, + { + "epoch": 1.86, + "grad_norm": 13.799981117248535, + "learning_rate": 1.627329192546584e-05, + "loss": 0.9947, + "step": 9900 + }, + { + "epoch": 1.87, + "grad_norm": 15.118837356567383, + "learning_rate": 1.62695275738754e-05, + "loss": 0.9937, + "step": 9910 + }, + { + "epoch": 1.87, + "grad_norm": 8.086974143981934, + "learning_rate": 1.6265763222284964e-05, + "loss": 0.8989, + "step": 9920 + }, + { + "epoch": 1.87, + "grad_norm": 24.501052856445312, + "learning_rate": 1.6261998870694524e-05, + "loss": 0.9246, + "step": 9930 + }, + { + "epoch": 1.87, + "grad_norm": 17.838918685913086, + "learning_rate": 1.6258234519104087e-05, + "loss": 0.6852, + "step": 9940 + }, + { + "epoch": 1.87, + "grad_norm": 14.205941200256348, + "learning_rate": 1.6254470167513647e-05, + "loss": 0.9937, + "step": 9950 + }, + { + "epoch": 1.87, + "grad_norm": 37.4649658203125, + "learning_rate": 1.625070581592321e-05, + "loss": 1.1566, + "step": 9960 + }, + { + "epoch": 1.88, + "grad_norm": 3.642307758331299, + "learning_rate": 1.624694146433277e-05, + "loss": 0.6511, + "step": 9970 + }, + { + "epoch": 1.88, + "grad_norm": 8.570235252380371, + "learning_rate": 1.6243177112742333e-05, + "loss": 1.0627, + "step": 9980 + }, + { + "epoch": 1.88, + "grad_norm": 49.44059753417969, + "learning_rate": 1.6239412761151892e-05, + "loss": 0.87, + "step": 9990 + }, + { + "epoch": 1.88, + "grad_norm": 4.772658824920654, + "learning_rate": 1.6235648409561456e-05, + "loss": 0.9856, + "step": 10000 + }, + { + "epoch": 1.88, + "grad_norm": 5.388487815856934, + "learning_rate": 1.6231884057971015e-05, + "loss": 0.8331, + "step": 10010 + }, + { + "epoch": 1.89, + "grad_norm": 28.38582992553711, + "learning_rate": 1.6228119706380575e-05, + "loss": 0.7234, + "step": 10020 + }, + { + "epoch": 1.89, + "grad_norm": 35.28871154785156, + "learning_rate": 1.6224355354790138e-05, + "loss": 1.2131, + "step": 10030 + }, + { + "epoch": 1.89, + "grad_norm": 11.386420249938965, + "learning_rate": 1.6220591003199698e-05, + "loss": 0.5296, + "step": 10040 + }, + { + "epoch": 1.89, + "grad_norm": 8.352224349975586, + "learning_rate": 1.621682665160926e-05, + "loss": 0.8543, + "step": 10050 + }, + { + "epoch": 1.89, + "grad_norm": 45.16379928588867, + "learning_rate": 1.621306230001882e-05, + "loss": 1.0114, + "step": 10060 + }, + { + "epoch": 1.9, + "grad_norm": 18.55881118774414, + "learning_rate": 1.6209297948428384e-05, + "loss": 0.8992, + "step": 10070 + }, + { + "epoch": 1.9, + "grad_norm": 35.752506256103516, + "learning_rate": 1.6205533596837947e-05, + "loss": 1.1698, + "step": 10080 + }, + { + "epoch": 1.9, + "grad_norm": 9.165136337280273, + "learning_rate": 1.6201769245247507e-05, + "loss": 0.809, + "step": 10090 + }, + { + "epoch": 1.9, + "grad_norm": 6.1471781730651855, + "learning_rate": 1.619800489365707e-05, + "loss": 0.9719, + "step": 10100 + }, + { + "epoch": 1.9, + "grad_norm": 6.279614448547363, + "learning_rate": 1.619424054206663e-05, + "loss": 0.8327, + "step": 10110 + }, + { + "epoch": 1.9, + "grad_norm": 4.469071865081787, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.7403, + "step": 10120 + }, + { + "epoch": 1.91, + "grad_norm": 28.841690063476562, + "learning_rate": 1.6186711838885753e-05, + "loss": 0.9098, + "step": 10130 + }, + { + "epoch": 1.91, + "grad_norm": 13.314472198486328, + "learning_rate": 1.6182947487295316e-05, + "loss": 0.8678, + "step": 10140 + }, + { + "epoch": 1.91, + "grad_norm": 12.73503589630127, + "learning_rate": 1.6179183135704876e-05, + "loss": 0.9671, + "step": 10150 + }, + { + "epoch": 1.91, + "grad_norm": 5.612974643707275, + "learning_rate": 1.617541878411444e-05, + "loss": 0.8328, + "step": 10160 + }, + { + "epoch": 1.91, + "grad_norm": 16.328927993774414, + "learning_rate": 1.6171654432524e-05, + "loss": 0.9885, + "step": 10170 + }, + { + "epoch": 1.92, + "grad_norm": 22.157751083374023, + "learning_rate": 1.6167890080933562e-05, + "loss": 0.9805, + "step": 10180 + }, + { + "epoch": 1.92, + "grad_norm": 41.27034378051758, + "learning_rate": 1.616412572934312e-05, + "loss": 1.0366, + "step": 10190 + }, + { + "epoch": 1.92, + "grad_norm": 9.670302391052246, + "learning_rate": 1.616036137775268e-05, + "loss": 1.0358, + "step": 10200 + }, + { + "epoch": 1.92, + "grad_norm": 23.558927536010742, + "learning_rate": 1.6156597026162245e-05, + "loss": 1.0069, + "step": 10210 + }, + { + "epoch": 1.92, + "grad_norm": 9.577465057373047, + "learning_rate": 1.6152832674571804e-05, + "loss": 0.9131, + "step": 10220 + }, + { + "epoch": 1.93, + "grad_norm": 7.152612209320068, + "learning_rate": 1.6149068322981367e-05, + "loss": 0.9243, + "step": 10230 + }, + { + "epoch": 1.93, + "grad_norm": 14.56041431427002, + "learning_rate": 1.6145303971390927e-05, + "loss": 1.2452, + "step": 10240 + }, + { + "epoch": 1.93, + "grad_norm": 4.7469611167907715, + "learning_rate": 1.614153961980049e-05, + "loss": 0.8841, + "step": 10250 + }, + { + "epoch": 1.93, + "grad_norm": 0.6863610148429871, + "learning_rate": 1.6137775268210054e-05, + "loss": 0.6211, + "step": 10260 + }, + { + "epoch": 1.93, + "grad_norm": 18.57179832458496, + "learning_rate": 1.6134010916619613e-05, + "loss": 1.0319, + "step": 10270 + }, + { + "epoch": 1.93, + "grad_norm": 11.881142616271973, + "learning_rate": 1.6130246565029176e-05, + "loss": 1.0036, + "step": 10280 + }, + { + "epoch": 1.94, + "grad_norm": 12.16629695892334, + "learning_rate": 1.6126482213438736e-05, + "loss": 0.7445, + "step": 10290 + }, + { + "epoch": 1.94, + "grad_norm": 26.047819137573242, + "learning_rate": 1.61227178618483e-05, + "loss": 1.1173, + "step": 10300 + }, + { + "epoch": 1.94, + "grad_norm": 4.4914116859436035, + "learning_rate": 1.611895351025786e-05, + "loss": 0.9002, + "step": 10310 + }, + { + "epoch": 1.94, + "grad_norm": 12.42726993560791, + "learning_rate": 1.6115189158667422e-05, + "loss": 0.9133, + "step": 10320 + }, + { + "epoch": 1.94, + "grad_norm": 12.923616409301758, + "learning_rate": 1.6111424807076982e-05, + "loss": 0.4911, + "step": 10330 + }, + { + "epoch": 1.95, + "grad_norm": 22.348865509033203, + "learning_rate": 1.6107660455486545e-05, + "loss": 0.8791, + "step": 10340 + }, + { + "epoch": 1.95, + "grad_norm": 4.061618804931641, + "learning_rate": 1.6103896103896105e-05, + "loss": 0.9309, + "step": 10350 + }, + { + "epoch": 1.95, + "grad_norm": 25.17974090576172, + "learning_rate": 1.6100131752305668e-05, + "loss": 0.995, + "step": 10360 + }, + { + "epoch": 1.95, + "grad_norm": 25.17315673828125, + "learning_rate": 1.6096367400715228e-05, + "loss": 0.6279, + "step": 10370 + }, + { + "epoch": 1.95, + "grad_norm": 33.93449020385742, + "learning_rate": 1.6092603049124788e-05, + "loss": 0.9279, + "step": 10380 + }, + { + "epoch": 1.96, + "grad_norm": 8.33711051940918, + "learning_rate": 1.608883869753435e-05, + "loss": 1.0123, + "step": 10390 + }, + { + "epoch": 1.96, + "grad_norm": 4.49125862121582, + "learning_rate": 1.608507434594391e-05, + "loss": 0.7202, + "step": 10400 + }, + { + "epoch": 1.96, + "grad_norm": 21.362960815429688, + "learning_rate": 1.6081309994353474e-05, + "loss": 1.0845, + "step": 10410 + }, + { + "epoch": 1.96, + "grad_norm": 22.759014129638672, + "learning_rate": 1.6077545642763034e-05, + "loss": 0.6752, + "step": 10420 + }, + { + "epoch": 1.96, + "grad_norm": 20.3575382232666, + "learning_rate": 1.6073781291172597e-05, + "loss": 0.9548, + "step": 10430 + }, + { + "epoch": 1.96, + "grad_norm": 5.1825947761535645, + "learning_rate": 1.6070016939582156e-05, + "loss": 0.7794, + "step": 10440 + }, + { + "epoch": 1.97, + "grad_norm": 13.626837730407715, + "learning_rate": 1.606625258799172e-05, + "loss": 0.8118, + "step": 10450 + }, + { + "epoch": 1.97, + "grad_norm": 20.532129287719727, + "learning_rate": 1.6062488236401283e-05, + "loss": 1.0943, + "step": 10460 + }, + { + "epoch": 1.97, + "grad_norm": 23.677160263061523, + "learning_rate": 1.6058723884810843e-05, + "loss": 1.4279, + "step": 10470 + }, + { + "epoch": 1.97, + "grad_norm": 30.627092361450195, + "learning_rate": 1.6054959533220406e-05, + "loss": 0.798, + "step": 10480 + }, + { + "epoch": 1.97, + "grad_norm": 11.574738502502441, + "learning_rate": 1.6051195181629965e-05, + "loss": 0.7298, + "step": 10490 + }, + { + "epoch": 1.98, + "grad_norm": 20.059284210205078, + "learning_rate": 1.604743083003953e-05, + "loss": 1.1131, + "step": 10500 + }, + { + "epoch": 1.98, + "grad_norm": 14.673868179321289, + "learning_rate": 1.604366647844909e-05, + "loss": 0.8992, + "step": 10510 + }, + { + "epoch": 1.98, + "grad_norm": 6.70554256439209, + "learning_rate": 1.603990212685865e-05, + "loss": 0.9923, + "step": 10520 + }, + { + "epoch": 1.98, + "grad_norm": 22.268571853637695, + "learning_rate": 1.603613777526821e-05, + "loss": 0.9369, + "step": 10530 + }, + { + "epoch": 1.98, + "grad_norm": 7.241332530975342, + "learning_rate": 1.603237342367777e-05, + "loss": 0.918, + "step": 10540 + }, + { + "epoch": 1.99, + "grad_norm": 8.223257064819336, + "learning_rate": 1.6028609072087334e-05, + "loss": 0.7295, + "step": 10550 + }, + { + "epoch": 1.99, + "grad_norm": 14.106549263000488, + "learning_rate": 1.6024844720496894e-05, + "loss": 0.9356, + "step": 10560 + }, + { + "epoch": 1.99, + "grad_norm": 5.024641513824463, + "learning_rate": 1.6021080368906457e-05, + "loss": 1.0827, + "step": 10570 + }, + { + "epoch": 1.99, + "grad_norm": 9.588906288146973, + "learning_rate": 1.6017316017316017e-05, + "loss": 0.9986, + "step": 10580 + }, + { + "epoch": 1.99, + "grad_norm": 13.386589050292969, + "learning_rate": 1.601355166572558e-05, + "loss": 0.801, + "step": 10590 + }, + { + "epoch": 2.0, + "grad_norm": 14.955891609191895, + "learning_rate": 1.600978731413514e-05, + "loss": 0.611, + "step": 10600 + }, + { + "epoch": 2.0, + "grad_norm": 26.955785751342773, + "learning_rate": 1.6006022962544703e-05, + "loss": 0.8322, + "step": 10610 + }, + { + "epoch": 2.0, + "grad_norm": 4.895423889160156, + "learning_rate": 1.6002258610954263e-05, + "loss": 0.6479, + "step": 10620 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9004, + "eval_loss": 0.43774810433387756, + "eval_runtime": 33.2538, + "eval_samples_per_second": 225.538, + "eval_steps_per_second": 28.207, + "step": 10626 + }, + { + "epoch": 2.0, + "grad_norm": 8.223067283630371, + "learning_rate": 1.5998494259363826e-05, + "loss": 0.8513, + "step": 10630 + }, + { + "epoch": 2.0, + "grad_norm": 6.11199951171875, + "learning_rate": 1.599472990777339e-05, + "loss": 0.8083, + "step": 10640 + }, + { + "epoch": 2.0, + "grad_norm": 7.889083385467529, + "learning_rate": 1.599096555618295e-05, + "loss": 0.6345, + "step": 10650 + }, + { + "epoch": 2.01, + "grad_norm": 0.47759780287742615, + "learning_rate": 1.5987201204592512e-05, + "loss": 0.7421, + "step": 10660 + }, + { + "epoch": 2.01, + "grad_norm": 7.3583879470825195, + "learning_rate": 1.5983436853002072e-05, + "loss": 0.791, + "step": 10670 + }, + { + "epoch": 2.01, + "grad_norm": 14.362602233886719, + "learning_rate": 1.5979672501411635e-05, + "loss": 0.7772, + "step": 10680 + }, + { + "epoch": 2.01, + "grad_norm": 14.467147827148438, + "learning_rate": 1.5975908149821195e-05, + "loss": 0.8084, + "step": 10690 + }, + { + "epoch": 2.01, + "grad_norm": 27.702394485473633, + "learning_rate": 1.5972143798230758e-05, + "loss": 1.2076, + "step": 10700 + }, + { + "epoch": 2.02, + "grad_norm": 29.06777572631836, + "learning_rate": 1.5968379446640318e-05, + "loss": 0.9421, + "step": 10710 + }, + { + "epoch": 2.02, + "grad_norm": 17.13526725769043, + "learning_rate": 1.5964615095049877e-05, + "loss": 0.7547, + "step": 10720 + }, + { + "epoch": 2.02, + "grad_norm": 5.489052772521973, + "learning_rate": 1.596085074345944e-05, + "loss": 0.6932, + "step": 10730 + }, + { + "epoch": 2.02, + "grad_norm": 18.08340072631836, + "learning_rate": 1.5957086391869e-05, + "loss": 0.9143, + "step": 10740 + }, + { + "epoch": 2.02, + "grad_norm": 7.827996730804443, + "learning_rate": 1.5953322040278563e-05, + "loss": 0.7774, + "step": 10750 + }, + { + "epoch": 2.03, + "grad_norm": 29.571741104125977, + "learning_rate": 1.5949557688688123e-05, + "loss": 0.7913, + "step": 10760 + }, + { + "epoch": 2.03, + "grad_norm": 12.85148811340332, + "learning_rate": 1.5945793337097686e-05, + "loss": 0.5545, + "step": 10770 + }, + { + "epoch": 2.03, + "grad_norm": 4.041319847106934, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.7221, + "step": 10780 + }, + { + "epoch": 2.03, + "grad_norm": 13.464383125305176, + "learning_rate": 1.593826463391681e-05, + "loss": 1.0252, + "step": 10790 + }, + { + "epoch": 2.03, + "grad_norm": 267.48870849609375, + "learning_rate": 1.593450028232637e-05, + "loss": 1.032, + "step": 10800 + }, + { + "epoch": 2.03, + "grad_norm": 5.915716648101807, + "learning_rate": 1.5930735930735932e-05, + "loss": 0.6537, + "step": 10810 + }, + { + "epoch": 2.04, + "grad_norm": 3.0029585361480713, + "learning_rate": 1.5926971579145495e-05, + "loss": 0.8683, + "step": 10820 + }, + { + "epoch": 2.04, + "grad_norm": 51.99099349975586, + "learning_rate": 1.5923207227555055e-05, + "loss": 0.714, + "step": 10830 + }, + { + "epoch": 2.04, + "grad_norm": 18.225906372070312, + "learning_rate": 1.5919442875964618e-05, + "loss": 0.6733, + "step": 10840 + }, + { + "epoch": 2.04, + "grad_norm": 14.735111236572266, + "learning_rate": 1.5915678524374178e-05, + "loss": 0.7899, + "step": 10850 + }, + { + "epoch": 2.04, + "grad_norm": 0.9460431337356567, + "learning_rate": 1.591191417278374e-05, + "loss": 0.697, + "step": 10860 + }, + { + "epoch": 2.05, + "grad_norm": 5.448774814605713, + "learning_rate": 1.59081498211933e-05, + "loss": 0.699, + "step": 10870 + }, + { + "epoch": 2.05, + "grad_norm": 12.799127578735352, + "learning_rate": 1.5904385469602864e-05, + "loss": 1.0552, + "step": 10880 + }, + { + "epoch": 2.05, + "grad_norm": 44.27261734008789, + "learning_rate": 1.5900621118012424e-05, + "loss": 0.6706, + "step": 10890 + }, + { + "epoch": 2.05, + "grad_norm": 23.952608108520508, + "learning_rate": 1.5896856766421984e-05, + "loss": 1.0494, + "step": 10900 + }, + { + "epoch": 2.05, + "grad_norm": 29.93094825744629, + "learning_rate": 1.5893092414831547e-05, + "loss": 0.7551, + "step": 10910 + }, + { + "epoch": 2.06, + "grad_norm": 12.698896408081055, + "learning_rate": 1.5889328063241107e-05, + "loss": 0.8114, + "step": 10920 + }, + { + "epoch": 2.06, + "grad_norm": 6.4680867195129395, + "learning_rate": 1.588556371165067e-05, + "loss": 0.6495, + "step": 10930 + }, + { + "epoch": 2.06, + "grad_norm": 20.92882537841797, + "learning_rate": 1.588179936006023e-05, + "loss": 0.5763, + "step": 10940 + }, + { + "epoch": 2.06, + "grad_norm": 16.703866958618164, + "learning_rate": 1.5878035008469793e-05, + "loss": 0.7678, + "step": 10950 + }, + { + "epoch": 2.06, + "grad_norm": 26.071792602539062, + "learning_rate": 1.5874270656879352e-05, + "loss": 0.7934, + "step": 10960 + }, + { + "epoch": 2.06, + "grad_norm": 8.398687362670898, + "learning_rate": 1.5870506305288916e-05, + "loss": 0.6339, + "step": 10970 + }, + { + "epoch": 2.07, + "grad_norm": 7.431175231933594, + "learning_rate": 1.5866741953698475e-05, + "loss": 0.4395, + "step": 10980 + }, + { + "epoch": 2.07, + "grad_norm": 14.827509880065918, + "learning_rate": 1.586297760210804e-05, + "loss": 0.6793, + "step": 10990 + }, + { + "epoch": 2.07, + "grad_norm": 3.574143409729004, + "learning_rate": 1.58592132505176e-05, + "loss": 1.0553, + "step": 11000 + }, + { + "epoch": 2.07, + "grad_norm": 11.981563568115234, + "learning_rate": 1.585544889892716e-05, + "loss": 1.0152, + "step": 11010 + }, + { + "epoch": 2.07, + "grad_norm": 33.9033088684082, + "learning_rate": 1.5851684547336725e-05, + "loss": 0.8337, + "step": 11020 + }, + { + "epoch": 2.08, + "grad_norm": 3.125753879547119, + "learning_rate": 1.5847920195746284e-05, + "loss": 0.6738, + "step": 11030 + }, + { + "epoch": 2.08, + "grad_norm": 16.212665557861328, + "learning_rate": 1.5844155844155847e-05, + "loss": 0.7257, + "step": 11040 + }, + { + "epoch": 2.08, + "grad_norm": 8.9789457321167, + "learning_rate": 1.5840391492565407e-05, + "loss": 0.8129, + "step": 11050 + }, + { + "epoch": 2.08, + "grad_norm": 27.197084426879883, + "learning_rate": 1.583662714097497e-05, + "loss": 0.8489, + "step": 11060 + }, + { + "epoch": 2.08, + "grad_norm": 9.654715538024902, + "learning_rate": 1.583286278938453e-05, + "loss": 1.1031, + "step": 11070 + }, + { + "epoch": 2.09, + "grad_norm": 7.9157514572143555, + "learning_rate": 1.582909843779409e-05, + "loss": 0.649, + "step": 11080 + }, + { + "epoch": 2.09, + "grad_norm": 26.433883666992188, + "learning_rate": 1.5825334086203653e-05, + "loss": 1.0084, + "step": 11090 + }, + { + "epoch": 2.09, + "grad_norm": 2.050842046737671, + "learning_rate": 1.5821569734613213e-05, + "loss": 0.7625, + "step": 11100 + }, + { + "epoch": 2.09, + "grad_norm": 22.36996078491211, + "learning_rate": 1.5817805383022776e-05, + "loss": 0.877, + "step": 11110 + }, + { + "epoch": 2.09, + "grad_norm": 1.3118711709976196, + "learning_rate": 1.5814041031432336e-05, + "loss": 0.5775, + "step": 11120 + }, + { + "epoch": 2.09, + "grad_norm": 17.231550216674805, + "learning_rate": 1.58102766798419e-05, + "loss": 1.0835, + "step": 11130 + }, + { + "epoch": 2.1, + "grad_norm": 8.116622924804688, + "learning_rate": 1.580651232825146e-05, + "loss": 0.7954, + "step": 11140 + }, + { + "epoch": 2.1, + "grad_norm": 10.011574745178223, + "learning_rate": 1.5802747976661022e-05, + "loss": 0.817, + "step": 11150 + }, + { + "epoch": 2.1, + "grad_norm": 17.308115005493164, + "learning_rate": 1.579898362507058e-05, + "loss": 0.4949, + "step": 11160 + }, + { + "epoch": 2.1, + "grad_norm": 15.126243591308594, + "learning_rate": 1.5795219273480145e-05, + "loss": 0.5666, + "step": 11170 + }, + { + "epoch": 2.1, + "grad_norm": 8.26230525970459, + "learning_rate": 1.5791454921889705e-05, + "loss": 0.5484, + "step": 11180 + }, + { + "epoch": 2.11, + "grad_norm": 2.2589051723480225, + "learning_rate": 1.5787690570299268e-05, + "loss": 0.8625, + "step": 11190 + }, + { + "epoch": 2.11, + "grad_norm": 18.615032196044922, + "learning_rate": 1.578392621870883e-05, + "loss": 1.1054, + "step": 11200 + }, + { + "epoch": 2.11, + "grad_norm": 14.837967872619629, + "learning_rate": 1.578016186711839e-05, + "loss": 0.9737, + "step": 11210 + }, + { + "epoch": 2.11, + "grad_norm": 11.36581802368164, + "learning_rate": 1.5776397515527954e-05, + "loss": 0.77, + "step": 11220 + }, + { + "epoch": 2.11, + "grad_norm": 7.043651103973389, + "learning_rate": 1.5772633163937514e-05, + "loss": 0.7536, + "step": 11230 + }, + { + "epoch": 2.12, + "grad_norm": 18.54761505126953, + "learning_rate": 1.5768868812347073e-05, + "loss": 1.0175, + "step": 11240 + }, + { + "epoch": 2.12, + "grad_norm": 7.881554126739502, + "learning_rate": 1.5765104460756636e-05, + "loss": 0.6701, + "step": 11250 + }, + { + "epoch": 2.12, + "grad_norm": 18.58554458618164, + "learning_rate": 1.5761340109166196e-05, + "loss": 1.0165, + "step": 11260 + }, + { + "epoch": 2.12, + "grad_norm": 6.169960021972656, + "learning_rate": 1.575757575757576e-05, + "loss": 0.7932, + "step": 11270 + }, + { + "epoch": 2.12, + "grad_norm": 13.55213737487793, + "learning_rate": 1.575381140598532e-05, + "loss": 0.6286, + "step": 11280 + }, + { + "epoch": 2.12, + "grad_norm": 26.40308952331543, + "learning_rate": 1.5750047054394882e-05, + "loss": 1.0074, + "step": 11290 + }, + { + "epoch": 2.13, + "grad_norm": 14.90792465209961, + "learning_rate": 1.5746282702804442e-05, + "loss": 0.7594, + "step": 11300 + }, + { + "epoch": 2.13, + "grad_norm": 25.512937545776367, + "learning_rate": 1.5742518351214005e-05, + "loss": 0.8933, + "step": 11310 + }, + { + "epoch": 2.13, + "grad_norm": 20.684497833251953, + "learning_rate": 1.5738753999623565e-05, + "loss": 0.8769, + "step": 11320 + }, + { + "epoch": 2.13, + "grad_norm": 26.62982940673828, + "learning_rate": 1.5734989648033128e-05, + "loss": 1.0551, + "step": 11330 + }, + { + "epoch": 2.13, + "grad_norm": 12.02119255065918, + "learning_rate": 1.5731225296442688e-05, + "loss": 0.8013, + "step": 11340 + }, + { + "epoch": 2.14, + "grad_norm": 4.007152080535889, + "learning_rate": 1.572746094485225e-05, + "loss": 0.7307, + "step": 11350 + }, + { + "epoch": 2.14, + "grad_norm": 11.850004196166992, + "learning_rate": 1.572369659326181e-05, + "loss": 0.7143, + "step": 11360 + }, + { + "epoch": 2.14, + "grad_norm": 11.220576286315918, + "learning_rate": 1.5719932241671374e-05, + "loss": 0.7637, + "step": 11370 + }, + { + "epoch": 2.14, + "grad_norm": 3.0567941665649414, + "learning_rate": 1.5716167890080937e-05, + "loss": 0.9187, + "step": 11380 + }, + { + "epoch": 2.14, + "grad_norm": 24.867557525634766, + "learning_rate": 1.5712403538490497e-05, + "loss": 0.86, + "step": 11390 + }, + { + "epoch": 2.15, + "grad_norm": 13.429787635803223, + "learning_rate": 1.570863918690006e-05, + "loss": 0.919, + "step": 11400 + }, + { + "epoch": 2.15, + "grad_norm": 1.4714537858963013, + "learning_rate": 1.5704874835309616e-05, + "loss": 0.8753, + "step": 11410 + }, + { + "epoch": 2.15, + "grad_norm": 26.924331665039062, + "learning_rate": 1.570111048371918e-05, + "loss": 0.61, + "step": 11420 + }, + { + "epoch": 2.15, + "grad_norm": 19.854873657226562, + "learning_rate": 1.5697346132128743e-05, + "loss": 0.765, + "step": 11430 + }, + { + "epoch": 2.15, + "grad_norm": 10.933598518371582, + "learning_rate": 1.5693581780538303e-05, + "loss": 0.9729, + "step": 11440 + }, + { + "epoch": 2.16, + "grad_norm": 12.186927795410156, + "learning_rate": 1.5689817428947866e-05, + "loss": 1.1194, + "step": 11450 + }, + { + "epoch": 2.16, + "grad_norm": 15.089810371398926, + "learning_rate": 1.5686053077357425e-05, + "loss": 0.979, + "step": 11460 + }, + { + "epoch": 2.16, + "grad_norm": 34.98917007446289, + "learning_rate": 1.568228872576699e-05, + "loss": 0.7631, + "step": 11470 + }, + { + "epoch": 2.16, + "grad_norm": 7.224375247955322, + "learning_rate": 1.567852437417655e-05, + "loss": 0.7461, + "step": 11480 + }, + { + "epoch": 2.16, + "grad_norm": 3.452284812927246, + "learning_rate": 1.567476002258611e-05, + "loss": 0.6214, + "step": 11490 + }, + { + "epoch": 2.16, + "grad_norm": 12.092110633850098, + "learning_rate": 1.567099567099567e-05, + "loss": 0.6558, + "step": 11500 + }, + { + "epoch": 2.17, + "grad_norm": 10.905387878417969, + "learning_rate": 1.5667231319405234e-05, + "loss": 0.6047, + "step": 11510 + }, + { + "epoch": 2.17, + "grad_norm": 10.892648696899414, + "learning_rate": 1.5663466967814794e-05, + "loss": 0.8369, + "step": 11520 + }, + { + "epoch": 2.17, + "grad_norm": 2.8297064304351807, + "learning_rate": 1.5659702616224357e-05, + "loss": 0.6804, + "step": 11530 + }, + { + "epoch": 2.17, + "grad_norm": 6.064371585845947, + "learning_rate": 1.5655938264633917e-05, + "loss": 0.6657, + "step": 11540 + }, + { + "epoch": 2.17, + "grad_norm": 8.448015213012695, + "learning_rate": 1.565217391304348e-05, + "loss": 0.6027, + "step": 11550 + }, + { + "epoch": 2.18, + "grad_norm": 39.2706413269043, + "learning_rate": 1.5648409561453043e-05, + "loss": 0.7446, + "step": 11560 + }, + { + "epoch": 2.18, + "grad_norm": 17.710491180419922, + "learning_rate": 1.5644645209862603e-05, + "loss": 0.9606, + "step": 11570 + }, + { + "epoch": 2.18, + "grad_norm": 8.026297569274902, + "learning_rate": 1.5640880858272166e-05, + "loss": 0.6021, + "step": 11580 + }, + { + "epoch": 2.18, + "grad_norm": 5.3070855140686035, + "learning_rate": 1.5637116506681723e-05, + "loss": 0.5271, + "step": 11590 + }, + { + "epoch": 2.18, + "grad_norm": 10.307164192199707, + "learning_rate": 1.5633352155091286e-05, + "loss": 0.6096, + "step": 11600 + }, + { + "epoch": 2.19, + "grad_norm": 5.217769145965576, + "learning_rate": 1.562958780350085e-05, + "loss": 0.4626, + "step": 11610 + }, + { + "epoch": 2.19, + "grad_norm": 33.302310943603516, + "learning_rate": 1.562582345191041e-05, + "loss": 0.5173, + "step": 11620 + }, + { + "epoch": 2.19, + "grad_norm": 5.040510654449463, + "learning_rate": 1.5622059100319972e-05, + "loss": 0.9346, + "step": 11630 + }, + { + "epoch": 2.19, + "grad_norm": 30.57442855834961, + "learning_rate": 1.5618294748729532e-05, + "loss": 0.8653, + "step": 11640 + }, + { + "epoch": 2.19, + "grad_norm": 4.821202754974365, + "learning_rate": 1.5614530397139095e-05, + "loss": 0.8027, + "step": 11650 + }, + { + "epoch": 2.19, + "grad_norm": 3.6210570335388184, + "learning_rate": 1.5610766045548655e-05, + "loss": 0.7766, + "step": 11660 + }, + { + "epoch": 2.2, + "grad_norm": 5.999725818634033, + "learning_rate": 1.5607001693958218e-05, + "loss": 0.9771, + "step": 11670 + }, + { + "epoch": 2.2, + "grad_norm": 9.624933242797852, + "learning_rate": 1.5603237342367778e-05, + "loss": 0.5936, + "step": 11680 + }, + { + "epoch": 2.2, + "grad_norm": 10.115823745727539, + "learning_rate": 1.559947299077734e-05, + "loss": 1.1893, + "step": 11690 + }, + { + "epoch": 2.2, + "grad_norm": 7.536423683166504, + "learning_rate": 1.55957086391869e-05, + "loss": 0.8263, + "step": 11700 + }, + { + "epoch": 2.2, + "grad_norm": 15.722282409667969, + "learning_rate": 1.5591944287596464e-05, + "loss": 0.5882, + "step": 11710 + }, + { + "epoch": 2.21, + "grad_norm": 22.392576217651367, + "learning_rate": 1.5588179936006023e-05, + "loss": 0.6934, + "step": 11720 + }, + { + "epoch": 2.21, + "grad_norm": 28.885343551635742, + "learning_rate": 1.5584415584415587e-05, + "loss": 1.0459, + "step": 11730 + }, + { + "epoch": 2.21, + "grad_norm": 20.01675796508789, + "learning_rate": 1.5580651232825146e-05, + "loss": 0.7343, + "step": 11740 + }, + { + "epoch": 2.21, + "grad_norm": 3.547024726867676, + "learning_rate": 1.557688688123471e-05, + "loss": 0.8088, + "step": 11750 + }, + { + "epoch": 2.21, + "grad_norm": 7.1423726081848145, + "learning_rate": 1.557312252964427e-05, + "loss": 0.5475, + "step": 11760 + }, + { + "epoch": 2.22, + "grad_norm": 29.471176147460938, + "learning_rate": 1.556935817805383e-05, + "loss": 0.8382, + "step": 11770 + }, + { + "epoch": 2.22, + "grad_norm": 16.62720489501953, + "learning_rate": 1.5565593826463392e-05, + "loss": 0.7917, + "step": 11780 + }, + { + "epoch": 2.22, + "grad_norm": 12.257611274719238, + "learning_rate": 1.5561829474872955e-05, + "loss": 0.7627, + "step": 11790 + }, + { + "epoch": 2.22, + "grad_norm": 16.232980728149414, + "learning_rate": 1.5558065123282515e-05, + "loss": 0.5723, + "step": 11800 + }, + { + "epoch": 2.22, + "grad_norm": 15.463409423828125, + "learning_rate": 1.5554300771692078e-05, + "loss": 0.6709, + "step": 11810 + }, + { + "epoch": 2.22, + "grad_norm": 11.793865203857422, + "learning_rate": 1.5550536420101638e-05, + "loss": 0.6177, + "step": 11820 + }, + { + "epoch": 2.23, + "grad_norm": 23.931665420532227, + "learning_rate": 1.55467720685112e-05, + "loss": 0.8577, + "step": 11830 + }, + { + "epoch": 2.23, + "grad_norm": 12.713443756103516, + "learning_rate": 1.554300771692076e-05, + "loss": 0.7765, + "step": 11840 + }, + { + "epoch": 2.23, + "grad_norm": 19.967254638671875, + "learning_rate": 1.5539243365330324e-05, + "loss": 1.0167, + "step": 11850 + }, + { + "epoch": 2.23, + "grad_norm": 15.038551330566406, + "learning_rate": 1.5535479013739884e-05, + "loss": 1.0065, + "step": 11860 + }, + { + "epoch": 2.23, + "grad_norm": 11.947111129760742, + "learning_rate": 1.5531714662149447e-05, + "loss": 0.7426, + "step": 11870 + }, + { + "epoch": 2.24, + "grad_norm": 23.100839614868164, + "learning_rate": 1.5527950310559007e-05, + "loss": 1.2186, + "step": 11880 + }, + { + "epoch": 2.24, + "grad_norm": 18.139780044555664, + "learning_rate": 1.552418595896857e-05, + "loss": 0.5974, + "step": 11890 + }, + { + "epoch": 2.24, + "grad_norm": 12.937055587768555, + "learning_rate": 1.552042160737813e-05, + "loss": 1.0542, + "step": 11900 + }, + { + "epoch": 2.24, + "grad_norm": 25.850690841674805, + "learning_rate": 1.5516657255787693e-05, + "loss": 0.6753, + "step": 11910 + }, + { + "epoch": 2.24, + "grad_norm": 11.147032737731934, + "learning_rate": 1.5512892904197253e-05, + "loss": 0.8321, + "step": 11920 + }, + { + "epoch": 2.25, + "grad_norm": 16.01250648498535, + "learning_rate": 1.5509128552606816e-05, + "loss": 0.6835, + "step": 11930 + }, + { + "epoch": 2.25, + "grad_norm": 14.47998332977295, + "learning_rate": 1.5505364201016376e-05, + "loss": 0.4995, + "step": 11940 + }, + { + "epoch": 2.25, + "grad_norm": 29.964704513549805, + "learning_rate": 1.5501599849425935e-05, + "loss": 0.6967, + "step": 11950 + }, + { + "epoch": 2.25, + "grad_norm": 17.843643188476562, + "learning_rate": 1.54978354978355e-05, + "loss": 0.8046, + "step": 11960 + }, + { + "epoch": 2.25, + "grad_norm": 10.963820457458496, + "learning_rate": 1.5494071146245058e-05, + "loss": 0.8361, + "step": 11970 + }, + { + "epoch": 2.25, + "grad_norm": 15.843602180480957, + "learning_rate": 1.549030679465462e-05, + "loss": 0.6498, + "step": 11980 + }, + { + "epoch": 2.26, + "grad_norm": 0.7258365154266357, + "learning_rate": 1.5486542443064185e-05, + "loss": 0.8831, + "step": 11990 + }, + { + "epoch": 2.26, + "grad_norm": 1.571528434753418, + "learning_rate": 1.5482778091473744e-05, + "loss": 0.8198, + "step": 12000 + }, + { + "epoch": 2.26, + "grad_norm": 8.469624519348145, + "learning_rate": 1.5479013739883307e-05, + "loss": 0.7245, + "step": 12010 + }, + { + "epoch": 2.26, + "grad_norm": 3.154909133911133, + "learning_rate": 1.5475249388292867e-05, + "loss": 0.7674, + "step": 12020 + }, + { + "epoch": 2.26, + "grad_norm": 21.369901657104492, + "learning_rate": 1.547148503670243e-05, + "loss": 0.9809, + "step": 12030 + }, + { + "epoch": 2.27, + "grad_norm": 31.945472717285156, + "learning_rate": 1.546772068511199e-05, + "loss": 0.7597, + "step": 12040 + }, + { + "epoch": 2.27, + "grad_norm": 25.05544662475586, + "learning_rate": 1.5463956333521553e-05, + "loss": 0.5625, + "step": 12050 + }, + { + "epoch": 2.27, + "grad_norm": 21.71693992614746, + "learning_rate": 1.5460191981931113e-05, + "loss": 0.5271, + "step": 12060 + }, + { + "epoch": 2.27, + "grad_norm": 24.646568298339844, + "learning_rate": 1.5456427630340676e-05, + "loss": 0.7604, + "step": 12070 + }, + { + "epoch": 2.27, + "grad_norm": 3.2769088745117188, + "learning_rate": 1.5452663278750236e-05, + "loss": 0.7491, + "step": 12080 + }, + { + "epoch": 2.28, + "grad_norm": 15.733813285827637, + "learning_rate": 1.54488989271598e-05, + "loss": 0.9759, + "step": 12090 + }, + { + "epoch": 2.28, + "grad_norm": 17.16128158569336, + "learning_rate": 1.544513457556936e-05, + "loss": 0.8524, + "step": 12100 + }, + { + "epoch": 2.28, + "grad_norm": 11.384461402893066, + "learning_rate": 1.544137022397892e-05, + "loss": 0.7966, + "step": 12110 + }, + { + "epoch": 2.28, + "grad_norm": 13.934250831604004, + "learning_rate": 1.5437605872388482e-05, + "loss": 0.8525, + "step": 12120 + }, + { + "epoch": 2.28, + "grad_norm": 11.015745162963867, + "learning_rate": 1.543384152079804e-05, + "loss": 0.617, + "step": 12130 + }, + { + "epoch": 2.28, + "grad_norm": 31.421207427978516, + "learning_rate": 1.5430077169207605e-05, + "loss": 0.4724, + "step": 12140 + }, + { + "epoch": 2.29, + "grad_norm": 27.373411178588867, + "learning_rate": 1.5426312817617165e-05, + "loss": 0.6447, + "step": 12150 + }, + { + "epoch": 2.29, + "grad_norm": 4.831554889678955, + "learning_rate": 1.5422548466026728e-05, + "loss": 0.7196, + "step": 12160 + }, + { + "epoch": 2.29, + "grad_norm": 17.245033264160156, + "learning_rate": 1.541878411443629e-05, + "loss": 0.9213, + "step": 12170 + }, + { + "epoch": 2.29, + "grad_norm": 12.337189674377441, + "learning_rate": 1.541501976284585e-05, + "loss": 0.8848, + "step": 12180 + }, + { + "epoch": 2.29, + "grad_norm": 14.200904846191406, + "learning_rate": 1.5411255411255414e-05, + "loss": 0.9418, + "step": 12190 + }, + { + "epoch": 2.3, + "grad_norm": 25.789546966552734, + "learning_rate": 1.5407491059664974e-05, + "loss": 0.9909, + "step": 12200 + }, + { + "epoch": 2.3, + "grad_norm": 67.17901611328125, + "learning_rate": 1.5403726708074537e-05, + "loss": 0.7414, + "step": 12210 + }, + { + "epoch": 2.3, + "grad_norm": 11.221137046813965, + "learning_rate": 1.5399962356484096e-05, + "loss": 1.1104, + "step": 12220 + }, + { + "epoch": 2.3, + "grad_norm": 13.688097953796387, + "learning_rate": 1.539619800489366e-05, + "loss": 0.6626, + "step": 12230 + }, + { + "epoch": 2.3, + "grad_norm": 16.81260871887207, + "learning_rate": 1.539243365330322e-05, + "loss": 0.6778, + "step": 12240 + }, + { + "epoch": 2.31, + "grad_norm": 7.933690071105957, + "learning_rate": 1.5388669301712783e-05, + "loss": 0.913, + "step": 12250 + }, + { + "epoch": 2.31, + "grad_norm": 30.486112594604492, + "learning_rate": 1.5384904950122342e-05, + "loss": 1.057, + "step": 12260 + }, + { + "epoch": 2.31, + "grad_norm": 19.049509048461914, + "learning_rate": 1.5381140598531905e-05, + "loss": 0.7562, + "step": 12270 + }, + { + "epoch": 2.31, + "grad_norm": 21.68576431274414, + "learning_rate": 1.5377376246941465e-05, + "loss": 0.696, + "step": 12280 + }, + { + "epoch": 2.31, + "grad_norm": 21.92389678955078, + "learning_rate": 1.5373611895351025e-05, + "loss": 0.845, + "step": 12290 + }, + { + "epoch": 2.32, + "grad_norm": 11.414402961730957, + "learning_rate": 1.5369847543760588e-05, + "loss": 0.7288, + "step": 12300 + }, + { + "epoch": 2.32, + "grad_norm": 12.205060005187988, + "learning_rate": 1.5366083192170148e-05, + "loss": 0.7567, + "step": 12310 + }, + { + "epoch": 2.32, + "grad_norm": 21.142671585083008, + "learning_rate": 1.536231884057971e-05, + "loss": 0.6949, + "step": 12320 + }, + { + "epoch": 2.32, + "grad_norm": 3.8780596256256104, + "learning_rate": 1.535855448898927e-05, + "loss": 0.5205, + "step": 12330 + }, + { + "epoch": 2.32, + "grad_norm": 2.610788106918335, + "learning_rate": 1.5354790137398834e-05, + "loss": 0.4817, + "step": 12340 + }, + { + "epoch": 2.32, + "grad_norm": 5.773406505584717, + "learning_rate": 1.5351025785808397e-05, + "loss": 0.9445, + "step": 12350 + }, + { + "epoch": 2.33, + "grad_norm": 4.580637454986572, + "learning_rate": 1.5347261434217957e-05, + "loss": 0.7737, + "step": 12360 + }, + { + "epoch": 2.33, + "grad_norm": 20.938676834106445, + "learning_rate": 1.534349708262752e-05, + "loss": 0.8886, + "step": 12370 + }, + { + "epoch": 2.33, + "grad_norm": 16.42961311340332, + "learning_rate": 1.533973273103708e-05, + "loss": 0.48, + "step": 12380 + }, + { + "epoch": 2.33, + "grad_norm": 4.3253493309021, + "learning_rate": 1.5335968379446643e-05, + "loss": 0.7892, + "step": 12390 + }, + { + "epoch": 2.33, + "grad_norm": 5.634984970092773, + "learning_rate": 1.5332204027856203e-05, + "loss": 0.805, + "step": 12400 + }, + { + "epoch": 2.34, + "grad_norm": 19.899145126342773, + "learning_rate": 1.5328439676265766e-05, + "loss": 0.7257, + "step": 12410 + }, + { + "epoch": 2.34, + "grad_norm": 11.208027839660645, + "learning_rate": 1.5324675324675326e-05, + "loss": 0.732, + "step": 12420 + }, + { + "epoch": 2.34, + "grad_norm": 33.87583541870117, + "learning_rate": 1.532091097308489e-05, + "loss": 0.6432, + "step": 12430 + }, + { + "epoch": 2.34, + "grad_norm": 8.736035346984863, + "learning_rate": 1.531714662149445e-05, + "loss": 0.7492, + "step": 12440 + }, + { + "epoch": 2.34, + "grad_norm": 13.03171443939209, + "learning_rate": 1.5313382269904012e-05, + "loss": 0.9966, + "step": 12450 + }, + { + "epoch": 2.35, + "grad_norm": 9.763032913208008, + "learning_rate": 1.530961791831357e-05, + "loss": 0.8488, + "step": 12460 + }, + { + "epoch": 2.35, + "grad_norm": 0.931615948677063, + "learning_rate": 1.530585356672313e-05, + "loss": 0.6715, + "step": 12470 + }, + { + "epoch": 2.35, + "grad_norm": 12.652198791503906, + "learning_rate": 1.5302089215132694e-05, + "loss": 1.1259, + "step": 12480 + }, + { + "epoch": 2.35, + "grad_norm": 6.345569133758545, + "learning_rate": 1.5298324863542254e-05, + "loss": 0.7731, + "step": 12490 + }, + { + "epoch": 2.35, + "grad_norm": 22.567358016967773, + "learning_rate": 1.5294560511951817e-05, + "loss": 0.6017, + "step": 12500 + }, + { + "epoch": 2.35, + "grad_norm": 3.477397918701172, + "learning_rate": 1.5290796160361377e-05, + "loss": 1.0937, + "step": 12510 + }, + { + "epoch": 2.36, + "grad_norm": 24.150218963623047, + "learning_rate": 1.528703180877094e-05, + "loss": 0.8269, + "step": 12520 + }, + { + "epoch": 2.36, + "grad_norm": 33.526763916015625, + "learning_rate": 1.5283267457180503e-05, + "loss": 0.7596, + "step": 12530 + }, + { + "epoch": 2.36, + "grad_norm": 11.938372611999512, + "learning_rate": 1.5279503105590063e-05, + "loss": 0.7197, + "step": 12540 + }, + { + "epoch": 2.36, + "grad_norm": 0.7802831530570984, + "learning_rate": 1.5275738753999626e-05, + "loss": 0.5031, + "step": 12550 + }, + { + "epoch": 2.36, + "grad_norm": 33.820281982421875, + "learning_rate": 1.5271974402409186e-05, + "loss": 0.7919, + "step": 12560 + }, + { + "epoch": 2.37, + "grad_norm": 16.24278450012207, + "learning_rate": 1.526821005081875e-05, + "loss": 0.5727, + "step": 12570 + }, + { + "epoch": 2.37, + "grad_norm": 8.651189804077148, + "learning_rate": 1.526444569922831e-05, + "loss": 0.6429, + "step": 12580 + }, + { + "epoch": 2.37, + "grad_norm": 42.32835388183594, + "learning_rate": 1.5260681347637872e-05, + "loss": 0.9006, + "step": 12590 + }, + { + "epoch": 2.37, + "grad_norm": 15.94575309753418, + "learning_rate": 1.5256916996047434e-05, + "loss": 0.7776, + "step": 12600 + }, + { + "epoch": 2.37, + "grad_norm": 34.81913375854492, + "learning_rate": 1.5253152644456995e-05, + "loss": 1.2596, + "step": 12610 + }, + { + "epoch": 2.38, + "grad_norm": 5.684762954711914, + "learning_rate": 1.5249388292866557e-05, + "loss": 0.6622, + "step": 12620 + }, + { + "epoch": 2.38, + "grad_norm": 14.016319274902344, + "learning_rate": 1.5245623941276115e-05, + "loss": 0.7527, + "step": 12630 + }, + { + "epoch": 2.38, + "grad_norm": 24.80596351623535, + "learning_rate": 1.5241859589685676e-05, + "loss": 1.1414, + "step": 12640 + }, + { + "epoch": 2.38, + "grad_norm": 11.010673522949219, + "learning_rate": 1.523809523809524e-05, + "loss": 0.6522, + "step": 12650 + }, + { + "epoch": 2.38, + "grad_norm": 11.180171966552734, + "learning_rate": 1.52343308865048e-05, + "loss": 0.4195, + "step": 12660 + }, + { + "epoch": 2.38, + "grad_norm": 18.080059051513672, + "learning_rate": 1.5230566534914362e-05, + "loss": 0.532, + "step": 12670 + }, + { + "epoch": 2.39, + "grad_norm": 16.197202682495117, + "learning_rate": 1.5226802183323924e-05, + "loss": 1.0089, + "step": 12680 + }, + { + "epoch": 2.39, + "grad_norm": 13.255231857299805, + "learning_rate": 1.5223037831733485e-05, + "loss": 0.9259, + "step": 12690 + }, + { + "epoch": 2.39, + "grad_norm": 6.038920879364014, + "learning_rate": 1.5219273480143047e-05, + "loss": 0.7511, + "step": 12700 + }, + { + "epoch": 2.39, + "grad_norm": 6.505653381347656, + "learning_rate": 1.5215509128552608e-05, + "loss": 0.8873, + "step": 12710 + }, + { + "epoch": 2.39, + "grad_norm": 10.185829162597656, + "learning_rate": 1.521174477696217e-05, + "loss": 0.9914, + "step": 12720 + }, + { + "epoch": 2.4, + "grad_norm": 22.103660583496094, + "learning_rate": 1.5207980425371731e-05, + "loss": 1.0945, + "step": 12730 + }, + { + "epoch": 2.4, + "grad_norm": 5.609415531158447, + "learning_rate": 1.5204216073781292e-05, + "loss": 0.7768, + "step": 12740 + }, + { + "epoch": 2.4, + "grad_norm": 21.126070022583008, + "learning_rate": 1.5200451722190854e-05, + "loss": 0.8569, + "step": 12750 + }, + { + "epoch": 2.4, + "grad_norm": 3.481779098510742, + "learning_rate": 1.5196687370600415e-05, + "loss": 0.6686, + "step": 12760 + }, + { + "epoch": 2.4, + "grad_norm": 5.01765775680542, + "learning_rate": 1.5192923019009977e-05, + "loss": 0.6434, + "step": 12770 + }, + { + "epoch": 2.41, + "grad_norm": 8.629416465759277, + "learning_rate": 1.5189158667419538e-05, + "loss": 0.6469, + "step": 12780 + }, + { + "epoch": 2.41, + "grad_norm": 14.828179359436035, + "learning_rate": 1.5185394315829101e-05, + "loss": 0.7293, + "step": 12790 + }, + { + "epoch": 2.41, + "grad_norm": 10.851454734802246, + "learning_rate": 1.5181629964238663e-05, + "loss": 0.7607, + "step": 12800 + }, + { + "epoch": 2.41, + "grad_norm": 22.300085067749023, + "learning_rate": 1.5177865612648221e-05, + "loss": 0.8268, + "step": 12810 + }, + { + "epoch": 2.41, + "grad_norm": 8.352980613708496, + "learning_rate": 1.5174101261057782e-05, + "loss": 0.6191, + "step": 12820 + }, + { + "epoch": 2.41, + "grad_norm": 10.153834342956543, + "learning_rate": 1.5170336909467346e-05, + "loss": 0.8384, + "step": 12830 + }, + { + "epoch": 2.42, + "grad_norm": 9.206123352050781, + "learning_rate": 1.5166572557876907e-05, + "loss": 0.7043, + "step": 12840 + }, + { + "epoch": 2.42, + "grad_norm": 6.315232276916504, + "learning_rate": 1.5162808206286468e-05, + "loss": 0.6646, + "step": 12850 + }, + { + "epoch": 2.42, + "grad_norm": 34.999691009521484, + "learning_rate": 1.515904385469603e-05, + "loss": 0.874, + "step": 12860 + }, + { + "epoch": 2.42, + "grad_norm": 9.736954689025879, + "learning_rate": 1.5155279503105591e-05, + "loss": 1.0045, + "step": 12870 + }, + { + "epoch": 2.42, + "grad_norm": 13.932580947875977, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.8253, + "step": 12880 + }, + { + "epoch": 2.43, + "grad_norm": 34.856239318847656, + "learning_rate": 1.5147750799924714e-05, + "loss": 0.9148, + "step": 12890 + }, + { + "epoch": 2.43, + "grad_norm": 8.915196418762207, + "learning_rate": 1.5143986448334276e-05, + "loss": 0.799, + "step": 12900 + }, + { + "epoch": 2.43, + "grad_norm": 23.154916763305664, + "learning_rate": 1.5140222096743837e-05, + "loss": 0.7314, + "step": 12910 + }, + { + "epoch": 2.43, + "grad_norm": 16.326953887939453, + "learning_rate": 1.5136457745153399e-05, + "loss": 0.7348, + "step": 12920 + }, + { + "epoch": 2.43, + "grad_norm": 25.118789672851562, + "learning_rate": 1.513269339356296e-05, + "loss": 0.6603, + "step": 12930 + }, + { + "epoch": 2.44, + "grad_norm": 24.18308448791504, + "learning_rate": 1.5128929041972522e-05, + "loss": 0.7526, + "step": 12940 + }, + { + "epoch": 2.44, + "grad_norm": 5.591280937194824, + "learning_rate": 1.5125164690382083e-05, + "loss": 0.8457, + "step": 12950 + }, + { + "epoch": 2.44, + "grad_norm": 13.854071617126465, + "learning_rate": 1.5121400338791645e-05, + "loss": 0.7127, + "step": 12960 + }, + { + "epoch": 2.44, + "grad_norm": 1.194824457168579, + "learning_rate": 1.5117635987201206e-05, + "loss": 0.9448, + "step": 12970 + }, + { + "epoch": 2.44, + "grad_norm": 10.191553115844727, + "learning_rate": 1.5113871635610766e-05, + "loss": 0.4985, + "step": 12980 + }, + { + "epoch": 2.44, + "grad_norm": 21.121976852416992, + "learning_rate": 1.5110107284020327e-05, + "loss": 0.5263, + "step": 12990 + }, + { + "epoch": 2.45, + "grad_norm": 15.498823165893555, + "learning_rate": 1.5106342932429889e-05, + "loss": 0.9089, + "step": 13000 + }, + { + "epoch": 2.45, + "grad_norm": 10.255945205688477, + "learning_rate": 1.510257858083945e-05, + "loss": 0.7219, + "step": 13010 + }, + { + "epoch": 2.45, + "grad_norm": 4.906744003295898, + "learning_rate": 1.5098814229249013e-05, + "loss": 0.8106, + "step": 13020 + }, + { + "epoch": 2.45, + "grad_norm": 4.751705169677734, + "learning_rate": 1.5095049877658575e-05, + "loss": 0.742, + "step": 13030 + }, + { + "epoch": 2.45, + "grad_norm": 10.848084449768066, + "learning_rate": 1.5091285526068136e-05, + "loss": 0.6807, + "step": 13040 + }, + { + "epoch": 2.46, + "grad_norm": 31.987140655517578, + "learning_rate": 1.5087521174477698e-05, + "loss": 0.4382, + "step": 13050 + }, + { + "epoch": 2.46, + "grad_norm": 23.37934112548828, + "learning_rate": 1.5083756822887259e-05, + "loss": 0.7205, + "step": 13060 + }, + { + "epoch": 2.46, + "grad_norm": 2.923102855682373, + "learning_rate": 1.507999247129682e-05, + "loss": 0.9366, + "step": 13070 + }, + { + "epoch": 2.46, + "grad_norm": 2.3516485691070557, + "learning_rate": 1.5076228119706382e-05, + "loss": 0.6969, + "step": 13080 + }, + { + "epoch": 2.46, + "grad_norm": 16.754817962646484, + "learning_rate": 1.5072463768115944e-05, + "loss": 0.6468, + "step": 13090 + }, + { + "epoch": 2.47, + "grad_norm": 0.9715334177017212, + "learning_rate": 1.5068699416525505e-05, + "loss": 0.9202, + "step": 13100 + }, + { + "epoch": 2.47, + "grad_norm": 16.75336456298828, + "learning_rate": 1.5064935064935066e-05, + "loss": 0.5497, + "step": 13110 + }, + { + "epoch": 2.47, + "grad_norm": 11.597676277160645, + "learning_rate": 1.5061170713344628e-05, + "loss": 1.031, + "step": 13120 + }, + { + "epoch": 2.47, + "grad_norm": 19.306570053100586, + "learning_rate": 1.505740636175419e-05, + "loss": 0.5285, + "step": 13130 + }, + { + "epoch": 2.47, + "grad_norm": 11.259316444396973, + "learning_rate": 1.505364201016375e-05, + "loss": 0.6111, + "step": 13140 + }, + { + "epoch": 2.48, + "grad_norm": 5.920182704925537, + "learning_rate": 1.5049877658573312e-05, + "loss": 0.9721, + "step": 13150 + }, + { + "epoch": 2.48, + "grad_norm": 4.26685905456543, + "learning_rate": 1.5046113306982872e-05, + "loss": 0.5111, + "step": 13160 + }, + { + "epoch": 2.48, + "grad_norm": 1.7324987649917603, + "learning_rate": 1.5042348955392434e-05, + "loss": 0.3585, + "step": 13170 + }, + { + "epoch": 2.48, + "grad_norm": 12.388806343078613, + "learning_rate": 1.5038584603801995e-05, + "loss": 0.6431, + "step": 13180 + }, + { + "epoch": 2.48, + "grad_norm": 26.238208770751953, + "learning_rate": 1.5034820252211556e-05, + "loss": 0.881, + "step": 13190 + }, + { + "epoch": 2.48, + "grad_norm": 17.09404945373535, + "learning_rate": 1.5031055900621118e-05, + "loss": 0.7259, + "step": 13200 + }, + { + "epoch": 2.49, + "grad_norm": 24.81346321105957, + "learning_rate": 1.5027291549030681e-05, + "loss": 0.8976, + "step": 13210 + }, + { + "epoch": 2.49, + "grad_norm": 18.42681121826172, + "learning_rate": 1.5023527197440243e-05, + "loss": 0.6566, + "step": 13220 + }, + { + "epoch": 2.49, + "grad_norm": 14.247604370117188, + "learning_rate": 1.5019762845849804e-05, + "loss": 0.5223, + "step": 13230 + }, + { + "epoch": 2.49, + "grad_norm": 10.424066543579102, + "learning_rate": 1.5015998494259365e-05, + "loss": 0.7308, + "step": 13240 + }, + { + "epoch": 2.49, + "grad_norm": 0.2744300067424774, + "learning_rate": 1.5012234142668927e-05, + "loss": 0.5581, + "step": 13250 + }, + { + "epoch": 2.5, + "grad_norm": 13.965598106384277, + "learning_rate": 1.5008469791078488e-05, + "loss": 0.6971, + "step": 13260 + }, + { + "epoch": 2.5, + "grad_norm": 15.680059432983398, + "learning_rate": 1.500470543948805e-05, + "loss": 0.823, + "step": 13270 + }, + { + "epoch": 2.5, + "grad_norm": 64.32691955566406, + "learning_rate": 1.5000941087897611e-05, + "loss": 0.7521, + "step": 13280 + }, + { + "epoch": 2.5, + "grad_norm": 17.173147201538086, + "learning_rate": 1.4997176736307173e-05, + "loss": 0.7143, + "step": 13290 + }, + { + "epoch": 2.5, + "grad_norm": 12.266512870788574, + "learning_rate": 1.4993412384716734e-05, + "loss": 0.5146, + "step": 13300 + }, + { + "epoch": 2.51, + "grad_norm": 27.415990829467773, + "learning_rate": 1.4989648033126296e-05, + "loss": 0.6267, + "step": 13310 + }, + { + "epoch": 2.51, + "grad_norm": 38.314414978027344, + "learning_rate": 1.4985883681535857e-05, + "loss": 0.8287, + "step": 13320 + }, + { + "epoch": 2.51, + "grad_norm": 14.848710060119629, + "learning_rate": 1.4982119329945417e-05, + "loss": 0.9447, + "step": 13330 + }, + { + "epoch": 2.51, + "grad_norm": 16.18121910095215, + "learning_rate": 1.4978354978354978e-05, + "loss": 0.8354, + "step": 13340 + }, + { + "epoch": 2.51, + "grad_norm": 18.633420944213867, + "learning_rate": 1.497459062676454e-05, + "loss": 0.5659, + "step": 13350 + }, + { + "epoch": 2.51, + "grad_norm": 8.951250076293945, + "learning_rate": 1.4970826275174101e-05, + "loss": 0.7976, + "step": 13360 + }, + { + "epoch": 2.52, + "grad_norm": 8.146002769470215, + "learning_rate": 1.4967061923583663e-05, + "loss": 0.5475, + "step": 13370 + }, + { + "epoch": 2.52, + "grad_norm": 10.315155982971191, + "learning_rate": 1.4963297571993224e-05, + "loss": 0.7532, + "step": 13380 + }, + { + "epoch": 2.52, + "grad_norm": 6.590341567993164, + "learning_rate": 1.4959533220402787e-05, + "loss": 0.7397, + "step": 13390 + }, + { + "epoch": 2.52, + "grad_norm": 7.777267932891846, + "learning_rate": 1.4955768868812349e-05, + "loss": 0.7557, + "step": 13400 + }, + { + "epoch": 2.52, + "grad_norm": 13.434835433959961, + "learning_rate": 1.495200451722191e-05, + "loss": 0.6464, + "step": 13410 + }, + { + "epoch": 2.53, + "grad_norm": 15.336316108703613, + "learning_rate": 1.4948240165631472e-05, + "loss": 1.0214, + "step": 13420 + }, + { + "epoch": 2.53, + "grad_norm": 9.77111530303955, + "learning_rate": 1.4944475814041033e-05, + "loss": 0.4507, + "step": 13430 + }, + { + "epoch": 2.53, + "grad_norm": 10.342262268066406, + "learning_rate": 1.4940711462450595e-05, + "loss": 0.993, + "step": 13440 + }, + { + "epoch": 2.53, + "grad_norm": 22.338380813598633, + "learning_rate": 1.4936947110860156e-05, + "loss": 1.1721, + "step": 13450 + }, + { + "epoch": 2.53, + "grad_norm": 12.006577491760254, + "learning_rate": 1.4933182759269718e-05, + "loss": 0.7968, + "step": 13460 + }, + { + "epoch": 2.54, + "grad_norm": 27.795074462890625, + "learning_rate": 1.4929418407679279e-05, + "loss": 0.9014, + "step": 13470 + }, + { + "epoch": 2.54, + "grad_norm": 38.55152893066406, + "learning_rate": 1.492565405608884e-05, + "loss": 0.8012, + "step": 13480 + }, + { + "epoch": 2.54, + "grad_norm": 4.785322666168213, + "learning_rate": 1.4921889704498402e-05, + "loss": 0.7915, + "step": 13490 + }, + { + "epoch": 2.54, + "grad_norm": 11.960060119628906, + "learning_rate": 1.4918125352907963e-05, + "loss": 0.949, + "step": 13500 + }, + { + "epoch": 2.54, + "grad_norm": 11.774707794189453, + "learning_rate": 1.4914361001317523e-05, + "loss": 0.5137, + "step": 13510 + }, + { + "epoch": 2.54, + "grad_norm": 10.362529754638672, + "learning_rate": 1.4910596649727085e-05, + "loss": 0.7564, + "step": 13520 + }, + { + "epoch": 2.55, + "grad_norm": 14.903637886047363, + "learning_rate": 1.4906832298136646e-05, + "loss": 1.0717, + "step": 13530 + }, + { + "epoch": 2.55, + "grad_norm": 24.547834396362305, + "learning_rate": 1.4903067946546208e-05, + "loss": 0.675, + "step": 13540 + }, + { + "epoch": 2.55, + "grad_norm": 15.174468040466309, + "learning_rate": 1.4899303594955769e-05, + "loss": 0.7592, + "step": 13550 + }, + { + "epoch": 2.55, + "grad_norm": 5.759738922119141, + "learning_rate": 1.489553924336533e-05, + "loss": 0.8173, + "step": 13560 + }, + { + "epoch": 2.55, + "grad_norm": 30.83299446105957, + "learning_rate": 1.4891774891774892e-05, + "loss": 0.6815, + "step": 13570 + }, + { + "epoch": 2.56, + "grad_norm": 18.987077713012695, + "learning_rate": 1.4888010540184455e-05, + "loss": 0.7051, + "step": 13580 + }, + { + "epoch": 2.56, + "grad_norm": 25.075693130493164, + "learning_rate": 1.4884246188594017e-05, + "loss": 0.9777, + "step": 13590 + }, + { + "epoch": 2.56, + "grad_norm": 12.43553352355957, + "learning_rate": 1.4880481837003578e-05, + "loss": 0.6671, + "step": 13600 + }, + { + "epoch": 2.56, + "grad_norm": 7.164165496826172, + "learning_rate": 1.487671748541314e-05, + "loss": 0.7385, + "step": 13610 + }, + { + "epoch": 2.56, + "grad_norm": 12.529688835144043, + "learning_rate": 1.4872953133822701e-05, + "loss": 0.9938, + "step": 13620 + }, + { + "epoch": 2.57, + "grad_norm": 11.181074142456055, + "learning_rate": 1.4869188782232262e-05, + "loss": 1.0485, + "step": 13630 + }, + { + "epoch": 2.57, + "grad_norm": 15.510049819946289, + "learning_rate": 1.4865424430641824e-05, + "loss": 0.6744, + "step": 13640 + }, + { + "epoch": 2.57, + "grad_norm": 14.156461715698242, + "learning_rate": 1.4861660079051385e-05, + "loss": 0.8552, + "step": 13650 + }, + { + "epoch": 2.57, + "grad_norm": 17.33963394165039, + "learning_rate": 1.4857895727460947e-05, + "loss": 0.9491, + "step": 13660 + }, + { + "epoch": 2.57, + "grad_norm": 0.6199512481689453, + "learning_rate": 1.4854131375870508e-05, + "loss": 0.7211, + "step": 13670 + }, + { + "epoch": 2.57, + "grad_norm": 6.893851280212402, + "learning_rate": 1.4850367024280068e-05, + "loss": 0.717, + "step": 13680 + }, + { + "epoch": 2.58, + "grad_norm": 11.729772567749023, + "learning_rate": 1.484660267268963e-05, + "loss": 0.4342, + "step": 13690 + }, + { + "epoch": 2.58, + "grad_norm": 0.8101961612701416, + "learning_rate": 1.4842838321099191e-05, + "loss": 0.8445, + "step": 13700 + }, + { + "epoch": 2.58, + "grad_norm": 28.97067642211914, + "learning_rate": 1.4839073969508752e-05, + "loss": 0.8899, + "step": 13710 + }, + { + "epoch": 2.58, + "grad_norm": 50.73062515258789, + "learning_rate": 1.4835309617918314e-05, + "loss": 0.7837, + "step": 13720 + }, + { + "epoch": 2.58, + "grad_norm": 15.3909330368042, + "learning_rate": 1.4831545266327875e-05, + "loss": 0.9568, + "step": 13730 + }, + { + "epoch": 2.59, + "grad_norm": 30.514009475708008, + "learning_rate": 1.4827780914737437e-05, + "loss": 0.9959, + "step": 13740 + }, + { + "epoch": 2.59, + "grad_norm": 8.172021865844727, + "learning_rate": 1.4824016563146998e-05, + "loss": 0.7495, + "step": 13750 + }, + { + "epoch": 2.59, + "grad_norm": 20.11758041381836, + "learning_rate": 1.4820252211556561e-05, + "loss": 0.549, + "step": 13760 + }, + { + "epoch": 2.59, + "grad_norm": 0.79815274477005, + "learning_rate": 1.4816487859966123e-05, + "loss": 0.5851, + "step": 13770 + }, + { + "epoch": 2.59, + "grad_norm": 60.54981231689453, + "learning_rate": 1.4812723508375684e-05, + "loss": 0.7835, + "step": 13780 + }, + { + "epoch": 2.6, + "grad_norm": 10.375343322753906, + "learning_rate": 1.4808959156785246e-05, + "loss": 0.9386, + "step": 13790 + }, + { + "epoch": 2.6, + "grad_norm": 11.545114517211914, + "learning_rate": 1.4805194805194807e-05, + "loss": 0.585, + "step": 13800 + }, + { + "epoch": 2.6, + "grad_norm": 18.797042846679688, + "learning_rate": 1.4801430453604369e-05, + "loss": 0.7882, + "step": 13810 + }, + { + "epoch": 2.6, + "grad_norm": 18.879865646362305, + "learning_rate": 1.479766610201393e-05, + "loss": 0.6514, + "step": 13820 + }, + { + "epoch": 2.6, + "grad_norm": 21.993356704711914, + "learning_rate": 1.4793901750423492e-05, + "loss": 0.7005, + "step": 13830 + }, + { + "epoch": 2.6, + "grad_norm": 27.9875431060791, + "learning_rate": 1.4790137398833053e-05, + "loss": 0.9488, + "step": 13840 + }, + { + "epoch": 2.61, + "grad_norm": 11.482915878295898, + "learning_rate": 1.4786373047242613e-05, + "loss": 0.7769, + "step": 13850 + }, + { + "epoch": 2.61, + "grad_norm": 7.644073486328125, + "learning_rate": 1.4782608695652174e-05, + "loss": 0.3767, + "step": 13860 + }, + { + "epoch": 2.61, + "grad_norm": 15.943230628967285, + "learning_rate": 1.4778844344061736e-05, + "loss": 0.7765, + "step": 13870 + }, + { + "epoch": 2.61, + "grad_norm": 23.832786560058594, + "learning_rate": 1.4775079992471297e-05, + "loss": 0.7166, + "step": 13880 + }, + { + "epoch": 2.61, + "grad_norm": 9.894569396972656, + "learning_rate": 1.4771315640880859e-05, + "loss": 0.7431, + "step": 13890 + }, + { + "epoch": 2.62, + "grad_norm": 18.379024505615234, + "learning_rate": 1.476755128929042e-05, + "loss": 0.8358, + "step": 13900 + }, + { + "epoch": 2.62, + "grad_norm": 1.7742643356323242, + "learning_rate": 1.4763786937699982e-05, + "loss": 0.5125, + "step": 13910 + }, + { + "epoch": 2.62, + "grad_norm": 10.236210823059082, + "learning_rate": 1.4760022586109543e-05, + "loss": 0.6742, + "step": 13920 + }, + { + "epoch": 2.62, + "grad_norm": 21.775062561035156, + "learning_rate": 1.4756258234519105e-05, + "loss": 0.5442, + "step": 13930 + }, + { + "epoch": 2.62, + "grad_norm": 12.80639362335205, + "learning_rate": 1.4752493882928666e-05, + "loss": 0.8378, + "step": 13940 + }, + { + "epoch": 2.63, + "grad_norm": 12.556268692016602, + "learning_rate": 1.4748729531338229e-05, + "loss": 0.7747, + "step": 13950 + }, + { + "epoch": 2.63, + "grad_norm": 9.039406776428223, + "learning_rate": 1.474496517974779e-05, + "loss": 0.7077, + "step": 13960 + }, + { + "epoch": 2.63, + "grad_norm": 15.263288497924805, + "learning_rate": 1.4741200828157352e-05, + "loss": 0.6179, + "step": 13970 + }, + { + "epoch": 2.63, + "grad_norm": 18.868026733398438, + "learning_rate": 1.4737436476566914e-05, + "loss": 0.8249, + "step": 13980 + }, + { + "epoch": 2.63, + "grad_norm": 35.89374923706055, + "learning_rate": 1.4733672124976475e-05, + "loss": 0.9226, + "step": 13990 + }, + { + "epoch": 2.64, + "grad_norm": 15.616324424743652, + "learning_rate": 1.4729907773386036e-05, + "loss": 1.0765, + "step": 14000 + }, + { + "epoch": 2.64, + "grad_norm": 1.1768288612365723, + "learning_rate": 1.4726143421795598e-05, + "loss": 0.7274, + "step": 14010 + }, + { + "epoch": 2.64, + "grad_norm": 24.081043243408203, + "learning_rate": 1.472237907020516e-05, + "loss": 0.8053, + "step": 14020 + }, + { + "epoch": 2.64, + "grad_norm": 4.48431396484375, + "learning_rate": 1.4718614718614719e-05, + "loss": 0.5182, + "step": 14030 + }, + { + "epoch": 2.64, + "grad_norm": 15.65145492553711, + "learning_rate": 1.471485036702428e-05, + "loss": 0.4912, + "step": 14040 + }, + { + "epoch": 2.64, + "grad_norm": 7.0879316329956055, + "learning_rate": 1.4711086015433842e-05, + "loss": 0.6764, + "step": 14050 + }, + { + "epoch": 2.65, + "grad_norm": 12.771845817565918, + "learning_rate": 1.4707321663843404e-05, + "loss": 0.9551, + "step": 14060 + }, + { + "epoch": 2.65, + "grad_norm": 14.33671760559082, + "learning_rate": 1.4703557312252965e-05, + "loss": 0.8178, + "step": 14070 + }, + { + "epoch": 2.65, + "grad_norm": 7.756180286407471, + "learning_rate": 1.4699792960662526e-05, + "loss": 0.8128, + "step": 14080 + }, + { + "epoch": 2.65, + "grad_norm": 3.8817169666290283, + "learning_rate": 1.4696028609072088e-05, + "loss": 0.6818, + "step": 14090 + }, + { + "epoch": 2.65, + "grad_norm": 25.685022354125977, + "learning_rate": 1.469226425748165e-05, + "loss": 1.0637, + "step": 14100 + }, + { + "epoch": 2.66, + "grad_norm": 7.975883483886719, + "learning_rate": 1.468849990589121e-05, + "loss": 0.6875, + "step": 14110 + }, + { + "epoch": 2.66, + "grad_norm": 11.329687118530273, + "learning_rate": 1.4684735554300772e-05, + "loss": 0.7874, + "step": 14120 + }, + { + "epoch": 2.66, + "grad_norm": 28.03009033203125, + "learning_rate": 1.4680971202710335e-05, + "loss": 0.3694, + "step": 14130 + }, + { + "epoch": 2.66, + "grad_norm": 15.965397834777832, + "learning_rate": 1.4677206851119897e-05, + "loss": 0.8764, + "step": 14140 + }, + { + "epoch": 2.66, + "grad_norm": 25.158184051513672, + "learning_rate": 1.4673442499529458e-05, + "loss": 0.7737, + "step": 14150 + }, + { + "epoch": 2.67, + "grad_norm": 6.6749725341796875, + "learning_rate": 1.466967814793902e-05, + "loss": 0.587, + "step": 14160 + }, + { + "epoch": 2.67, + "grad_norm": 11.725072860717773, + "learning_rate": 1.4665913796348581e-05, + "loss": 0.7353, + "step": 14170 + }, + { + "epoch": 2.67, + "grad_norm": 17.462539672851562, + "learning_rate": 1.4662149444758143e-05, + "loss": 0.9037, + "step": 14180 + }, + { + "epoch": 2.67, + "grad_norm": 7.446224212646484, + "learning_rate": 1.4658385093167704e-05, + "loss": 0.7478, + "step": 14190 + }, + { + "epoch": 2.67, + "grad_norm": 26.2742919921875, + "learning_rate": 1.4654620741577264e-05, + "loss": 0.8547, + "step": 14200 + }, + { + "epoch": 2.67, + "grad_norm": 26.45722770690918, + "learning_rate": 1.4650856389986825e-05, + "loss": 1.2598, + "step": 14210 + }, + { + "epoch": 2.68, + "grad_norm": 30.59162139892578, + "learning_rate": 1.4647092038396387e-05, + "loss": 0.9227, + "step": 14220 + }, + { + "epoch": 2.68, + "grad_norm": 2.223261594772339, + "learning_rate": 1.4643327686805948e-05, + "loss": 0.6894, + "step": 14230 + }, + { + "epoch": 2.68, + "grad_norm": 12.304634094238281, + "learning_rate": 1.463956333521551e-05, + "loss": 0.938, + "step": 14240 + }, + { + "epoch": 2.68, + "grad_norm": 18.860149383544922, + "learning_rate": 1.4635798983625071e-05, + "loss": 0.7099, + "step": 14250 + }, + { + "epoch": 2.68, + "grad_norm": 26.4725399017334, + "learning_rate": 1.4632034632034633e-05, + "loss": 0.6649, + "step": 14260 + }, + { + "epoch": 2.69, + "grad_norm": 16.648344039916992, + "learning_rate": 1.4628270280444194e-05, + "loss": 0.7711, + "step": 14270 + }, + { + "epoch": 2.69, + "grad_norm": 10.517045974731445, + "learning_rate": 1.4624505928853756e-05, + "loss": 0.9333, + "step": 14280 + }, + { + "epoch": 2.69, + "grad_norm": 11.410792350769043, + "learning_rate": 1.4620741577263317e-05, + "loss": 0.5923, + "step": 14290 + }, + { + "epoch": 2.69, + "grad_norm": 16.357027053833008, + "learning_rate": 1.4616977225672879e-05, + "loss": 0.9859, + "step": 14300 + }, + { + "epoch": 2.69, + "grad_norm": 16.898576736450195, + "learning_rate": 1.461321287408244e-05, + "loss": 0.5869, + "step": 14310 + }, + { + "epoch": 2.7, + "grad_norm": 24.384197235107422, + "learning_rate": 1.4609448522492003e-05, + "loss": 1.1913, + "step": 14320 + }, + { + "epoch": 2.7, + "grad_norm": 19.52752685546875, + "learning_rate": 1.4605684170901565e-05, + "loss": 0.4424, + "step": 14330 + }, + { + "epoch": 2.7, + "grad_norm": 24.645038604736328, + "learning_rate": 1.4601919819311126e-05, + "loss": 0.8484, + "step": 14340 + }, + { + "epoch": 2.7, + "grad_norm": 6.663630485534668, + "learning_rate": 1.4598155467720688e-05, + "loss": 0.8926, + "step": 14350 + }, + { + "epoch": 2.7, + "grad_norm": 36.7334098815918, + "learning_rate": 1.4594391116130249e-05, + "loss": 0.8592, + "step": 14360 + }, + { + "epoch": 2.7, + "grad_norm": 8.041152954101562, + "learning_rate": 1.459062676453981e-05, + "loss": 0.4931, + "step": 14370 + }, + { + "epoch": 2.71, + "grad_norm": 4.520535469055176, + "learning_rate": 1.458686241294937e-05, + "loss": 0.6731, + "step": 14380 + }, + { + "epoch": 2.71, + "grad_norm": 18.924875259399414, + "learning_rate": 1.4583098061358932e-05, + "loss": 0.6853, + "step": 14390 + }, + { + "epoch": 2.71, + "grad_norm": 8.557247161865234, + "learning_rate": 1.4579333709768493e-05, + "loss": 0.7192, + "step": 14400 + }, + { + "epoch": 2.71, + "grad_norm": 14.18856143951416, + "learning_rate": 1.4575569358178055e-05, + "loss": 0.6096, + "step": 14410 + }, + { + "epoch": 2.71, + "grad_norm": 8.3154296875, + "learning_rate": 1.4571805006587616e-05, + "loss": 0.6547, + "step": 14420 + }, + { + "epoch": 2.72, + "grad_norm": 22.346403121948242, + "learning_rate": 1.4568040654997178e-05, + "loss": 0.6901, + "step": 14430 + }, + { + "epoch": 2.72, + "grad_norm": 26.471521377563477, + "learning_rate": 1.4564276303406739e-05, + "loss": 0.8754, + "step": 14440 + }, + { + "epoch": 2.72, + "grad_norm": 18.310527801513672, + "learning_rate": 1.45605119518163e-05, + "loss": 0.6152, + "step": 14450 + }, + { + "epoch": 2.72, + "grad_norm": 9.209930419921875, + "learning_rate": 1.4556747600225862e-05, + "loss": 0.8826, + "step": 14460 + }, + { + "epoch": 2.72, + "grad_norm": 7.652730464935303, + "learning_rate": 1.4552983248635423e-05, + "loss": 0.4942, + "step": 14470 + }, + { + "epoch": 2.73, + "grad_norm": 0.7166558504104614, + "learning_rate": 1.4549218897044985e-05, + "loss": 0.6347, + "step": 14480 + }, + { + "epoch": 2.73, + "grad_norm": 23.243921279907227, + "learning_rate": 1.4545454545454546e-05, + "loss": 0.9109, + "step": 14490 + }, + { + "epoch": 2.73, + "grad_norm": 24.10265350341797, + "learning_rate": 1.4541690193864108e-05, + "loss": 0.5445, + "step": 14500 + }, + { + "epoch": 2.73, + "grad_norm": 7.200760364532471, + "learning_rate": 1.4537925842273671e-05, + "loss": 0.8351, + "step": 14510 + }, + { + "epoch": 2.73, + "grad_norm": 23.413759231567383, + "learning_rate": 1.4534161490683232e-05, + "loss": 0.9889, + "step": 14520 + }, + { + "epoch": 2.73, + "grad_norm": 11.395479202270508, + "learning_rate": 1.4530397139092794e-05, + "loss": 0.548, + "step": 14530 + }, + { + "epoch": 2.74, + "grad_norm": 18.534324645996094, + "learning_rate": 1.4526632787502355e-05, + "loss": 0.5068, + "step": 14540 + }, + { + "epoch": 2.74, + "grad_norm": 22.6841983795166, + "learning_rate": 1.4522868435911915e-05, + "loss": 0.933, + "step": 14550 + }, + { + "epoch": 2.74, + "grad_norm": 26.522497177124023, + "learning_rate": 1.4519104084321477e-05, + "loss": 0.9139, + "step": 14560 + }, + { + "epoch": 2.74, + "grad_norm": 0.6627997756004333, + "learning_rate": 1.4515339732731038e-05, + "loss": 0.7262, + "step": 14570 + }, + { + "epoch": 2.74, + "grad_norm": 8.96130084991455, + "learning_rate": 1.45115753811406e-05, + "loss": 0.8326, + "step": 14580 + }, + { + "epoch": 2.75, + "grad_norm": 8.547167778015137, + "learning_rate": 1.4507811029550161e-05, + "loss": 0.8872, + "step": 14590 + }, + { + "epoch": 2.75, + "grad_norm": 8.127227783203125, + "learning_rate": 1.4504046677959722e-05, + "loss": 0.7324, + "step": 14600 + }, + { + "epoch": 2.75, + "grad_norm": 33.58380126953125, + "learning_rate": 1.4500282326369284e-05, + "loss": 0.8574, + "step": 14610 + }, + { + "epoch": 2.75, + "grad_norm": 3.6204919815063477, + "learning_rate": 1.4496517974778845e-05, + "loss": 0.6359, + "step": 14620 + }, + { + "epoch": 2.75, + "grad_norm": 32.16131591796875, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.9616, + "step": 14630 + }, + { + "epoch": 2.76, + "grad_norm": 12.819900512695312, + "learning_rate": 1.4488989271597968e-05, + "loss": 0.6547, + "step": 14640 + }, + { + "epoch": 2.76, + "grad_norm": 22.329343795776367, + "learning_rate": 1.448522492000753e-05, + "loss": 0.5535, + "step": 14650 + }, + { + "epoch": 2.76, + "grad_norm": 24.37099266052246, + "learning_rate": 1.4481460568417091e-05, + "loss": 0.935, + "step": 14660 + }, + { + "epoch": 2.76, + "grad_norm": 24.315109252929688, + "learning_rate": 1.4477696216826653e-05, + "loss": 0.8781, + "step": 14670 + }, + { + "epoch": 2.76, + "grad_norm": 2.761948823928833, + "learning_rate": 1.4473931865236214e-05, + "loss": 0.7095, + "step": 14680 + }, + { + "epoch": 2.76, + "grad_norm": 20.340417861938477, + "learning_rate": 1.4470167513645777e-05, + "loss": 0.9713, + "step": 14690 + }, + { + "epoch": 2.77, + "grad_norm": 14.535849571228027, + "learning_rate": 1.4466403162055339e-05, + "loss": 0.5641, + "step": 14700 + }, + { + "epoch": 2.77, + "grad_norm": 18.312532424926758, + "learning_rate": 1.44626388104649e-05, + "loss": 0.8118, + "step": 14710 + }, + { + "epoch": 2.77, + "grad_norm": 15.691104888916016, + "learning_rate": 1.4458874458874458e-05, + "loss": 0.6316, + "step": 14720 + }, + { + "epoch": 2.77, + "grad_norm": 10.579209327697754, + "learning_rate": 1.445511010728402e-05, + "loss": 0.4896, + "step": 14730 + }, + { + "epoch": 2.77, + "grad_norm": 13.167963981628418, + "learning_rate": 1.4451345755693583e-05, + "loss": 0.9028, + "step": 14740 + }, + { + "epoch": 2.78, + "grad_norm": 18.209197998046875, + "learning_rate": 1.4447581404103144e-05, + "loss": 0.7617, + "step": 14750 + }, + { + "epoch": 2.78, + "grad_norm": 7.098175525665283, + "learning_rate": 1.4443817052512706e-05, + "loss": 0.9601, + "step": 14760 + }, + { + "epoch": 2.78, + "grad_norm": 29.88236427307129, + "learning_rate": 1.4440052700922267e-05, + "loss": 0.9673, + "step": 14770 + }, + { + "epoch": 2.78, + "grad_norm": 14.183192253112793, + "learning_rate": 1.4436288349331829e-05, + "loss": 0.9759, + "step": 14780 + }, + { + "epoch": 2.78, + "grad_norm": 1.7883967161178589, + "learning_rate": 1.443252399774139e-05, + "loss": 0.4751, + "step": 14790 + }, + { + "epoch": 2.79, + "grad_norm": 25.07923698425293, + "learning_rate": 1.4428759646150952e-05, + "loss": 0.8102, + "step": 14800 + }, + { + "epoch": 2.79, + "grad_norm": 0.43187662959098816, + "learning_rate": 1.4424995294560513e-05, + "loss": 0.6616, + "step": 14810 + }, + { + "epoch": 2.79, + "grad_norm": 3.4543023109436035, + "learning_rate": 1.4421230942970075e-05, + "loss": 0.8503, + "step": 14820 + }, + { + "epoch": 2.79, + "grad_norm": 9.902512550354004, + "learning_rate": 1.4417466591379636e-05, + "loss": 0.7454, + "step": 14830 + }, + { + "epoch": 2.79, + "grad_norm": 14.120229721069336, + "learning_rate": 1.4413702239789197e-05, + "loss": 0.7271, + "step": 14840 + }, + { + "epoch": 2.8, + "grad_norm": 42.34109878540039, + "learning_rate": 1.4409937888198759e-05, + "loss": 0.6191, + "step": 14850 + }, + { + "epoch": 2.8, + "grad_norm": 8.808544158935547, + "learning_rate": 1.440617353660832e-05, + "loss": 0.7787, + "step": 14860 + }, + { + "epoch": 2.8, + "grad_norm": 18.57027244567871, + "learning_rate": 1.4402409185017882e-05, + "loss": 0.6487, + "step": 14870 + }, + { + "epoch": 2.8, + "grad_norm": 12.555221557617188, + "learning_rate": 1.4398644833427445e-05, + "loss": 0.5236, + "step": 14880 + }, + { + "epoch": 2.8, + "grad_norm": 4.785314083099365, + "learning_rate": 1.4394880481837006e-05, + "loss": 0.6572, + "step": 14890 + }, + { + "epoch": 2.8, + "grad_norm": 4.444519996643066, + "learning_rate": 1.4391116130246565e-05, + "loss": 0.4675, + "step": 14900 + }, + { + "epoch": 2.81, + "grad_norm": 16.588285446166992, + "learning_rate": 1.4387351778656126e-05, + "loss": 0.9234, + "step": 14910 + }, + { + "epoch": 2.81, + "grad_norm": 6.433644771575928, + "learning_rate": 1.4383587427065689e-05, + "loss": 0.7184, + "step": 14920 + }, + { + "epoch": 2.81, + "grad_norm": 27.862564086914062, + "learning_rate": 1.437982307547525e-05, + "loss": 0.8789, + "step": 14930 + }, + { + "epoch": 2.81, + "grad_norm": 8.804747581481934, + "learning_rate": 1.4376058723884812e-05, + "loss": 0.8987, + "step": 14940 + }, + { + "epoch": 2.81, + "grad_norm": 15.815366744995117, + "learning_rate": 1.4372294372294374e-05, + "loss": 0.601, + "step": 14950 + }, + { + "epoch": 2.82, + "grad_norm": 16.371736526489258, + "learning_rate": 1.4368530020703935e-05, + "loss": 0.9209, + "step": 14960 + }, + { + "epoch": 2.82, + "grad_norm": 20.936168670654297, + "learning_rate": 1.4364765669113496e-05, + "loss": 0.6826, + "step": 14970 + }, + { + "epoch": 2.82, + "grad_norm": 0.5109462141990662, + "learning_rate": 1.4361001317523058e-05, + "loss": 0.5467, + "step": 14980 + }, + { + "epoch": 2.82, + "grad_norm": 15.911187171936035, + "learning_rate": 1.435723696593262e-05, + "loss": 0.7498, + "step": 14990 + }, + { + "epoch": 2.82, + "grad_norm": 10.85964584350586, + "learning_rate": 1.435347261434218e-05, + "loss": 0.7946, + "step": 15000 + }, + { + "epoch": 2.83, + "grad_norm": 12.648409843444824, + "learning_rate": 1.4349708262751742e-05, + "loss": 0.8198, + "step": 15010 + }, + { + "epoch": 2.83, + "grad_norm": 25.404829025268555, + "learning_rate": 1.4345943911161304e-05, + "loss": 0.6896, + "step": 15020 + }, + { + "epoch": 2.83, + "grad_norm": 21.891569137573242, + "learning_rate": 1.4342179559570865e-05, + "loss": 0.5017, + "step": 15030 + }, + { + "epoch": 2.83, + "grad_norm": 8.692461967468262, + "learning_rate": 1.4338415207980427e-05, + "loss": 0.6032, + "step": 15040 + }, + { + "epoch": 2.83, + "grad_norm": 6.623054027557373, + "learning_rate": 1.4334650856389988e-05, + "loss": 0.5454, + "step": 15050 + }, + { + "epoch": 2.83, + "grad_norm": 14.266157150268555, + "learning_rate": 1.4330886504799551e-05, + "loss": 0.8083, + "step": 15060 + }, + { + "epoch": 2.84, + "grad_norm": 21.779327392578125, + "learning_rate": 1.432712215320911e-05, + "loss": 0.7511, + "step": 15070 + }, + { + "epoch": 2.84, + "grad_norm": 14.891936302185059, + "learning_rate": 1.432335780161867e-05, + "loss": 0.7927, + "step": 15080 + }, + { + "epoch": 2.84, + "grad_norm": 3.073430061340332, + "learning_rate": 1.4319593450028232e-05, + "loss": 0.6556, + "step": 15090 + }, + { + "epoch": 2.84, + "grad_norm": 13.47904109954834, + "learning_rate": 1.4315829098437794e-05, + "loss": 0.8525, + "step": 15100 + }, + { + "epoch": 2.84, + "grad_norm": 22.618297576904297, + "learning_rate": 1.4312064746847357e-05, + "loss": 0.7522, + "step": 15110 + }, + { + "epoch": 2.85, + "grad_norm": 21.820899963378906, + "learning_rate": 1.4308300395256918e-05, + "loss": 1.0761, + "step": 15120 + }, + { + "epoch": 2.85, + "grad_norm": 0.3761250078678131, + "learning_rate": 1.430453604366648e-05, + "loss": 0.5225, + "step": 15130 + }, + { + "epoch": 2.85, + "grad_norm": 4.551104545593262, + "learning_rate": 1.4300771692076041e-05, + "loss": 0.7417, + "step": 15140 + }, + { + "epoch": 2.85, + "grad_norm": 17.213136672973633, + "learning_rate": 1.4297007340485603e-05, + "loss": 0.7196, + "step": 15150 + }, + { + "epoch": 2.85, + "grad_norm": 29.140161514282227, + "learning_rate": 1.4293242988895164e-05, + "loss": 0.6101, + "step": 15160 + }, + { + "epoch": 2.86, + "grad_norm": 23.715343475341797, + "learning_rate": 1.4289478637304726e-05, + "loss": 0.9656, + "step": 15170 + }, + { + "epoch": 2.86, + "grad_norm": 16.146928787231445, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.5416, + "step": 15180 + }, + { + "epoch": 2.86, + "grad_norm": 23.387487411499023, + "learning_rate": 1.4281949934123849e-05, + "loss": 1.0218, + "step": 15190 + }, + { + "epoch": 2.86, + "grad_norm": 19.16897201538086, + "learning_rate": 1.427818558253341e-05, + "loss": 0.7901, + "step": 15200 + }, + { + "epoch": 2.86, + "grad_norm": 20.92551040649414, + "learning_rate": 1.4274421230942972e-05, + "loss": 1.0405, + "step": 15210 + }, + { + "epoch": 2.86, + "grad_norm": 4.3972930908203125, + "learning_rate": 1.4270656879352533e-05, + "loss": 1.0935, + "step": 15220 + }, + { + "epoch": 2.87, + "grad_norm": 9.615336418151855, + "learning_rate": 1.4266892527762094e-05, + "loss": 0.7947, + "step": 15230 + }, + { + "epoch": 2.87, + "grad_norm": 12.70883560180664, + "learning_rate": 1.4263128176171656e-05, + "loss": 0.7954, + "step": 15240 + }, + { + "epoch": 2.87, + "grad_norm": 19.39516830444336, + "learning_rate": 1.4259363824581216e-05, + "loss": 0.8201, + "step": 15250 + }, + { + "epoch": 2.87, + "grad_norm": 16.59307098388672, + "learning_rate": 1.4255599472990777e-05, + "loss": 0.766, + "step": 15260 + }, + { + "epoch": 2.87, + "grad_norm": 7.716126441955566, + "learning_rate": 1.4251835121400339e-05, + "loss": 0.7935, + "step": 15270 + }, + { + "epoch": 2.88, + "grad_norm": 3.0978472232818604, + "learning_rate": 1.42480707698099e-05, + "loss": 0.7058, + "step": 15280 + }, + { + "epoch": 2.88, + "grad_norm": 2.391120195388794, + "learning_rate": 1.4244306418219463e-05, + "loss": 0.88, + "step": 15290 + }, + { + "epoch": 2.88, + "grad_norm": 16.929407119750977, + "learning_rate": 1.4240542066629025e-05, + "loss": 0.6826, + "step": 15300 + }, + { + "epoch": 2.88, + "grad_norm": 15.712681770324707, + "learning_rate": 1.4236777715038586e-05, + "loss": 0.9275, + "step": 15310 + }, + { + "epoch": 2.88, + "grad_norm": 2.698317527770996, + "learning_rate": 1.4233013363448148e-05, + "loss": 0.7053, + "step": 15320 + }, + { + "epoch": 2.89, + "grad_norm": 4.682209014892578, + "learning_rate": 1.4229249011857709e-05, + "loss": 0.3361, + "step": 15330 + }, + { + "epoch": 2.89, + "grad_norm": 18.427043914794922, + "learning_rate": 1.422548466026727e-05, + "loss": 0.5728, + "step": 15340 + }, + { + "epoch": 2.89, + "grad_norm": 5.598134994506836, + "learning_rate": 1.4221720308676832e-05, + "loss": 0.6496, + "step": 15350 + }, + { + "epoch": 2.89, + "grad_norm": 13.333558082580566, + "learning_rate": 1.4217955957086393e-05, + "loss": 0.7265, + "step": 15360 + }, + { + "epoch": 2.89, + "grad_norm": 15.676292419433594, + "learning_rate": 1.4214191605495955e-05, + "loss": 0.6903, + "step": 15370 + }, + { + "epoch": 2.89, + "grad_norm": 15.788107872009277, + "learning_rate": 1.4210427253905516e-05, + "loss": 0.7214, + "step": 15380 + }, + { + "epoch": 2.9, + "grad_norm": 18.944698333740234, + "learning_rate": 1.4206662902315078e-05, + "loss": 0.6256, + "step": 15390 + }, + { + "epoch": 2.9, + "grad_norm": 17.18710708618164, + "learning_rate": 1.420289855072464e-05, + "loss": 0.9801, + "step": 15400 + }, + { + "epoch": 2.9, + "grad_norm": 24.525888442993164, + "learning_rate": 1.41991341991342e-05, + "loss": 0.8311, + "step": 15410 + }, + { + "epoch": 2.9, + "grad_norm": 15.199677467346191, + "learning_rate": 1.419536984754376e-05, + "loss": 0.6035, + "step": 15420 + }, + { + "epoch": 2.9, + "grad_norm": 1.7926441431045532, + "learning_rate": 1.4191605495953322e-05, + "loss": 0.7877, + "step": 15430 + }, + { + "epoch": 2.91, + "grad_norm": 10.499088287353516, + "learning_rate": 1.4187841144362883e-05, + "loss": 0.5538, + "step": 15440 + }, + { + "epoch": 2.91, + "grad_norm": 3.2541844844818115, + "learning_rate": 1.4184076792772445e-05, + "loss": 0.5928, + "step": 15450 + }, + { + "epoch": 2.91, + "grad_norm": 0.6842057108879089, + "learning_rate": 1.4180312441182006e-05, + "loss": 0.8708, + "step": 15460 + }, + { + "epoch": 2.91, + "grad_norm": 11.153387069702148, + "learning_rate": 1.4176548089591568e-05, + "loss": 0.55, + "step": 15470 + }, + { + "epoch": 2.91, + "grad_norm": 14.324100494384766, + "learning_rate": 1.4172783738001131e-05, + "loss": 0.9598, + "step": 15480 + }, + { + "epoch": 2.92, + "grad_norm": 9.769564628601074, + "learning_rate": 1.4169019386410692e-05, + "loss": 0.6915, + "step": 15490 + }, + { + "epoch": 2.92, + "grad_norm": 5.7416863441467285, + "learning_rate": 1.4165255034820254e-05, + "loss": 1.0917, + "step": 15500 + }, + { + "epoch": 2.92, + "grad_norm": 8.69757080078125, + "learning_rate": 1.4161490683229815e-05, + "loss": 0.5941, + "step": 15510 + }, + { + "epoch": 2.92, + "grad_norm": 5.992401123046875, + "learning_rate": 1.4157726331639377e-05, + "loss": 0.5866, + "step": 15520 + }, + { + "epoch": 2.92, + "grad_norm": 18.785144805908203, + "learning_rate": 1.4153961980048938e-05, + "loss": 0.721, + "step": 15530 + }, + { + "epoch": 2.92, + "grad_norm": 18.443172454833984, + "learning_rate": 1.41501976284585e-05, + "loss": 0.513, + "step": 15540 + }, + { + "epoch": 2.93, + "grad_norm": 8.198022842407227, + "learning_rate": 1.4146433276868061e-05, + "loss": 0.8079, + "step": 15550 + }, + { + "epoch": 2.93, + "grad_norm": 8.908957481384277, + "learning_rate": 1.4142668925277623e-05, + "loss": 0.656, + "step": 15560 + }, + { + "epoch": 2.93, + "grad_norm": 16.126298904418945, + "learning_rate": 1.4138904573687184e-05, + "loss": 0.9063, + "step": 15570 + }, + { + "epoch": 2.93, + "grad_norm": 20.169885635375977, + "learning_rate": 1.4135140222096746e-05, + "loss": 0.741, + "step": 15580 + }, + { + "epoch": 2.93, + "grad_norm": 0.3554701507091522, + "learning_rate": 1.4131375870506307e-05, + "loss": 0.614, + "step": 15590 + }, + { + "epoch": 2.94, + "grad_norm": 9.904111862182617, + "learning_rate": 1.4127611518915867e-05, + "loss": 0.8123, + "step": 15600 + }, + { + "epoch": 2.94, + "grad_norm": 24.91057586669922, + "learning_rate": 1.4123847167325428e-05, + "loss": 0.7551, + "step": 15610 + }, + { + "epoch": 2.94, + "grad_norm": 5.963438510894775, + "learning_rate": 1.412008281573499e-05, + "loss": 0.6714, + "step": 15620 + }, + { + "epoch": 2.94, + "grad_norm": 15.89360523223877, + "learning_rate": 1.4116318464144551e-05, + "loss": 0.8974, + "step": 15630 + }, + { + "epoch": 2.94, + "grad_norm": 12.206485748291016, + "learning_rate": 1.4112554112554113e-05, + "loss": 1.0773, + "step": 15640 + }, + { + "epoch": 2.95, + "grad_norm": 2.8578970432281494, + "learning_rate": 1.4108789760963674e-05, + "loss": 0.5777, + "step": 15650 + }, + { + "epoch": 2.95, + "grad_norm": 3.7192718982696533, + "learning_rate": 1.4105025409373236e-05, + "loss": 0.6443, + "step": 15660 + }, + { + "epoch": 2.95, + "grad_norm": 15.652264595031738, + "learning_rate": 1.4101261057782799e-05, + "loss": 0.7183, + "step": 15670 + }, + { + "epoch": 2.95, + "grad_norm": 4.038358688354492, + "learning_rate": 1.409749670619236e-05, + "loss": 0.3951, + "step": 15680 + }, + { + "epoch": 2.95, + "grad_norm": 23.8483829498291, + "learning_rate": 1.4093732354601922e-05, + "loss": 0.7791, + "step": 15690 + }, + { + "epoch": 2.96, + "grad_norm": 3.3910269737243652, + "learning_rate": 1.4089968003011483e-05, + "loss": 0.5433, + "step": 15700 + }, + { + "epoch": 2.96, + "grad_norm": 3.8397445678710938, + "learning_rate": 1.4086203651421045e-05, + "loss": 0.7548, + "step": 15710 + }, + { + "epoch": 2.96, + "grad_norm": 11.651528358459473, + "learning_rate": 1.4082439299830606e-05, + "loss": 0.4432, + "step": 15720 + }, + { + "epoch": 2.96, + "grad_norm": 11.280261039733887, + "learning_rate": 1.4078674948240167e-05, + "loss": 0.5902, + "step": 15730 + }, + { + "epoch": 2.96, + "grad_norm": 22.70057487487793, + "learning_rate": 1.4074910596649729e-05, + "loss": 0.5237, + "step": 15740 + }, + { + "epoch": 2.96, + "grad_norm": 12.416793823242188, + "learning_rate": 1.407114624505929e-05, + "loss": 0.5404, + "step": 15750 + }, + { + "epoch": 2.97, + "grad_norm": 23.81106948852539, + "learning_rate": 1.4067381893468852e-05, + "loss": 0.7493, + "step": 15760 + }, + { + "epoch": 2.97, + "grad_norm": 21.926897048950195, + "learning_rate": 1.4063617541878412e-05, + "loss": 0.8389, + "step": 15770 + }, + { + "epoch": 2.97, + "grad_norm": 4.98796272277832, + "learning_rate": 1.4059853190287973e-05, + "loss": 0.7776, + "step": 15780 + }, + { + "epoch": 2.97, + "grad_norm": 3.577707052230835, + "learning_rate": 1.4056088838697535e-05, + "loss": 0.3931, + "step": 15790 + }, + { + "epoch": 2.97, + "grad_norm": 6.863688945770264, + "learning_rate": 1.4052324487107096e-05, + "loss": 0.8432, + "step": 15800 + }, + { + "epoch": 2.98, + "grad_norm": 1.5216230154037476, + "learning_rate": 1.4048560135516657e-05, + "loss": 0.8054, + "step": 15810 + }, + { + "epoch": 2.98, + "grad_norm": 30.641324996948242, + "learning_rate": 1.4044795783926219e-05, + "loss": 0.7257, + "step": 15820 + }, + { + "epoch": 2.98, + "grad_norm": 10.028465270996094, + "learning_rate": 1.404103143233578e-05, + "loss": 0.9489, + "step": 15830 + }, + { + "epoch": 2.98, + "grad_norm": 27.563535690307617, + "learning_rate": 1.4037267080745342e-05, + "loss": 0.8339, + "step": 15840 + }, + { + "epoch": 2.98, + "grad_norm": 8.942625045776367, + "learning_rate": 1.4033502729154905e-05, + "loss": 0.7258, + "step": 15850 + }, + { + "epoch": 2.99, + "grad_norm": 16.081558227539062, + "learning_rate": 1.4029738377564466e-05, + "loss": 0.6341, + "step": 15860 + }, + { + "epoch": 2.99, + "grad_norm": 6.025300025939941, + "learning_rate": 1.4025974025974028e-05, + "loss": 0.6362, + "step": 15870 + }, + { + "epoch": 2.99, + "grad_norm": 15.980204582214355, + "learning_rate": 1.402220967438359e-05, + "loss": 0.8119, + "step": 15880 + }, + { + "epoch": 2.99, + "grad_norm": 34.32817840576172, + "learning_rate": 1.401844532279315e-05, + "loss": 0.8354, + "step": 15890 + }, + { + "epoch": 2.99, + "grad_norm": 9.960022926330566, + "learning_rate": 1.4014680971202712e-05, + "loss": 0.727, + "step": 15900 + }, + { + "epoch": 2.99, + "grad_norm": 17.459871292114258, + "learning_rate": 1.4010916619612274e-05, + "loss": 0.4824, + "step": 15910 + }, + { + "epoch": 3.0, + "grad_norm": 7.697139263153076, + "learning_rate": 1.4007152268021835e-05, + "loss": 0.4821, + "step": 15920 + }, + { + "epoch": 3.0, + "grad_norm": 21.81731605529785, + "learning_rate": 1.4003387916431397e-05, + "loss": 0.6092, + "step": 15930 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9081333333333333, + "eval_loss": 0.3438812494277954, + "eval_runtime": 51.1646, + "eval_samples_per_second": 146.586, + "eval_steps_per_second": 18.333, + "step": 15939 + }, + { + "epoch": 3.0, + "grad_norm": 30.34982681274414, + "learning_rate": 1.3999623564840956e-05, + "loss": 1.0665, + "step": 15940 + }, + { + "epoch": 3.0, + "grad_norm": 11.997414588928223, + "learning_rate": 1.3995859213250518e-05, + "loss": 0.8418, + "step": 15950 + }, + { + "epoch": 3.0, + "grad_norm": 12.47726058959961, + "learning_rate": 1.399209486166008e-05, + "loss": 0.6524, + "step": 15960 + }, + { + "epoch": 3.01, + "grad_norm": 5.827723026275635, + "learning_rate": 1.398833051006964e-05, + "loss": 0.6862, + "step": 15970 + }, + { + "epoch": 3.01, + "grad_norm": 11.972338676452637, + "learning_rate": 1.3984566158479202e-05, + "loss": 0.6641, + "step": 15980 + }, + { + "epoch": 3.01, + "grad_norm": 16.457120895385742, + "learning_rate": 1.3980801806888764e-05, + "loss": 0.9417, + "step": 15990 + }, + { + "epoch": 3.01, + "grad_norm": 11.473155975341797, + "learning_rate": 1.3977037455298325e-05, + "loss": 0.6272, + "step": 16000 + }, + { + "epoch": 3.01, + "grad_norm": 26.96430778503418, + "learning_rate": 1.3973273103707887e-05, + "loss": 0.3731, + "step": 16010 + }, + { + "epoch": 3.02, + "grad_norm": 12.995346069335938, + "learning_rate": 1.3969508752117448e-05, + "loss": 0.7276, + "step": 16020 + }, + { + "epoch": 3.02, + "grad_norm": 9.100690841674805, + "learning_rate": 1.396574440052701e-05, + "loss": 0.4041, + "step": 16030 + }, + { + "epoch": 3.02, + "grad_norm": 29.41590690612793, + "learning_rate": 1.3961980048936573e-05, + "loss": 0.5657, + "step": 16040 + }, + { + "epoch": 3.02, + "grad_norm": 1.4971169233322144, + "learning_rate": 1.3958215697346134e-05, + "loss": 0.8166, + "step": 16050 + }, + { + "epoch": 3.02, + "grad_norm": 27.66693687438965, + "learning_rate": 1.3954451345755696e-05, + "loss": 0.5622, + "step": 16060 + }, + { + "epoch": 3.02, + "grad_norm": 19.55706214904785, + "learning_rate": 1.3950686994165257e-05, + "loss": 0.4681, + "step": 16070 + }, + { + "epoch": 3.03, + "grad_norm": 8.878369331359863, + "learning_rate": 1.3946922642574819e-05, + "loss": 0.4866, + "step": 16080 + }, + { + "epoch": 3.03, + "grad_norm": 1.7334586381912231, + "learning_rate": 1.394315829098438e-05, + "loss": 0.52, + "step": 16090 + }, + { + "epoch": 3.03, + "grad_norm": 11.960335731506348, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.061, + "step": 16100 + }, + { + "epoch": 3.03, + "grad_norm": 9.888663291931152, + "learning_rate": 1.3935629587803503e-05, + "loss": 0.6879, + "step": 16110 + }, + { + "epoch": 3.03, + "grad_norm": 5.383222579956055, + "learning_rate": 1.3931865236213063e-05, + "loss": 0.8313, + "step": 16120 + }, + { + "epoch": 3.04, + "grad_norm": 0.5276015996932983, + "learning_rate": 1.3928100884622624e-05, + "loss": 0.6315, + "step": 16130 + }, + { + "epoch": 3.04, + "grad_norm": 12.056757926940918, + "learning_rate": 1.3924336533032186e-05, + "loss": 0.5598, + "step": 16140 + }, + { + "epoch": 3.04, + "grad_norm": 6.8872971534729, + "learning_rate": 1.3920572181441747e-05, + "loss": 0.3837, + "step": 16150 + }, + { + "epoch": 3.04, + "grad_norm": 3.6625053882598877, + "learning_rate": 1.3916807829851309e-05, + "loss": 0.6365, + "step": 16160 + }, + { + "epoch": 3.04, + "grad_norm": 0.40750113129615784, + "learning_rate": 1.391304347826087e-05, + "loss": 0.545, + "step": 16170 + }, + { + "epoch": 3.05, + "grad_norm": 5.1755170822143555, + "learning_rate": 1.3909279126670432e-05, + "loss": 0.5662, + "step": 16180 + }, + { + "epoch": 3.05, + "grad_norm": 27.666248321533203, + "learning_rate": 1.3905514775079993e-05, + "loss": 0.7077, + "step": 16190 + }, + { + "epoch": 3.05, + "grad_norm": 6.176600456237793, + "learning_rate": 1.3901750423489554e-05, + "loss": 0.5436, + "step": 16200 + }, + { + "epoch": 3.05, + "grad_norm": 40.43038558959961, + "learning_rate": 1.3897986071899116e-05, + "loss": 0.6344, + "step": 16210 + }, + { + "epoch": 3.05, + "grad_norm": 13.843911170959473, + "learning_rate": 1.3894221720308679e-05, + "loss": 0.7547, + "step": 16220 + }, + { + "epoch": 3.05, + "grad_norm": 17.818954467773438, + "learning_rate": 1.389045736871824e-05, + "loss": 0.8285, + "step": 16230 + }, + { + "epoch": 3.06, + "grad_norm": 23.41729164123535, + "learning_rate": 1.3886693017127802e-05, + "loss": 0.5006, + "step": 16240 + }, + { + "epoch": 3.06, + "grad_norm": 24.877290725708008, + "learning_rate": 1.3882928665537363e-05, + "loss": 1.136, + "step": 16250 + }, + { + "epoch": 3.06, + "grad_norm": 12.05526065826416, + "learning_rate": 1.3879164313946925e-05, + "loss": 0.703, + "step": 16260 + }, + { + "epoch": 3.06, + "grad_norm": 12.669425964355469, + "learning_rate": 1.3875399962356486e-05, + "loss": 0.7638, + "step": 16270 + }, + { + "epoch": 3.06, + "grad_norm": 4.3127899169921875, + "learning_rate": 1.3871635610766048e-05, + "loss": 0.8595, + "step": 16280 + }, + { + "epoch": 3.07, + "grad_norm": 27.62041473388672, + "learning_rate": 1.3867871259175608e-05, + "loss": 0.8843, + "step": 16290 + }, + { + "epoch": 3.07, + "grad_norm": 19.740711212158203, + "learning_rate": 1.3864106907585169e-05, + "loss": 0.6221, + "step": 16300 + }, + { + "epoch": 3.07, + "grad_norm": 4.2288312911987305, + "learning_rate": 1.386034255599473e-05, + "loss": 0.6524, + "step": 16310 + }, + { + "epoch": 3.07, + "grad_norm": 15.323369026184082, + "learning_rate": 1.3856578204404292e-05, + "loss": 0.9442, + "step": 16320 + }, + { + "epoch": 3.07, + "grad_norm": 7.15553617477417, + "learning_rate": 1.3852813852813853e-05, + "loss": 0.6764, + "step": 16330 + }, + { + "epoch": 3.08, + "grad_norm": 22.282184600830078, + "learning_rate": 1.3849049501223415e-05, + "loss": 0.6296, + "step": 16340 + }, + { + "epoch": 3.08, + "grad_norm": 20.944698333740234, + "learning_rate": 1.3845285149632976e-05, + "loss": 0.6328, + "step": 16350 + }, + { + "epoch": 3.08, + "grad_norm": 11.877099990844727, + "learning_rate": 1.3841520798042538e-05, + "loss": 0.668, + "step": 16360 + }, + { + "epoch": 3.08, + "grad_norm": 7.8392863273620605, + "learning_rate": 1.38377564464521e-05, + "loss": 0.4543, + "step": 16370 + }, + { + "epoch": 3.08, + "grad_norm": 19.019250869750977, + "learning_rate": 1.383399209486166e-05, + "loss": 0.8038, + "step": 16380 + }, + { + "epoch": 3.08, + "grad_norm": 0.9338988065719604, + "learning_rate": 1.3830227743271222e-05, + "loss": 0.4974, + "step": 16390 + }, + { + "epoch": 3.09, + "grad_norm": 0.640573263168335, + "learning_rate": 1.3826463391680784e-05, + "loss": 0.6984, + "step": 16400 + }, + { + "epoch": 3.09, + "grad_norm": 15.033812522888184, + "learning_rate": 1.3822699040090347e-05, + "loss": 0.9574, + "step": 16410 + }, + { + "epoch": 3.09, + "grad_norm": 12.06335163116455, + "learning_rate": 1.3818934688499908e-05, + "loss": 0.6254, + "step": 16420 + }, + { + "epoch": 3.09, + "grad_norm": 4.687410354614258, + "learning_rate": 1.381517033690947e-05, + "loss": 0.6764, + "step": 16430 + }, + { + "epoch": 3.09, + "grad_norm": 27.31790542602539, + "learning_rate": 1.3811405985319031e-05, + "loss": 0.5824, + "step": 16440 + }, + { + "epoch": 3.1, + "grad_norm": 9.797171592712402, + "learning_rate": 1.3807641633728593e-05, + "loss": 0.4788, + "step": 16450 + }, + { + "epoch": 3.1, + "grad_norm": 17.67564582824707, + "learning_rate": 1.3803877282138154e-05, + "loss": 1.209, + "step": 16460 + }, + { + "epoch": 3.1, + "grad_norm": 0.271230548620224, + "learning_rate": 1.3800112930547714e-05, + "loss": 0.4775, + "step": 16470 + }, + { + "epoch": 3.1, + "grad_norm": 33.21747589111328, + "learning_rate": 1.3796348578957275e-05, + "loss": 1.0736, + "step": 16480 + }, + { + "epoch": 3.1, + "grad_norm": 88.40038299560547, + "learning_rate": 1.3792584227366837e-05, + "loss": 0.8484, + "step": 16490 + }, + { + "epoch": 3.11, + "grad_norm": 17.692691802978516, + "learning_rate": 1.3788819875776398e-05, + "loss": 0.7804, + "step": 16500 + }, + { + "epoch": 3.11, + "grad_norm": 9.918412208557129, + "learning_rate": 1.378505552418596e-05, + "loss": 0.7316, + "step": 16510 + }, + { + "epoch": 3.11, + "grad_norm": 5.417355537414551, + "learning_rate": 1.3781291172595521e-05, + "loss": 0.6182, + "step": 16520 + }, + { + "epoch": 3.11, + "grad_norm": 11.936298370361328, + "learning_rate": 1.3777526821005083e-05, + "loss": 0.4077, + "step": 16530 + }, + { + "epoch": 3.11, + "grad_norm": 13.960946083068848, + "learning_rate": 1.3773762469414644e-05, + "loss": 0.526, + "step": 16540 + }, + { + "epoch": 3.12, + "grad_norm": 22.762529373168945, + "learning_rate": 1.3769998117824206e-05, + "loss": 0.6749, + "step": 16550 + }, + { + "epoch": 3.12, + "grad_norm": 14.109274864196777, + "learning_rate": 1.3766233766233767e-05, + "loss": 0.5186, + "step": 16560 + }, + { + "epoch": 3.12, + "grad_norm": 35.54779815673828, + "learning_rate": 1.3762469414643328e-05, + "loss": 0.7046, + "step": 16570 + }, + { + "epoch": 3.12, + "grad_norm": 12.966293334960938, + "learning_rate": 1.375870506305289e-05, + "loss": 0.7402, + "step": 16580 + }, + { + "epoch": 3.12, + "grad_norm": 8.81118392944336, + "learning_rate": 1.3754940711462453e-05, + "loss": 0.6679, + "step": 16590 + }, + { + "epoch": 3.12, + "grad_norm": 0.5349300503730774, + "learning_rate": 1.3751176359872015e-05, + "loss": 0.6342, + "step": 16600 + }, + { + "epoch": 3.13, + "grad_norm": 14.064860343933105, + "learning_rate": 1.3747412008281576e-05, + "loss": 0.8408, + "step": 16610 + }, + { + "epoch": 3.13, + "grad_norm": 13.386101722717285, + "learning_rate": 1.3743647656691137e-05, + "loss": 0.6574, + "step": 16620 + }, + { + "epoch": 3.13, + "grad_norm": 0.2948550879955292, + "learning_rate": 1.3739883305100699e-05, + "loss": 0.8882, + "step": 16630 + }, + { + "epoch": 3.13, + "grad_norm": 8.500995635986328, + "learning_rate": 1.3736118953510259e-05, + "loss": 0.6037, + "step": 16640 + }, + { + "epoch": 3.13, + "grad_norm": 18.741573333740234, + "learning_rate": 1.373235460191982e-05, + "loss": 0.629, + "step": 16650 + }, + { + "epoch": 3.14, + "grad_norm": 0.3394733667373657, + "learning_rate": 1.3728590250329382e-05, + "loss": 0.7729, + "step": 16660 + }, + { + "epoch": 3.14, + "grad_norm": 11.697025299072266, + "learning_rate": 1.3724825898738943e-05, + "loss": 0.512, + "step": 16670 + }, + { + "epoch": 3.14, + "grad_norm": 9.116700172424316, + "learning_rate": 1.3721061547148505e-05, + "loss": 0.6958, + "step": 16680 + }, + { + "epoch": 3.14, + "grad_norm": 59.24357223510742, + "learning_rate": 1.3717297195558066e-05, + "loss": 0.9625, + "step": 16690 + }, + { + "epoch": 3.14, + "grad_norm": 20.930126190185547, + "learning_rate": 1.3713532843967627e-05, + "loss": 0.7139, + "step": 16700 + }, + { + "epoch": 3.15, + "grad_norm": 46.4968147277832, + "learning_rate": 1.3709768492377189e-05, + "loss": 0.8418, + "step": 16710 + }, + { + "epoch": 3.15, + "grad_norm": 0.5635331273078918, + "learning_rate": 1.370600414078675e-05, + "loss": 0.7676, + "step": 16720 + }, + { + "epoch": 3.15, + "grad_norm": 0.2128181755542755, + "learning_rate": 1.3702239789196312e-05, + "loss": 0.9451, + "step": 16730 + }, + { + "epoch": 3.15, + "grad_norm": 21.21966552734375, + "learning_rate": 1.3698475437605873e-05, + "loss": 0.5834, + "step": 16740 + }, + { + "epoch": 3.15, + "grad_norm": 37.33884811401367, + "learning_rate": 1.3694711086015435e-05, + "loss": 0.8985, + "step": 16750 + }, + { + "epoch": 3.15, + "grad_norm": 7.059642314910889, + "learning_rate": 1.3690946734424996e-05, + "loss": 0.7002, + "step": 16760 + }, + { + "epoch": 3.16, + "grad_norm": 14.130393981933594, + "learning_rate": 1.3687182382834558e-05, + "loss": 0.8018, + "step": 16770 + }, + { + "epoch": 3.16, + "grad_norm": 15.686850547790527, + "learning_rate": 1.368341803124412e-05, + "loss": 0.7179, + "step": 16780 + }, + { + "epoch": 3.16, + "grad_norm": 7.120452404022217, + "learning_rate": 1.3679653679653682e-05, + "loss": 0.6912, + "step": 16790 + }, + { + "epoch": 3.16, + "grad_norm": 15.659395217895508, + "learning_rate": 1.3675889328063244e-05, + "loss": 0.8925, + "step": 16800 + }, + { + "epoch": 3.16, + "grad_norm": 27.15504264831543, + "learning_rate": 1.3672124976472802e-05, + "loss": 0.4575, + "step": 16810 + }, + { + "epoch": 3.17, + "grad_norm": 7.112409591674805, + "learning_rate": 1.3668360624882365e-05, + "loss": 0.7047, + "step": 16820 + }, + { + "epoch": 3.17, + "grad_norm": 21.725502014160156, + "learning_rate": 1.3664596273291926e-05, + "loss": 0.5739, + "step": 16830 + }, + { + "epoch": 3.17, + "grad_norm": 17.80892562866211, + "learning_rate": 1.3660831921701488e-05, + "loss": 0.8959, + "step": 16840 + }, + { + "epoch": 3.17, + "grad_norm": 1.4555948972702026, + "learning_rate": 1.365706757011105e-05, + "loss": 0.4714, + "step": 16850 + }, + { + "epoch": 3.17, + "grad_norm": 2.813807725906372, + "learning_rate": 1.365330321852061e-05, + "loss": 0.5298, + "step": 16860 + }, + { + "epoch": 3.18, + "grad_norm": 15.255325317382812, + "learning_rate": 1.3649538866930172e-05, + "loss": 0.7289, + "step": 16870 + }, + { + "epoch": 3.18, + "grad_norm": 10.276988983154297, + "learning_rate": 1.3645774515339734e-05, + "loss": 0.3811, + "step": 16880 + }, + { + "epoch": 3.18, + "grad_norm": 4.02056884765625, + "learning_rate": 1.3642010163749295e-05, + "loss": 0.7805, + "step": 16890 + }, + { + "epoch": 3.18, + "grad_norm": 20.98756217956543, + "learning_rate": 1.3638245812158857e-05, + "loss": 0.7219, + "step": 16900 + }, + { + "epoch": 3.18, + "grad_norm": 1.9777402877807617, + "learning_rate": 1.3634481460568418e-05, + "loss": 0.4574, + "step": 16910 + }, + { + "epoch": 3.18, + "grad_norm": 17.15428924560547, + "learning_rate": 1.363071710897798e-05, + "loss": 0.6216, + "step": 16920 + }, + { + "epoch": 3.19, + "grad_norm": 26.12958335876465, + "learning_rate": 1.3626952757387541e-05, + "loss": 0.5892, + "step": 16930 + }, + { + "epoch": 3.19, + "grad_norm": 2.257452964782715, + "learning_rate": 1.3623188405797103e-05, + "loss": 0.6007, + "step": 16940 + }, + { + "epoch": 3.19, + "grad_norm": 21.12044334411621, + "learning_rate": 1.3619424054206664e-05, + "loss": 0.6436, + "step": 16950 + }, + { + "epoch": 3.19, + "grad_norm": 27.5797061920166, + "learning_rate": 1.3615659702616225e-05, + "loss": 0.6577, + "step": 16960 + }, + { + "epoch": 3.19, + "grad_norm": 0.14470890164375305, + "learning_rate": 1.3611895351025789e-05, + "loss": 0.7087, + "step": 16970 + }, + { + "epoch": 3.2, + "grad_norm": 25.093280792236328, + "learning_rate": 1.360813099943535e-05, + "loss": 0.8523, + "step": 16980 + }, + { + "epoch": 3.2, + "grad_norm": 9.746846199035645, + "learning_rate": 1.3604366647844908e-05, + "loss": 0.6063, + "step": 16990 + }, + { + "epoch": 3.2, + "grad_norm": 26.40338897705078, + "learning_rate": 1.360060229625447e-05, + "loss": 0.7969, + "step": 17000 + }, + { + "epoch": 3.2, + "grad_norm": 16.754243850708008, + "learning_rate": 1.3596837944664033e-05, + "loss": 0.7463, + "step": 17010 + }, + { + "epoch": 3.2, + "grad_norm": 30.17478370666504, + "learning_rate": 1.3593073593073594e-05, + "loss": 0.8098, + "step": 17020 + }, + { + "epoch": 3.21, + "grad_norm": 10.640960693359375, + "learning_rate": 1.3589309241483156e-05, + "loss": 0.5837, + "step": 17030 + }, + { + "epoch": 3.21, + "grad_norm": 21.33949089050293, + "learning_rate": 1.3585544889892717e-05, + "loss": 0.627, + "step": 17040 + }, + { + "epoch": 3.21, + "grad_norm": 11.06455135345459, + "learning_rate": 1.3581780538302279e-05, + "loss": 0.6147, + "step": 17050 + }, + { + "epoch": 3.21, + "grad_norm": 25.937108993530273, + "learning_rate": 1.357801618671184e-05, + "loss": 1.1128, + "step": 17060 + }, + { + "epoch": 3.21, + "grad_norm": 6.419862270355225, + "learning_rate": 1.3574251835121402e-05, + "loss": 0.8132, + "step": 17070 + }, + { + "epoch": 3.21, + "grad_norm": 26.92257308959961, + "learning_rate": 1.3570487483530963e-05, + "loss": 0.4862, + "step": 17080 + }, + { + "epoch": 3.22, + "grad_norm": 10.812932968139648, + "learning_rate": 1.3566723131940524e-05, + "loss": 1.0484, + "step": 17090 + }, + { + "epoch": 3.22, + "grad_norm": 23.167081832885742, + "learning_rate": 1.3562958780350086e-05, + "loss": 0.815, + "step": 17100 + }, + { + "epoch": 3.22, + "grad_norm": 19.610149383544922, + "learning_rate": 1.3559194428759647e-05, + "loss": 0.788, + "step": 17110 + }, + { + "epoch": 3.22, + "grad_norm": 13.85695743560791, + "learning_rate": 1.3555430077169209e-05, + "loss": 0.65, + "step": 17120 + }, + { + "epoch": 3.22, + "grad_norm": 4.279960632324219, + "learning_rate": 1.355166572557877e-05, + "loss": 0.6214, + "step": 17130 + }, + { + "epoch": 3.23, + "grad_norm": 21.195491790771484, + "learning_rate": 1.3547901373988332e-05, + "loss": 0.5917, + "step": 17140 + }, + { + "epoch": 3.23, + "grad_norm": 43.27781295776367, + "learning_rate": 1.3544137022397895e-05, + "loss": 0.5481, + "step": 17150 + }, + { + "epoch": 3.23, + "grad_norm": 32.89248275756836, + "learning_rate": 1.3540372670807453e-05, + "loss": 0.7128, + "step": 17160 + }, + { + "epoch": 3.23, + "grad_norm": 1.7565685510635376, + "learning_rate": 1.3536608319217014e-05, + "loss": 0.5305, + "step": 17170 + }, + { + "epoch": 3.23, + "grad_norm": 0.9114441871643066, + "learning_rate": 1.3532843967626576e-05, + "loss": 0.4674, + "step": 17180 + }, + { + "epoch": 3.24, + "grad_norm": 14.061701774597168, + "learning_rate": 1.3529079616036137e-05, + "loss": 0.947, + "step": 17190 + }, + { + "epoch": 3.24, + "grad_norm": 14.889267921447754, + "learning_rate": 1.35253152644457e-05, + "loss": 0.8785, + "step": 17200 + }, + { + "epoch": 3.24, + "grad_norm": 10.14665412902832, + "learning_rate": 1.3521550912855262e-05, + "loss": 0.728, + "step": 17210 + }, + { + "epoch": 3.24, + "grad_norm": 16.317060470581055, + "learning_rate": 1.3517786561264823e-05, + "loss": 0.615, + "step": 17220 + }, + { + "epoch": 3.24, + "grad_norm": 14.949483871459961, + "learning_rate": 1.3514022209674385e-05, + "loss": 0.4112, + "step": 17230 + }, + { + "epoch": 3.24, + "grad_norm": 42.36807632446289, + "learning_rate": 1.3510257858083946e-05, + "loss": 0.5082, + "step": 17240 + }, + { + "epoch": 3.25, + "grad_norm": 10.184826850891113, + "learning_rate": 1.3506493506493508e-05, + "loss": 0.63, + "step": 17250 + }, + { + "epoch": 3.25, + "grad_norm": 7.961637496948242, + "learning_rate": 1.350272915490307e-05, + "loss": 0.373, + "step": 17260 + }, + { + "epoch": 3.25, + "grad_norm": 15.121786117553711, + "learning_rate": 1.349896480331263e-05, + "loss": 1.0031, + "step": 17270 + }, + { + "epoch": 3.25, + "grad_norm": 17.208942413330078, + "learning_rate": 1.3495200451722192e-05, + "loss": 0.7613, + "step": 17280 + }, + { + "epoch": 3.25, + "grad_norm": 3.7287890911102295, + "learning_rate": 1.3491436100131754e-05, + "loss": 0.4566, + "step": 17290 + }, + { + "epoch": 3.26, + "grad_norm": 34.4078254699707, + "learning_rate": 1.3487671748541315e-05, + "loss": 0.8477, + "step": 17300 + }, + { + "epoch": 3.26, + "grad_norm": 6.368019104003906, + "learning_rate": 1.3483907396950877e-05, + "loss": 0.6715, + "step": 17310 + }, + { + "epoch": 3.26, + "grad_norm": 6.000946044921875, + "learning_rate": 1.3480143045360438e-05, + "loss": 0.3316, + "step": 17320 + }, + { + "epoch": 3.26, + "grad_norm": 8.809701919555664, + "learning_rate": 1.347637869377e-05, + "loss": 0.9712, + "step": 17330 + }, + { + "epoch": 3.26, + "grad_norm": 2.8006772994995117, + "learning_rate": 1.347261434217956e-05, + "loss": 0.5571, + "step": 17340 + }, + { + "epoch": 3.27, + "grad_norm": 3.8963782787323, + "learning_rate": 1.346884999058912e-05, + "loss": 0.8123, + "step": 17350 + }, + { + "epoch": 3.27, + "grad_norm": 21.654504776000977, + "learning_rate": 1.3465085638998682e-05, + "loss": 0.46, + "step": 17360 + }, + { + "epoch": 3.27, + "grad_norm": 21.95523452758789, + "learning_rate": 1.3461321287408244e-05, + "loss": 0.8436, + "step": 17370 + }, + { + "epoch": 3.27, + "grad_norm": 21.384435653686523, + "learning_rate": 1.3457556935817807e-05, + "loss": 0.6596, + "step": 17380 + }, + { + "epoch": 3.27, + "grad_norm": 2.9826455116271973, + "learning_rate": 1.3453792584227368e-05, + "loss": 0.7622, + "step": 17390 + }, + { + "epoch": 3.27, + "grad_norm": 1.4264588356018066, + "learning_rate": 1.345002823263693e-05, + "loss": 0.739, + "step": 17400 + }, + { + "epoch": 3.28, + "grad_norm": 7.517381191253662, + "learning_rate": 1.3446263881046491e-05, + "loss": 0.6091, + "step": 17410 + }, + { + "epoch": 3.28, + "grad_norm": 32.621761322021484, + "learning_rate": 1.3442499529456053e-05, + "loss": 0.6992, + "step": 17420 + }, + { + "epoch": 3.28, + "grad_norm": 4.760457992553711, + "learning_rate": 1.3438735177865614e-05, + "loss": 1.0, + "step": 17430 + }, + { + "epoch": 3.28, + "grad_norm": 0.4995521008968353, + "learning_rate": 1.3434970826275176e-05, + "loss": 0.523, + "step": 17440 + }, + { + "epoch": 3.28, + "grad_norm": 8.695629119873047, + "learning_rate": 1.3431206474684737e-05, + "loss": 0.646, + "step": 17450 + }, + { + "epoch": 3.29, + "grad_norm": 0.28502389788627625, + "learning_rate": 1.3427442123094298e-05, + "loss": 0.5188, + "step": 17460 + }, + { + "epoch": 3.29, + "grad_norm": 0.6979949474334717, + "learning_rate": 1.342367777150386e-05, + "loss": 0.5536, + "step": 17470 + }, + { + "epoch": 3.29, + "grad_norm": 24.936214447021484, + "learning_rate": 1.3419913419913421e-05, + "loss": 0.647, + "step": 17480 + }, + { + "epoch": 3.29, + "grad_norm": 8.776481628417969, + "learning_rate": 1.3416149068322983e-05, + "loss": 0.8732, + "step": 17490 + }, + { + "epoch": 3.29, + "grad_norm": 19.038969039916992, + "learning_rate": 1.3412384716732544e-05, + "loss": 0.77, + "step": 17500 + }, + { + "epoch": 3.3, + "grad_norm": 4.335228443145752, + "learning_rate": 1.3408620365142104e-05, + "loss": 0.6023, + "step": 17510 + }, + { + "epoch": 3.3, + "grad_norm": 7.746741771697998, + "learning_rate": 1.3404856013551666e-05, + "loss": 0.6851, + "step": 17520 + }, + { + "epoch": 3.3, + "grad_norm": 24.021316528320312, + "learning_rate": 1.3401091661961227e-05, + "loss": 0.8683, + "step": 17530 + }, + { + "epoch": 3.3, + "grad_norm": 40.04768371582031, + "learning_rate": 1.3397327310370788e-05, + "loss": 0.7139, + "step": 17540 + }, + { + "epoch": 3.3, + "grad_norm": 28.369369506835938, + "learning_rate": 1.339356295878035e-05, + "loss": 0.7984, + "step": 17550 + }, + { + "epoch": 3.31, + "grad_norm": 12.895668983459473, + "learning_rate": 1.3389798607189911e-05, + "loss": 0.8768, + "step": 17560 + }, + { + "epoch": 3.31, + "grad_norm": 13.10739517211914, + "learning_rate": 1.3386034255599475e-05, + "loss": 0.9014, + "step": 17570 + }, + { + "epoch": 3.31, + "grad_norm": 6.7018537521362305, + "learning_rate": 1.3382269904009036e-05, + "loss": 0.6438, + "step": 17580 + }, + { + "epoch": 3.31, + "grad_norm": 20.47855567932129, + "learning_rate": 1.3378505552418597e-05, + "loss": 0.5956, + "step": 17590 + }, + { + "epoch": 3.31, + "grad_norm": 34.99126052856445, + "learning_rate": 1.3374741200828159e-05, + "loss": 0.6245, + "step": 17600 + }, + { + "epoch": 3.31, + "grad_norm": 31.813350677490234, + "learning_rate": 1.337097684923772e-05, + "loss": 0.9122, + "step": 17610 + }, + { + "epoch": 3.32, + "grad_norm": 15.438579559326172, + "learning_rate": 1.3367212497647282e-05, + "loss": 0.7581, + "step": 17620 + }, + { + "epoch": 3.32, + "grad_norm": 10.1848726272583, + "learning_rate": 1.3363448146056843e-05, + "loss": 0.3745, + "step": 17630 + }, + { + "epoch": 3.32, + "grad_norm": 0.6416832804679871, + "learning_rate": 1.3359683794466405e-05, + "loss": 0.5186, + "step": 17640 + }, + { + "epoch": 3.32, + "grad_norm": 17.22964859008789, + "learning_rate": 1.3355919442875966e-05, + "loss": 0.517, + "step": 17650 + }, + { + "epoch": 3.32, + "grad_norm": 23.54407501220703, + "learning_rate": 1.3352155091285528e-05, + "loss": 0.6123, + "step": 17660 + }, + { + "epoch": 3.33, + "grad_norm": 24.60756492614746, + "learning_rate": 1.3348390739695089e-05, + "loss": 0.6709, + "step": 17670 + }, + { + "epoch": 3.33, + "grad_norm": 18.41277313232422, + "learning_rate": 1.334462638810465e-05, + "loss": 0.724, + "step": 17680 + }, + { + "epoch": 3.33, + "grad_norm": 13.058307647705078, + "learning_rate": 1.334086203651421e-05, + "loss": 0.6962, + "step": 17690 + }, + { + "epoch": 3.33, + "grad_norm": 13.01867389678955, + "learning_rate": 1.3337097684923772e-05, + "loss": 0.5346, + "step": 17700 + }, + { + "epoch": 3.33, + "grad_norm": 13.48055362701416, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.5472, + "step": 17710 + }, + { + "epoch": 3.34, + "grad_norm": 18.029436111450195, + "learning_rate": 1.3329568981742895e-05, + "loss": 0.4834, + "step": 17720 + }, + { + "epoch": 3.34, + "grad_norm": 2.22222900390625, + "learning_rate": 1.3325804630152456e-05, + "loss": 0.6628, + "step": 17730 + }, + { + "epoch": 3.34, + "grad_norm": 7.793396472930908, + "learning_rate": 1.3322040278562018e-05, + "loss": 0.866, + "step": 17740 + }, + { + "epoch": 3.34, + "grad_norm": 42.029598236083984, + "learning_rate": 1.331827592697158e-05, + "loss": 0.5733, + "step": 17750 + }, + { + "epoch": 3.34, + "grad_norm": 8.039556503295898, + "learning_rate": 1.3314511575381142e-05, + "loss": 0.6282, + "step": 17760 + }, + { + "epoch": 3.34, + "grad_norm": 13.458028793334961, + "learning_rate": 1.3310747223790704e-05, + "loss": 0.911, + "step": 17770 + }, + { + "epoch": 3.35, + "grad_norm": 14.425552368164062, + "learning_rate": 1.3306982872200265e-05, + "loss": 1.0583, + "step": 17780 + }, + { + "epoch": 3.35, + "grad_norm": 3.239820718765259, + "learning_rate": 1.3303218520609827e-05, + "loss": 0.5288, + "step": 17790 + }, + { + "epoch": 3.35, + "grad_norm": 49.50566482543945, + "learning_rate": 1.3299454169019388e-05, + "loss": 0.9413, + "step": 17800 + }, + { + "epoch": 3.35, + "grad_norm": 15.337508201599121, + "learning_rate": 1.329568981742895e-05, + "loss": 0.6259, + "step": 17810 + }, + { + "epoch": 3.35, + "grad_norm": 22.253278732299805, + "learning_rate": 1.3291925465838511e-05, + "loss": 0.8045, + "step": 17820 + }, + { + "epoch": 3.36, + "grad_norm": 8.542914390563965, + "learning_rate": 1.3288161114248073e-05, + "loss": 0.5947, + "step": 17830 + }, + { + "epoch": 3.36, + "grad_norm": 9.067444801330566, + "learning_rate": 1.3284396762657634e-05, + "loss": 0.5303, + "step": 17840 + }, + { + "epoch": 3.36, + "grad_norm": 15.519441604614258, + "learning_rate": 1.3280632411067195e-05, + "loss": 1.0939, + "step": 17850 + }, + { + "epoch": 3.36, + "grad_norm": 8.470025062561035, + "learning_rate": 1.3276868059476755e-05, + "loss": 0.7933, + "step": 17860 + }, + { + "epoch": 3.36, + "grad_norm": 17.665695190429688, + "learning_rate": 1.3273103707886317e-05, + "loss": 0.7463, + "step": 17870 + }, + { + "epoch": 3.37, + "grad_norm": 17.912128448486328, + "learning_rate": 1.3269339356295878e-05, + "loss": 0.6041, + "step": 17880 + }, + { + "epoch": 3.37, + "grad_norm": 26.95820426940918, + "learning_rate": 1.326557500470544e-05, + "loss": 0.7265, + "step": 17890 + }, + { + "epoch": 3.37, + "grad_norm": 13.88226318359375, + "learning_rate": 1.3261810653115001e-05, + "loss": 0.7233, + "step": 17900 + }, + { + "epoch": 3.37, + "grad_norm": 9.494647979736328, + "learning_rate": 1.3258046301524563e-05, + "loss": 0.5738, + "step": 17910 + }, + { + "epoch": 3.37, + "grad_norm": 23.288188934326172, + "learning_rate": 1.3254281949934124e-05, + "loss": 0.9191, + "step": 17920 + }, + { + "epoch": 3.37, + "grad_norm": 12.423943519592285, + "learning_rate": 1.3250517598343685e-05, + "loss": 0.5722, + "step": 17930 + }, + { + "epoch": 3.38, + "grad_norm": 20.487674713134766, + "learning_rate": 1.3246753246753249e-05, + "loss": 0.5988, + "step": 17940 + }, + { + "epoch": 3.38, + "grad_norm": 8.707905769348145, + "learning_rate": 1.324298889516281e-05, + "loss": 0.3331, + "step": 17950 + }, + { + "epoch": 3.38, + "grad_norm": 0.4977474808692932, + "learning_rate": 1.3239224543572372e-05, + "loss": 0.5852, + "step": 17960 + }, + { + "epoch": 3.38, + "grad_norm": 6.157559871673584, + "learning_rate": 1.3235460191981933e-05, + "loss": 0.6341, + "step": 17970 + }, + { + "epoch": 3.38, + "grad_norm": 23.23993682861328, + "learning_rate": 1.3231695840391494e-05, + "loss": 0.9592, + "step": 17980 + }, + { + "epoch": 3.39, + "grad_norm": 10.312979698181152, + "learning_rate": 1.3227931488801056e-05, + "loss": 0.7303, + "step": 17990 + }, + { + "epoch": 3.39, + "grad_norm": 24.67930030822754, + "learning_rate": 1.3224167137210617e-05, + "loss": 0.5402, + "step": 18000 + }, + { + "epoch": 3.39, + "grad_norm": 22.200210571289062, + "learning_rate": 1.3220402785620179e-05, + "loss": 0.8094, + "step": 18010 + }, + { + "epoch": 3.39, + "grad_norm": 5.099223613739014, + "learning_rate": 1.321663843402974e-05, + "loss": 0.8314, + "step": 18020 + }, + { + "epoch": 3.39, + "grad_norm": 19.834396362304688, + "learning_rate": 1.32128740824393e-05, + "loss": 1.009, + "step": 18030 + }, + { + "epoch": 3.4, + "grad_norm": 11.00228214263916, + "learning_rate": 1.3209109730848861e-05, + "loss": 0.8001, + "step": 18040 + }, + { + "epoch": 3.4, + "grad_norm": 19.782005310058594, + "learning_rate": 1.3205345379258423e-05, + "loss": 0.5065, + "step": 18050 + }, + { + "epoch": 3.4, + "grad_norm": 9.906347274780273, + "learning_rate": 1.3201581027667984e-05, + "loss": 0.4765, + "step": 18060 + }, + { + "epoch": 3.4, + "grad_norm": 28.395952224731445, + "learning_rate": 1.3197816676077546e-05, + "loss": 0.9283, + "step": 18070 + }, + { + "epoch": 3.4, + "grad_norm": 4.160763740539551, + "learning_rate": 1.3194052324487107e-05, + "loss": 0.6412, + "step": 18080 + }, + { + "epoch": 3.4, + "grad_norm": 16.857194900512695, + "learning_rate": 1.3190287972896669e-05, + "loss": 0.888, + "step": 18090 + }, + { + "epoch": 3.41, + "grad_norm": 3.8556270599365234, + "learning_rate": 1.318652362130623e-05, + "loss": 0.8731, + "step": 18100 + }, + { + "epoch": 3.41, + "grad_norm": 29.509401321411133, + "learning_rate": 1.3182759269715792e-05, + "loss": 0.6451, + "step": 18110 + }, + { + "epoch": 3.41, + "grad_norm": 8.988554954528809, + "learning_rate": 1.3178994918125355e-05, + "loss": 0.9483, + "step": 18120 + }, + { + "epoch": 3.41, + "grad_norm": 19.900251388549805, + "learning_rate": 1.3175230566534916e-05, + "loss": 1.0578, + "step": 18130 + }, + { + "epoch": 3.41, + "grad_norm": 11.77758502960205, + "learning_rate": 1.3171466214944478e-05, + "loss": 0.6442, + "step": 18140 + }, + { + "epoch": 3.42, + "grad_norm": 6.527004718780518, + "learning_rate": 1.316770186335404e-05, + "loss": 0.7697, + "step": 18150 + }, + { + "epoch": 3.42, + "grad_norm": 5.472777843475342, + "learning_rate": 1.31639375117636e-05, + "loss": 0.9177, + "step": 18160 + }, + { + "epoch": 3.42, + "grad_norm": 11.783276557922363, + "learning_rate": 1.3160173160173162e-05, + "loss": 0.6803, + "step": 18170 + }, + { + "epoch": 3.42, + "grad_norm": 9.348353385925293, + "learning_rate": 1.3156408808582724e-05, + "loss": 0.9163, + "step": 18180 + }, + { + "epoch": 3.42, + "grad_norm": 8.337559700012207, + "learning_rate": 1.3152644456992285e-05, + "loss": 0.5074, + "step": 18190 + }, + { + "epoch": 3.43, + "grad_norm": 18.181156158447266, + "learning_rate": 1.3148880105401847e-05, + "loss": 0.6039, + "step": 18200 + }, + { + "epoch": 3.43, + "grad_norm": 24.175315856933594, + "learning_rate": 1.3145115753811406e-05, + "loss": 0.6791, + "step": 18210 + }, + { + "epoch": 3.43, + "grad_norm": 11.990982055664062, + "learning_rate": 1.3141351402220968e-05, + "loss": 0.5122, + "step": 18220 + }, + { + "epoch": 3.43, + "grad_norm": 12.18565845489502, + "learning_rate": 1.313758705063053e-05, + "loss": 0.8138, + "step": 18230 + }, + { + "epoch": 3.43, + "grad_norm": 20.374452590942383, + "learning_rate": 1.313382269904009e-05, + "loss": 0.4286, + "step": 18240 + }, + { + "epoch": 3.43, + "grad_norm": 11.977636337280273, + "learning_rate": 1.3130058347449652e-05, + "loss": 0.6633, + "step": 18250 + }, + { + "epoch": 3.44, + "grad_norm": 5.555273056030273, + "learning_rate": 1.3126293995859214e-05, + "loss": 0.3768, + "step": 18260 + }, + { + "epoch": 3.44, + "grad_norm": 12.093500137329102, + "learning_rate": 1.3122529644268775e-05, + "loss": 0.5081, + "step": 18270 + }, + { + "epoch": 3.44, + "grad_norm": 12.523153305053711, + "learning_rate": 1.3118765292678337e-05, + "loss": 0.7281, + "step": 18280 + }, + { + "epoch": 3.44, + "grad_norm": 34.439266204833984, + "learning_rate": 1.3115000941087898e-05, + "loss": 0.7471, + "step": 18290 + }, + { + "epoch": 3.44, + "grad_norm": 28.79136848449707, + "learning_rate": 1.311123658949746e-05, + "loss": 0.6478, + "step": 18300 + }, + { + "epoch": 3.45, + "grad_norm": 5.4620561599731445, + "learning_rate": 1.3107472237907023e-05, + "loss": 0.8856, + "step": 18310 + }, + { + "epoch": 3.45, + "grad_norm": 14.058579444885254, + "learning_rate": 1.3103707886316584e-05, + "loss": 0.4122, + "step": 18320 + }, + { + "epoch": 3.45, + "grad_norm": 17.870243072509766, + "learning_rate": 1.3099943534726146e-05, + "loss": 0.4202, + "step": 18330 + }, + { + "epoch": 3.45, + "grad_norm": 32.40336990356445, + "learning_rate": 1.3096179183135707e-05, + "loss": 0.9052, + "step": 18340 + }, + { + "epoch": 3.45, + "grad_norm": 10.193222999572754, + "learning_rate": 1.3092414831545268e-05, + "loss": 0.5433, + "step": 18350 + }, + { + "epoch": 3.46, + "grad_norm": 7.227964401245117, + "learning_rate": 1.308865047995483e-05, + "loss": 0.6235, + "step": 18360 + }, + { + "epoch": 3.46, + "grad_norm": 17.942949295043945, + "learning_rate": 1.3084886128364391e-05, + "loss": 0.9279, + "step": 18370 + }, + { + "epoch": 3.46, + "grad_norm": 2.4786839485168457, + "learning_rate": 1.3081121776773951e-05, + "loss": 0.5688, + "step": 18380 + }, + { + "epoch": 3.46, + "grad_norm": 13.7864408493042, + "learning_rate": 1.3077357425183513e-05, + "loss": 0.6254, + "step": 18390 + }, + { + "epoch": 3.46, + "grad_norm": 11.290862083435059, + "learning_rate": 1.3073593073593074e-05, + "loss": 0.5499, + "step": 18400 + }, + { + "epoch": 3.47, + "grad_norm": 27.211687088012695, + "learning_rate": 1.3069828722002636e-05, + "loss": 0.7081, + "step": 18410 + }, + { + "epoch": 3.47, + "grad_norm": 31.036930084228516, + "learning_rate": 1.3066064370412197e-05, + "loss": 0.692, + "step": 18420 + }, + { + "epoch": 3.47, + "grad_norm": 9.828187942504883, + "learning_rate": 1.3062300018821758e-05, + "loss": 0.848, + "step": 18430 + }, + { + "epoch": 3.47, + "grad_norm": 22.125829696655273, + "learning_rate": 1.305853566723132e-05, + "loss": 0.7898, + "step": 18440 + }, + { + "epoch": 3.47, + "grad_norm": 15.01488971710205, + "learning_rate": 1.3054771315640881e-05, + "loss": 0.4647, + "step": 18450 + }, + { + "epoch": 3.47, + "grad_norm": 14.60596752166748, + "learning_rate": 1.3051006964050443e-05, + "loss": 0.7542, + "step": 18460 + }, + { + "epoch": 3.48, + "grad_norm": 23.092918395996094, + "learning_rate": 1.3047242612460004e-05, + "loss": 0.8983, + "step": 18470 + }, + { + "epoch": 3.48, + "grad_norm": 12.916792869567871, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.6377, + "step": 18480 + }, + { + "epoch": 3.48, + "grad_norm": 30.594213485717773, + "learning_rate": 1.3039713909279127e-05, + "loss": 0.9676, + "step": 18490 + }, + { + "epoch": 3.48, + "grad_norm": 18.934587478637695, + "learning_rate": 1.303594955768869e-05, + "loss": 0.8782, + "step": 18500 + }, + { + "epoch": 3.48, + "grad_norm": 2.794344663619995, + "learning_rate": 1.3032185206098252e-05, + "loss": 0.3121, + "step": 18510 + }, + { + "epoch": 3.49, + "grad_norm": 2.3938238620758057, + "learning_rate": 1.3028420854507813e-05, + "loss": 0.9087, + "step": 18520 + }, + { + "epoch": 3.49, + "grad_norm": 32.515846252441406, + "learning_rate": 1.3024656502917375e-05, + "loss": 0.6834, + "step": 18530 + }, + { + "epoch": 3.49, + "grad_norm": 4.718508720397949, + "learning_rate": 1.3020892151326936e-05, + "loss": 0.6034, + "step": 18540 + }, + { + "epoch": 3.49, + "grad_norm": 16.135196685791016, + "learning_rate": 1.3017127799736498e-05, + "loss": 0.7444, + "step": 18550 + }, + { + "epoch": 3.49, + "grad_norm": 15.95462417602539, + "learning_rate": 1.3013363448146057e-05, + "loss": 0.482, + "step": 18560 + }, + { + "epoch": 3.5, + "grad_norm": 15.838091850280762, + "learning_rate": 1.3009599096555619e-05, + "loss": 0.7012, + "step": 18570 + }, + { + "epoch": 3.5, + "grad_norm": 9.506869316101074, + "learning_rate": 1.300583474496518e-05, + "loss": 0.5384, + "step": 18580 + }, + { + "epoch": 3.5, + "grad_norm": 11.965920448303223, + "learning_rate": 1.3002070393374742e-05, + "loss": 0.5091, + "step": 18590 + }, + { + "epoch": 3.5, + "grad_norm": 21.009666442871094, + "learning_rate": 1.2998306041784303e-05, + "loss": 0.9806, + "step": 18600 + }, + { + "epoch": 3.5, + "grad_norm": 19.624370574951172, + "learning_rate": 1.2994541690193865e-05, + "loss": 0.7226, + "step": 18610 + }, + { + "epoch": 3.5, + "grad_norm": 18.199827194213867, + "learning_rate": 1.2990777338603426e-05, + "loss": 0.7008, + "step": 18620 + }, + { + "epoch": 3.51, + "grad_norm": 16.78691864013672, + "learning_rate": 1.2987012987012988e-05, + "loss": 0.8391, + "step": 18630 + }, + { + "epoch": 3.51, + "grad_norm": 11.214224815368652, + "learning_rate": 1.2983248635422549e-05, + "loss": 0.7674, + "step": 18640 + }, + { + "epoch": 3.51, + "grad_norm": 12.816788673400879, + "learning_rate": 1.297948428383211e-05, + "loss": 0.6571, + "step": 18650 + }, + { + "epoch": 3.51, + "grad_norm": 26.343637466430664, + "learning_rate": 1.2975719932241672e-05, + "loss": 0.7407, + "step": 18660 + }, + { + "epoch": 3.51, + "grad_norm": 10.422934532165527, + "learning_rate": 1.2971955580651234e-05, + "loss": 0.571, + "step": 18670 + }, + { + "epoch": 3.52, + "grad_norm": 4.7122015953063965, + "learning_rate": 1.2968191229060797e-05, + "loss": 0.4944, + "step": 18680 + }, + { + "epoch": 3.52, + "grad_norm": 10.654007911682129, + "learning_rate": 1.2964426877470358e-05, + "loss": 0.4871, + "step": 18690 + }, + { + "epoch": 3.52, + "grad_norm": 16.5805606842041, + "learning_rate": 1.296066252587992e-05, + "loss": 0.7417, + "step": 18700 + }, + { + "epoch": 3.52, + "grad_norm": 12.002159118652344, + "learning_rate": 1.2956898174289481e-05, + "loss": 0.686, + "step": 18710 + }, + { + "epoch": 3.52, + "grad_norm": 22.57682991027832, + "learning_rate": 1.2953133822699043e-05, + "loss": 0.9398, + "step": 18720 + }, + { + "epoch": 3.53, + "grad_norm": 8.608962059020996, + "learning_rate": 1.2949369471108602e-05, + "loss": 0.5677, + "step": 18730 + }, + { + "epoch": 3.53, + "grad_norm": 14.006396293640137, + "learning_rate": 1.2945605119518164e-05, + "loss": 0.635, + "step": 18740 + }, + { + "epoch": 3.53, + "grad_norm": 10.348053932189941, + "learning_rate": 1.2941840767927725e-05, + "loss": 0.7357, + "step": 18750 + }, + { + "epoch": 3.53, + "grad_norm": 18.81407928466797, + "learning_rate": 1.2938076416337287e-05, + "loss": 0.9314, + "step": 18760 + }, + { + "epoch": 3.53, + "grad_norm": 11.178011894226074, + "learning_rate": 1.2934312064746848e-05, + "loss": 0.454, + "step": 18770 + }, + { + "epoch": 3.53, + "grad_norm": 23.306005477905273, + "learning_rate": 1.293054771315641e-05, + "loss": 0.5233, + "step": 18780 + }, + { + "epoch": 3.54, + "grad_norm": 5.833727836608887, + "learning_rate": 1.2926783361565971e-05, + "loss": 0.5184, + "step": 18790 + }, + { + "epoch": 3.54, + "grad_norm": 11.050354957580566, + "learning_rate": 1.2923019009975533e-05, + "loss": 0.6627, + "step": 18800 + }, + { + "epoch": 3.54, + "grad_norm": 17.933313369750977, + "learning_rate": 1.2919254658385094e-05, + "loss": 0.6955, + "step": 18810 + }, + { + "epoch": 3.54, + "grad_norm": 27.6488037109375, + "learning_rate": 1.2915490306794655e-05, + "loss": 0.7085, + "step": 18820 + }, + { + "epoch": 3.54, + "grad_norm": 19.928560256958008, + "learning_rate": 1.2911725955204217e-05, + "loss": 0.4012, + "step": 18830 + }, + { + "epoch": 3.55, + "grad_norm": 12.084012985229492, + "learning_rate": 1.2907961603613778e-05, + "loss": 0.679, + "step": 18840 + }, + { + "epoch": 3.55, + "grad_norm": 36.998104095458984, + "learning_rate": 1.290419725202334e-05, + "loss": 0.8037, + "step": 18850 + }, + { + "epoch": 3.55, + "grad_norm": 15.250351905822754, + "learning_rate": 1.2900432900432901e-05, + "loss": 0.5382, + "step": 18860 + }, + { + "epoch": 3.55, + "grad_norm": 30.874340057373047, + "learning_rate": 1.2896668548842464e-05, + "loss": 0.4945, + "step": 18870 + }, + { + "epoch": 3.55, + "grad_norm": 22.66957664489746, + "learning_rate": 1.2892904197252026e-05, + "loss": 0.5362, + "step": 18880 + }, + { + "epoch": 3.56, + "grad_norm": 29.47343635559082, + "learning_rate": 1.2889139845661587e-05, + "loss": 0.6591, + "step": 18890 + }, + { + "epoch": 3.56, + "grad_norm": 11.1913423538208, + "learning_rate": 1.2885375494071149e-05, + "loss": 0.6665, + "step": 18900 + }, + { + "epoch": 3.56, + "grad_norm": 7.629205703735352, + "learning_rate": 1.2881611142480709e-05, + "loss": 0.879, + "step": 18910 + }, + { + "epoch": 3.56, + "grad_norm": 9.654862403869629, + "learning_rate": 1.287784679089027e-05, + "loss": 0.5893, + "step": 18920 + }, + { + "epoch": 3.56, + "grad_norm": 0.7786961793899536, + "learning_rate": 1.2874082439299832e-05, + "loss": 0.7301, + "step": 18930 + }, + { + "epoch": 3.56, + "grad_norm": 18.76505470275879, + "learning_rate": 1.2870318087709393e-05, + "loss": 0.8441, + "step": 18940 + }, + { + "epoch": 3.57, + "grad_norm": 34.90388870239258, + "learning_rate": 1.2866553736118954e-05, + "loss": 0.7965, + "step": 18950 + }, + { + "epoch": 3.57, + "grad_norm": 14.218279838562012, + "learning_rate": 1.2862789384528516e-05, + "loss": 0.7871, + "step": 18960 + }, + { + "epoch": 3.57, + "grad_norm": 12.61451244354248, + "learning_rate": 1.2859025032938077e-05, + "loss": 0.3458, + "step": 18970 + }, + { + "epoch": 3.57, + "grad_norm": 7.381354331970215, + "learning_rate": 1.2855260681347639e-05, + "loss": 0.3608, + "step": 18980 + }, + { + "epoch": 3.57, + "grad_norm": 5.548383712768555, + "learning_rate": 1.28514963297572e-05, + "loss": 0.3053, + "step": 18990 + }, + { + "epoch": 3.58, + "grad_norm": 24.012208938598633, + "learning_rate": 1.2847731978166762e-05, + "loss": 0.73, + "step": 19000 + }, + { + "epoch": 3.58, + "grad_norm": 8.774457931518555, + "learning_rate": 1.2843967626576323e-05, + "loss": 0.6298, + "step": 19010 + }, + { + "epoch": 3.58, + "grad_norm": 30.910158157348633, + "learning_rate": 1.2840203274985885e-05, + "loss": 0.7751, + "step": 19020 + }, + { + "epoch": 3.58, + "grad_norm": 34.78523635864258, + "learning_rate": 1.2836438923395446e-05, + "loss": 0.7953, + "step": 19030 + }, + { + "epoch": 3.58, + "grad_norm": 19.161802291870117, + "learning_rate": 1.2832674571805008e-05, + "loss": 0.6131, + "step": 19040 + }, + { + "epoch": 3.59, + "grad_norm": 0.4744579792022705, + "learning_rate": 1.282891022021457e-05, + "loss": 0.5262, + "step": 19050 + }, + { + "epoch": 3.59, + "grad_norm": 12.062616348266602, + "learning_rate": 1.2825145868624132e-05, + "loss": 0.8151, + "step": 19060 + }, + { + "epoch": 3.59, + "grad_norm": 11.607182502746582, + "learning_rate": 1.2821381517033694e-05, + "loss": 0.5112, + "step": 19070 + }, + { + "epoch": 3.59, + "grad_norm": 1.1499428749084473, + "learning_rate": 1.2817617165443252e-05, + "loss": 0.9765, + "step": 19080 + }, + { + "epoch": 3.59, + "grad_norm": 4.771254062652588, + "learning_rate": 1.2813852813852813e-05, + "loss": 0.5044, + "step": 19090 + }, + { + "epoch": 3.59, + "grad_norm": 11.462512016296387, + "learning_rate": 1.2810088462262376e-05, + "loss": 0.7876, + "step": 19100 + }, + { + "epoch": 3.6, + "grad_norm": 29.608722686767578, + "learning_rate": 1.2806324110671938e-05, + "loss": 0.9717, + "step": 19110 + }, + { + "epoch": 3.6, + "grad_norm": 18.42778968811035, + "learning_rate": 1.28025597590815e-05, + "loss": 0.4388, + "step": 19120 + }, + { + "epoch": 3.6, + "grad_norm": 8.988313674926758, + "learning_rate": 1.279879540749106e-05, + "loss": 0.5285, + "step": 19130 + }, + { + "epoch": 3.6, + "grad_norm": 17.52735710144043, + "learning_rate": 1.2795031055900622e-05, + "loss": 0.5965, + "step": 19140 + }, + { + "epoch": 3.6, + "grad_norm": 24.257164001464844, + "learning_rate": 1.2791266704310184e-05, + "loss": 0.8865, + "step": 19150 + }, + { + "epoch": 3.61, + "grad_norm": 5.7428460121154785, + "learning_rate": 1.2787502352719745e-05, + "loss": 0.5267, + "step": 19160 + }, + { + "epoch": 3.61, + "grad_norm": 8.993688583374023, + "learning_rate": 1.2783738001129307e-05, + "loss": 0.8054, + "step": 19170 + }, + { + "epoch": 3.61, + "grad_norm": 6.0230631828308105, + "learning_rate": 1.2779973649538868e-05, + "loss": 0.4008, + "step": 19180 + }, + { + "epoch": 3.61, + "grad_norm": 7.1093220710754395, + "learning_rate": 1.277620929794843e-05, + "loss": 0.5139, + "step": 19190 + }, + { + "epoch": 3.61, + "grad_norm": 6.246382713317871, + "learning_rate": 1.2772444946357991e-05, + "loss": 0.7167, + "step": 19200 + }, + { + "epoch": 3.62, + "grad_norm": 9.685704231262207, + "learning_rate": 1.2768680594767552e-05, + "loss": 0.6331, + "step": 19210 + }, + { + "epoch": 3.62, + "grad_norm": 3.0230801105499268, + "learning_rate": 1.2764916243177114e-05, + "loss": 0.6397, + "step": 19220 + }, + { + "epoch": 3.62, + "grad_norm": 17.838150024414062, + "learning_rate": 1.2761151891586675e-05, + "loss": 0.5742, + "step": 19230 + }, + { + "epoch": 3.62, + "grad_norm": 21.397781372070312, + "learning_rate": 1.2757387539996238e-05, + "loss": 0.8132, + "step": 19240 + }, + { + "epoch": 3.62, + "grad_norm": 8.271471977233887, + "learning_rate": 1.2753623188405797e-05, + "loss": 0.6171, + "step": 19250 + }, + { + "epoch": 3.63, + "grad_norm": 2.5334246158599854, + "learning_rate": 1.2749858836815358e-05, + "loss": 0.8327, + "step": 19260 + }, + { + "epoch": 3.63, + "grad_norm": 4.004415988922119, + "learning_rate": 1.274609448522492e-05, + "loss": 0.5323, + "step": 19270 + }, + { + "epoch": 3.63, + "grad_norm": 22.41325569152832, + "learning_rate": 1.2742330133634483e-05, + "loss": 0.5696, + "step": 19280 + }, + { + "epoch": 3.63, + "grad_norm": 10.288199424743652, + "learning_rate": 1.2738565782044044e-05, + "loss": 0.6255, + "step": 19290 + }, + { + "epoch": 3.63, + "grad_norm": 7.55498743057251, + "learning_rate": 1.2734801430453606e-05, + "loss": 0.6258, + "step": 19300 + }, + { + "epoch": 3.63, + "grad_norm": 5.852587699890137, + "learning_rate": 1.2731037078863167e-05, + "loss": 0.6389, + "step": 19310 + }, + { + "epoch": 3.64, + "grad_norm": 27.851564407348633, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.8596, + "step": 19320 + }, + { + "epoch": 3.64, + "grad_norm": 9.760847091674805, + "learning_rate": 1.272350837568229e-05, + "loss": 0.5892, + "step": 19330 + }, + { + "epoch": 3.64, + "grad_norm": 14.357726097106934, + "learning_rate": 1.2719744024091851e-05, + "loss": 0.4372, + "step": 19340 + }, + { + "epoch": 3.64, + "grad_norm": 2.770073175430298, + "learning_rate": 1.2715979672501413e-05, + "loss": 0.6819, + "step": 19350 + }, + { + "epoch": 3.64, + "grad_norm": 32.54544448852539, + "learning_rate": 1.2712215320910974e-05, + "loss": 0.8689, + "step": 19360 + }, + { + "epoch": 3.65, + "grad_norm": 28.392282485961914, + "learning_rate": 1.2708450969320536e-05, + "loss": 0.6886, + "step": 19370 + }, + { + "epoch": 3.65, + "grad_norm": 29.35589599609375, + "learning_rate": 1.2704686617730097e-05, + "loss": 0.5073, + "step": 19380 + }, + { + "epoch": 3.65, + "grad_norm": 4.598623752593994, + "learning_rate": 1.2700922266139659e-05, + "loss": 0.3873, + "step": 19390 + }, + { + "epoch": 3.65, + "grad_norm": 31.010652542114258, + "learning_rate": 1.269715791454922e-05, + "loss": 0.6336, + "step": 19400 + }, + { + "epoch": 3.65, + "grad_norm": 17.593690872192383, + "learning_rate": 1.2693393562958782e-05, + "loss": 0.7767, + "step": 19410 + }, + { + "epoch": 3.66, + "grad_norm": 6.044388771057129, + "learning_rate": 1.2689629211368343e-05, + "loss": 0.683, + "step": 19420 + }, + { + "epoch": 3.66, + "grad_norm": 9.18665599822998, + "learning_rate": 1.2685864859777903e-05, + "loss": 0.9063, + "step": 19430 + }, + { + "epoch": 3.66, + "grad_norm": 12.083257675170898, + "learning_rate": 1.2682100508187464e-05, + "loss": 1.0481, + "step": 19440 + }, + { + "epoch": 3.66, + "grad_norm": 18.31049919128418, + "learning_rate": 1.2678336156597026e-05, + "loss": 0.8629, + "step": 19450 + }, + { + "epoch": 3.66, + "grad_norm": 5.21190881729126, + "learning_rate": 1.2674571805006587e-05, + "loss": 0.5297, + "step": 19460 + }, + { + "epoch": 3.66, + "grad_norm": 14.538198471069336, + "learning_rate": 1.267080745341615e-05, + "loss": 0.6776, + "step": 19470 + }, + { + "epoch": 3.67, + "grad_norm": 3.3181121349334717, + "learning_rate": 1.2667043101825712e-05, + "loss": 0.6773, + "step": 19480 + }, + { + "epoch": 3.67, + "grad_norm": 1.1416178941726685, + "learning_rate": 1.2663278750235273e-05, + "loss": 0.4913, + "step": 19490 + }, + { + "epoch": 3.67, + "grad_norm": 4.539183139801025, + "learning_rate": 1.2659514398644835e-05, + "loss": 0.8529, + "step": 19500 + }, + { + "epoch": 3.67, + "grad_norm": 11.008049011230469, + "learning_rate": 1.2655750047054396e-05, + "loss": 0.5791, + "step": 19510 + }, + { + "epoch": 3.67, + "grad_norm": 26.218887329101562, + "learning_rate": 1.2651985695463958e-05, + "loss": 0.6222, + "step": 19520 + }, + { + "epoch": 3.68, + "grad_norm": 9.582514762878418, + "learning_rate": 1.2648221343873519e-05, + "loss": 0.5567, + "step": 19530 + }, + { + "epoch": 3.68, + "grad_norm": 33.36174011230469, + "learning_rate": 1.264445699228308e-05, + "loss": 0.5457, + "step": 19540 + }, + { + "epoch": 3.68, + "grad_norm": 5.197388648986816, + "learning_rate": 1.2640692640692642e-05, + "loss": 1.0242, + "step": 19550 + }, + { + "epoch": 3.68, + "grad_norm": 29.88850212097168, + "learning_rate": 1.2636928289102204e-05, + "loss": 0.6045, + "step": 19560 + }, + { + "epoch": 3.68, + "grad_norm": 14.333205223083496, + "learning_rate": 1.2633163937511765e-05, + "loss": 0.8825, + "step": 19570 + }, + { + "epoch": 3.69, + "grad_norm": 16.74696159362793, + "learning_rate": 1.2629399585921326e-05, + "loss": 0.5199, + "step": 19580 + }, + { + "epoch": 3.69, + "grad_norm": 10.93972110748291, + "learning_rate": 1.2625635234330888e-05, + "loss": 0.5204, + "step": 19590 + }, + { + "epoch": 3.69, + "grad_norm": 26.37276268005371, + "learning_rate": 1.2621870882740448e-05, + "loss": 0.6918, + "step": 19600 + }, + { + "epoch": 3.69, + "grad_norm": 30.152820587158203, + "learning_rate": 1.2618106531150009e-05, + "loss": 0.6616, + "step": 19610 + }, + { + "epoch": 3.69, + "grad_norm": 4.047630310058594, + "learning_rate": 1.261434217955957e-05, + "loss": 0.3501, + "step": 19620 + }, + { + "epoch": 3.69, + "grad_norm": 23.427507400512695, + "learning_rate": 1.2610577827969132e-05, + "loss": 0.7386, + "step": 19630 + }, + { + "epoch": 3.7, + "grad_norm": 17.632530212402344, + "learning_rate": 1.2606813476378694e-05, + "loss": 0.3864, + "step": 19640 + }, + { + "epoch": 3.7, + "grad_norm": 13.861688613891602, + "learning_rate": 1.2603049124788255e-05, + "loss": 0.4736, + "step": 19650 + }, + { + "epoch": 3.7, + "grad_norm": 0.12637591361999512, + "learning_rate": 1.2599284773197818e-05, + "loss": 0.459, + "step": 19660 + }, + { + "epoch": 3.7, + "grad_norm": 12.556920051574707, + "learning_rate": 1.259552042160738e-05, + "loss": 0.5324, + "step": 19670 + }, + { + "epoch": 3.7, + "grad_norm": 15.041160583496094, + "learning_rate": 1.2591756070016941e-05, + "loss": 0.7857, + "step": 19680 + }, + { + "epoch": 3.71, + "grad_norm": 8.056427955627441, + "learning_rate": 1.2587991718426503e-05, + "loss": 0.5911, + "step": 19690 + }, + { + "epoch": 3.71, + "grad_norm": 6.120177268981934, + "learning_rate": 1.2584227366836064e-05, + "loss": 0.4515, + "step": 19700 + }, + { + "epoch": 3.71, + "grad_norm": 5.58326530456543, + "learning_rate": 1.2580463015245625e-05, + "loss": 0.8182, + "step": 19710 + }, + { + "epoch": 3.71, + "grad_norm": 13.465167045593262, + "learning_rate": 1.2576698663655187e-05, + "loss": 0.48, + "step": 19720 + }, + { + "epoch": 3.71, + "grad_norm": 2.778507709503174, + "learning_rate": 1.2572934312064748e-05, + "loss": 0.5068, + "step": 19730 + }, + { + "epoch": 3.72, + "grad_norm": 7.899831771850586, + "learning_rate": 1.256916996047431e-05, + "loss": 0.5613, + "step": 19740 + }, + { + "epoch": 3.72, + "grad_norm": 8.313310623168945, + "learning_rate": 1.2565405608883871e-05, + "loss": 0.583, + "step": 19750 + }, + { + "epoch": 3.72, + "grad_norm": 9.49411678314209, + "learning_rate": 1.2561641257293433e-05, + "loss": 1.0025, + "step": 19760 + }, + { + "epoch": 3.72, + "grad_norm": 24.08152198791504, + "learning_rate": 1.2557876905702994e-05, + "loss": 0.7991, + "step": 19770 + }, + { + "epoch": 3.72, + "grad_norm": 6.793327808380127, + "learning_rate": 1.2554112554112554e-05, + "loss": 0.8257, + "step": 19780 + }, + { + "epoch": 3.72, + "grad_norm": 17.370943069458008, + "learning_rate": 1.2550348202522115e-05, + "loss": 0.8301, + "step": 19790 + }, + { + "epoch": 3.73, + "grad_norm": 9.847330093383789, + "learning_rate": 1.2546583850931677e-05, + "loss": 0.6801, + "step": 19800 + }, + { + "epoch": 3.73, + "grad_norm": 7.884547710418701, + "learning_rate": 1.2542819499341238e-05, + "loss": 0.4344, + "step": 19810 + }, + { + "epoch": 3.73, + "grad_norm": 0.6122704744338989, + "learning_rate": 1.25390551477508e-05, + "loss": 0.649, + "step": 19820 + }, + { + "epoch": 3.73, + "grad_norm": 12.32701301574707, + "learning_rate": 1.2535290796160361e-05, + "loss": 0.4236, + "step": 19830 + }, + { + "epoch": 3.73, + "grad_norm": 11.68048095703125, + "learning_rate": 1.2531526444569924e-05, + "loss": 0.6277, + "step": 19840 + }, + { + "epoch": 3.74, + "grad_norm": 27.56481170654297, + "learning_rate": 1.2527762092979486e-05, + "loss": 0.6161, + "step": 19850 + }, + { + "epoch": 3.74, + "grad_norm": 10.342185020446777, + "learning_rate": 1.2523997741389047e-05, + "loss": 0.7714, + "step": 19860 + }, + { + "epoch": 3.74, + "grad_norm": 15.999653816223145, + "learning_rate": 1.2520233389798609e-05, + "loss": 0.4018, + "step": 19870 + }, + { + "epoch": 3.74, + "grad_norm": 14.883124351501465, + "learning_rate": 1.251646903820817e-05, + "loss": 0.9517, + "step": 19880 + }, + { + "epoch": 3.74, + "grad_norm": 4.749094009399414, + "learning_rate": 1.2512704686617732e-05, + "loss": 0.4089, + "step": 19890 + }, + { + "epoch": 3.75, + "grad_norm": 23.341344833374023, + "learning_rate": 1.2508940335027293e-05, + "loss": 0.5633, + "step": 19900 + }, + { + "epoch": 3.75, + "grad_norm": 15.799015045166016, + "learning_rate": 1.2505175983436855e-05, + "loss": 0.662, + "step": 19910 + }, + { + "epoch": 3.75, + "grad_norm": 25.26270294189453, + "learning_rate": 1.2501411631846416e-05, + "loss": 0.6803, + "step": 19920 + }, + { + "epoch": 3.75, + "grad_norm": 19.431108474731445, + "learning_rate": 1.2497647280255978e-05, + "loss": 0.6431, + "step": 19930 + }, + { + "epoch": 3.75, + "grad_norm": 6.340080261230469, + "learning_rate": 1.2493882928665539e-05, + "loss": 0.6813, + "step": 19940 + }, + { + "epoch": 3.75, + "grad_norm": 10.563408851623535, + "learning_rate": 1.2490118577075099e-05, + "loss": 0.5349, + "step": 19950 + }, + { + "epoch": 3.76, + "grad_norm": 18.89433479309082, + "learning_rate": 1.248635422548466e-05, + "loss": 0.4407, + "step": 19960 + }, + { + "epoch": 3.76, + "grad_norm": 5.784712791442871, + "learning_rate": 1.2482589873894222e-05, + "loss": 0.7272, + "step": 19970 + }, + { + "epoch": 3.76, + "grad_norm": 24.42642593383789, + "learning_rate": 1.2478825522303783e-05, + "loss": 0.6009, + "step": 19980 + }, + { + "epoch": 3.76, + "grad_norm": 21.03069496154785, + "learning_rate": 1.2475061170713345e-05, + "loss": 0.652, + "step": 19990 + }, + { + "epoch": 3.76, + "grad_norm": 0.11219408363103867, + "learning_rate": 1.2471296819122906e-05, + "loss": 0.5116, + "step": 20000 + }, + { + "epoch": 3.77, + "grad_norm": 10.447319030761719, + "learning_rate": 1.2467532467532468e-05, + "loss": 0.3867, + "step": 20010 + }, + { + "epoch": 3.77, + "grad_norm": 8.39681625366211, + "learning_rate": 1.2463768115942029e-05, + "loss": 0.7164, + "step": 20020 + }, + { + "epoch": 3.77, + "grad_norm": 13.545207977294922, + "learning_rate": 1.2460003764351592e-05, + "loss": 0.7469, + "step": 20030 + }, + { + "epoch": 3.77, + "grad_norm": 8.322916030883789, + "learning_rate": 1.2456239412761154e-05, + "loss": 0.6016, + "step": 20040 + }, + { + "epoch": 3.77, + "grad_norm": 10.224037170410156, + "learning_rate": 1.2452475061170715e-05, + "loss": 0.6915, + "step": 20050 + }, + { + "epoch": 3.78, + "grad_norm": 10.355185508728027, + "learning_rate": 1.2448710709580277e-05, + "loss": 0.5082, + "step": 20060 + }, + { + "epoch": 3.78, + "grad_norm": 11.303367614746094, + "learning_rate": 1.2444946357989838e-05, + "loss": 0.574, + "step": 20070 + }, + { + "epoch": 3.78, + "grad_norm": 15.811086654663086, + "learning_rate": 1.24411820063994e-05, + "loss": 0.8375, + "step": 20080 + }, + { + "epoch": 3.78, + "grad_norm": 19.942611694335938, + "learning_rate": 1.2437417654808961e-05, + "loss": 0.6268, + "step": 20090 + }, + { + "epoch": 3.78, + "grad_norm": 8.685022354125977, + "learning_rate": 1.2433653303218522e-05, + "loss": 0.5048, + "step": 20100 + }, + { + "epoch": 3.79, + "grad_norm": 30.918336868286133, + "learning_rate": 1.2429888951628084e-05, + "loss": 0.5821, + "step": 20110 + }, + { + "epoch": 3.79, + "grad_norm": 22.617633819580078, + "learning_rate": 1.2426124600037644e-05, + "loss": 0.6512, + "step": 20120 + }, + { + "epoch": 3.79, + "grad_norm": 7.3372483253479, + "learning_rate": 1.2422360248447205e-05, + "loss": 0.958, + "step": 20130 + }, + { + "epoch": 3.79, + "grad_norm": 9.846896171569824, + "learning_rate": 1.2418595896856767e-05, + "loss": 0.61, + "step": 20140 + }, + { + "epoch": 3.79, + "grad_norm": 27.524892807006836, + "learning_rate": 1.2414831545266328e-05, + "loss": 0.885, + "step": 20150 + }, + { + "epoch": 3.79, + "grad_norm": 5.012969017028809, + "learning_rate": 1.241106719367589e-05, + "loss": 0.5705, + "step": 20160 + }, + { + "epoch": 3.8, + "grad_norm": 14.11467456817627, + "learning_rate": 1.2407302842085451e-05, + "loss": 0.7462, + "step": 20170 + }, + { + "epoch": 3.8, + "grad_norm": 23.304222106933594, + "learning_rate": 1.2403538490495012e-05, + "loss": 0.8118, + "step": 20180 + }, + { + "epoch": 3.8, + "grad_norm": 5.021602630615234, + "learning_rate": 1.2399774138904574e-05, + "loss": 0.6483, + "step": 20190 + }, + { + "epoch": 3.8, + "grad_norm": 4.954014301300049, + "learning_rate": 1.2396009787314135e-05, + "loss": 0.5144, + "step": 20200 + }, + { + "epoch": 3.8, + "grad_norm": 15.003413200378418, + "learning_rate": 1.2392245435723698e-05, + "loss": 0.6549, + "step": 20210 + }, + { + "epoch": 3.81, + "grad_norm": 0.8703759908676147, + "learning_rate": 1.238848108413326e-05, + "loss": 0.477, + "step": 20220 + }, + { + "epoch": 3.81, + "grad_norm": 19.85989761352539, + "learning_rate": 1.2384716732542821e-05, + "loss": 0.6608, + "step": 20230 + }, + { + "epoch": 3.81, + "grad_norm": 10.10875129699707, + "learning_rate": 1.2380952380952383e-05, + "loss": 0.7041, + "step": 20240 + }, + { + "epoch": 3.81, + "grad_norm": 6.1762471199035645, + "learning_rate": 1.2377188029361944e-05, + "loss": 0.9546, + "step": 20250 + }, + { + "epoch": 3.81, + "grad_norm": 44.439823150634766, + "learning_rate": 1.2373423677771506e-05, + "loss": 1.0269, + "step": 20260 + }, + { + "epoch": 3.82, + "grad_norm": 17.293577194213867, + "learning_rate": 1.2369659326181067e-05, + "loss": 0.6694, + "step": 20270 + }, + { + "epoch": 3.82, + "grad_norm": 5.511439800262451, + "learning_rate": 1.2365894974590629e-05, + "loss": 0.502, + "step": 20280 + }, + { + "epoch": 3.82, + "grad_norm": 18.506452560424805, + "learning_rate": 1.236213062300019e-05, + "loss": 1.0257, + "step": 20290 + }, + { + "epoch": 3.82, + "grad_norm": 10.748970031738281, + "learning_rate": 1.235836627140975e-05, + "loss": 0.7194, + "step": 20300 + }, + { + "epoch": 3.82, + "grad_norm": 39.78062057495117, + "learning_rate": 1.2354601919819311e-05, + "loss": 0.5527, + "step": 20310 + }, + { + "epoch": 3.82, + "grad_norm": 16.632009506225586, + "learning_rate": 1.2350837568228873e-05, + "loss": 0.7186, + "step": 20320 + }, + { + "epoch": 3.83, + "grad_norm": 3.820341110229492, + "learning_rate": 1.2347073216638434e-05, + "loss": 0.6947, + "step": 20330 + }, + { + "epoch": 3.83, + "grad_norm": 1.6036685705184937, + "learning_rate": 1.2343308865047996e-05, + "loss": 0.4921, + "step": 20340 + }, + { + "epoch": 3.83, + "grad_norm": 1.2304280996322632, + "learning_rate": 1.2339544513457557e-05, + "loss": 0.6795, + "step": 20350 + }, + { + "epoch": 3.83, + "grad_norm": 18.970306396484375, + "learning_rate": 1.2335780161867119e-05, + "loss": 0.6016, + "step": 20360 + }, + { + "epoch": 3.83, + "grad_norm": 8.561296463012695, + "learning_rate": 1.233201581027668e-05, + "loss": 0.9288, + "step": 20370 + }, + { + "epoch": 3.84, + "grad_norm": 11.71761417388916, + "learning_rate": 1.2328251458686242e-05, + "loss": 0.7948, + "step": 20380 + }, + { + "epoch": 3.84, + "grad_norm": 38.39409637451172, + "learning_rate": 1.2324487107095803e-05, + "loss": 0.6868, + "step": 20390 + }, + { + "epoch": 3.84, + "grad_norm": 19.950315475463867, + "learning_rate": 1.2320722755505366e-05, + "loss": 0.4955, + "step": 20400 + }, + { + "epoch": 3.84, + "grad_norm": 6.911773204803467, + "learning_rate": 1.2316958403914928e-05, + "loss": 0.3525, + "step": 20410 + }, + { + "epoch": 3.84, + "grad_norm": 13.918087005615234, + "learning_rate": 1.2313194052324489e-05, + "loss": 0.3229, + "step": 20420 + }, + { + "epoch": 3.85, + "grad_norm": 14.926342964172363, + "learning_rate": 1.230942970073405e-05, + "loss": 0.6098, + "step": 20430 + }, + { + "epoch": 3.85, + "grad_norm": 16.889162063598633, + "learning_rate": 1.2305665349143612e-05, + "loss": 0.6025, + "step": 20440 + }, + { + "epoch": 3.85, + "grad_norm": 17.502153396606445, + "learning_rate": 1.2301900997553174e-05, + "loss": 0.4296, + "step": 20450 + }, + { + "epoch": 3.85, + "grad_norm": 17.864023208618164, + "learning_rate": 1.2298136645962735e-05, + "loss": 0.9841, + "step": 20460 + }, + { + "epoch": 3.85, + "grad_norm": 8.081547737121582, + "learning_rate": 1.2294372294372295e-05, + "loss": 0.5448, + "step": 20470 + }, + { + "epoch": 3.85, + "grad_norm": 10.248896598815918, + "learning_rate": 1.2290607942781856e-05, + "loss": 0.7463, + "step": 20480 + }, + { + "epoch": 3.86, + "grad_norm": 43.104915618896484, + "learning_rate": 1.2286843591191418e-05, + "loss": 0.868, + "step": 20490 + }, + { + "epoch": 3.86, + "grad_norm": 23.549795150756836, + "learning_rate": 1.2283079239600979e-05, + "loss": 0.551, + "step": 20500 + }, + { + "epoch": 3.86, + "grad_norm": 18.552968978881836, + "learning_rate": 1.227931488801054e-05, + "loss": 0.5959, + "step": 20510 + }, + { + "epoch": 3.86, + "grad_norm": 15.869778633117676, + "learning_rate": 1.2275550536420102e-05, + "loss": 0.7179, + "step": 20520 + }, + { + "epoch": 3.86, + "grad_norm": 14.710973739624023, + "learning_rate": 1.2271786184829664e-05, + "loss": 0.6882, + "step": 20530 + }, + { + "epoch": 3.87, + "grad_norm": 8.942631721496582, + "learning_rate": 1.2268021833239225e-05, + "loss": 0.8396, + "step": 20540 + }, + { + "epoch": 3.87, + "grad_norm": 6.654508113861084, + "learning_rate": 1.2264257481648786e-05, + "loss": 0.3994, + "step": 20550 + }, + { + "epoch": 3.87, + "grad_norm": 47.291954040527344, + "learning_rate": 1.2260493130058348e-05, + "loss": 0.7838, + "step": 20560 + }, + { + "epoch": 3.87, + "grad_norm": 13.667283058166504, + "learning_rate": 1.225672877846791e-05, + "loss": 0.4754, + "step": 20570 + }, + { + "epoch": 3.87, + "grad_norm": 41.05614471435547, + "learning_rate": 1.2252964426877473e-05, + "loss": 0.643, + "step": 20580 + }, + { + "epoch": 3.88, + "grad_norm": 16.537818908691406, + "learning_rate": 1.2249200075287034e-05, + "loss": 0.7026, + "step": 20590 + }, + { + "epoch": 3.88, + "grad_norm": 16.900470733642578, + "learning_rate": 1.2245435723696595e-05, + "loss": 0.4249, + "step": 20600 + }, + { + "epoch": 3.88, + "grad_norm": 11.43319034576416, + "learning_rate": 1.2241671372106157e-05, + "loss": 0.8047, + "step": 20610 + }, + { + "epoch": 3.88, + "grad_norm": 1.8628902435302734, + "learning_rate": 1.2237907020515718e-05, + "loss": 0.5949, + "step": 20620 + }, + { + "epoch": 3.88, + "grad_norm": 2.925947427749634, + "learning_rate": 1.223414266892528e-05, + "loss": 0.6901, + "step": 20630 + }, + { + "epoch": 3.88, + "grad_norm": 10.759583473205566, + "learning_rate": 1.2230378317334841e-05, + "loss": 0.4635, + "step": 20640 + }, + { + "epoch": 3.89, + "grad_norm": 13.309684753417969, + "learning_rate": 1.2226613965744401e-05, + "loss": 0.5326, + "step": 20650 + }, + { + "epoch": 3.89, + "grad_norm": 23.810779571533203, + "learning_rate": 1.2222849614153963e-05, + "loss": 0.603, + "step": 20660 + }, + { + "epoch": 3.89, + "grad_norm": 14.398448944091797, + "learning_rate": 1.2219085262563524e-05, + "loss": 0.9449, + "step": 20670 + }, + { + "epoch": 3.89, + "grad_norm": 10.191229820251465, + "learning_rate": 1.2215320910973085e-05, + "loss": 0.4746, + "step": 20680 + }, + { + "epoch": 3.89, + "grad_norm": 18.595684051513672, + "learning_rate": 1.2211556559382647e-05, + "loss": 1.0004, + "step": 20690 + }, + { + "epoch": 3.9, + "grad_norm": 32.1865119934082, + "learning_rate": 1.2207792207792208e-05, + "loss": 0.4889, + "step": 20700 + }, + { + "epoch": 3.9, + "grad_norm": 0.28670892119407654, + "learning_rate": 1.220402785620177e-05, + "loss": 0.6854, + "step": 20710 + }, + { + "epoch": 3.9, + "grad_norm": 5.30502986907959, + "learning_rate": 1.2200263504611331e-05, + "loss": 0.5467, + "step": 20720 + }, + { + "epoch": 3.9, + "grad_norm": 12.79511547088623, + "learning_rate": 1.2196499153020893e-05, + "loss": 0.4574, + "step": 20730 + }, + { + "epoch": 3.9, + "grad_norm": 17.650203704833984, + "learning_rate": 1.2192734801430454e-05, + "loss": 0.5639, + "step": 20740 + }, + { + "epoch": 3.91, + "grad_norm": 0.7680536508560181, + "learning_rate": 1.2188970449840016e-05, + "loss": 0.6494, + "step": 20750 + }, + { + "epoch": 3.91, + "grad_norm": 0.08082833886146545, + "learning_rate": 1.2185206098249577e-05, + "loss": 0.4382, + "step": 20760 + }, + { + "epoch": 3.91, + "grad_norm": 14.408023834228516, + "learning_rate": 1.218144174665914e-05, + "loss": 0.8581, + "step": 20770 + }, + { + "epoch": 3.91, + "grad_norm": 10.851282119750977, + "learning_rate": 1.2177677395068702e-05, + "loss": 0.7481, + "step": 20780 + }, + { + "epoch": 3.91, + "grad_norm": 29.99077796936035, + "learning_rate": 1.2173913043478263e-05, + "loss": 1.0, + "step": 20790 + }, + { + "epoch": 3.91, + "grad_norm": 0.19973182678222656, + "learning_rate": 1.2170148691887825e-05, + "loss": 0.6387, + "step": 20800 + }, + { + "epoch": 3.92, + "grad_norm": 7.756618499755859, + "learning_rate": 1.2166384340297386e-05, + "loss": 0.5351, + "step": 20810 + }, + { + "epoch": 3.92, + "grad_norm": 20.255647659301758, + "learning_rate": 1.2162619988706946e-05, + "loss": 0.7969, + "step": 20820 + }, + { + "epoch": 3.92, + "grad_norm": 0.8674776554107666, + "learning_rate": 1.2158855637116507e-05, + "loss": 0.7177, + "step": 20830 + }, + { + "epoch": 3.92, + "grad_norm": 8.670698165893555, + "learning_rate": 1.2155091285526069e-05, + "loss": 0.624, + "step": 20840 + }, + { + "epoch": 3.92, + "grad_norm": 0.19249022006988525, + "learning_rate": 1.215132693393563e-05, + "loss": 0.4953, + "step": 20850 + }, + { + "epoch": 3.93, + "grad_norm": 26.02955436706543, + "learning_rate": 1.2147562582345192e-05, + "loss": 0.5161, + "step": 20860 + }, + { + "epoch": 3.93, + "grad_norm": 29.887535095214844, + "learning_rate": 1.2143798230754753e-05, + "loss": 0.5798, + "step": 20870 + }, + { + "epoch": 3.93, + "grad_norm": 17.09107208251953, + "learning_rate": 1.2140033879164315e-05, + "loss": 0.5887, + "step": 20880 + }, + { + "epoch": 3.93, + "grad_norm": 21.360456466674805, + "learning_rate": 1.2136269527573876e-05, + "loss": 0.76, + "step": 20890 + }, + { + "epoch": 3.93, + "grad_norm": 8.325711250305176, + "learning_rate": 1.2132505175983438e-05, + "loss": 0.6259, + "step": 20900 + }, + { + "epoch": 3.94, + "grad_norm": 8.927617073059082, + "learning_rate": 1.2128740824392999e-05, + "loss": 0.5466, + "step": 20910 + }, + { + "epoch": 3.94, + "grad_norm": 9.248173713684082, + "learning_rate": 1.212497647280256e-05, + "loss": 0.4414, + "step": 20920 + }, + { + "epoch": 3.94, + "grad_norm": 9.122405052185059, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.5872, + "step": 20930 + }, + { + "epoch": 3.94, + "grad_norm": 8.157663345336914, + "learning_rate": 1.2117447769621683e-05, + "loss": 0.577, + "step": 20940 + }, + { + "epoch": 3.94, + "grad_norm": 19.186445236206055, + "learning_rate": 1.2113683418031245e-05, + "loss": 0.7963, + "step": 20950 + }, + { + "epoch": 3.95, + "grad_norm": 8.585948944091797, + "learning_rate": 1.2109919066440808e-05, + "loss": 0.6799, + "step": 20960 + }, + { + "epoch": 3.95, + "grad_norm": 11.789979934692383, + "learning_rate": 1.210615471485037e-05, + "loss": 0.5179, + "step": 20970 + }, + { + "epoch": 3.95, + "grad_norm": 5.875610828399658, + "learning_rate": 1.2102390363259931e-05, + "loss": 0.437, + "step": 20980 + }, + { + "epoch": 3.95, + "grad_norm": 18.89061164855957, + "learning_rate": 1.2098626011669492e-05, + "loss": 0.577, + "step": 20990 + }, + { + "epoch": 3.95, + "grad_norm": 4.297598838806152, + "learning_rate": 1.2094861660079052e-05, + "loss": 0.7225, + "step": 21000 + }, + { + "epoch": 3.95, + "grad_norm": 0.36552512645721436, + "learning_rate": 1.2091097308488614e-05, + "loss": 0.2754, + "step": 21010 + }, + { + "epoch": 3.96, + "grad_norm": 13.013652801513672, + "learning_rate": 1.2087332956898175e-05, + "loss": 0.669, + "step": 21020 + }, + { + "epoch": 3.96, + "grad_norm": 6.945061683654785, + "learning_rate": 1.2083568605307737e-05, + "loss": 0.5892, + "step": 21030 + }, + { + "epoch": 3.96, + "grad_norm": 4.39546012878418, + "learning_rate": 1.2079804253717298e-05, + "loss": 0.6006, + "step": 21040 + }, + { + "epoch": 3.96, + "grad_norm": 24.449954986572266, + "learning_rate": 1.207603990212686e-05, + "loss": 0.503, + "step": 21050 + }, + { + "epoch": 3.96, + "grad_norm": 9.902664184570312, + "learning_rate": 1.2072275550536421e-05, + "loss": 0.6385, + "step": 21060 + }, + { + "epoch": 3.97, + "grad_norm": 44.686317443847656, + "learning_rate": 1.2068511198945982e-05, + "loss": 0.4779, + "step": 21070 + }, + { + "epoch": 3.97, + "grad_norm": 13.766404151916504, + "learning_rate": 1.2064746847355544e-05, + "loss": 0.6803, + "step": 21080 + }, + { + "epoch": 3.97, + "grad_norm": 41.97696304321289, + "learning_rate": 1.2060982495765105e-05, + "loss": 1.1873, + "step": 21090 + }, + { + "epoch": 3.97, + "grad_norm": 15.863570213317871, + "learning_rate": 1.2057218144174667e-05, + "loss": 0.5433, + "step": 21100 + }, + { + "epoch": 3.97, + "grad_norm": 10.411752700805664, + "learning_rate": 1.2053453792584228e-05, + "loss": 0.5845, + "step": 21110 + }, + { + "epoch": 3.98, + "grad_norm": 4.70853853225708, + "learning_rate": 1.204968944099379e-05, + "loss": 0.9748, + "step": 21120 + }, + { + "epoch": 3.98, + "grad_norm": 41.68354034423828, + "learning_rate": 1.2045925089403351e-05, + "loss": 0.7451, + "step": 21130 + }, + { + "epoch": 3.98, + "grad_norm": 12.279452323913574, + "learning_rate": 1.2042160737812914e-05, + "loss": 0.6239, + "step": 21140 + }, + { + "epoch": 3.98, + "grad_norm": 1.5319571495056152, + "learning_rate": 1.2038396386222476e-05, + "loss": 0.6755, + "step": 21150 + }, + { + "epoch": 3.98, + "grad_norm": 19.07404899597168, + "learning_rate": 1.2034632034632037e-05, + "loss": 0.9424, + "step": 21160 + }, + { + "epoch": 3.98, + "grad_norm": 23.491230010986328, + "learning_rate": 1.2030867683041595e-05, + "loss": 0.7338, + "step": 21170 + }, + { + "epoch": 3.99, + "grad_norm": 30.06789207458496, + "learning_rate": 1.2027103331451157e-05, + "loss": 0.705, + "step": 21180 + }, + { + "epoch": 3.99, + "grad_norm": 7.1863789558410645, + "learning_rate": 1.202333897986072e-05, + "loss": 0.964, + "step": 21190 + }, + { + "epoch": 3.99, + "grad_norm": 19.22167205810547, + "learning_rate": 1.2019574628270281e-05, + "loss": 0.5426, + "step": 21200 + }, + { + "epoch": 3.99, + "grad_norm": 18.454984664916992, + "learning_rate": 1.2015810276679843e-05, + "loss": 0.5774, + "step": 21210 + }, + { + "epoch": 3.99, + "grad_norm": 12.360136985778809, + "learning_rate": 1.2012045925089404e-05, + "loss": 0.9418, + "step": 21220 + }, + { + "epoch": 4.0, + "grad_norm": 11.939126968383789, + "learning_rate": 1.2008281573498966e-05, + "loss": 0.7091, + "step": 21230 + }, + { + "epoch": 4.0, + "grad_norm": 32.17779541015625, + "learning_rate": 1.2004517221908527e-05, + "loss": 0.7587, + "step": 21240 + }, + { + "epoch": 4.0, + "grad_norm": 7.594665050506592, + "learning_rate": 1.2000752870318089e-05, + "loss": 0.4173, + "step": 21250 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9169333333333334, + "eval_loss": 0.3204882740974426, + "eval_runtime": 51.2733, + "eval_samples_per_second": 146.275, + "eval_steps_per_second": 18.294, + "step": 21252 + }, + { + "epoch": 4.0, + "grad_norm": 20.44147491455078, + "learning_rate": 1.199698851872765e-05, + "loss": 0.7655, + "step": 21260 + }, + { + "epoch": 4.0, + "grad_norm": 5.429800987243652, + "learning_rate": 1.1993224167137212e-05, + "loss": 0.507, + "step": 21270 + }, + { + "epoch": 4.01, + "grad_norm": 2.7339699268341064, + "learning_rate": 1.1989459815546773e-05, + "loss": 0.5326, + "step": 21280 + }, + { + "epoch": 4.01, + "grad_norm": 10.058930397033691, + "learning_rate": 1.1985695463956335e-05, + "loss": 0.923, + "step": 21290 + }, + { + "epoch": 4.01, + "grad_norm": 17.565692901611328, + "learning_rate": 1.1981931112365896e-05, + "loss": 0.5113, + "step": 21300 + }, + { + "epoch": 4.01, + "grad_norm": 6.418165683746338, + "learning_rate": 1.1978166760775457e-05, + "loss": 0.7591, + "step": 21310 + }, + { + "epoch": 4.01, + "grad_norm": 8.731622695922852, + "learning_rate": 1.1974402409185019e-05, + "loss": 0.4488, + "step": 21320 + }, + { + "epoch": 4.01, + "grad_norm": 6.82095193862915, + "learning_rate": 1.1970638057594582e-05, + "loss": 0.5737, + "step": 21330 + }, + { + "epoch": 4.02, + "grad_norm": 19.75332260131836, + "learning_rate": 1.196687370600414e-05, + "loss": 0.5355, + "step": 21340 + }, + { + "epoch": 4.02, + "grad_norm": 8.986883163452148, + "learning_rate": 1.1963109354413702e-05, + "loss": 0.749, + "step": 21350 + }, + { + "epoch": 4.02, + "grad_norm": 5.045528411865234, + "learning_rate": 1.1959345002823263e-05, + "loss": 0.7225, + "step": 21360 + }, + { + "epoch": 4.02, + "grad_norm": 24.6765079498291, + "learning_rate": 1.1955580651232826e-05, + "loss": 0.5938, + "step": 21370 + }, + { + "epoch": 4.02, + "grad_norm": 14.547659873962402, + "learning_rate": 1.1951816299642388e-05, + "loss": 0.554, + "step": 21380 + }, + { + "epoch": 4.03, + "grad_norm": 21.172176361083984, + "learning_rate": 1.1948051948051949e-05, + "loss": 0.3325, + "step": 21390 + }, + { + "epoch": 4.03, + "grad_norm": 35.6104736328125, + "learning_rate": 1.194428759646151e-05, + "loss": 0.8367, + "step": 21400 + }, + { + "epoch": 4.03, + "grad_norm": 4.693765163421631, + "learning_rate": 1.1940523244871072e-05, + "loss": 0.6465, + "step": 21410 + }, + { + "epoch": 4.03, + "grad_norm": 12.738019943237305, + "learning_rate": 1.1936758893280634e-05, + "loss": 0.5504, + "step": 21420 + }, + { + "epoch": 4.03, + "grad_norm": 26.901885986328125, + "learning_rate": 1.1932994541690195e-05, + "loss": 0.4753, + "step": 21430 + }, + { + "epoch": 4.04, + "grad_norm": 21.4942569732666, + "learning_rate": 1.1929230190099756e-05, + "loss": 0.8649, + "step": 21440 + }, + { + "epoch": 4.04, + "grad_norm": 14.137928009033203, + "learning_rate": 1.1925465838509318e-05, + "loss": 0.5552, + "step": 21450 + }, + { + "epoch": 4.04, + "grad_norm": 18.820674896240234, + "learning_rate": 1.192170148691888e-05, + "loss": 0.577, + "step": 21460 + }, + { + "epoch": 4.04, + "grad_norm": 3.413235664367676, + "learning_rate": 1.191793713532844e-05, + "loss": 0.5026, + "step": 21470 + }, + { + "epoch": 4.04, + "grad_norm": 1.3790395259857178, + "learning_rate": 1.1914172783738002e-05, + "loss": 0.4477, + "step": 21480 + }, + { + "epoch": 4.04, + "grad_norm": 16.56000328063965, + "learning_rate": 1.1910408432147564e-05, + "loss": 0.6547, + "step": 21490 + }, + { + "epoch": 4.05, + "grad_norm": 6.222777843475342, + "learning_rate": 1.1906644080557125e-05, + "loss": 0.6857, + "step": 21500 + }, + { + "epoch": 4.05, + "grad_norm": 9.765985488891602, + "learning_rate": 1.1902879728966688e-05, + "loss": 0.6043, + "step": 21510 + }, + { + "epoch": 4.05, + "grad_norm": 21.691728591918945, + "learning_rate": 1.1899115377376246e-05, + "loss": 0.3465, + "step": 21520 + }, + { + "epoch": 4.05, + "grad_norm": 0.24102507531642914, + "learning_rate": 1.1895351025785808e-05, + "loss": 0.4715, + "step": 21530 + }, + { + "epoch": 4.05, + "grad_norm": 6.748645305633545, + "learning_rate": 1.189158667419537e-05, + "loss": 0.4595, + "step": 21540 + }, + { + "epoch": 4.06, + "grad_norm": 6.551207065582275, + "learning_rate": 1.188782232260493e-05, + "loss": 0.8763, + "step": 21550 + }, + { + "epoch": 4.06, + "grad_norm": 9.870513916015625, + "learning_rate": 1.1884057971014494e-05, + "loss": 0.4955, + "step": 21560 + }, + { + "epoch": 4.06, + "grad_norm": 15.267050743103027, + "learning_rate": 1.1880293619424055e-05, + "loss": 0.5343, + "step": 21570 + }, + { + "epoch": 4.06, + "grad_norm": 16.510562896728516, + "learning_rate": 1.1876529267833617e-05, + "loss": 0.7292, + "step": 21580 + }, + { + "epoch": 4.06, + "grad_norm": 5.880744457244873, + "learning_rate": 1.1872764916243178e-05, + "loss": 0.8015, + "step": 21590 + }, + { + "epoch": 4.07, + "grad_norm": 12.049949645996094, + "learning_rate": 1.186900056465274e-05, + "loss": 0.5739, + "step": 21600 + }, + { + "epoch": 4.07, + "grad_norm": 13.833176612854004, + "learning_rate": 1.1865236213062301e-05, + "loss": 0.9261, + "step": 21610 + }, + { + "epoch": 4.07, + "grad_norm": 21.508180618286133, + "learning_rate": 1.1861471861471863e-05, + "loss": 0.6697, + "step": 21620 + }, + { + "epoch": 4.07, + "grad_norm": 0.2058361917734146, + "learning_rate": 1.1857707509881424e-05, + "loss": 0.7873, + "step": 21630 + }, + { + "epoch": 4.07, + "grad_norm": 31.943452835083008, + "learning_rate": 1.1853943158290986e-05, + "loss": 0.5298, + "step": 21640 + }, + { + "epoch": 4.07, + "grad_norm": 11.333353042602539, + "learning_rate": 1.1850178806700547e-05, + "loss": 0.3821, + "step": 21650 + }, + { + "epoch": 4.08, + "grad_norm": 8.284579277038574, + "learning_rate": 1.1846414455110109e-05, + "loss": 0.4522, + "step": 21660 + }, + { + "epoch": 4.08, + "grad_norm": 11.12885570526123, + "learning_rate": 1.184265010351967e-05, + "loss": 0.9707, + "step": 21670 + }, + { + "epoch": 4.08, + "grad_norm": 5.9140191078186035, + "learning_rate": 1.1838885751929231e-05, + "loss": 0.5, + "step": 21680 + }, + { + "epoch": 4.08, + "grad_norm": 6.177058696746826, + "learning_rate": 1.1835121400338791e-05, + "loss": 0.6024, + "step": 21690 + }, + { + "epoch": 4.08, + "grad_norm": 2.904001235961914, + "learning_rate": 1.1831357048748353e-05, + "loss": 0.8046, + "step": 21700 + }, + { + "epoch": 4.09, + "grad_norm": 16.1057186126709, + "learning_rate": 1.1827592697157914e-05, + "loss": 0.5458, + "step": 21710 + }, + { + "epoch": 4.09, + "grad_norm": 19.81020736694336, + "learning_rate": 1.1823828345567476e-05, + "loss": 0.3633, + "step": 21720 + }, + { + "epoch": 4.09, + "grad_norm": 16.992023468017578, + "learning_rate": 1.1820063993977037e-05, + "loss": 0.9944, + "step": 21730 + }, + { + "epoch": 4.09, + "grad_norm": 24.561140060424805, + "learning_rate": 1.18162996423866e-05, + "loss": 0.4752, + "step": 21740 + }, + { + "epoch": 4.09, + "grad_norm": 0.726672887802124, + "learning_rate": 1.1812535290796162e-05, + "loss": 0.4632, + "step": 21750 + }, + { + "epoch": 4.1, + "grad_norm": 0.690126359462738, + "learning_rate": 1.1808770939205723e-05, + "loss": 0.5868, + "step": 21760 + }, + { + "epoch": 4.1, + "grad_norm": 36.9278564453125, + "learning_rate": 1.1805006587615285e-05, + "loss": 0.772, + "step": 21770 + }, + { + "epoch": 4.1, + "grad_norm": 16.300630569458008, + "learning_rate": 1.1801242236024846e-05, + "loss": 0.4303, + "step": 21780 + }, + { + "epoch": 4.1, + "grad_norm": 0.3904343843460083, + "learning_rate": 1.1797477884434408e-05, + "loss": 0.5117, + "step": 21790 + }, + { + "epoch": 4.1, + "grad_norm": 26.473928451538086, + "learning_rate": 1.1793713532843969e-05, + "loss": 0.4777, + "step": 21800 + }, + { + "epoch": 4.11, + "grad_norm": 12.805113792419434, + "learning_rate": 1.178994918125353e-05, + "loss": 0.7928, + "step": 21810 + }, + { + "epoch": 4.11, + "grad_norm": 26.79267692565918, + "learning_rate": 1.1786184829663092e-05, + "loss": 0.6495, + "step": 21820 + }, + { + "epoch": 4.11, + "grad_norm": 18.316246032714844, + "learning_rate": 1.1782420478072653e-05, + "loss": 0.7989, + "step": 21830 + }, + { + "epoch": 4.11, + "grad_norm": 15.82396411895752, + "learning_rate": 1.1778656126482215e-05, + "loss": 0.6794, + "step": 21840 + }, + { + "epoch": 4.11, + "grad_norm": 15.773408889770508, + "learning_rate": 1.1774891774891776e-05, + "loss": 0.7149, + "step": 21850 + }, + { + "epoch": 4.11, + "grad_norm": 9.06588077545166, + "learning_rate": 1.1771127423301338e-05, + "loss": 0.7622, + "step": 21860 + }, + { + "epoch": 4.12, + "grad_norm": 0.6174959540367126, + "learning_rate": 1.1767363071710898e-05, + "loss": 0.7434, + "step": 21870 + }, + { + "epoch": 4.12, + "grad_norm": 4.278200149536133, + "learning_rate": 1.1763598720120459e-05, + "loss": 0.6226, + "step": 21880 + }, + { + "epoch": 4.12, + "grad_norm": 16.598983764648438, + "learning_rate": 1.175983436853002e-05, + "loss": 0.5363, + "step": 21890 + }, + { + "epoch": 4.12, + "grad_norm": 25.885684967041016, + "learning_rate": 1.1756070016939582e-05, + "loss": 0.5998, + "step": 21900 + }, + { + "epoch": 4.12, + "grad_norm": 7.993864059448242, + "learning_rate": 1.1752305665349143e-05, + "loss": 0.3801, + "step": 21910 + }, + { + "epoch": 4.13, + "grad_norm": 0.9169643521308899, + "learning_rate": 1.1748541313758705e-05, + "loss": 0.6745, + "step": 21920 + }, + { + "epoch": 4.13, + "grad_norm": 48.847991943359375, + "learning_rate": 1.1744776962168268e-05, + "loss": 0.5772, + "step": 21930 + }, + { + "epoch": 4.13, + "grad_norm": 34.63701629638672, + "learning_rate": 1.174101261057783e-05, + "loss": 0.6175, + "step": 21940 + }, + { + "epoch": 4.13, + "grad_norm": 55.32223129272461, + "learning_rate": 1.1737248258987391e-05, + "loss": 1.0425, + "step": 21950 + }, + { + "epoch": 4.13, + "grad_norm": 8.96568489074707, + "learning_rate": 1.1733483907396952e-05, + "loss": 0.5169, + "step": 21960 + }, + { + "epoch": 4.14, + "grad_norm": 13.289989471435547, + "learning_rate": 1.1729719555806514e-05, + "loss": 0.6216, + "step": 21970 + }, + { + "epoch": 4.14, + "grad_norm": 0.6537827253341675, + "learning_rate": 1.1725955204216075e-05, + "loss": 0.7422, + "step": 21980 + }, + { + "epoch": 4.14, + "grad_norm": 3.4758856296539307, + "learning_rate": 1.1722190852625637e-05, + "loss": 0.6289, + "step": 21990 + }, + { + "epoch": 4.14, + "grad_norm": 3.0647480487823486, + "learning_rate": 1.1718426501035198e-05, + "loss": 0.598, + "step": 22000 + }, + { + "epoch": 4.14, + "grad_norm": 9.152379989624023, + "learning_rate": 1.171466214944476e-05, + "loss": 0.4765, + "step": 22010 + }, + { + "epoch": 4.14, + "grad_norm": 8.172050476074219, + "learning_rate": 1.1710897797854321e-05, + "loss": 0.6093, + "step": 22020 + }, + { + "epoch": 4.15, + "grad_norm": 9.752901077270508, + "learning_rate": 1.1707133446263883e-05, + "loss": 0.6086, + "step": 22030 + }, + { + "epoch": 4.15, + "grad_norm": 12.649826049804688, + "learning_rate": 1.1703369094673442e-05, + "loss": 0.7863, + "step": 22040 + }, + { + "epoch": 4.15, + "grad_norm": 11.14371109008789, + "learning_rate": 1.1699604743083004e-05, + "loss": 0.6756, + "step": 22050 + }, + { + "epoch": 4.15, + "grad_norm": 0.7610763311386108, + "learning_rate": 1.1695840391492565e-05, + "loss": 0.6398, + "step": 22060 + }, + { + "epoch": 4.15, + "grad_norm": 1.436449408531189, + "learning_rate": 1.1692076039902127e-05, + "loss": 0.5272, + "step": 22070 + }, + { + "epoch": 4.16, + "grad_norm": 22.43182373046875, + "learning_rate": 1.1688311688311688e-05, + "loss": 0.5553, + "step": 22080 + }, + { + "epoch": 4.16, + "grad_norm": 0.680665910243988, + "learning_rate": 1.168454733672125e-05, + "loss": 0.3977, + "step": 22090 + }, + { + "epoch": 4.16, + "grad_norm": 12.118298530578613, + "learning_rate": 1.1680782985130811e-05, + "loss": 0.482, + "step": 22100 + }, + { + "epoch": 4.16, + "grad_norm": 19.042518615722656, + "learning_rate": 1.1677018633540373e-05, + "loss": 0.5045, + "step": 22110 + }, + { + "epoch": 4.16, + "grad_norm": 9.090872764587402, + "learning_rate": 1.1673254281949936e-05, + "loss": 0.4631, + "step": 22120 + }, + { + "epoch": 4.17, + "grad_norm": 2.18812894821167, + "learning_rate": 1.1669489930359497e-05, + "loss": 0.6592, + "step": 22130 + }, + { + "epoch": 4.17, + "grad_norm": 36.4178466796875, + "learning_rate": 1.1665725578769059e-05, + "loss": 0.6483, + "step": 22140 + }, + { + "epoch": 4.17, + "grad_norm": 28.83828353881836, + "learning_rate": 1.166196122717862e-05, + "loss": 0.4226, + "step": 22150 + }, + { + "epoch": 4.17, + "grad_norm": 31.52997398376465, + "learning_rate": 1.1658196875588182e-05, + "loss": 0.3637, + "step": 22160 + }, + { + "epoch": 4.17, + "grad_norm": 8.760178565979004, + "learning_rate": 1.1654432523997743e-05, + "loss": 0.7955, + "step": 22170 + }, + { + "epoch": 4.17, + "grad_norm": 0.6389203071594238, + "learning_rate": 1.1650668172407305e-05, + "loss": 0.4444, + "step": 22180 + }, + { + "epoch": 4.18, + "grad_norm": 29.18195915222168, + "learning_rate": 1.1646903820816866e-05, + "loss": 0.7053, + "step": 22190 + }, + { + "epoch": 4.18, + "grad_norm": 22.56359100341797, + "learning_rate": 1.1643139469226427e-05, + "loss": 0.4031, + "step": 22200 + }, + { + "epoch": 4.18, + "grad_norm": 23.801780700683594, + "learning_rate": 1.1639375117635989e-05, + "loss": 0.624, + "step": 22210 + }, + { + "epoch": 4.18, + "grad_norm": 13.610224723815918, + "learning_rate": 1.1635610766045549e-05, + "loss": 0.9822, + "step": 22220 + }, + { + "epoch": 4.18, + "grad_norm": 18.63134765625, + "learning_rate": 1.163184641445511e-05, + "loss": 0.6779, + "step": 22230 + }, + { + "epoch": 4.19, + "grad_norm": 16.268205642700195, + "learning_rate": 1.1628082062864672e-05, + "loss": 0.5979, + "step": 22240 + }, + { + "epoch": 4.19, + "grad_norm": 8.651780128479004, + "learning_rate": 1.1624317711274233e-05, + "loss": 0.7667, + "step": 22250 + }, + { + "epoch": 4.19, + "grad_norm": 42.6129035949707, + "learning_rate": 1.1620553359683795e-05, + "loss": 0.6944, + "step": 22260 + }, + { + "epoch": 4.19, + "grad_norm": 3.1881678104400635, + "learning_rate": 1.1616789008093356e-05, + "loss": 0.471, + "step": 22270 + }, + { + "epoch": 4.19, + "grad_norm": 0.7028786540031433, + "learning_rate": 1.1613024656502917e-05, + "loss": 0.5197, + "step": 22280 + }, + { + "epoch": 4.2, + "grad_norm": 47.69763946533203, + "learning_rate": 1.1609260304912479e-05, + "loss": 0.6065, + "step": 22290 + }, + { + "epoch": 4.2, + "grad_norm": 6.314722061157227, + "learning_rate": 1.1605495953322042e-05, + "loss": 0.5556, + "step": 22300 + }, + { + "epoch": 4.2, + "grad_norm": 8.165750503540039, + "learning_rate": 1.1601731601731604e-05, + "loss": 0.7189, + "step": 22310 + }, + { + "epoch": 4.2, + "grad_norm": 7.349857807159424, + "learning_rate": 1.1597967250141165e-05, + "loss": 0.5367, + "step": 22320 + }, + { + "epoch": 4.2, + "grad_norm": 10.950774192810059, + "learning_rate": 1.1594202898550726e-05, + "loss": 0.773, + "step": 22330 + }, + { + "epoch": 4.2, + "grad_norm": 16.20650291442871, + "learning_rate": 1.1590438546960288e-05, + "loss": 0.6576, + "step": 22340 + }, + { + "epoch": 4.21, + "grad_norm": 20.38640594482422, + "learning_rate": 1.158667419536985e-05, + "loss": 0.5126, + "step": 22350 + }, + { + "epoch": 4.21, + "grad_norm": 8.257250785827637, + "learning_rate": 1.158290984377941e-05, + "loss": 0.591, + "step": 22360 + }, + { + "epoch": 4.21, + "grad_norm": 5.747903823852539, + "learning_rate": 1.1579145492188972e-05, + "loss": 0.6314, + "step": 22370 + }, + { + "epoch": 4.21, + "grad_norm": 19.206281661987305, + "learning_rate": 1.1575381140598534e-05, + "loss": 0.6481, + "step": 22380 + }, + { + "epoch": 4.21, + "grad_norm": 22.839046478271484, + "learning_rate": 1.1571616789008094e-05, + "loss": 0.5929, + "step": 22390 + }, + { + "epoch": 4.22, + "grad_norm": 22.537134170532227, + "learning_rate": 1.1567852437417655e-05, + "loss": 0.4775, + "step": 22400 + }, + { + "epoch": 4.22, + "grad_norm": 0.13030314445495605, + "learning_rate": 1.1564088085827216e-05, + "loss": 0.4585, + "step": 22410 + }, + { + "epoch": 4.22, + "grad_norm": 11.866349220275879, + "learning_rate": 1.1560323734236778e-05, + "loss": 0.6309, + "step": 22420 + }, + { + "epoch": 4.22, + "grad_norm": 15.4808349609375, + "learning_rate": 1.155655938264634e-05, + "loss": 0.7321, + "step": 22430 + }, + { + "epoch": 4.22, + "grad_norm": 14.688508033752441, + "learning_rate": 1.15527950310559e-05, + "loss": 0.8127, + "step": 22440 + }, + { + "epoch": 4.23, + "grad_norm": 8.376343727111816, + "learning_rate": 1.1549030679465462e-05, + "loss": 0.6156, + "step": 22450 + }, + { + "epoch": 4.23, + "grad_norm": 22.739774703979492, + "learning_rate": 1.1545266327875024e-05, + "loss": 0.6053, + "step": 22460 + }, + { + "epoch": 4.23, + "grad_norm": 1.1516119241714478, + "learning_rate": 1.1541501976284585e-05, + "loss": 0.5797, + "step": 22470 + }, + { + "epoch": 4.23, + "grad_norm": 38.92045974731445, + "learning_rate": 1.1537737624694147e-05, + "loss": 0.7583, + "step": 22480 + }, + { + "epoch": 4.23, + "grad_norm": 11.236977577209473, + "learning_rate": 1.153397327310371e-05, + "loss": 0.7248, + "step": 22490 + }, + { + "epoch": 4.23, + "grad_norm": 1.5737559795379639, + "learning_rate": 1.1530208921513271e-05, + "loss": 0.7463, + "step": 22500 + }, + { + "epoch": 4.24, + "grad_norm": 1.3805655241012573, + "learning_rate": 1.1526444569922833e-05, + "loss": 0.4989, + "step": 22510 + }, + { + "epoch": 4.24, + "grad_norm": 115.92575073242188, + "learning_rate": 1.1522680218332394e-05, + "loss": 0.4902, + "step": 22520 + }, + { + "epoch": 4.24, + "grad_norm": 1.0640676021575928, + "learning_rate": 1.1518915866741956e-05, + "loss": 0.5065, + "step": 22530 + }, + { + "epoch": 4.24, + "grad_norm": 9.37736988067627, + "learning_rate": 1.1515151515151517e-05, + "loss": 0.493, + "step": 22540 + }, + { + "epoch": 4.24, + "grad_norm": 11.277339935302734, + "learning_rate": 1.1511387163561079e-05, + "loss": 0.5904, + "step": 22550 + }, + { + "epoch": 4.25, + "grad_norm": 33.85553741455078, + "learning_rate": 1.1507622811970638e-05, + "loss": 0.5691, + "step": 22560 + }, + { + "epoch": 4.25, + "grad_norm": 16.795196533203125, + "learning_rate": 1.15038584603802e-05, + "loss": 0.6498, + "step": 22570 + }, + { + "epoch": 4.25, + "grad_norm": 6.320994853973389, + "learning_rate": 1.1500094108789761e-05, + "loss": 0.5735, + "step": 22580 + }, + { + "epoch": 4.25, + "grad_norm": 0.7785674929618835, + "learning_rate": 1.1496329757199323e-05, + "loss": 0.5169, + "step": 22590 + }, + { + "epoch": 4.25, + "grad_norm": 1.8020360469818115, + "learning_rate": 1.1492565405608884e-05, + "loss": 0.3877, + "step": 22600 + }, + { + "epoch": 4.26, + "grad_norm": 28.67214584350586, + "learning_rate": 1.1488801054018446e-05, + "loss": 0.6102, + "step": 22610 + }, + { + "epoch": 4.26, + "grad_norm": 33.07846450805664, + "learning_rate": 1.1485036702428007e-05, + "loss": 0.6309, + "step": 22620 + }, + { + "epoch": 4.26, + "grad_norm": 41.755577087402344, + "learning_rate": 1.1481272350837569e-05, + "loss": 0.9355, + "step": 22630 + }, + { + "epoch": 4.26, + "grad_norm": 7.226475238800049, + "learning_rate": 1.147750799924713e-05, + "loss": 0.6734, + "step": 22640 + }, + { + "epoch": 4.26, + "grad_norm": 20.88840675354004, + "learning_rate": 1.1473743647656691e-05, + "loss": 0.4802, + "step": 22650 + }, + { + "epoch": 4.27, + "grad_norm": 18.926301956176758, + "learning_rate": 1.1469979296066253e-05, + "loss": 0.6868, + "step": 22660 + }, + { + "epoch": 4.27, + "grad_norm": 9.090205192565918, + "learning_rate": 1.1466214944475816e-05, + "loss": 0.7658, + "step": 22670 + }, + { + "epoch": 4.27, + "grad_norm": 55.62081527709961, + "learning_rate": 1.1462450592885378e-05, + "loss": 0.6087, + "step": 22680 + }, + { + "epoch": 4.27, + "grad_norm": 20.29065704345703, + "learning_rate": 1.1458686241294939e-05, + "loss": 0.4497, + "step": 22690 + }, + { + "epoch": 4.27, + "grad_norm": 6.561438083648682, + "learning_rate": 1.14549218897045e-05, + "loss": 0.5936, + "step": 22700 + }, + { + "epoch": 4.27, + "grad_norm": 12.46176815032959, + "learning_rate": 1.1451157538114062e-05, + "loss": 0.8636, + "step": 22710 + }, + { + "epoch": 4.28, + "grad_norm": 31.131755828857422, + "learning_rate": 1.1447393186523623e-05, + "loss": 0.7126, + "step": 22720 + }, + { + "epoch": 4.28, + "grad_norm": 20.14628028869629, + "learning_rate": 1.1443628834933185e-05, + "loss": 0.4201, + "step": 22730 + }, + { + "epoch": 4.28, + "grad_norm": 8.134221076965332, + "learning_rate": 1.1439864483342745e-05, + "loss": 0.595, + "step": 22740 + }, + { + "epoch": 4.28, + "grad_norm": 23.64768409729004, + "learning_rate": 1.1436100131752306e-05, + "loss": 0.8233, + "step": 22750 + }, + { + "epoch": 4.28, + "grad_norm": 3.0115549564361572, + "learning_rate": 1.1432335780161868e-05, + "loss": 0.3933, + "step": 22760 + }, + { + "epoch": 4.29, + "grad_norm": 0.10152573138475418, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.4964, + "step": 22770 + }, + { + "epoch": 4.29, + "grad_norm": 12.97737979888916, + "learning_rate": 1.142480707698099e-05, + "loss": 0.281, + "step": 22780 + }, + { + "epoch": 4.29, + "grad_norm": 10.339591979980469, + "learning_rate": 1.1421042725390552e-05, + "loss": 0.7338, + "step": 22790 + }, + { + "epoch": 4.29, + "grad_norm": 21.024656295776367, + "learning_rate": 1.1417278373800113e-05, + "loss": 0.4871, + "step": 22800 + }, + { + "epoch": 4.29, + "grad_norm": 0.6640139818191528, + "learning_rate": 1.1413514022209675e-05, + "loss": 0.5691, + "step": 22810 + }, + { + "epoch": 4.3, + "grad_norm": 19.123470306396484, + "learning_rate": 1.1409749670619236e-05, + "loss": 0.8221, + "step": 22820 + }, + { + "epoch": 4.3, + "grad_norm": 16.33519172668457, + "learning_rate": 1.1405985319028798e-05, + "loss": 0.6879, + "step": 22830 + }, + { + "epoch": 4.3, + "grad_norm": 19.19475746154785, + "learning_rate": 1.140222096743836e-05, + "loss": 0.5056, + "step": 22840 + }, + { + "epoch": 4.3, + "grad_norm": 4.6960883140563965, + "learning_rate": 1.139845661584792e-05, + "loss": 0.6242, + "step": 22850 + }, + { + "epoch": 4.3, + "grad_norm": 0.4895837604999542, + "learning_rate": 1.1394692264257484e-05, + "loss": 0.6206, + "step": 22860 + }, + { + "epoch": 4.3, + "grad_norm": 18.621294021606445, + "learning_rate": 1.1390927912667045e-05, + "loss": 0.6418, + "step": 22870 + }, + { + "epoch": 4.31, + "grad_norm": 12.699724197387695, + "learning_rate": 1.1387163561076607e-05, + "loss": 0.4555, + "step": 22880 + }, + { + "epoch": 4.31, + "grad_norm": 8.417510986328125, + "learning_rate": 1.1383399209486168e-05, + "loss": 0.5236, + "step": 22890 + }, + { + "epoch": 4.31, + "grad_norm": 9.543535232543945, + "learning_rate": 1.137963485789573e-05, + "loss": 0.5137, + "step": 22900 + }, + { + "epoch": 4.31, + "grad_norm": 21.985715866088867, + "learning_rate": 1.137587050630529e-05, + "loss": 0.4621, + "step": 22910 + }, + { + "epoch": 4.31, + "grad_norm": 2.261923313140869, + "learning_rate": 1.1372106154714851e-05, + "loss": 0.6878, + "step": 22920 + }, + { + "epoch": 4.32, + "grad_norm": 12.996893882751465, + "learning_rate": 1.1368341803124412e-05, + "loss": 0.7822, + "step": 22930 + }, + { + "epoch": 4.32, + "grad_norm": 1.7661455869674683, + "learning_rate": 1.1364577451533974e-05, + "loss": 0.627, + "step": 22940 + }, + { + "epoch": 4.32, + "grad_norm": 16.084827423095703, + "learning_rate": 1.1360813099943535e-05, + "loss": 0.4727, + "step": 22950 + }, + { + "epoch": 4.32, + "grad_norm": 10.136256217956543, + "learning_rate": 1.1357048748353097e-05, + "loss": 0.5857, + "step": 22960 + }, + { + "epoch": 4.32, + "grad_norm": 13.618847846984863, + "learning_rate": 1.1353284396762658e-05, + "loss": 0.5905, + "step": 22970 + }, + { + "epoch": 4.33, + "grad_norm": 1.0733072757720947, + "learning_rate": 1.134952004517222e-05, + "loss": 0.6132, + "step": 22980 + }, + { + "epoch": 4.33, + "grad_norm": 4.069360733032227, + "learning_rate": 1.1345755693581781e-05, + "loss": 0.3998, + "step": 22990 + }, + { + "epoch": 4.33, + "grad_norm": 27.945581436157227, + "learning_rate": 1.1341991341991343e-05, + "loss": 0.5658, + "step": 23000 + }, + { + "epoch": 4.33, + "grad_norm": 0.5599634647369385, + "learning_rate": 1.1338226990400904e-05, + "loss": 0.3289, + "step": 23010 + }, + { + "epoch": 4.33, + "grad_norm": 21.056047439575195, + "learning_rate": 1.1334462638810466e-05, + "loss": 0.4886, + "step": 23020 + }, + { + "epoch": 4.33, + "grad_norm": 19.576793670654297, + "learning_rate": 1.1330698287220027e-05, + "loss": 0.584, + "step": 23030 + }, + { + "epoch": 4.34, + "grad_norm": 8.226250648498535, + "learning_rate": 1.132693393562959e-05, + "loss": 0.7993, + "step": 23040 + }, + { + "epoch": 4.34, + "grad_norm": 7.203392028808594, + "learning_rate": 1.1323169584039152e-05, + "loss": 0.5483, + "step": 23050 + }, + { + "epoch": 4.34, + "grad_norm": 27.118764877319336, + "learning_rate": 1.1319405232448713e-05, + "loss": 0.6039, + "step": 23060 + }, + { + "epoch": 4.34, + "grad_norm": 8.184642791748047, + "learning_rate": 1.1315640880858275e-05, + "loss": 0.5206, + "step": 23070 + }, + { + "epoch": 4.34, + "grad_norm": 1.8194717168807983, + "learning_rate": 1.1311876529267836e-05, + "loss": 0.7129, + "step": 23080 + }, + { + "epoch": 4.35, + "grad_norm": 7.509031772613525, + "learning_rate": 1.1308112177677396e-05, + "loss": 0.7681, + "step": 23090 + }, + { + "epoch": 4.35, + "grad_norm": 19.41481590270996, + "learning_rate": 1.1304347826086957e-05, + "loss": 0.5878, + "step": 23100 + }, + { + "epoch": 4.35, + "grad_norm": 9.031009674072266, + "learning_rate": 1.1300583474496519e-05, + "loss": 0.4608, + "step": 23110 + }, + { + "epoch": 4.35, + "grad_norm": 3.5254902839660645, + "learning_rate": 1.129681912290608e-05, + "loss": 0.6231, + "step": 23120 + }, + { + "epoch": 4.35, + "grad_norm": 14.69057559967041, + "learning_rate": 1.1293054771315642e-05, + "loss": 0.7666, + "step": 23130 + }, + { + "epoch": 4.36, + "grad_norm": 26.575754165649414, + "learning_rate": 1.1289290419725203e-05, + "loss": 0.5632, + "step": 23140 + }, + { + "epoch": 4.36, + "grad_norm": 12.611248016357422, + "learning_rate": 1.1285526068134765e-05, + "loss": 0.6975, + "step": 23150 + }, + { + "epoch": 4.36, + "grad_norm": 19.788602828979492, + "learning_rate": 1.1281761716544326e-05, + "loss": 0.6446, + "step": 23160 + }, + { + "epoch": 4.36, + "grad_norm": 20.97464370727539, + "learning_rate": 1.1277997364953887e-05, + "loss": 0.7142, + "step": 23170 + }, + { + "epoch": 4.36, + "grad_norm": 11.11438274383545, + "learning_rate": 1.1274233013363449e-05, + "loss": 0.4329, + "step": 23180 + }, + { + "epoch": 4.36, + "grad_norm": 6.7174296379089355, + "learning_rate": 1.127046866177301e-05, + "loss": 0.642, + "step": 23190 + }, + { + "epoch": 4.37, + "grad_norm": 12.077559471130371, + "learning_rate": 1.1266704310182572e-05, + "loss": 0.6088, + "step": 23200 + }, + { + "epoch": 4.37, + "grad_norm": 0.6835032105445862, + "learning_rate": 1.1262939958592133e-05, + "loss": 0.7096, + "step": 23210 + }, + { + "epoch": 4.37, + "grad_norm": 13.644143104553223, + "learning_rate": 1.1259175607001695e-05, + "loss": 0.5963, + "step": 23220 + }, + { + "epoch": 4.37, + "grad_norm": 17.367053985595703, + "learning_rate": 1.1255411255411258e-05, + "loss": 0.5982, + "step": 23230 + }, + { + "epoch": 4.37, + "grad_norm": 19.097530364990234, + "learning_rate": 1.125164690382082e-05, + "loss": 0.7949, + "step": 23240 + }, + { + "epoch": 4.38, + "grad_norm": 11.06638240814209, + "learning_rate": 1.124788255223038e-05, + "loss": 0.5742, + "step": 23250 + }, + { + "epoch": 4.38, + "grad_norm": 6.701301097869873, + "learning_rate": 1.1244118200639939e-05, + "loss": 0.3653, + "step": 23260 + }, + { + "epoch": 4.38, + "grad_norm": 7.139585018157959, + "learning_rate": 1.1240353849049502e-05, + "loss": 0.2213, + "step": 23270 + }, + { + "epoch": 4.38, + "grad_norm": 15.305197715759277, + "learning_rate": 1.1236589497459064e-05, + "loss": 0.4505, + "step": 23280 + }, + { + "epoch": 4.38, + "grad_norm": 9.9895601272583, + "learning_rate": 1.1232825145868625e-05, + "loss": 0.7347, + "step": 23290 + }, + { + "epoch": 4.39, + "grad_norm": 30.742244720458984, + "learning_rate": 1.1229060794278186e-05, + "loss": 0.7636, + "step": 23300 + }, + { + "epoch": 4.39, + "grad_norm": 24.35445785522461, + "learning_rate": 1.1225296442687748e-05, + "loss": 0.7456, + "step": 23310 + }, + { + "epoch": 4.39, + "grad_norm": 24.246612548828125, + "learning_rate": 1.122153209109731e-05, + "loss": 0.6049, + "step": 23320 + }, + { + "epoch": 4.39, + "grad_norm": 33.06283950805664, + "learning_rate": 1.121776773950687e-05, + "loss": 0.5307, + "step": 23330 + }, + { + "epoch": 4.39, + "grad_norm": 25.850997924804688, + "learning_rate": 1.1214003387916432e-05, + "loss": 0.7956, + "step": 23340 + }, + { + "epoch": 4.39, + "grad_norm": 11.392745018005371, + "learning_rate": 1.1210239036325994e-05, + "loss": 0.6775, + "step": 23350 + }, + { + "epoch": 4.4, + "grad_norm": 10.8051118850708, + "learning_rate": 1.1206474684735555e-05, + "loss": 0.4612, + "step": 23360 + }, + { + "epoch": 4.4, + "grad_norm": 32.45104217529297, + "learning_rate": 1.1202710333145117e-05, + "loss": 0.6303, + "step": 23370 + }, + { + "epoch": 4.4, + "grad_norm": 5.765473365783691, + "learning_rate": 1.1198945981554678e-05, + "loss": 0.7854, + "step": 23380 + }, + { + "epoch": 4.4, + "grad_norm": 0.4003254771232605, + "learning_rate": 1.119518162996424e-05, + "loss": 0.5774, + "step": 23390 + }, + { + "epoch": 4.4, + "grad_norm": 8.617371559143066, + "learning_rate": 1.1191417278373801e-05, + "loss": 1.0461, + "step": 23400 + }, + { + "epoch": 4.41, + "grad_norm": 17.002267837524414, + "learning_rate": 1.1187652926783363e-05, + "loss": 0.7655, + "step": 23410 + }, + { + "epoch": 4.41, + "grad_norm": 20.015209197998047, + "learning_rate": 1.1183888575192926e-05, + "loss": 0.6856, + "step": 23420 + }, + { + "epoch": 4.41, + "grad_norm": 5.477625846862793, + "learning_rate": 1.1180124223602484e-05, + "loss": 0.7369, + "step": 23430 + }, + { + "epoch": 4.41, + "grad_norm": 9.53620719909668, + "learning_rate": 1.1176359872012045e-05, + "loss": 0.4399, + "step": 23440 + }, + { + "epoch": 4.41, + "grad_norm": 3.9531502723693848, + "learning_rate": 1.1172595520421607e-05, + "loss": 0.9006, + "step": 23450 + }, + { + "epoch": 4.42, + "grad_norm": 5.668755054473877, + "learning_rate": 1.116883116883117e-05, + "loss": 0.432, + "step": 23460 + }, + { + "epoch": 4.42, + "grad_norm": 36.14069366455078, + "learning_rate": 1.1165066817240731e-05, + "loss": 0.7544, + "step": 23470 + }, + { + "epoch": 4.42, + "grad_norm": 9.506393432617188, + "learning_rate": 1.1161302465650293e-05, + "loss": 0.7204, + "step": 23480 + }, + { + "epoch": 4.42, + "grad_norm": 22.080291748046875, + "learning_rate": 1.1157538114059854e-05, + "loss": 0.5264, + "step": 23490 + }, + { + "epoch": 4.42, + "grad_norm": 6.617203712463379, + "learning_rate": 1.1153773762469416e-05, + "loss": 0.5839, + "step": 23500 + }, + { + "epoch": 4.42, + "grad_norm": 13.228835105895996, + "learning_rate": 1.1150009410878977e-05, + "loss": 0.7058, + "step": 23510 + }, + { + "epoch": 4.43, + "grad_norm": 0.6176514625549316, + "learning_rate": 1.1146245059288539e-05, + "loss": 0.4557, + "step": 23520 + }, + { + "epoch": 4.43, + "grad_norm": 0.5672368407249451, + "learning_rate": 1.11424807076981e-05, + "loss": 0.4541, + "step": 23530 + }, + { + "epoch": 4.43, + "grad_norm": 2.1428656578063965, + "learning_rate": 1.1138716356107661e-05, + "loss": 0.7391, + "step": 23540 + }, + { + "epoch": 4.43, + "grad_norm": 10.963018417358398, + "learning_rate": 1.1134952004517223e-05, + "loss": 0.5223, + "step": 23550 + }, + { + "epoch": 4.43, + "grad_norm": 9.472241401672363, + "learning_rate": 1.1131187652926784e-05, + "loss": 0.5409, + "step": 23560 + }, + { + "epoch": 4.44, + "grad_norm": 18.68202781677246, + "learning_rate": 1.1127423301336346e-05, + "loss": 0.6596, + "step": 23570 + }, + { + "epoch": 4.44, + "grad_norm": 4.82119083404541, + "learning_rate": 1.1123658949745907e-05, + "loss": 0.6226, + "step": 23580 + }, + { + "epoch": 4.44, + "grad_norm": 5.042635917663574, + "learning_rate": 1.1119894598155469e-05, + "loss": 0.6738, + "step": 23590 + }, + { + "epoch": 4.44, + "grad_norm": 12.62334156036377, + "learning_rate": 1.1116130246565032e-05, + "loss": 0.7358, + "step": 23600 + }, + { + "epoch": 4.44, + "grad_norm": 15.830984115600586, + "learning_rate": 1.111236589497459e-05, + "loss": 0.7408, + "step": 23610 + }, + { + "epoch": 4.45, + "grad_norm": 9.996474266052246, + "learning_rate": 1.1108601543384151e-05, + "loss": 0.3432, + "step": 23620 + }, + { + "epoch": 4.45, + "grad_norm": 42.45836639404297, + "learning_rate": 1.1104837191793713e-05, + "loss": 0.7615, + "step": 23630 + }, + { + "epoch": 4.45, + "grad_norm": 32.250850677490234, + "learning_rate": 1.1101072840203274e-05, + "loss": 0.4661, + "step": 23640 + }, + { + "epoch": 4.45, + "grad_norm": 24.006662368774414, + "learning_rate": 1.1097308488612838e-05, + "loss": 0.7126, + "step": 23650 + }, + { + "epoch": 4.45, + "grad_norm": 7.966846942901611, + "learning_rate": 1.1093544137022399e-05, + "loss": 0.7873, + "step": 23660 + }, + { + "epoch": 4.46, + "grad_norm": 16.041297912597656, + "learning_rate": 1.108977978543196e-05, + "loss": 0.5328, + "step": 23670 + }, + { + "epoch": 4.46, + "grad_norm": 12.257124900817871, + "learning_rate": 1.1086015433841522e-05, + "loss": 0.549, + "step": 23680 + }, + { + "epoch": 4.46, + "grad_norm": 9.333507537841797, + "learning_rate": 1.1082251082251083e-05, + "loss": 0.4787, + "step": 23690 + }, + { + "epoch": 4.46, + "grad_norm": 15.94845199584961, + "learning_rate": 1.1078486730660645e-05, + "loss": 0.5384, + "step": 23700 + }, + { + "epoch": 4.46, + "grad_norm": 15.848910331726074, + "learning_rate": 1.1074722379070206e-05, + "loss": 0.6707, + "step": 23710 + }, + { + "epoch": 4.46, + "grad_norm": 12.358846664428711, + "learning_rate": 1.1070958027479768e-05, + "loss": 0.6222, + "step": 23720 + }, + { + "epoch": 4.47, + "grad_norm": 16.612165451049805, + "learning_rate": 1.106719367588933e-05, + "loss": 0.4296, + "step": 23730 + }, + { + "epoch": 4.47, + "grad_norm": 7.621057987213135, + "learning_rate": 1.106342932429889e-05, + "loss": 0.4142, + "step": 23740 + }, + { + "epoch": 4.47, + "grad_norm": 26.88300895690918, + "learning_rate": 1.1059664972708452e-05, + "loss": 0.3773, + "step": 23750 + }, + { + "epoch": 4.47, + "grad_norm": 14.327881813049316, + "learning_rate": 1.1055900621118014e-05, + "loss": 0.5598, + "step": 23760 + }, + { + "epoch": 4.47, + "grad_norm": 7.642137050628662, + "learning_rate": 1.1052136269527575e-05, + "loss": 0.5747, + "step": 23770 + }, + { + "epoch": 4.48, + "grad_norm": 11.110989570617676, + "learning_rate": 1.1048371917937135e-05, + "loss": 0.4744, + "step": 23780 + }, + { + "epoch": 4.48, + "grad_norm": 11.560811042785645, + "learning_rate": 1.1044607566346696e-05, + "loss": 0.7039, + "step": 23790 + }, + { + "epoch": 4.48, + "grad_norm": 17.159273147583008, + "learning_rate": 1.1040843214756258e-05, + "loss": 0.7643, + "step": 23800 + }, + { + "epoch": 4.48, + "grad_norm": 15.362770080566406, + "learning_rate": 1.103707886316582e-05, + "loss": 0.6629, + "step": 23810 + }, + { + "epoch": 4.48, + "grad_norm": 0.8261994123458862, + "learning_rate": 1.103331451157538e-05, + "loss": 0.5071, + "step": 23820 + }, + { + "epoch": 4.49, + "grad_norm": 6.509442329406738, + "learning_rate": 1.1029550159984944e-05, + "loss": 0.4378, + "step": 23830 + }, + { + "epoch": 4.49, + "grad_norm": 15.329565048217773, + "learning_rate": 1.1025785808394505e-05, + "loss": 0.5858, + "step": 23840 + }, + { + "epoch": 4.49, + "grad_norm": 5.65818977355957, + "learning_rate": 1.1022021456804067e-05, + "loss": 0.4484, + "step": 23850 + }, + { + "epoch": 4.49, + "grad_norm": 4.112398624420166, + "learning_rate": 1.1018257105213628e-05, + "loss": 0.4658, + "step": 23860 + }, + { + "epoch": 4.49, + "grad_norm": 12.523710250854492, + "learning_rate": 1.101449275362319e-05, + "loss": 0.5546, + "step": 23870 + }, + { + "epoch": 4.49, + "grad_norm": 0.8319380879402161, + "learning_rate": 1.1010728402032751e-05, + "loss": 0.4997, + "step": 23880 + }, + { + "epoch": 4.5, + "grad_norm": 17.31648063659668, + "learning_rate": 1.1006964050442313e-05, + "loss": 0.4323, + "step": 23890 + }, + { + "epoch": 4.5, + "grad_norm": 34.52608871459961, + "learning_rate": 1.1003199698851874e-05, + "loss": 0.544, + "step": 23900 + }, + { + "epoch": 4.5, + "grad_norm": 9.72608757019043, + "learning_rate": 1.0999435347261436e-05, + "loss": 0.6388, + "step": 23910 + }, + { + "epoch": 4.5, + "grad_norm": 17.97516441345215, + "learning_rate": 1.0995670995670997e-05, + "loss": 0.8557, + "step": 23920 + }, + { + "epoch": 4.5, + "grad_norm": 0.5164489150047302, + "learning_rate": 1.0991906644080558e-05, + "loss": 0.7708, + "step": 23930 + }, + { + "epoch": 4.51, + "grad_norm": 4.0153117179870605, + "learning_rate": 1.098814229249012e-05, + "loss": 0.6138, + "step": 23940 + }, + { + "epoch": 4.51, + "grad_norm": 1.1666315793991089, + "learning_rate": 1.0984377940899681e-05, + "loss": 0.7278, + "step": 23950 + }, + { + "epoch": 4.51, + "grad_norm": 9.93372917175293, + "learning_rate": 1.0980613589309241e-05, + "loss": 0.4048, + "step": 23960 + }, + { + "epoch": 4.51, + "grad_norm": 21.520671844482422, + "learning_rate": 1.0976849237718803e-05, + "loss": 0.6345, + "step": 23970 + }, + { + "epoch": 4.51, + "grad_norm": 3.4470133781433105, + "learning_rate": 1.0973084886128364e-05, + "loss": 1.0712, + "step": 23980 + }, + { + "epoch": 4.52, + "grad_norm": 29.81308364868164, + "learning_rate": 1.0969320534537926e-05, + "loss": 0.4262, + "step": 23990 + }, + { + "epoch": 4.52, + "grad_norm": 4.830698490142822, + "learning_rate": 1.0965556182947487e-05, + "loss": 0.7396, + "step": 24000 + }, + { + "epoch": 4.52, + "grad_norm": 9.33254623413086, + "learning_rate": 1.0961791831357048e-05, + "loss": 0.8639, + "step": 24010 + }, + { + "epoch": 4.52, + "grad_norm": 14.065550804138184, + "learning_rate": 1.0958027479766612e-05, + "loss": 0.623, + "step": 24020 + }, + { + "epoch": 4.52, + "grad_norm": 11.378796577453613, + "learning_rate": 1.0954263128176173e-05, + "loss": 0.581, + "step": 24030 + }, + { + "epoch": 4.52, + "grad_norm": 13.996289253234863, + "learning_rate": 1.0950498776585735e-05, + "loss": 0.5295, + "step": 24040 + }, + { + "epoch": 4.53, + "grad_norm": 8.590402603149414, + "learning_rate": 1.0946734424995296e-05, + "loss": 0.7987, + "step": 24050 + }, + { + "epoch": 4.53, + "grad_norm": 11.498170852661133, + "learning_rate": 1.0942970073404857e-05, + "loss": 0.6092, + "step": 24060 + }, + { + "epoch": 4.53, + "grad_norm": 10.157093048095703, + "learning_rate": 1.0939205721814419e-05, + "loss": 0.3375, + "step": 24070 + }, + { + "epoch": 4.53, + "grad_norm": 10.8042573928833, + "learning_rate": 1.093544137022398e-05, + "loss": 0.7559, + "step": 24080 + }, + { + "epoch": 4.53, + "grad_norm": 0.13831891119480133, + "learning_rate": 1.0931677018633542e-05, + "loss": 0.655, + "step": 24090 + }, + { + "epoch": 4.54, + "grad_norm": 21.07738494873047, + "learning_rate": 1.0927912667043103e-05, + "loss": 0.8755, + "step": 24100 + }, + { + "epoch": 4.54, + "grad_norm": 25.695125579833984, + "learning_rate": 1.0924148315452665e-05, + "loss": 0.4189, + "step": 24110 + }, + { + "epoch": 4.54, + "grad_norm": 23.678091049194336, + "learning_rate": 1.0920383963862226e-05, + "loss": 0.6966, + "step": 24120 + }, + { + "epoch": 4.54, + "grad_norm": 17.928577423095703, + "learning_rate": 1.0916619612271786e-05, + "loss": 0.4332, + "step": 24130 + }, + { + "epoch": 4.54, + "grad_norm": 40.44157409667969, + "learning_rate": 1.0912855260681347e-05, + "loss": 0.4427, + "step": 24140 + }, + { + "epoch": 4.55, + "grad_norm": 4.393035411834717, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.5062, + "step": 24150 + }, + { + "epoch": 4.55, + "grad_norm": 46.50747299194336, + "learning_rate": 1.090532655750047e-05, + "loss": 0.7385, + "step": 24160 + }, + { + "epoch": 4.55, + "grad_norm": 0.6289504170417786, + "learning_rate": 1.0901562205910032e-05, + "loss": 0.3068, + "step": 24170 + }, + { + "epoch": 4.55, + "grad_norm": 26.921764373779297, + "learning_rate": 1.0897797854319593e-05, + "loss": 0.6155, + "step": 24180 + }, + { + "epoch": 4.55, + "grad_norm": 38.250587463378906, + "learning_rate": 1.0894033502729155e-05, + "loss": 0.576, + "step": 24190 + }, + { + "epoch": 4.55, + "grad_norm": 2.182755470275879, + "learning_rate": 1.0890269151138718e-05, + "loss": 0.5502, + "step": 24200 + }, + { + "epoch": 4.56, + "grad_norm": 8.932013511657715, + "learning_rate": 1.088650479954828e-05, + "loss": 0.4656, + "step": 24210 + }, + { + "epoch": 4.56, + "grad_norm": 8.519400596618652, + "learning_rate": 1.088274044795784e-05, + "loss": 0.3739, + "step": 24220 + }, + { + "epoch": 4.56, + "grad_norm": 21.846633911132812, + "learning_rate": 1.0878976096367402e-05, + "loss": 0.5867, + "step": 24230 + }, + { + "epoch": 4.56, + "grad_norm": 19.62921142578125, + "learning_rate": 1.0875211744776964e-05, + "loss": 0.5858, + "step": 24240 + }, + { + "epoch": 4.56, + "grad_norm": 60.666866302490234, + "learning_rate": 1.0871447393186525e-05, + "loss": 0.6691, + "step": 24250 + }, + { + "epoch": 4.57, + "grad_norm": 12.690692901611328, + "learning_rate": 1.0867683041596087e-05, + "loss": 0.5031, + "step": 24260 + }, + { + "epoch": 4.57, + "grad_norm": 22.332693099975586, + "learning_rate": 1.0863918690005648e-05, + "loss": 0.6551, + "step": 24270 + }, + { + "epoch": 4.57, + "grad_norm": 8.846254348754883, + "learning_rate": 1.086015433841521e-05, + "loss": 0.5339, + "step": 24280 + }, + { + "epoch": 4.57, + "grad_norm": 30.030580520629883, + "learning_rate": 1.0856389986824771e-05, + "loss": 0.8483, + "step": 24290 + }, + { + "epoch": 4.57, + "grad_norm": 30.104228973388672, + "learning_rate": 1.0852625635234333e-05, + "loss": 0.4223, + "step": 24300 + }, + { + "epoch": 4.58, + "grad_norm": 26.719762802124023, + "learning_rate": 1.0848861283643892e-05, + "loss": 0.5728, + "step": 24310 + }, + { + "epoch": 4.58, + "grad_norm": 8.01773452758789, + "learning_rate": 1.0845096932053454e-05, + "loss": 0.7319, + "step": 24320 + }, + { + "epoch": 4.58, + "grad_norm": 17.984601974487305, + "learning_rate": 1.0841332580463015e-05, + "loss": 0.6263, + "step": 24330 + }, + { + "epoch": 4.58, + "grad_norm": 1.179288625717163, + "learning_rate": 1.0837568228872577e-05, + "loss": 0.2777, + "step": 24340 + }, + { + "epoch": 4.58, + "grad_norm": 17.420080184936523, + "learning_rate": 1.0833803877282138e-05, + "loss": 0.3534, + "step": 24350 + }, + { + "epoch": 4.58, + "grad_norm": 20.7567138671875, + "learning_rate": 1.08300395256917e-05, + "loss": 1.0827, + "step": 24360 + }, + { + "epoch": 4.59, + "grad_norm": 28.857418060302734, + "learning_rate": 1.0826275174101261e-05, + "loss": 0.6296, + "step": 24370 + }, + { + "epoch": 4.59, + "grad_norm": 20.736568450927734, + "learning_rate": 1.0822510822510823e-05, + "loss": 0.9884, + "step": 24380 + }, + { + "epoch": 4.59, + "grad_norm": 22.843528747558594, + "learning_rate": 1.0818746470920386e-05, + "loss": 0.4609, + "step": 24390 + }, + { + "epoch": 4.59, + "grad_norm": 14.99477767944336, + "learning_rate": 1.0814982119329947e-05, + "loss": 0.7755, + "step": 24400 + }, + { + "epoch": 4.59, + "grad_norm": 13.172017097473145, + "learning_rate": 1.0811217767739509e-05, + "loss": 0.4842, + "step": 24410 + }, + { + "epoch": 4.6, + "grad_norm": 15.066924095153809, + "learning_rate": 1.080745341614907e-05, + "loss": 0.4145, + "step": 24420 + }, + { + "epoch": 4.6, + "grad_norm": 0.11462697386741638, + "learning_rate": 1.0803689064558631e-05, + "loss": 0.5677, + "step": 24430 + }, + { + "epoch": 4.6, + "grad_norm": 0.14985047280788422, + "learning_rate": 1.0799924712968193e-05, + "loss": 0.4298, + "step": 24440 + }, + { + "epoch": 4.6, + "grad_norm": 13.569608688354492, + "learning_rate": 1.0796160361377754e-05, + "loss": 0.5217, + "step": 24450 + }, + { + "epoch": 4.6, + "grad_norm": 9.481203079223633, + "learning_rate": 1.0792396009787316e-05, + "loss": 0.8526, + "step": 24460 + }, + { + "epoch": 4.61, + "grad_norm": 4.702198028564453, + "learning_rate": 1.0788631658196877e-05, + "loss": 0.5461, + "step": 24470 + }, + { + "epoch": 4.61, + "grad_norm": 10.113847732543945, + "learning_rate": 1.0784867306606437e-05, + "loss": 0.7229, + "step": 24480 + }, + { + "epoch": 4.61, + "grad_norm": 9.467721939086914, + "learning_rate": 1.0781102955015999e-05, + "loss": 0.6535, + "step": 24490 + }, + { + "epoch": 4.61, + "grad_norm": 18.867504119873047, + "learning_rate": 1.077733860342556e-05, + "loss": 0.7685, + "step": 24500 + }, + { + "epoch": 4.61, + "grad_norm": 8.213113784790039, + "learning_rate": 1.0773574251835121e-05, + "loss": 0.7004, + "step": 24510 + }, + { + "epoch": 4.62, + "grad_norm": 2.475691795349121, + "learning_rate": 1.0769809900244683e-05, + "loss": 0.6569, + "step": 24520 + }, + { + "epoch": 4.62, + "grad_norm": 17.931053161621094, + "learning_rate": 1.0766045548654244e-05, + "loss": 0.6393, + "step": 24530 + }, + { + "epoch": 4.62, + "grad_norm": 8.332486152648926, + "learning_rate": 1.0762281197063806e-05, + "loss": 0.6192, + "step": 24540 + }, + { + "epoch": 4.62, + "grad_norm": 0.999433159828186, + "learning_rate": 1.0758516845473367e-05, + "loss": 0.5856, + "step": 24550 + }, + { + "epoch": 4.62, + "grad_norm": 5.174107074737549, + "learning_rate": 1.0754752493882929e-05, + "loss": 0.6075, + "step": 24560 + }, + { + "epoch": 4.62, + "grad_norm": 9.829113006591797, + "learning_rate": 1.0750988142292492e-05, + "loss": 0.5431, + "step": 24570 + }, + { + "epoch": 4.63, + "grad_norm": 41.149696350097656, + "learning_rate": 1.0747223790702053e-05, + "loss": 0.4506, + "step": 24580 + }, + { + "epoch": 4.63, + "grad_norm": 15.11819839477539, + "learning_rate": 1.0743459439111615e-05, + "loss": 0.7719, + "step": 24590 + }, + { + "epoch": 4.63, + "grad_norm": 7.7300124168396, + "learning_rate": 1.0739695087521176e-05, + "loss": 0.5971, + "step": 24600 + }, + { + "epoch": 4.63, + "grad_norm": 20.490882873535156, + "learning_rate": 1.0735930735930738e-05, + "loss": 0.4271, + "step": 24610 + }, + { + "epoch": 4.63, + "grad_norm": 7.432632923126221, + "learning_rate": 1.07321663843403e-05, + "loss": 0.7363, + "step": 24620 + }, + { + "epoch": 4.64, + "grad_norm": 15.051981925964355, + "learning_rate": 1.072840203274986e-05, + "loss": 0.5787, + "step": 24630 + }, + { + "epoch": 4.64, + "grad_norm": 7.251710414886475, + "learning_rate": 1.0724637681159422e-05, + "loss": 0.7642, + "step": 24640 + }, + { + "epoch": 4.64, + "grad_norm": 17.7536678314209, + "learning_rate": 1.0720873329568982e-05, + "loss": 0.6854, + "step": 24650 + }, + { + "epoch": 4.64, + "grad_norm": 16.003955841064453, + "learning_rate": 1.0717108977978543e-05, + "loss": 0.409, + "step": 24660 + }, + { + "epoch": 4.64, + "grad_norm": 4.870722770690918, + "learning_rate": 1.0713344626388105e-05, + "loss": 0.7751, + "step": 24670 + }, + { + "epoch": 4.65, + "grad_norm": 17.760967254638672, + "learning_rate": 1.0709580274797666e-05, + "loss": 0.3988, + "step": 24680 + }, + { + "epoch": 4.65, + "grad_norm": 1.264788031578064, + "learning_rate": 1.0705815923207228e-05, + "loss": 0.5068, + "step": 24690 + }, + { + "epoch": 4.65, + "grad_norm": 18.01626205444336, + "learning_rate": 1.070205157161679e-05, + "loss": 0.4463, + "step": 24700 + }, + { + "epoch": 4.65, + "grad_norm": 6.05813455581665, + "learning_rate": 1.069828722002635e-05, + "loss": 0.6254, + "step": 24710 + }, + { + "epoch": 4.65, + "grad_norm": 25.01399803161621, + "learning_rate": 1.0694522868435912e-05, + "loss": 0.5808, + "step": 24720 + }, + { + "epoch": 4.65, + "grad_norm": 5.349250316619873, + "learning_rate": 1.0690758516845474e-05, + "loss": 0.5773, + "step": 24730 + }, + { + "epoch": 4.66, + "grad_norm": 0.2522965967655182, + "learning_rate": 1.0686994165255035e-05, + "loss": 0.3343, + "step": 24740 + }, + { + "epoch": 4.66, + "grad_norm": 18.957963943481445, + "learning_rate": 1.0683229813664597e-05, + "loss": 0.6543, + "step": 24750 + }, + { + "epoch": 4.66, + "grad_norm": 40.8657112121582, + "learning_rate": 1.067946546207416e-05, + "loss": 0.4904, + "step": 24760 + }, + { + "epoch": 4.66, + "grad_norm": 20.800670623779297, + "learning_rate": 1.0675701110483721e-05, + "loss": 0.4968, + "step": 24770 + }, + { + "epoch": 4.66, + "grad_norm": 7.221461296081543, + "learning_rate": 1.0671936758893283e-05, + "loss": 0.4546, + "step": 24780 + }, + { + "epoch": 4.67, + "grad_norm": 2.7894575595855713, + "learning_rate": 1.0668172407302844e-05, + "loss": 0.4324, + "step": 24790 + }, + { + "epoch": 4.67, + "grad_norm": 2.2078590393066406, + "learning_rate": 1.0664408055712406e-05, + "loss": 0.3639, + "step": 24800 + }, + { + "epoch": 4.67, + "grad_norm": 13.342238426208496, + "learning_rate": 1.0660643704121967e-05, + "loss": 0.3975, + "step": 24810 + }, + { + "epoch": 4.67, + "grad_norm": 15.394166946411133, + "learning_rate": 1.0656879352531528e-05, + "loss": 0.696, + "step": 24820 + }, + { + "epoch": 4.67, + "grad_norm": 13.436577796936035, + "learning_rate": 1.0653115000941088e-05, + "loss": 0.8646, + "step": 24830 + }, + { + "epoch": 4.68, + "grad_norm": 10.768505096435547, + "learning_rate": 1.064935064935065e-05, + "loss": 0.6416, + "step": 24840 + }, + { + "epoch": 4.68, + "grad_norm": 21.5819034576416, + "learning_rate": 1.0645586297760211e-05, + "loss": 0.4017, + "step": 24850 + }, + { + "epoch": 4.68, + "grad_norm": 4.942729473114014, + "learning_rate": 1.0641821946169773e-05, + "loss": 0.5493, + "step": 24860 + }, + { + "epoch": 4.68, + "grad_norm": 28.3702392578125, + "learning_rate": 1.0638057594579334e-05, + "loss": 0.4549, + "step": 24870 + }, + { + "epoch": 4.68, + "grad_norm": 18.81791877746582, + "learning_rate": 1.0634293242988896e-05, + "loss": 0.4436, + "step": 24880 + }, + { + "epoch": 4.68, + "grad_norm": 14.989437103271484, + "learning_rate": 1.0630528891398457e-05, + "loss": 0.5157, + "step": 24890 + }, + { + "epoch": 4.69, + "grad_norm": 10.192904472351074, + "learning_rate": 1.0626764539808018e-05, + "loss": 0.6498, + "step": 24900 + }, + { + "epoch": 4.69, + "grad_norm": 11.987055778503418, + "learning_rate": 1.062300018821758e-05, + "loss": 0.5607, + "step": 24910 + }, + { + "epoch": 4.69, + "grad_norm": 8.632203102111816, + "learning_rate": 1.0619235836627141e-05, + "loss": 0.7684, + "step": 24920 + }, + { + "epoch": 4.69, + "grad_norm": 18.148733139038086, + "learning_rate": 1.0615471485036703e-05, + "loss": 0.4516, + "step": 24930 + }, + { + "epoch": 4.69, + "grad_norm": 13.001415252685547, + "learning_rate": 1.0611707133446264e-05, + "loss": 0.6813, + "step": 24940 + }, + { + "epoch": 4.7, + "grad_norm": 9.432195663452148, + "learning_rate": 1.0607942781855827e-05, + "loss": 0.7401, + "step": 24950 + }, + { + "epoch": 4.7, + "grad_norm": 14.515077590942383, + "learning_rate": 1.0604178430265389e-05, + "loss": 0.411, + "step": 24960 + }, + { + "epoch": 4.7, + "grad_norm": 7.635622024536133, + "learning_rate": 1.060041407867495e-05, + "loss": 0.55, + "step": 24970 + }, + { + "epoch": 4.7, + "grad_norm": 21.15432357788086, + "learning_rate": 1.0596649727084512e-05, + "loss": 0.9953, + "step": 24980 + }, + { + "epoch": 4.7, + "grad_norm": 12.600613594055176, + "learning_rate": 1.0592885375494073e-05, + "loss": 0.5475, + "step": 24990 + }, + { + "epoch": 4.71, + "grad_norm": 15.378485679626465, + "learning_rate": 1.0589121023903633e-05, + "loss": 0.6201, + "step": 25000 + }, + { + "epoch": 4.71, + "grad_norm": 16.676759719848633, + "learning_rate": 1.0585356672313195e-05, + "loss": 0.6947, + "step": 25010 + }, + { + "epoch": 4.71, + "grad_norm": 9.91386604309082, + "learning_rate": 1.0581592320722756e-05, + "loss": 0.7228, + "step": 25020 + }, + { + "epoch": 4.71, + "grad_norm": 21.984731674194336, + "learning_rate": 1.0577827969132317e-05, + "loss": 0.6805, + "step": 25030 + }, + { + "epoch": 4.71, + "grad_norm": 0.7576162219047546, + "learning_rate": 1.0574063617541879e-05, + "loss": 0.4934, + "step": 25040 + }, + { + "epoch": 4.71, + "grad_norm": 4.221212387084961, + "learning_rate": 1.057029926595144e-05, + "loss": 0.5943, + "step": 25050 + }, + { + "epoch": 4.72, + "grad_norm": 11.483154296875, + "learning_rate": 1.0566534914361002e-05, + "loss": 0.3277, + "step": 25060 + }, + { + "epoch": 4.72, + "grad_norm": 6.1331024169921875, + "learning_rate": 1.0562770562770563e-05, + "loss": 0.9332, + "step": 25070 + }, + { + "epoch": 4.72, + "grad_norm": 1.2644256353378296, + "learning_rate": 1.0559006211180125e-05, + "loss": 0.894, + "step": 25080 + }, + { + "epoch": 4.72, + "grad_norm": 4.269558906555176, + "learning_rate": 1.0555241859589686e-05, + "loss": 0.6306, + "step": 25090 + }, + { + "epoch": 4.72, + "grad_norm": 17.586618423461914, + "learning_rate": 1.0551477507999248e-05, + "loss": 0.5255, + "step": 25100 + }, + { + "epoch": 4.73, + "grad_norm": 0.7638131976127625, + "learning_rate": 1.0547713156408809e-05, + "loss": 0.6211, + "step": 25110 + }, + { + "epoch": 4.73, + "grad_norm": 6.860489368438721, + "learning_rate": 1.054394880481837e-05, + "loss": 0.5254, + "step": 25120 + }, + { + "epoch": 4.73, + "grad_norm": 3.1576449871063232, + "learning_rate": 1.0540184453227934e-05, + "loss": 0.7581, + "step": 25130 + }, + { + "epoch": 4.73, + "grad_norm": 23.722333908081055, + "learning_rate": 1.0536420101637495e-05, + "loss": 0.3437, + "step": 25140 + }, + { + "epoch": 4.73, + "grad_norm": 2.296964168548584, + "learning_rate": 1.0532655750047057e-05, + "loss": 0.4786, + "step": 25150 + }, + { + "epoch": 4.74, + "grad_norm": 18.260684967041016, + "learning_rate": 1.0528891398456618e-05, + "loss": 0.8271, + "step": 25160 + }, + { + "epoch": 4.74, + "grad_norm": 11.189855575561523, + "learning_rate": 1.052512704686618e-05, + "loss": 0.4455, + "step": 25170 + }, + { + "epoch": 4.74, + "grad_norm": 25.472238540649414, + "learning_rate": 1.052136269527574e-05, + "loss": 0.5042, + "step": 25180 + }, + { + "epoch": 4.74, + "grad_norm": 3.388648271560669, + "learning_rate": 1.05175983436853e-05, + "loss": 0.6337, + "step": 25190 + }, + { + "epoch": 4.74, + "grad_norm": 6.780930042266846, + "learning_rate": 1.0513833992094862e-05, + "loss": 0.7147, + "step": 25200 + }, + { + "epoch": 4.74, + "grad_norm": 28.922266006469727, + "learning_rate": 1.0510069640504424e-05, + "loss": 0.4112, + "step": 25210 + }, + { + "epoch": 4.75, + "grad_norm": 14.36142349243164, + "learning_rate": 1.0506305288913985e-05, + "loss": 0.7538, + "step": 25220 + }, + { + "epoch": 4.75, + "grad_norm": 16.995763778686523, + "learning_rate": 1.0502540937323547e-05, + "loss": 0.7889, + "step": 25230 + }, + { + "epoch": 4.75, + "grad_norm": 0.24860745668411255, + "learning_rate": 1.0498776585733108e-05, + "loss": 0.4844, + "step": 25240 + }, + { + "epoch": 4.75, + "grad_norm": 7.232578754425049, + "learning_rate": 1.049501223414267e-05, + "loss": 0.9642, + "step": 25250 + }, + { + "epoch": 4.75, + "grad_norm": 6.162290573120117, + "learning_rate": 1.0491247882552231e-05, + "loss": 0.4363, + "step": 25260 + }, + { + "epoch": 4.76, + "grad_norm": 4.756533145904541, + "learning_rate": 1.0487483530961793e-05, + "loss": 0.7687, + "step": 25270 + }, + { + "epoch": 4.76, + "grad_norm": 5.880683422088623, + "learning_rate": 1.0483719179371354e-05, + "loss": 0.3593, + "step": 25280 + }, + { + "epoch": 4.76, + "grad_norm": 12.597949981689453, + "learning_rate": 1.0479954827780915e-05, + "loss": 0.5685, + "step": 25290 + }, + { + "epoch": 4.76, + "grad_norm": 4.098395824432373, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.4729, + "step": 25300 + }, + { + "epoch": 4.76, + "grad_norm": 30.62727928161621, + "learning_rate": 1.0472426124600038e-05, + "loss": 0.8119, + "step": 25310 + }, + { + "epoch": 4.77, + "grad_norm": 18.5235595703125, + "learning_rate": 1.0468661773009602e-05, + "loss": 0.4133, + "step": 25320 + }, + { + "epoch": 4.77, + "grad_norm": 26.45856285095215, + "learning_rate": 1.0464897421419163e-05, + "loss": 0.5182, + "step": 25330 + }, + { + "epoch": 4.77, + "grad_norm": 16.52037811279297, + "learning_rate": 1.0461133069828724e-05, + "loss": 0.5143, + "step": 25340 + }, + { + "epoch": 4.77, + "grad_norm": 0.0581025630235672, + "learning_rate": 1.0457368718238282e-05, + "loss": 0.6238, + "step": 25350 + }, + { + "epoch": 4.77, + "grad_norm": 2.4997506141662598, + "learning_rate": 1.0453604366647846e-05, + "loss": 0.4463, + "step": 25360 + }, + { + "epoch": 4.78, + "grad_norm": 11.980353355407715, + "learning_rate": 1.0449840015057407e-05, + "loss": 0.4376, + "step": 25370 + }, + { + "epoch": 4.78, + "grad_norm": 14.3879976272583, + "learning_rate": 1.0446075663466969e-05, + "loss": 1.0135, + "step": 25380 + }, + { + "epoch": 4.78, + "grad_norm": 20.742895126342773, + "learning_rate": 1.044231131187653e-05, + "loss": 0.5509, + "step": 25390 + }, + { + "epoch": 4.78, + "grad_norm": 7.918363094329834, + "learning_rate": 1.0438546960286091e-05, + "loss": 0.6053, + "step": 25400 + }, + { + "epoch": 4.78, + "grad_norm": 15.34104061126709, + "learning_rate": 1.0434782608695653e-05, + "loss": 0.7682, + "step": 25410 + }, + { + "epoch": 4.78, + "grad_norm": 10.88797378540039, + "learning_rate": 1.0431018257105214e-05, + "loss": 0.6793, + "step": 25420 + }, + { + "epoch": 4.79, + "grad_norm": 21.449735641479492, + "learning_rate": 1.0427253905514776e-05, + "loss": 0.7585, + "step": 25430 + }, + { + "epoch": 4.79, + "grad_norm": 13.024362564086914, + "learning_rate": 1.0423489553924337e-05, + "loss": 0.6198, + "step": 25440 + }, + { + "epoch": 4.79, + "grad_norm": 1.5722072124481201, + "learning_rate": 1.0419725202333899e-05, + "loss": 0.466, + "step": 25450 + }, + { + "epoch": 4.79, + "grad_norm": 4.080865383148193, + "learning_rate": 1.041596085074346e-05, + "loss": 0.5095, + "step": 25460 + }, + { + "epoch": 4.79, + "grad_norm": 23.28003692626953, + "learning_rate": 1.0412196499153022e-05, + "loss": 0.8175, + "step": 25470 + }, + { + "epoch": 4.8, + "grad_norm": 8.450470924377441, + "learning_rate": 1.0408432147562583e-05, + "loss": 0.5053, + "step": 25480 + }, + { + "epoch": 4.8, + "grad_norm": 18.8877010345459, + "learning_rate": 1.0404667795972145e-05, + "loss": 0.6964, + "step": 25490 + }, + { + "epoch": 4.8, + "grad_norm": 4.335949420928955, + "learning_rate": 1.0400903444381708e-05, + "loss": 0.4675, + "step": 25500 + }, + { + "epoch": 4.8, + "grad_norm": 6.426523685455322, + "learning_rate": 1.039713909279127e-05, + "loss": 0.6782, + "step": 25510 + }, + { + "epoch": 4.8, + "grad_norm": 2.119450330734253, + "learning_rate": 1.0393374741200827e-05, + "loss": 0.4604, + "step": 25520 + }, + { + "epoch": 4.81, + "grad_norm": 59.50654983520508, + "learning_rate": 1.0389610389610389e-05, + "loss": 0.6908, + "step": 25530 + }, + { + "epoch": 4.81, + "grad_norm": 1.951970100402832, + "learning_rate": 1.038584603801995e-05, + "loss": 0.6409, + "step": 25540 + }, + { + "epoch": 4.81, + "grad_norm": 1.0901389122009277, + "learning_rate": 1.0382081686429513e-05, + "loss": 0.5373, + "step": 25550 + }, + { + "epoch": 4.81, + "grad_norm": 7.754991054534912, + "learning_rate": 1.0378317334839075e-05, + "loss": 0.4142, + "step": 25560 + }, + { + "epoch": 4.81, + "grad_norm": 12.59786605834961, + "learning_rate": 1.0374552983248636e-05, + "loss": 0.544, + "step": 25570 + }, + { + "epoch": 4.81, + "grad_norm": 32.1705436706543, + "learning_rate": 1.0370788631658198e-05, + "loss": 0.4405, + "step": 25580 + }, + { + "epoch": 4.82, + "grad_norm": 12.26833438873291, + "learning_rate": 1.036702428006776e-05, + "loss": 0.9665, + "step": 25590 + }, + { + "epoch": 4.82, + "grad_norm": 7.950512886047363, + "learning_rate": 1.036325992847732e-05, + "loss": 0.7382, + "step": 25600 + }, + { + "epoch": 4.82, + "grad_norm": 14.049162864685059, + "learning_rate": 1.0359495576886882e-05, + "loss": 0.4283, + "step": 25610 + }, + { + "epoch": 4.82, + "grad_norm": 19.716205596923828, + "learning_rate": 1.0355731225296444e-05, + "loss": 0.3962, + "step": 25620 + }, + { + "epoch": 4.82, + "grad_norm": 10.237818717956543, + "learning_rate": 1.0351966873706005e-05, + "loss": 0.5062, + "step": 25630 + }, + { + "epoch": 4.83, + "grad_norm": 27.731966018676758, + "learning_rate": 1.0348202522115567e-05, + "loss": 0.4489, + "step": 25640 + }, + { + "epoch": 4.83, + "grad_norm": 2.626387596130371, + "learning_rate": 1.0344438170525128e-05, + "loss": 0.3363, + "step": 25650 + }, + { + "epoch": 4.83, + "grad_norm": 23.135208129882812, + "learning_rate": 1.034067381893469e-05, + "loss": 0.7282, + "step": 25660 + }, + { + "epoch": 4.83, + "grad_norm": 17.033761978149414, + "learning_rate": 1.0336909467344251e-05, + "loss": 0.5404, + "step": 25670 + }, + { + "epoch": 4.83, + "grad_norm": 5.097890377044678, + "learning_rate": 1.0333145115753812e-05, + "loss": 1.1042, + "step": 25680 + }, + { + "epoch": 4.84, + "grad_norm": 10.225035667419434, + "learning_rate": 1.0329380764163376e-05, + "loss": 0.4477, + "step": 25690 + }, + { + "epoch": 4.84, + "grad_norm": 7.208095550537109, + "learning_rate": 1.0325616412572934e-05, + "loss": 0.699, + "step": 25700 + }, + { + "epoch": 4.84, + "grad_norm": 9.896465301513672, + "learning_rate": 1.0321852060982495e-05, + "loss": 0.4655, + "step": 25710 + }, + { + "epoch": 4.84, + "grad_norm": 12.159602165222168, + "learning_rate": 1.0318087709392057e-05, + "loss": 0.7253, + "step": 25720 + }, + { + "epoch": 4.84, + "grad_norm": 0.5531998872756958, + "learning_rate": 1.031432335780162e-05, + "loss": 0.3742, + "step": 25730 + }, + { + "epoch": 4.84, + "grad_norm": 8.790497779846191, + "learning_rate": 1.0310559006211181e-05, + "loss": 0.5695, + "step": 25740 + }, + { + "epoch": 4.85, + "grad_norm": 3.4259729385375977, + "learning_rate": 1.0306794654620743e-05, + "loss": 0.7616, + "step": 25750 + }, + { + "epoch": 4.85, + "grad_norm": 21.889638900756836, + "learning_rate": 1.0303030303030304e-05, + "loss": 0.6106, + "step": 25760 + }, + { + "epoch": 4.85, + "grad_norm": 13.903435707092285, + "learning_rate": 1.0299265951439866e-05, + "loss": 0.6131, + "step": 25770 + }, + { + "epoch": 4.85, + "grad_norm": 19.042236328125, + "learning_rate": 1.0295501599849427e-05, + "loss": 0.6028, + "step": 25780 + }, + { + "epoch": 4.85, + "grad_norm": 25.542591094970703, + "learning_rate": 1.0291737248258988e-05, + "loss": 0.5859, + "step": 25790 + }, + { + "epoch": 4.86, + "grad_norm": 12.444345474243164, + "learning_rate": 1.028797289666855e-05, + "loss": 0.6576, + "step": 25800 + }, + { + "epoch": 4.86, + "grad_norm": 0.6395838260650635, + "learning_rate": 1.0284208545078111e-05, + "loss": 0.7174, + "step": 25810 + }, + { + "epoch": 4.86, + "grad_norm": 13.468929290771484, + "learning_rate": 1.0280444193487673e-05, + "loss": 0.6934, + "step": 25820 + }, + { + "epoch": 4.86, + "grad_norm": 16.97991180419922, + "learning_rate": 1.0276679841897234e-05, + "loss": 0.9544, + "step": 25830 + }, + { + "epoch": 4.86, + "grad_norm": 22.151426315307617, + "learning_rate": 1.0272915490306796e-05, + "loss": 0.5222, + "step": 25840 + }, + { + "epoch": 4.87, + "grad_norm": 9.480589866638184, + "learning_rate": 1.0269151138716357e-05, + "loss": 0.6157, + "step": 25850 + }, + { + "epoch": 4.87, + "grad_norm": 15.794319152832031, + "learning_rate": 1.0265386787125919e-05, + "loss": 0.8519, + "step": 25860 + }, + { + "epoch": 4.87, + "grad_norm": 9.91077709197998, + "learning_rate": 1.0261622435535478e-05, + "loss": 0.3128, + "step": 25870 + }, + { + "epoch": 4.87, + "grad_norm": 10.460318565368652, + "learning_rate": 1.025785808394504e-05, + "loss": 0.7739, + "step": 25880 + }, + { + "epoch": 4.87, + "grad_norm": 29.492027282714844, + "learning_rate": 1.0254093732354601e-05, + "loss": 0.6284, + "step": 25890 + }, + { + "epoch": 4.87, + "grad_norm": 7.003443241119385, + "learning_rate": 1.0250329380764163e-05, + "loss": 0.4957, + "step": 25900 + }, + { + "epoch": 4.88, + "grad_norm": 36.19068145751953, + "learning_rate": 1.0246565029173724e-05, + "loss": 0.5009, + "step": 25910 + }, + { + "epoch": 4.88, + "grad_norm": 17.596288681030273, + "learning_rate": 1.0242800677583287e-05, + "loss": 0.5891, + "step": 25920 + }, + { + "epoch": 4.88, + "grad_norm": 34.26020812988281, + "learning_rate": 1.0239036325992849e-05, + "loss": 0.767, + "step": 25930 + }, + { + "epoch": 4.88, + "grad_norm": 0.572287380695343, + "learning_rate": 1.023527197440241e-05, + "loss": 0.4655, + "step": 25940 + }, + { + "epoch": 4.88, + "grad_norm": 11.611014366149902, + "learning_rate": 1.0231507622811972e-05, + "loss": 0.6033, + "step": 25950 + }, + { + "epoch": 4.89, + "grad_norm": 5.741128444671631, + "learning_rate": 1.0227743271221533e-05, + "loss": 0.2666, + "step": 25960 + }, + { + "epoch": 4.89, + "grad_norm": 27.423297882080078, + "learning_rate": 1.0223978919631095e-05, + "loss": 0.5137, + "step": 25970 + }, + { + "epoch": 4.89, + "grad_norm": 7.721019744873047, + "learning_rate": 1.0220214568040656e-05, + "loss": 0.8591, + "step": 25980 + }, + { + "epoch": 4.89, + "grad_norm": 5.386199474334717, + "learning_rate": 1.0216450216450218e-05, + "loss": 0.5424, + "step": 25990 + }, + { + "epoch": 4.89, + "grad_norm": 16.02313804626465, + "learning_rate": 1.0212685864859779e-05, + "loss": 0.9359, + "step": 26000 + }, + { + "epoch": 4.9, + "grad_norm": 28.932086944580078, + "learning_rate": 1.020892151326934e-05, + "loss": 0.4697, + "step": 26010 + }, + { + "epoch": 4.9, + "grad_norm": 7.247927665710449, + "learning_rate": 1.0205157161678902e-05, + "loss": 0.5748, + "step": 26020 + }, + { + "epoch": 4.9, + "grad_norm": 0.293780118227005, + "learning_rate": 1.0201392810088464e-05, + "loss": 0.4207, + "step": 26030 + }, + { + "epoch": 4.9, + "grad_norm": 9.770951271057129, + "learning_rate": 1.0197628458498025e-05, + "loss": 0.6643, + "step": 26040 + }, + { + "epoch": 4.9, + "grad_norm": 17.128610610961914, + "learning_rate": 1.0193864106907585e-05, + "loss": 0.4047, + "step": 26050 + }, + { + "epoch": 4.9, + "grad_norm": 20.82026481628418, + "learning_rate": 1.0190099755317146e-05, + "loss": 0.7626, + "step": 26060 + }, + { + "epoch": 4.91, + "grad_norm": 9.654380798339844, + "learning_rate": 1.0186335403726708e-05, + "loss": 0.4447, + "step": 26070 + }, + { + "epoch": 4.91, + "grad_norm": 19.486652374267578, + "learning_rate": 1.0182571052136269e-05, + "loss": 0.4849, + "step": 26080 + }, + { + "epoch": 4.91, + "grad_norm": 15.955955505371094, + "learning_rate": 1.017880670054583e-05, + "loss": 0.7018, + "step": 26090 + }, + { + "epoch": 4.91, + "grad_norm": 6.0235915184021, + "learning_rate": 1.0175042348955392e-05, + "loss": 0.5864, + "step": 26100 + }, + { + "epoch": 4.91, + "grad_norm": 41.60502624511719, + "learning_rate": 1.0171277997364955e-05, + "loss": 0.9113, + "step": 26110 + }, + { + "epoch": 4.92, + "grad_norm": 10.696873664855957, + "learning_rate": 1.0167513645774517e-05, + "loss": 0.4178, + "step": 26120 + }, + { + "epoch": 4.92, + "grad_norm": 27.66557502746582, + "learning_rate": 1.0163749294184078e-05, + "loss": 0.5074, + "step": 26130 + }, + { + "epoch": 4.92, + "grad_norm": 4.632658958435059, + "learning_rate": 1.015998494259364e-05, + "loss": 0.3351, + "step": 26140 + }, + { + "epoch": 4.92, + "grad_norm": 26.457387924194336, + "learning_rate": 1.0156220591003201e-05, + "loss": 1.3971, + "step": 26150 + }, + { + "epoch": 4.92, + "grad_norm": 3.868096351623535, + "learning_rate": 1.0152456239412763e-05, + "loss": 0.5699, + "step": 26160 + }, + { + "epoch": 4.93, + "grad_norm": 1.3630057573318481, + "learning_rate": 1.0148691887822324e-05, + "loss": 0.4219, + "step": 26170 + }, + { + "epoch": 4.93, + "grad_norm": 34.00837326049805, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.8625, + "step": 26180 + }, + { + "epoch": 4.93, + "grad_norm": 2.9621551036834717, + "learning_rate": 1.0141163184641447e-05, + "loss": 0.6725, + "step": 26190 + }, + { + "epoch": 4.93, + "grad_norm": 0.44564133882522583, + "learning_rate": 1.0137398833051008e-05, + "loss": 0.6897, + "step": 26200 + }, + { + "epoch": 4.93, + "grad_norm": 10.090829849243164, + "learning_rate": 1.013363448146057e-05, + "loss": 0.7626, + "step": 26210 + }, + { + "epoch": 4.94, + "grad_norm": 10.721175193786621, + "learning_rate": 1.012987012987013e-05, + "loss": 0.7807, + "step": 26220 + }, + { + "epoch": 4.94, + "grad_norm": 8.438685417175293, + "learning_rate": 1.0126105778279691e-05, + "loss": 0.6646, + "step": 26230 + }, + { + "epoch": 4.94, + "grad_norm": 20.982872009277344, + "learning_rate": 1.0122341426689252e-05, + "loss": 0.7103, + "step": 26240 + }, + { + "epoch": 4.94, + "grad_norm": 24.98394012451172, + "learning_rate": 1.0118577075098814e-05, + "loss": 0.658, + "step": 26250 + }, + { + "epoch": 4.94, + "grad_norm": 10.647464752197266, + "learning_rate": 1.0114812723508375e-05, + "loss": 0.5797, + "step": 26260 + }, + { + "epoch": 4.94, + "grad_norm": 5.082032203674316, + "learning_rate": 1.0111048371917937e-05, + "loss": 0.3246, + "step": 26270 + }, + { + "epoch": 4.95, + "grad_norm": 34.345130920410156, + "learning_rate": 1.0107284020327498e-05, + "loss": 0.7128, + "step": 26280 + }, + { + "epoch": 4.95, + "grad_norm": 1.867757797241211, + "learning_rate": 1.0103519668737061e-05, + "loss": 0.4022, + "step": 26290 + }, + { + "epoch": 4.95, + "grad_norm": 17.364198684692383, + "learning_rate": 1.0099755317146623e-05, + "loss": 0.4057, + "step": 26300 + }, + { + "epoch": 4.95, + "grad_norm": 0.3039032518863678, + "learning_rate": 1.0095990965556184e-05, + "loss": 0.6772, + "step": 26310 + }, + { + "epoch": 4.95, + "grad_norm": 7.0659050941467285, + "learning_rate": 1.0092226613965746e-05, + "loss": 0.4634, + "step": 26320 + }, + { + "epoch": 4.96, + "grad_norm": 6.5546135902404785, + "learning_rate": 1.0088462262375307e-05, + "loss": 0.6775, + "step": 26330 + }, + { + "epoch": 4.96, + "grad_norm": 19.550182342529297, + "learning_rate": 1.0084697910784869e-05, + "loss": 0.3757, + "step": 26340 + }, + { + "epoch": 4.96, + "grad_norm": 10.655951499938965, + "learning_rate": 1.008093355919443e-05, + "loss": 0.5657, + "step": 26350 + }, + { + "epoch": 4.96, + "grad_norm": 7.180771350860596, + "learning_rate": 1.0077169207603992e-05, + "loss": 0.7091, + "step": 26360 + }, + { + "epoch": 4.96, + "grad_norm": 10.868316650390625, + "learning_rate": 1.0073404856013553e-05, + "loss": 0.6567, + "step": 26370 + }, + { + "epoch": 4.97, + "grad_norm": 43.5999641418457, + "learning_rate": 1.0069640504423115e-05, + "loss": 0.4698, + "step": 26380 + }, + { + "epoch": 4.97, + "grad_norm": 12.528432846069336, + "learning_rate": 1.0065876152832676e-05, + "loss": 0.7588, + "step": 26390 + }, + { + "epoch": 4.97, + "grad_norm": 13.703747749328613, + "learning_rate": 1.0062111801242236e-05, + "loss": 0.7794, + "step": 26400 + }, + { + "epoch": 4.97, + "grad_norm": 0.5737854838371277, + "learning_rate": 1.0058347449651797e-05, + "loss": 0.6142, + "step": 26410 + }, + { + "epoch": 4.97, + "grad_norm": 15.537364959716797, + "learning_rate": 1.0054583098061359e-05, + "loss": 0.585, + "step": 26420 + }, + { + "epoch": 4.97, + "grad_norm": 14.01577377319336, + "learning_rate": 1.005081874647092e-05, + "loss": 0.7088, + "step": 26430 + }, + { + "epoch": 4.98, + "grad_norm": 5.803639888763428, + "learning_rate": 1.0047054394880482e-05, + "loss": 0.7963, + "step": 26440 + }, + { + "epoch": 4.98, + "grad_norm": 16.50397300720215, + "learning_rate": 1.0043290043290043e-05, + "loss": 0.5466, + "step": 26450 + }, + { + "epoch": 4.98, + "grad_norm": 10.755285263061523, + "learning_rate": 1.0039525691699605e-05, + "loss": 0.4802, + "step": 26460 + }, + { + "epoch": 4.98, + "grad_norm": 4.0640869140625, + "learning_rate": 1.0035761340109166e-05, + "loss": 0.8407, + "step": 26470 + }, + { + "epoch": 4.98, + "grad_norm": 10.815974235534668, + "learning_rate": 1.003199698851873e-05, + "loss": 0.5498, + "step": 26480 + }, + { + "epoch": 4.99, + "grad_norm": 21.308177947998047, + "learning_rate": 1.002823263692829e-05, + "loss": 0.6421, + "step": 26490 + }, + { + "epoch": 4.99, + "grad_norm": 20.949392318725586, + "learning_rate": 1.0024468285337852e-05, + "loss": 0.4232, + "step": 26500 + }, + { + "epoch": 4.99, + "grad_norm": 6.295169830322266, + "learning_rate": 1.0020703933747414e-05, + "loss": 0.3888, + "step": 26510 + }, + { + "epoch": 4.99, + "grad_norm": 22.155059814453125, + "learning_rate": 1.0016939582156975e-05, + "loss": 0.4725, + "step": 26520 + }, + { + "epoch": 4.99, + "grad_norm": 11.023420333862305, + "learning_rate": 1.0013175230566537e-05, + "loss": 0.9657, + "step": 26530 + }, + { + "epoch": 5.0, + "grad_norm": 7.431711673736572, + "learning_rate": 1.0009410878976098e-05, + "loss": 0.8442, + "step": 26540 + }, + { + "epoch": 5.0, + "grad_norm": 7.200937271118164, + "learning_rate": 1.000564652738566e-05, + "loss": 0.4408, + "step": 26550 + }, + { + "epoch": 5.0, + "grad_norm": 16.981725692749023, + "learning_rate": 1.0001882175795221e-05, + "loss": 0.4665, + "step": 26560 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9174666666666667, + "eval_loss": 0.3038625717163086, + "eval_runtime": 51.1218, + "eval_samples_per_second": 146.708, + "eval_steps_per_second": 18.348, + "step": 26565 + }, + { + "epoch": 5.0, + "grad_norm": 11.93405532836914, + "learning_rate": 9.998117824204782e-06, + "loss": 0.5697, + "step": 26570 + }, + { + "epoch": 5.0, + "grad_norm": 5.752691745758057, + "learning_rate": 9.994353472614344e-06, + "loss": 0.407, + "step": 26580 + }, + { + "epoch": 5.0, + "grad_norm": 0.38019922375679016, + "learning_rate": 9.990589121023905e-06, + "loss": 0.5588, + "step": 26590 + }, + { + "epoch": 5.01, + "grad_norm": 23.82176399230957, + "learning_rate": 9.986824769433467e-06, + "loss": 0.7845, + "step": 26600 + }, + { + "epoch": 5.01, + "grad_norm": 18.493980407714844, + "learning_rate": 9.983060417843027e-06, + "loss": 1.0395, + "step": 26610 + }, + { + "epoch": 5.01, + "grad_norm": 0.250558078289032, + "learning_rate": 9.979296066252588e-06, + "loss": 0.2666, + "step": 26620 + }, + { + "epoch": 5.01, + "grad_norm": 12.60075569152832, + "learning_rate": 9.97553171466215e-06, + "loss": 0.4569, + "step": 26630 + }, + { + "epoch": 5.01, + "grad_norm": 12.024946212768555, + "learning_rate": 9.971767363071711e-06, + "loss": 0.4128, + "step": 26640 + }, + { + "epoch": 5.02, + "grad_norm": 0.44587206840515137, + "learning_rate": 9.968003011481272e-06, + "loss": 0.5314, + "step": 26650 + }, + { + "epoch": 5.02, + "grad_norm": 3.9360930919647217, + "learning_rate": 9.964238659890836e-06, + "loss": 0.5811, + "step": 26660 + }, + { + "epoch": 5.02, + "grad_norm": 5.4166035652160645, + "learning_rate": 9.960474308300397e-06, + "loss": 0.5869, + "step": 26670 + }, + { + "epoch": 5.02, + "grad_norm": 13.620014190673828, + "learning_rate": 9.956709956709958e-06, + "loss": 0.5772, + "step": 26680 + }, + { + "epoch": 5.02, + "grad_norm": 18.73905372619629, + "learning_rate": 9.95294560511952e-06, + "loss": 0.4458, + "step": 26690 + }, + { + "epoch": 5.03, + "grad_norm": 9.41979694366455, + "learning_rate": 9.94918125352908e-06, + "loss": 0.6342, + "step": 26700 + }, + { + "epoch": 5.03, + "grad_norm": 6.560514450073242, + "learning_rate": 9.945416901938641e-06, + "loss": 0.8343, + "step": 26710 + }, + { + "epoch": 5.03, + "grad_norm": 24.214908599853516, + "learning_rate": 9.941652550348203e-06, + "loss": 0.3422, + "step": 26720 + }, + { + "epoch": 5.03, + "grad_norm": 13.99776554107666, + "learning_rate": 9.937888198757764e-06, + "loss": 0.6253, + "step": 26730 + }, + { + "epoch": 5.03, + "grad_norm": 13.242351531982422, + "learning_rate": 9.934123847167326e-06, + "loss": 0.543, + "step": 26740 + }, + { + "epoch": 5.03, + "grad_norm": 33.52753448486328, + "learning_rate": 9.930359495576887e-06, + "loss": 0.4902, + "step": 26750 + }, + { + "epoch": 5.04, + "grad_norm": 23.321645736694336, + "learning_rate": 9.92659514398645e-06, + "loss": 0.656, + "step": 26760 + }, + { + "epoch": 5.04, + "grad_norm": 16.174564361572266, + "learning_rate": 9.922830792396012e-06, + "loss": 0.5337, + "step": 26770 + }, + { + "epoch": 5.04, + "grad_norm": 4.621311664581299, + "learning_rate": 9.919066440805573e-06, + "loss": 0.7531, + "step": 26780 + }, + { + "epoch": 5.04, + "grad_norm": 29.9796085357666, + "learning_rate": 9.915302089215133e-06, + "loss": 0.6874, + "step": 26790 + }, + { + "epoch": 5.04, + "grad_norm": 26.092121124267578, + "learning_rate": 9.911537737624694e-06, + "loss": 0.7789, + "step": 26800 + }, + { + "epoch": 5.05, + "grad_norm": 7.301915645599365, + "learning_rate": 9.907773386034256e-06, + "loss": 0.385, + "step": 26810 + }, + { + "epoch": 5.05, + "grad_norm": 19.605470657348633, + "learning_rate": 9.904009034443817e-06, + "loss": 0.702, + "step": 26820 + }, + { + "epoch": 5.05, + "grad_norm": 13.46226692199707, + "learning_rate": 9.900244682853379e-06, + "loss": 0.24, + "step": 26830 + }, + { + "epoch": 5.05, + "grad_norm": 5.802454948425293, + "learning_rate": 9.89648033126294e-06, + "loss": 0.2608, + "step": 26840 + }, + { + "epoch": 5.05, + "grad_norm": 8.033814430236816, + "learning_rate": 9.892715979672503e-06, + "loss": 0.6069, + "step": 26850 + }, + { + "epoch": 5.06, + "grad_norm": 0.2305469959974289, + "learning_rate": 9.888951628082065e-06, + "loss": 0.4146, + "step": 26860 + }, + { + "epoch": 5.06, + "grad_norm": 7.512425422668457, + "learning_rate": 9.885187276491625e-06, + "loss": 0.6316, + "step": 26870 + }, + { + "epoch": 5.06, + "grad_norm": 3.023247480392456, + "learning_rate": 9.881422924901186e-06, + "loss": 0.7018, + "step": 26880 + }, + { + "epoch": 5.06, + "grad_norm": 20.460777282714844, + "learning_rate": 9.877658573310747e-06, + "loss": 0.5622, + "step": 26890 + }, + { + "epoch": 5.06, + "grad_norm": 5.8153839111328125, + "learning_rate": 9.873894221720309e-06, + "loss": 0.5251, + "step": 26900 + }, + { + "epoch": 5.06, + "grad_norm": 27.608034133911133, + "learning_rate": 9.87012987012987e-06, + "loss": 0.6028, + "step": 26910 + }, + { + "epoch": 5.07, + "grad_norm": 33.49705123901367, + "learning_rate": 9.866365518539432e-06, + "loss": 0.5239, + "step": 26920 + }, + { + "epoch": 5.07, + "grad_norm": 1.4094599485397339, + "learning_rate": 9.862601166948993e-06, + "loss": 0.7576, + "step": 26930 + }, + { + "epoch": 5.07, + "grad_norm": 8.930601119995117, + "learning_rate": 9.858836815358556e-06, + "loss": 0.4088, + "step": 26940 + }, + { + "epoch": 5.07, + "grad_norm": 0.6943548321723938, + "learning_rate": 9.855072463768118e-06, + "loss": 0.633, + "step": 26950 + }, + { + "epoch": 5.07, + "grad_norm": 33.55091094970703, + "learning_rate": 9.851308112177678e-06, + "loss": 0.7098, + "step": 26960 + }, + { + "epoch": 5.08, + "grad_norm": 18.824798583984375, + "learning_rate": 9.847543760587239e-06, + "loss": 0.5024, + "step": 26970 + }, + { + "epoch": 5.08, + "grad_norm": 19.024484634399414, + "learning_rate": 9.8437794089968e-06, + "loss": 0.5722, + "step": 26980 + }, + { + "epoch": 5.08, + "grad_norm": 4.507168292999268, + "learning_rate": 9.840015057406362e-06, + "loss": 0.2062, + "step": 26990 + }, + { + "epoch": 5.08, + "grad_norm": 31.550439834594727, + "learning_rate": 9.836250705815924e-06, + "loss": 0.4145, + "step": 27000 + }, + { + "epoch": 5.08, + "grad_norm": 11.541763305664062, + "learning_rate": 9.832486354225485e-06, + "loss": 0.522, + "step": 27010 + }, + { + "epoch": 5.09, + "grad_norm": 14.351038932800293, + "learning_rate": 9.828722002635046e-06, + "loss": 0.5481, + "step": 27020 + }, + { + "epoch": 5.09, + "grad_norm": 4.900013446807861, + "learning_rate": 9.82495765104461e-06, + "loss": 0.4472, + "step": 27030 + }, + { + "epoch": 5.09, + "grad_norm": 14.692139625549316, + "learning_rate": 9.821193299454171e-06, + "loss": 0.6838, + "step": 27040 + }, + { + "epoch": 5.09, + "grad_norm": 0.5007912516593933, + "learning_rate": 9.81742894786373e-06, + "loss": 0.7123, + "step": 27050 + }, + { + "epoch": 5.09, + "grad_norm": 29.906484603881836, + "learning_rate": 9.813664596273292e-06, + "loss": 0.3299, + "step": 27060 + }, + { + "epoch": 5.1, + "grad_norm": 21.051406860351562, + "learning_rate": 9.809900244682854e-06, + "loss": 0.471, + "step": 27070 + }, + { + "epoch": 5.1, + "grad_norm": 8.312321662902832, + "learning_rate": 9.806135893092415e-06, + "loss": 0.5687, + "step": 27080 + }, + { + "epoch": 5.1, + "grad_norm": 0.8224940896034241, + "learning_rate": 9.802371541501977e-06, + "loss": 0.3544, + "step": 27090 + }, + { + "epoch": 5.1, + "grad_norm": 18.394412994384766, + "learning_rate": 9.798607189911538e-06, + "loss": 0.4271, + "step": 27100 + }, + { + "epoch": 5.1, + "grad_norm": 14.561628341674805, + "learning_rate": 9.7948428383211e-06, + "loss": 0.7428, + "step": 27110 + }, + { + "epoch": 5.1, + "grad_norm": 17.597429275512695, + "learning_rate": 9.791078486730661e-06, + "loss": 0.322, + "step": 27120 + }, + { + "epoch": 5.11, + "grad_norm": 0.4523058831691742, + "learning_rate": 9.787314135140224e-06, + "loss": 0.4471, + "step": 27130 + }, + { + "epoch": 5.11, + "grad_norm": 57.31825637817383, + "learning_rate": 9.783549783549784e-06, + "loss": 1.022, + "step": 27140 + }, + { + "epoch": 5.11, + "grad_norm": 15.83234977722168, + "learning_rate": 9.779785431959345e-06, + "loss": 0.5429, + "step": 27150 + }, + { + "epoch": 5.11, + "grad_norm": 17.558889389038086, + "learning_rate": 9.776021080368907e-06, + "loss": 0.4771, + "step": 27160 + }, + { + "epoch": 5.11, + "grad_norm": 6.830644607543945, + "learning_rate": 9.772256728778468e-06, + "loss": 0.3859, + "step": 27170 + }, + { + "epoch": 5.12, + "grad_norm": 23.56614875793457, + "learning_rate": 9.76849237718803e-06, + "loss": 0.2237, + "step": 27180 + }, + { + "epoch": 5.12, + "grad_norm": 17.846912384033203, + "learning_rate": 9.764728025597591e-06, + "loss": 0.5612, + "step": 27190 + }, + { + "epoch": 5.12, + "grad_norm": 22.106182098388672, + "learning_rate": 9.760963674007153e-06, + "loss": 0.4336, + "step": 27200 + }, + { + "epoch": 5.12, + "grad_norm": 34.99475860595703, + "learning_rate": 9.757199322416714e-06, + "loss": 0.5471, + "step": 27210 + }, + { + "epoch": 5.12, + "grad_norm": 22.88408660888672, + "learning_rate": 9.753434970826276e-06, + "loss": 0.5702, + "step": 27220 + }, + { + "epoch": 5.13, + "grad_norm": 0.2636334300041199, + "learning_rate": 9.749670619235837e-06, + "loss": 0.6926, + "step": 27230 + }, + { + "epoch": 5.13, + "grad_norm": 0.4821383059024811, + "learning_rate": 9.745906267645399e-06, + "loss": 0.4507, + "step": 27240 + }, + { + "epoch": 5.13, + "grad_norm": 0.9522204399108887, + "learning_rate": 9.74214191605496e-06, + "loss": 0.7467, + "step": 27250 + }, + { + "epoch": 5.13, + "grad_norm": 0.07715526968240738, + "learning_rate": 9.738377564464521e-06, + "loss": 0.5735, + "step": 27260 + }, + { + "epoch": 5.13, + "grad_norm": 2.0689947605133057, + "learning_rate": 9.734613212874083e-06, + "loss": 0.621, + "step": 27270 + }, + { + "epoch": 5.13, + "grad_norm": 4.146627902984619, + "learning_rate": 9.730848861283644e-06, + "loss": 0.3251, + "step": 27280 + }, + { + "epoch": 5.14, + "grad_norm": 25.737661361694336, + "learning_rate": 9.727084509693206e-06, + "loss": 0.34, + "step": 27290 + }, + { + "epoch": 5.14, + "grad_norm": 10.852874755859375, + "learning_rate": 9.723320158102767e-06, + "loss": 0.6041, + "step": 27300 + }, + { + "epoch": 5.14, + "grad_norm": 18.879383087158203, + "learning_rate": 9.719555806512329e-06, + "loss": 0.6012, + "step": 27310 + }, + { + "epoch": 5.14, + "grad_norm": 0.9953970909118652, + "learning_rate": 9.71579145492189e-06, + "loss": 0.62, + "step": 27320 + }, + { + "epoch": 5.14, + "grad_norm": 1.2048535346984863, + "learning_rate": 9.712027103331452e-06, + "loss": 0.5219, + "step": 27330 + }, + { + "epoch": 5.15, + "grad_norm": 13.980968475341797, + "learning_rate": 9.708262751741013e-06, + "loss": 0.7282, + "step": 27340 + }, + { + "epoch": 5.15, + "grad_norm": 9.173256874084473, + "learning_rate": 9.704498400150575e-06, + "loss": 0.432, + "step": 27350 + }, + { + "epoch": 5.15, + "grad_norm": 11.977608680725098, + "learning_rate": 9.700734048560136e-06, + "loss": 0.5185, + "step": 27360 + }, + { + "epoch": 5.15, + "grad_norm": 9.067453384399414, + "learning_rate": 9.696969696969698e-06, + "loss": 0.4714, + "step": 27370 + }, + { + "epoch": 5.15, + "grad_norm": 4.337044715881348, + "learning_rate": 9.693205345379259e-06, + "loss": 0.3863, + "step": 27380 + }, + { + "epoch": 5.16, + "grad_norm": 6.323875427246094, + "learning_rate": 9.68944099378882e-06, + "loss": 0.7175, + "step": 27390 + }, + { + "epoch": 5.16, + "grad_norm": 2.4866652488708496, + "learning_rate": 9.685676642198382e-06, + "loss": 0.3135, + "step": 27400 + }, + { + "epoch": 5.16, + "grad_norm": 2.433683395385742, + "learning_rate": 9.681912290607943e-06, + "loss": 0.576, + "step": 27410 + }, + { + "epoch": 5.16, + "grad_norm": 10.03541374206543, + "learning_rate": 9.678147939017505e-06, + "loss": 0.4971, + "step": 27420 + }, + { + "epoch": 5.16, + "grad_norm": 22.778186798095703, + "learning_rate": 9.674383587427066e-06, + "loss": 0.5237, + "step": 27430 + }, + { + "epoch": 5.16, + "grad_norm": 4.677865028381348, + "learning_rate": 9.670619235836628e-06, + "loss": 0.7147, + "step": 27440 + }, + { + "epoch": 5.17, + "grad_norm": 21.216205596923828, + "learning_rate": 9.66685488424619e-06, + "loss": 0.5606, + "step": 27450 + }, + { + "epoch": 5.17, + "grad_norm": 10.490550994873047, + "learning_rate": 9.66309053265575e-06, + "loss": 0.7098, + "step": 27460 + }, + { + "epoch": 5.17, + "grad_norm": 7.556525230407715, + "learning_rate": 9.659326181065312e-06, + "loss": 0.6452, + "step": 27470 + }, + { + "epoch": 5.17, + "grad_norm": 0.8397094011306763, + "learning_rate": 9.655561829474874e-06, + "loss": 0.1972, + "step": 27480 + }, + { + "epoch": 5.17, + "grad_norm": 9.92435073852539, + "learning_rate": 9.651797477884435e-06, + "loss": 0.4522, + "step": 27490 + }, + { + "epoch": 5.18, + "grad_norm": 1.3592374324798584, + "learning_rate": 9.648033126293997e-06, + "loss": 0.4904, + "step": 27500 + }, + { + "epoch": 5.18, + "grad_norm": 10.651955604553223, + "learning_rate": 9.644268774703558e-06, + "loss": 0.5157, + "step": 27510 + }, + { + "epoch": 5.18, + "grad_norm": 18.821256637573242, + "learning_rate": 9.64050442311312e-06, + "loss": 0.5196, + "step": 27520 + }, + { + "epoch": 5.18, + "grad_norm": 3.651150703430176, + "learning_rate": 9.636740071522681e-06, + "loss": 0.5902, + "step": 27530 + }, + { + "epoch": 5.18, + "grad_norm": 13.892976760864258, + "learning_rate": 9.632975719932242e-06, + "loss": 0.8091, + "step": 27540 + }, + { + "epoch": 5.19, + "grad_norm": 0.7897984981536865, + "learning_rate": 9.629211368341804e-06, + "loss": 0.4273, + "step": 27550 + }, + { + "epoch": 5.19, + "grad_norm": 16.25598907470703, + "learning_rate": 9.625447016751365e-06, + "loss": 0.3544, + "step": 27560 + }, + { + "epoch": 5.19, + "grad_norm": 15.311908721923828, + "learning_rate": 9.621682665160927e-06, + "loss": 0.3885, + "step": 27570 + }, + { + "epoch": 5.19, + "grad_norm": 13.913825988769531, + "learning_rate": 9.617918313570488e-06, + "loss": 0.3768, + "step": 27580 + }, + { + "epoch": 5.19, + "grad_norm": 0.829640805721283, + "learning_rate": 9.61415396198005e-06, + "loss": 0.3235, + "step": 27590 + }, + { + "epoch": 5.19, + "grad_norm": 11.236851692199707, + "learning_rate": 9.610389610389611e-06, + "loss": 0.73, + "step": 27600 + }, + { + "epoch": 5.2, + "grad_norm": 9.920482635498047, + "learning_rate": 9.606625258799173e-06, + "loss": 0.379, + "step": 27610 + }, + { + "epoch": 5.2, + "grad_norm": 5.420136451721191, + "learning_rate": 9.602860907208734e-06, + "loss": 0.7466, + "step": 27620 + }, + { + "epoch": 5.2, + "grad_norm": 32.80744934082031, + "learning_rate": 9.599096555618296e-06, + "loss": 0.906, + "step": 27630 + }, + { + "epoch": 5.2, + "grad_norm": 21.18132209777832, + "learning_rate": 9.595332204027857e-06, + "loss": 0.8318, + "step": 27640 + }, + { + "epoch": 5.2, + "grad_norm": 7.528763771057129, + "learning_rate": 9.591567852437418e-06, + "loss": 0.6328, + "step": 27650 + }, + { + "epoch": 5.21, + "grad_norm": 24.74061393737793, + "learning_rate": 9.58780350084698e-06, + "loss": 0.5783, + "step": 27660 + }, + { + "epoch": 5.21, + "grad_norm": 15.430523872375488, + "learning_rate": 9.584039149256541e-06, + "loss": 0.554, + "step": 27670 + }, + { + "epoch": 5.21, + "grad_norm": 23.323516845703125, + "learning_rate": 9.580274797666103e-06, + "loss": 0.246, + "step": 27680 + }, + { + "epoch": 5.21, + "grad_norm": 29.96780776977539, + "learning_rate": 9.576510446075664e-06, + "loss": 0.7304, + "step": 27690 + }, + { + "epoch": 5.21, + "grad_norm": 19.847814559936523, + "learning_rate": 9.572746094485226e-06, + "loss": 0.4962, + "step": 27700 + }, + { + "epoch": 5.22, + "grad_norm": 11.185002326965332, + "learning_rate": 9.568981742894787e-06, + "loss": 0.4532, + "step": 27710 + }, + { + "epoch": 5.22, + "grad_norm": 12.05192756652832, + "learning_rate": 9.565217391304349e-06, + "loss": 0.6854, + "step": 27720 + }, + { + "epoch": 5.22, + "grad_norm": 18.75062370300293, + "learning_rate": 9.56145303971391e-06, + "loss": 0.7951, + "step": 27730 + }, + { + "epoch": 5.22, + "grad_norm": 5.429780960083008, + "learning_rate": 9.557688688123472e-06, + "loss": 0.5807, + "step": 27740 + }, + { + "epoch": 5.22, + "grad_norm": 6.862247943878174, + "learning_rate": 9.553924336533033e-06, + "loss": 0.4013, + "step": 27750 + }, + { + "epoch": 5.22, + "grad_norm": 2.499727487564087, + "learning_rate": 9.550159984942595e-06, + "loss": 0.5839, + "step": 27760 + }, + { + "epoch": 5.23, + "grad_norm": 20.708763122558594, + "learning_rate": 9.546395633352156e-06, + "loss": 0.5312, + "step": 27770 + }, + { + "epoch": 5.23, + "grad_norm": 35.32229995727539, + "learning_rate": 9.542631281761717e-06, + "loss": 0.5386, + "step": 27780 + }, + { + "epoch": 5.23, + "grad_norm": 4.731060028076172, + "learning_rate": 9.538866930171279e-06, + "loss": 0.3741, + "step": 27790 + }, + { + "epoch": 5.23, + "grad_norm": 15.941679000854492, + "learning_rate": 9.53510257858084e-06, + "loss": 0.5086, + "step": 27800 + }, + { + "epoch": 5.23, + "grad_norm": 33.2602653503418, + "learning_rate": 9.531338226990402e-06, + "loss": 0.6115, + "step": 27810 + }, + { + "epoch": 5.24, + "grad_norm": 10.409682273864746, + "learning_rate": 9.527573875399963e-06, + "loss": 0.3425, + "step": 27820 + }, + { + "epoch": 5.24, + "grad_norm": 18.776081085205078, + "learning_rate": 9.523809523809525e-06, + "loss": 0.4186, + "step": 27830 + }, + { + "epoch": 5.24, + "grad_norm": 45.04935073852539, + "learning_rate": 9.520045172219086e-06, + "loss": 0.5253, + "step": 27840 + }, + { + "epoch": 5.24, + "grad_norm": 8.127985954284668, + "learning_rate": 9.516280820628648e-06, + "loss": 0.5575, + "step": 27850 + }, + { + "epoch": 5.24, + "grad_norm": 3.796123504638672, + "learning_rate": 9.512516469038209e-06, + "loss": 0.3674, + "step": 27860 + }, + { + "epoch": 5.25, + "grad_norm": 3.369060754776001, + "learning_rate": 9.50875211744777e-06, + "loss": 0.539, + "step": 27870 + }, + { + "epoch": 5.25, + "grad_norm": 1.8295410871505737, + "learning_rate": 9.504987765857332e-06, + "loss": 0.3522, + "step": 27880 + }, + { + "epoch": 5.25, + "grad_norm": 21.568384170532227, + "learning_rate": 9.501223414266894e-06, + "loss": 0.4866, + "step": 27890 + }, + { + "epoch": 5.25, + "grad_norm": 6.660335540771484, + "learning_rate": 9.497459062676455e-06, + "loss": 0.6279, + "step": 27900 + }, + { + "epoch": 5.25, + "grad_norm": 0.142924964427948, + "learning_rate": 9.493694711086016e-06, + "loss": 0.7068, + "step": 27910 + }, + { + "epoch": 5.26, + "grad_norm": 0.3119409680366516, + "learning_rate": 9.489930359495578e-06, + "loss": 0.5291, + "step": 27920 + }, + { + "epoch": 5.26, + "grad_norm": 41.5748405456543, + "learning_rate": 9.48616600790514e-06, + "loss": 0.6962, + "step": 27930 + }, + { + "epoch": 5.26, + "grad_norm": 21.246931076049805, + "learning_rate": 9.4824016563147e-06, + "loss": 0.5247, + "step": 27940 + }, + { + "epoch": 5.26, + "grad_norm": 9.162127494812012, + "learning_rate": 9.478637304724262e-06, + "loss": 0.6259, + "step": 27950 + }, + { + "epoch": 5.26, + "grad_norm": 14.135149955749512, + "learning_rate": 9.474872953133824e-06, + "loss": 0.5191, + "step": 27960 + }, + { + "epoch": 5.26, + "grad_norm": 14.310225486755371, + "learning_rate": 9.471108601543385e-06, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 5.27, + "grad_norm": 23.264753341674805, + "learning_rate": 9.467344249952947e-06, + "loss": 0.8251, + "step": 27980 + }, + { + "epoch": 5.27, + "grad_norm": 18.3626766204834, + "learning_rate": 9.463579898362508e-06, + "loss": 0.6605, + "step": 27990 + }, + { + "epoch": 5.27, + "grad_norm": 1.1121855974197388, + "learning_rate": 9.45981554677207e-06, + "loss": 0.396, + "step": 28000 + }, + { + "epoch": 5.27, + "grad_norm": 0.5820768475532532, + "learning_rate": 9.456051195181631e-06, + "loss": 0.3876, + "step": 28010 + }, + { + "epoch": 5.27, + "grad_norm": 19.84071159362793, + "learning_rate": 9.452286843591193e-06, + "loss": 0.2798, + "step": 28020 + }, + { + "epoch": 5.28, + "grad_norm": 14.78951644897461, + "learning_rate": 9.448522492000754e-06, + "loss": 0.7496, + "step": 28030 + }, + { + "epoch": 5.28, + "grad_norm": 1.7860188484191895, + "learning_rate": 9.444758140410315e-06, + "loss": 0.3007, + "step": 28040 + }, + { + "epoch": 5.28, + "grad_norm": 23.34420394897461, + "learning_rate": 9.440993788819877e-06, + "loss": 0.6097, + "step": 28050 + }, + { + "epoch": 5.28, + "grad_norm": 11.595169067382812, + "learning_rate": 9.437229437229438e-06, + "loss": 0.7458, + "step": 28060 + }, + { + "epoch": 5.28, + "grad_norm": 13.248278617858887, + "learning_rate": 9.433465085639e-06, + "loss": 0.4254, + "step": 28070 + }, + { + "epoch": 5.29, + "grad_norm": 9.730267524719238, + "learning_rate": 9.429700734048561e-06, + "loss": 0.2484, + "step": 28080 + }, + { + "epoch": 5.29, + "grad_norm": 10.137773513793945, + "learning_rate": 9.425936382458121e-06, + "loss": 0.5563, + "step": 28090 + }, + { + "epoch": 5.29, + "grad_norm": 9.176275253295898, + "learning_rate": 9.422172030867684e-06, + "loss": 0.7219, + "step": 28100 + }, + { + "epoch": 5.29, + "grad_norm": 18.387922286987305, + "learning_rate": 9.418407679277246e-06, + "loss": 0.4597, + "step": 28110 + }, + { + "epoch": 5.29, + "grad_norm": 5.12867546081543, + "learning_rate": 9.414643327686807e-06, + "loss": 0.6968, + "step": 28120 + }, + { + "epoch": 5.29, + "grad_norm": 5.446651935577393, + "learning_rate": 9.410878976096369e-06, + "loss": 0.5488, + "step": 28130 + }, + { + "epoch": 5.3, + "grad_norm": 11.691588401794434, + "learning_rate": 9.40711462450593e-06, + "loss": 0.3528, + "step": 28140 + }, + { + "epoch": 5.3, + "grad_norm": 5.106471538543701, + "learning_rate": 9.403350272915491e-06, + "loss": 0.5703, + "step": 28150 + }, + { + "epoch": 5.3, + "grad_norm": 22.144344329833984, + "learning_rate": 9.399585921325053e-06, + "loss": 0.5555, + "step": 28160 + }, + { + "epoch": 5.3, + "grad_norm": 11.630510330200195, + "learning_rate": 9.395821569734614e-06, + "loss": 0.6737, + "step": 28170 + }, + { + "epoch": 5.3, + "grad_norm": 0.5806583166122437, + "learning_rate": 9.392057218144174e-06, + "loss": 0.5265, + "step": 28180 + }, + { + "epoch": 5.31, + "grad_norm": 4.3490214347839355, + "learning_rate": 9.388292866553737e-06, + "loss": 0.7698, + "step": 28190 + }, + { + "epoch": 5.31, + "grad_norm": 19.73464584350586, + "learning_rate": 9.384528514963299e-06, + "loss": 0.3166, + "step": 28200 + }, + { + "epoch": 5.31, + "grad_norm": 8.744988441467285, + "learning_rate": 9.38076416337286e-06, + "loss": 0.5361, + "step": 28210 + }, + { + "epoch": 5.31, + "grad_norm": 13.170635223388672, + "learning_rate": 9.376999811782422e-06, + "loss": 0.5091, + "step": 28220 + }, + { + "epoch": 5.31, + "grad_norm": 3.8502869606018066, + "learning_rate": 9.373235460191983e-06, + "loss": 0.6684, + "step": 28230 + }, + { + "epoch": 5.32, + "grad_norm": 4.719302177429199, + "learning_rate": 9.369471108601545e-06, + "loss": 0.3989, + "step": 28240 + }, + { + "epoch": 5.32, + "grad_norm": 32.28798294067383, + "learning_rate": 9.365706757011106e-06, + "loss": 0.509, + "step": 28250 + }, + { + "epoch": 5.32, + "grad_norm": 17.933258056640625, + "learning_rate": 9.361942405420668e-06, + "loss": 0.5806, + "step": 28260 + }, + { + "epoch": 5.32, + "grad_norm": 4.121888637542725, + "learning_rate": 9.358178053830227e-06, + "loss": 0.4916, + "step": 28270 + }, + { + "epoch": 5.32, + "grad_norm": 7.130378723144531, + "learning_rate": 9.354413702239789e-06, + "loss": 0.4107, + "step": 28280 + }, + { + "epoch": 5.32, + "grad_norm": 14.70533561706543, + "learning_rate": 9.350649350649352e-06, + "loss": 0.581, + "step": 28290 + }, + { + "epoch": 5.33, + "grad_norm": 11.548562049865723, + "learning_rate": 9.346884999058913e-06, + "loss": 0.3762, + "step": 28300 + }, + { + "epoch": 5.33, + "grad_norm": 15.485182762145996, + "learning_rate": 9.343120647468475e-06, + "loss": 0.7006, + "step": 28310 + }, + { + "epoch": 5.33, + "grad_norm": 26.412918090820312, + "learning_rate": 9.339356295878036e-06, + "loss": 0.2112, + "step": 28320 + }, + { + "epoch": 5.33, + "grad_norm": 13.118706703186035, + "learning_rate": 9.335591944287598e-06, + "loss": 0.3918, + "step": 28330 + }, + { + "epoch": 5.33, + "grad_norm": 6.297987937927246, + "learning_rate": 9.33182759269716e-06, + "loss": 0.4208, + "step": 28340 + }, + { + "epoch": 5.34, + "grad_norm": 5.268683433532715, + "learning_rate": 9.32806324110672e-06, + "loss": 0.5968, + "step": 28350 + }, + { + "epoch": 5.34, + "grad_norm": 6.9022955894470215, + "learning_rate": 9.32429888951628e-06, + "loss": 0.5602, + "step": 28360 + }, + { + "epoch": 5.34, + "grad_norm": 1.084697961807251, + "learning_rate": 9.320534537925842e-06, + "loss": 0.9877, + "step": 28370 + }, + { + "epoch": 5.34, + "grad_norm": 18.268627166748047, + "learning_rate": 9.316770186335405e-06, + "loss": 0.5616, + "step": 28380 + }, + { + "epoch": 5.34, + "grad_norm": 11.595245361328125, + "learning_rate": 9.313005834744967e-06, + "loss": 0.3982, + "step": 28390 + }, + { + "epoch": 5.35, + "grad_norm": 15.181175231933594, + "learning_rate": 9.309241483154528e-06, + "loss": 0.4555, + "step": 28400 + }, + { + "epoch": 5.35, + "grad_norm": 20.354490280151367, + "learning_rate": 9.30547713156409e-06, + "loss": 0.4182, + "step": 28410 + }, + { + "epoch": 5.35, + "grad_norm": 2.795022487640381, + "learning_rate": 9.301712779973651e-06, + "loss": 0.629, + "step": 28420 + }, + { + "epoch": 5.35, + "grad_norm": 3.4192261695861816, + "learning_rate": 9.297948428383212e-06, + "loss": 0.6771, + "step": 28430 + }, + { + "epoch": 5.35, + "grad_norm": 18.82028579711914, + "learning_rate": 9.294184076792772e-06, + "loss": 0.6471, + "step": 28440 + }, + { + "epoch": 5.35, + "grad_norm": 10.711726188659668, + "learning_rate": 9.290419725202334e-06, + "loss": 0.707, + "step": 28450 + }, + { + "epoch": 5.36, + "grad_norm": 18.926769256591797, + "learning_rate": 9.286655373611895e-06, + "loss": 0.499, + "step": 28460 + }, + { + "epoch": 5.36, + "grad_norm": 6.421844482421875, + "learning_rate": 9.282891022021458e-06, + "loss": 0.506, + "step": 28470 + }, + { + "epoch": 5.36, + "grad_norm": 30.580154418945312, + "learning_rate": 9.27912667043102e-06, + "loss": 0.4126, + "step": 28480 + }, + { + "epoch": 5.36, + "grad_norm": 1.4444148540496826, + "learning_rate": 9.275362318840581e-06, + "loss": 0.321, + "step": 28490 + }, + { + "epoch": 5.36, + "grad_norm": 0.1973607838153839, + "learning_rate": 9.271597967250143e-06, + "loss": 0.6311, + "step": 28500 + }, + { + "epoch": 5.37, + "grad_norm": 4.095947265625, + "learning_rate": 9.267833615659704e-06, + "loss": 0.3348, + "step": 28510 + }, + { + "epoch": 5.37, + "grad_norm": 3.3935463428497314, + "learning_rate": 9.264069264069266e-06, + "loss": 0.7266, + "step": 28520 + }, + { + "epoch": 5.37, + "grad_norm": 1.5833213329315186, + "learning_rate": 9.260304912478825e-06, + "loss": 0.2602, + "step": 28530 + }, + { + "epoch": 5.37, + "grad_norm": 32.14379119873047, + "learning_rate": 9.256540560888387e-06, + "loss": 0.6631, + "step": 28540 + }, + { + "epoch": 5.37, + "grad_norm": 6.494132041931152, + "learning_rate": 9.252776209297948e-06, + "loss": 0.686, + "step": 28550 + }, + { + "epoch": 5.38, + "grad_norm": 7.422737121582031, + "learning_rate": 9.24901185770751e-06, + "loss": 0.7162, + "step": 28560 + }, + { + "epoch": 5.38, + "grad_norm": 21.754234313964844, + "learning_rate": 9.245247506117073e-06, + "loss": 0.5544, + "step": 28570 + }, + { + "epoch": 5.38, + "grad_norm": 11.023734092712402, + "learning_rate": 9.241483154526634e-06, + "loss": 0.7434, + "step": 28580 + }, + { + "epoch": 5.38, + "grad_norm": 14.888821601867676, + "learning_rate": 9.237718802936196e-06, + "loss": 0.4337, + "step": 28590 + }, + { + "epoch": 5.38, + "grad_norm": 6.61778450012207, + "learning_rate": 9.233954451345757e-06, + "loss": 0.7, + "step": 28600 + }, + { + "epoch": 5.38, + "grad_norm": 10.373224258422852, + "learning_rate": 9.230190099755319e-06, + "loss": 0.5232, + "step": 28610 + }, + { + "epoch": 5.39, + "grad_norm": 6.890915870666504, + "learning_rate": 9.226425748164878e-06, + "loss": 0.4981, + "step": 28620 + }, + { + "epoch": 5.39, + "grad_norm": 7.358579158782959, + "learning_rate": 9.22266139657444e-06, + "loss": 0.7168, + "step": 28630 + }, + { + "epoch": 5.39, + "grad_norm": 26.474361419677734, + "learning_rate": 9.218897044984001e-06, + "loss": 0.4434, + "step": 28640 + }, + { + "epoch": 5.39, + "grad_norm": 0.49774327874183655, + "learning_rate": 9.215132693393563e-06, + "loss": 0.4762, + "step": 28650 + }, + { + "epoch": 5.39, + "grad_norm": 0.7938456535339355, + "learning_rate": 9.211368341803126e-06, + "loss": 0.4004, + "step": 28660 + }, + { + "epoch": 5.4, + "grad_norm": 16.776710510253906, + "learning_rate": 9.207603990212687e-06, + "loss": 0.6284, + "step": 28670 + }, + { + "epoch": 5.4, + "grad_norm": 28.070096969604492, + "learning_rate": 9.203839638622249e-06, + "loss": 0.6981, + "step": 28680 + }, + { + "epoch": 5.4, + "grad_norm": 10.797050476074219, + "learning_rate": 9.20007528703181e-06, + "loss": 0.4466, + "step": 28690 + }, + { + "epoch": 5.4, + "grad_norm": 25.648643493652344, + "learning_rate": 9.19631093544137e-06, + "loss": 0.7307, + "step": 28700 + }, + { + "epoch": 5.4, + "grad_norm": 20.66429901123047, + "learning_rate": 9.192546583850932e-06, + "loss": 0.5789, + "step": 28710 + }, + { + "epoch": 5.41, + "grad_norm": 21.309707641601562, + "learning_rate": 9.188782232260493e-06, + "loss": 0.3682, + "step": 28720 + }, + { + "epoch": 5.41, + "grad_norm": 7.1550703048706055, + "learning_rate": 9.185017880670055e-06, + "loss": 0.3696, + "step": 28730 + }, + { + "epoch": 5.41, + "grad_norm": 19.179853439331055, + "learning_rate": 9.181253529079616e-06, + "loss": 0.4966, + "step": 28740 + }, + { + "epoch": 5.41, + "grad_norm": 12.53691291809082, + "learning_rate": 9.177489177489179e-06, + "loss": 0.4346, + "step": 28750 + }, + { + "epoch": 5.41, + "grad_norm": 17.41402244567871, + "learning_rate": 9.17372482589874e-06, + "loss": 0.469, + "step": 28760 + }, + { + "epoch": 5.42, + "grad_norm": 9.987686157226562, + "learning_rate": 9.169960474308302e-06, + "loss": 0.4145, + "step": 28770 + }, + { + "epoch": 5.42, + "grad_norm": 9.735391616821289, + "learning_rate": 9.166196122717864e-06, + "loss": 0.4125, + "step": 28780 + }, + { + "epoch": 5.42, + "grad_norm": 40.975223541259766, + "learning_rate": 9.162431771127423e-06, + "loss": 0.6056, + "step": 28790 + }, + { + "epoch": 5.42, + "grad_norm": 22.280542373657227, + "learning_rate": 9.158667419536985e-06, + "loss": 0.6244, + "step": 28800 + }, + { + "epoch": 5.42, + "grad_norm": 8.667016983032227, + "learning_rate": 9.154903067946546e-06, + "loss": 0.3564, + "step": 28810 + }, + { + "epoch": 5.42, + "grad_norm": 0.26233333349227905, + "learning_rate": 9.151138716356108e-06, + "loss": 0.5011, + "step": 28820 + }, + { + "epoch": 5.43, + "grad_norm": 8.251745223999023, + "learning_rate": 9.147374364765669e-06, + "loss": 0.5049, + "step": 28830 + }, + { + "epoch": 5.43, + "grad_norm": 18.818889617919922, + "learning_rate": 9.143610013175232e-06, + "loss": 0.3172, + "step": 28840 + }, + { + "epoch": 5.43, + "grad_norm": 26.497833251953125, + "learning_rate": 9.139845661584794e-06, + "loss": 0.5095, + "step": 28850 + }, + { + "epoch": 5.43, + "grad_norm": 8.361761093139648, + "learning_rate": 9.136081309994355e-06, + "loss": 0.5354, + "step": 28860 + }, + { + "epoch": 5.43, + "grad_norm": 17.610157012939453, + "learning_rate": 9.132316958403917e-06, + "loss": 0.6836, + "step": 28870 + }, + { + "epoch": 5.44, + "grad_norm": 7.032655715942383, + "learning_rate": 9.128552606813476e-06, + "loss": 0.5749, + "step": 28880 + }, + { + "epoch": 5.44, + "grad_norm": 18.669261932373047, + "learning_rate": 9.124788255223038e-06, + "loss": 0.5949, + "step": 28890 + }, + { + "epoch": 5.44, + "grad_norm": 12.058159828186035, + "learning_rate": 9.1210239036326e-06, + "loss": 0.5494, + "step": 28900 + }, + { + "epoch": 5.44, + "grad_norm": 0.7247989773750305, + "learning_rate": 9.11725955204216e-06, + "loss": 0.4002, + "step": 28910 + }, + { + "epoch": 5.44, + "grad_norm": 6.094273090362549, + "learning_rate": 9.113495200451722e-06, + "loss": 0.5376, + "step": 28920 + }, + { + "epoch": 5.45, + "grad_norm": 17.795515060424805, + "learning_rate": 9.109730848861284e-06, + "loss": 0.5065, + "step": 28930 + }, + { + "epoch": 5.45, + "grad_norm": 13.030588150024414, + "learning_rate": 9.105966497270847e-06, + "loss": 0.5984, + "step": 28940 + }, + { + "epoch": 5.45, + "grad_norm": 0.7247048020362854, + "learning_rate": 9.102202145680408e-06, + "loss": 0.6524, + "step": 28950 + }, + { + "epoch": 5.45, + "grad_norm": 0.13299456238746643, + "learning_rate": 9.09843779408997e-06, + "loss": 0.616, + "step": 28960 + }, + { + "epoch": 5.45, + "grad_norm": 5.874953746795654, + "learning_rate": 9.09467344249953e-06, + "loss": 0.6393, + "step": 28970 + }, + { + "epoch": 5.45, + "grad_norm": 14.227991104125977, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3868, + "step": 28980 + }, + { + "epoch": 5.46, + "grad_norm": 8.069024085998535, + "learning_rate": 9.087144739318652e-06, + "loss": 0.6467, + "step": 28990 + }, + { + "epoch": 5.46, + "grad_norm": 12.728198051452637, + "learning_rate": 9.083380387728214e-06, + "loss": 0.8101, + "step": 29000 + }, + { + "epoch": 5.46, + "grad_norm": 31.68919563293457, + "learning_rate": 9.079616036137775e-06, + "loss": 0.4018, + "step": 29010 + }, + { + "epoch": 5.46, + "grad_norm": 37.730751037597656, + "learning_rate": 9.075851684547337e-06, + "loss": 0.3918, + "step": 29020 + }, + { + "epoch": 5.46, + "grad_norm": 32.44590759277344, + "learning_rate": 9.0720873329569e-06, + "loss": 0.5716, + "step": 29030 + }, + { + "epoch": 5.47, + "grad_norm": 3.4160537719726562, + "learning_rate": 9.068322981366461e-06, + "loss": 0.6425, + "step": 29040 + }, + { + "epoch": 5.47, + "grad_norm": 15.794535636901855, + "learning_rate": 9.064558629776021e-06, + "loss": 0.4578, + "step": 29050 + }, + { + "epoch": 5.47, + "grad_norm": 14.576927185058594, + "learning_rate": 9.060794278185583e-06, + "loss": 0.2114, + "step": 29060 + }, + { + "epoch": 5.47, + "grad_norm": 12.429288864135742, + "learning_rate": 9.057029926595144e-06, + "loss": 0.571, + "step": 29070 + }, + { + "epoch": 5.47, + "grad_norm": 12.511153221130371, + "learning_rate": 9.053265575004706e-06, + "loss": 0.5288, + "step": 29080 + }, + { + "epoch": 5.48, + "grad_norm": 11.03250789642334, + "learning_rate": 9.049501223414267e-06, + "loss": 0.6037, + "step": 29090 + }, + { + "epoch": 5.48, + "grad_norm": 14.617107391357422, + "learning_rate": 9.045736871823829e-06, + "loss": 0.7927, + "step": 29100 + }, + { + "epoch": 5.48, + "grad_norm": 10.177559852600098, + "learning_rate": 9.04197252023339e-06, + "loss": 0.5179, + "step": 29110 + }, + { + "epoch": 5.48, + "grad_norm": 1.5865446329116821, + "learning_rate": 9.038208168642953e-06, + "loss": 0.6091, + "step": 29120 + }, + { + "epoch": 5.48, + "grad_norm": 18.313032150268555, + "learning_rate": 9.034443817052515e-06, + "loss": 0.4665, + "step": 29130 + }, + { + "epoch": 5.48, + "grad_norm": 23.01014518737793, + "learning_rate": 9.030679465462074e-06, + "loss": 0.6638, + "step": 29140 + }, + { + "epoch": 5.49, + "grad_norm": 18.016878128051758, + "learning_rate": 9.026915113871636e-06, + "loss": 0.437, + "step": 29150 + }, + { + "epoch": 5.49, + "grad_norm": 19.193099975585938, + "learning_rate": 9.023150762281197e-06, + "loss": 0.864, + "step": 29160 + }, + { + "epoch": 5.49, + "grad_norm": 22.776002883911133, + "learning_rate": 9.019386410690759e-06, + "loss": 0.3799, + "step": 29170 + }, + { + "epoch": 5.49, + "grad_norm": 19.30347442626953, + "learning_rate": 9.01562205910032e-06, + "loss": 0.4335, + "step": 29180 + }, + { + "epoch": 5.49, + "grad_norm": 8.978057861328125, + "learning_rate": 9.011857707509882e-06, + "loss": 0.6437, + "step": 29190 + }, + { + "epoch": 5.5, + "grad_norm": 24.524974822998047, + "learning_rate": 9.008093355919443e-06, + "loss": 0.6654, + "step": 29200 + }, + { + "epoch": 5.5, + "grad_norm": 29.421937942504883, + "learning_rate": 9.004329004329005e-06, + "loss": 1.0022, + "step": 29210 + }, + { + "epoch": 5.5, + "grad_norm": 9.427165985107422, + "learning_rate": 9.000564652738568e-06, + "loss": 0.618, + "step": 29220 + }, + { + "epoch": 5.5, + "grad_norm": 3.5719804763793945, + "learning_rate": 8.996800301148128e-06, + "loss": 0.4899, + "step": 29230 + }, + { + "epoch": 5.5, + "grad_norm": 4.902514934539795, + "learning_rate": 8.993035949557689e-06, + "loss": 0.6078, + "step": 29240 + }, + { + "epoch": 5.51, + "grad_norm": 7.195416450500488, + "learning_rate": 8.98927159796725e-06, + "loss": 0.5264, + "step": 29250 + }, + { + "epoch": 5.51, + "grad_norm": 5.761614799499512, + "learning_rate": 8.985507246376812e-06, + "loss": 0.5512, + "step": 29260 + }, + { + "epoch": 5.51, + "grad_norm": 20.03443145751953, + "learning_rate": 8.981742894786373e-06, + "loss": 0.3887, + "step": 29270 + }, + { + "epoch": 5.51, + "grad_norm": 20.721759796142578, + "learning_rate": 8.977978543195935e-06, + "loss": 0.4945, + "step": 29280 + }, + { + "epoch": 5.51, + "grad_norm": 3.0783581733703613, + "learning_rate": 8.974214191605496e-06, + "loss": 0.5722, + "step": 29290 + }, + { + "epoch": 5.51, + "grad_norm": 5.727044105529785, + "learning_rate": 8.970449840015058e-06, + "loss": 0.9616, + "step": 29300 + }, + { + "epoch": 5.52, + "grad_norm": 12.99439811706543, + "learning_rate": 8.96668548842462e-06, + "loss": 0.282, + "step": 29310 + }, + { + "epoch": 5.52, + "grad_norm": 22.195281982421875, + "learning_rate": 8.96292113683418e-06, + "loss": 0.8487, + "step": 29320 + }, + { + "epoch": 5.52, + "grad_norm": 16.877878189086914, + "learning_rate": 8.959156785243742e-06, + "loss": 0.4369, + "step": 29330 + }, + { + "epoch": 5.52, + "grad_norm": 3.1448419094085693, + "learning_rate": 8.955392433653304e-06, + "loss": 0.4618, + "step": 29340 + }, + { + "epoch": 5.52, + "grad_norm": 11.687005996704102, + "learning_rate": 8.951628082062865e-06, + "loss": 0.5836, + "step": 29350 + }, + { + "epoch": 5.53, + "grad_norm": 52.1478385925293, + "learning_rate": 8.947863730472427e-06, + "loss": 0.8731, + "step": 29360 + }, + { + "epoch": 5.53, + "grad_norm": 8.80933952331543, + "learning_rate": 8.944099378881988e-06, + "loss": 0.6717, + "step": 29370 + }, + { + "epoch": 5.53, + "grad_norm": 18.5314884185791, + "learning_rate": 8.94033502729155e-06, + "loss": 0.4894, + "step": 29380 + }, + { + "epoch": 5.53, + "grad_norm": 11.329646110534668, + "learning_rate": 8.936570675701111e-06, + "loss": 0.4815, + "step": 29390 + }, + { + "epoch": 5.53, + "grad_norm": 21.230852127075195, + "learning_rate": 8.932806324110672e-06, + "loss": 0.4756, + "step": 29400 + }, + { + "epoch": 5.54, + "grad_norm": 8.987369537353516, + "learning_rate": 8.929041972520234e-06, + "loss": 0.553, + "step": 29410 + }, + { + "epoch": 5.54, + "grad_norm": 9.444402694702148, + "learning_rate": 8.925277620929795e-06, + "loss": 0.4838, + "step": 29420 + }, + { + "epoch": 5.54, + "grad_norm": 0.40830114483833313, + "learning_rate": 8.921513269339357e-06, + "loss": 0.7624, + "step": 29430 + }, + { + "epoch": 5.54, + "grad_norm": 0.5162389874458313, + "learning_rate": 8.917748917748918e-06, + "loss": 0.2015, + "step": 29440 + }, + { + "epoch": 5.54, + "grad_norm": 37.219818115234375, + "learning_rate": 8.91398456615848e-06, + "loss": 0.5263, + "step": 29450 + }, + { + "epoch": 5.54, + "grad_norm": 8.195032119750977, + "learning_rate": 8.910220214568041e-06, + "loss": 0.6158, + "step": 29460 + }, + { + "epoch": 5.55, + "grad_norm": 5.147323131561279, + "learning_rate": 8.906455862977603e-06, + "loss": 0.5521, + "step": 29470 + }, + { + "epoch": 5.55, + "grad_norm": 3.7888503074645996, + "learning_rate": 8.902691511387164e-06, + "loss": 0.357, + "step": 29480 + }, + { + "epoch": 5.55, + "grad_norm": 6.756317138671875, + "learning_rate": 8.898927159796726e-06, + "loss": 0.4016, + "step": 29490 + }, + { + "epoch": 5.55, + "grad_norm": 26.7462158203125, + "learning_rate": 8.895162808206287e-06, + "loss": 0.6225, + "step": 29500 + }, + { + "epoch": 5.55, + "grad_norm": 3.3406825065612793, + "learning_rate": 8.891398456615848e-06, + "loss": 0.5504, + "step": 29510 + }, + { + "epoch": 5.56, + "grad_norm": 7.413740634918213, + "learning_rate": 8.88763410502541e-06, + "loss": 0.4144, + "step": 29520 + }, + { + "epoch": 5.56, + "grad_norm": 0.5586540699005127, + "learning_rate": 8.883869753434971e-06, + "loss": 0.394, + "step": 29530 + }, + { + "epoch": 5.56, + "grad_norm": 28.930585861206055, + "learning_rate": 8.880105401844533e-06, + "loss": 0.436, + "step": 29540 + }, + { + "epoch": 5.56, + "grad_norm": 4.1949849128723145, + "learning_rate": 8.876341050254094e-06, + "loss": 0.7621, + "step": 29550 + }, + { + "epoch": 5.56, + "grad_norm": 15.377513885498047, + "learning_rate": 8.872576698663656e-06, + "loss": 0.3309, + "step": 29560 + }, + { + "epoch": 5.57, + "grad_norm": 10.37800121307373, + "learning_rate": 8.868812347073217e-06, + "loss": 0.5609, + "step": 29570 + }, + { + "epoch": 5.57, + "grad_norm": 16.737653732299805, + "learning_rate": 8.865047995482779e-06, + "loss": 0.5195, + "step": 29580 + }, + { + "epoch": 5.57, + "grad_norm": 10.057021141052246, + "learning_rate": 8.86128364389234e-06, + "loss": 0.5856, + "step": 29590 + }, + { + "epoch": 5.57, + "grad_norm": 18.19561195373535, + "learning_rate": 8.857519292301902e-06, + "loss": 0.6032, + "step": 29600 + }, + { + "epoch": 5.57, + "grad_norm": 8.191587448120117, + "learning_rate": 8.853754940711463e-06, + "loss": 0.3631, + "step": 29610 + }, + { + "epoch": 5.58, + "grad_norm": 14.722908973693848, + "learning_rate": 8.849990589121025e-06, + "loss": 0.9588, + "step": 29620 + }, + { + "epoch": 5.58, + "grad_norm": 25.523221969604492, + "learning_rate": 8.846226237530586e-06, + "loss": 0.7066, + "step": 29630 + }, + { + "epoch": 5.58, + "grad_norm": 9.714911460876465, + "learning_rate": 8.842461885940147e-06, + "loss": 0.4797, + "step": 29640 + }, + { + "epoch": 5.58, + "grad_norm": 5.88856315612793, + "learning_rate": 8.838697534349709e-06, + "loss": 0.4359, + "step": 29650 + }, + { + "epoch": 5.58, + "grad_norm": 12.061199188232422, + "learning_rate": 8.83493318275927e-06, + "loss": 0.5571, + "step": 29660 + }, + { + "epoch": 5.58, + "grad_norm": 18.20134162902832, + "learning_rate": 8.831168831168832e-06, + "loss": 0.3449, + "step": 29670 + }, + { + "epoch": 5.59, + "grad_norm": 12.034881591796875, + "learning_rate": 8.827404479578393e-06, + "loss": 0.7451, + "step": 29680 + }, + { + "epoch": 5.59, + "grad_norm": 4.537769317626953, + "learning_rate": 8.823640127987955e-06, + "loss": 0.7454, + "step": 29690 + }, + { + "epoch": 5.59, + "grad_norm": 39.31631088256836, + "learning_rate": 8.819875776397516e-06, + "loss": 0.3897, + "step": 29700 + }, + { + "epoch": 5.59, + "grad_norm": 20.764333724975586, + "learning_rate": 8.816111424807078e-06, + "loss": 0.4239, + "step": 29710 + }, + { + "epoch": 5.59, + "grad_norm": 4.3475823402404785, + "learning_rate": 8.812347073216639e-06, + "loss": 0.4626, + "step": 29720 + }, + { + "epoch": 5.6, + "grad_norm": 26.93927001953125, + "learning_rate": 8.8085827216262e-06, + "loss": 0.3852, + "step": 29730 + }, + { + "epoch": 5.6, + "grad_norm": 23.00481414794922, + "learning_rate": 8.804818370035762e-06, + "loss": 0.5371, + "step": 29740 + }, + { + "epoch": 5.6, + "grad_norm": 1.6085634231567383, + "learning_rate": 8.801054018445324e-06, + "loss": 0.3698, + "step": 29750 + }, + { + "epoch": 5.6, + "grad_norm": 13.103279113769531, + "learning_rate": 8.797289666854885e-06, + "loss": 0.6233, + "step": 29760 + }, + { + "epoch": 5.6, + "grad_norm": 14.591599464416504, + "learning_rate": 8.793525315264446e-06, + "loss": 0.4826, + "step": 29770 + }, + { + "epoch": 5.61, + "grad_norm": 23.947324752807617, + "learning_rate": 8.789760963674008e-06, + "loss": 0.5319, + "step": 29780 + }, + { + "epoch": 5.61, + "grad_norm": 11.421167373657227, + "learning_rate": 8.78599661208357e-06, + "loss": 0.5641, + "step": 29790 + }, + { + "epoch": 5.61, + "grad_norm": 4.573395252227783, + "learning_rate": 8.78223226049313e-06, + "loss": 0.6296, + "step": 29800 + }, + { + "epoch": 5.61, + "grad_norm": 30.208024978637695, + "learning_rate": 8.778467908902692e-06, + "loss": 0.4905, + "step": 29810 + }, + { + "epoch": 5.61, + "grad_norm": 33.18610763549805, + "learning_rate": 8.774703557312254e-06, + "loss": 0.7144, + "step": 29820 + }, + { + "epoch": 5.61, + "grad_norm": 34.4425048828125, + "learning_rate": 8.770939205721815e-06, + "loss": 0.8056, + "step": 29830 + }, + { + "epoch": 5.62, + "grad_norm": 9.860750198364258, + "learning_rate": 8.767174854131377e-06, + "loss": 0.4445, + "step": 29840 + }, + { + "epoch": 5.62, + "grad_norm": 10.247167587280273, + "learning_rate": 8.763410502540938e-06, + "loss": 0.472, + "step": 29850 + }, + { + "epoch": 5.62, + "grad_norm": 30.189830780029297, + "learning_rate": 8.7596461509505e-06, + "loss": 0.4546, + "step": 29860 + }, + { + "epoch": 5.62, + "grad_norm": 4.302022933959961, + "learning_rate": 8.755881799360061e-06, + "loss": 0.6976, + "step": 29870 + }, + { + "epoch": 5.62, + "grad_norm": 0.30874672532081604, + "learning_rate": 8.752117447769623e-06, + "loss": 0.711, + "step": 29880 + }, + { + "epoch": 5.63, + "grad_norm": 5.02794075012207, + "learning_rate": 8.748353096179184e-06, + "loss": 0.6327, + "step": 29890 + }, + { + "epoch": 5.63, + "grad_norm": 1.8799618482589722, + "learning_rate": 8.744588744588745e-06, + "loss": 0.3782, + "step": 29900 + }, + { + "epoch": 5.63, + "grad_norm": 12.888080596923828, + "learning_rate": 8.740824392998307e-06, + "loss": 0.4036, + "step": 29910 + }, + { + "epoch": 5.63, + "grad_norm": 24.87213897705078, + "learning_rate": 8.737060041407868e-06, + "loss": 0.51, + "step": 29920 + }, + { + "epoch": 5.63, + "grad_norm": 20.8959903717041, + "learning_rate": 8.73329568981743e-06, + "loss": 0.434, + "step": 29930 + }, + { + "epoch": 5.64, + "grad_norm": 15.75831413269043, + "learning_rate": 8.729531338226991e-06, + "loss": 0.5039, + "step": 29940 + }, + { + "epoch": 5.64, + "grad_norm": 6.654936790466309, + "learning_rate": 8.725766986636553e-06, + "loss": 0.6428, + "step": 29950 + }, + { + "epoch": 5.64, + "grad_norm": 12.773100852966309, + "learning_rate": 8.722002635046114e-06, + "loss": 0.5853, + "step": 29960 + }, + { + "epoch": 5.64, + "grad_norm": 19.850297927856445, + "learning_rate": 8.718238283455676e-06, + "loss": 0.7984, + "step": 29970 + }, + { + "epoch": 5.64, + "grad_norm": 11.2599458694458, + "learning_rate": 8.714473931865237e-06, + "loss": 0.7433, + "step": 29980 + }, + { + "epoch": 5.64, + "grad_norm": 14.740336418151855, + "learning_rate": 8.710709580274799e-06, + "loss": 0.448, + "step": 29990 + }, + { + "epoch": 5.65, + "grad_norm": 11.278069496154785, + "learning_rate": 8.70694522868436e-06, + "loss": 0.5628, + "step": 30000 + }, + { + "epoch": 5.65, + "grad_norm": 17.312114715576172, + "learning_rate": 8.703180877093921e-06, + "loss": 0.8434, + "step": 30010 + }, + { + "epoch": 5.65, + "grad_norm": 10.707963943481445, + "learning_rate": 8.699416525503483e-06, + "loss": 0.5479, + "step": 30020 + }, + { + "epoch": 5.65, + "grad_norm": 33.91600036621094, + "learning_rate": 8.695652173913044e-06, + "loss": 1.1438, + "step": 30030 + }, + { + "epoch": 5.65, + "grad_norm": 26.10776138305664, + "learning_rate": 8.691887822322606e-06, + "loss": 0.6822, + "step": 30040 + }, + { + "epoch": 5.66, + "grad_norm": 4.60282564163208, + "learning_rate": 8.688123470732167e-06, + "loss": 0.5197, + "step": 30050 + }, + { + "epoch": 5.66, + "grad_norm": 1.051890254020691, + "learning_rate": 8.684359119141729e-06, + "loss": 0.6023, + "step": 30060 + }, + { + "epoch": 5.66, + "grad_norm": 20.802509307861328, + "learning_rate": 8.68059476755129e-06, + "loss": 0.3724, + "step": 30070 + }, + { + "epoch": 5.66, + "grad_norm": 11.538596153259277, + "learning_rate": 8.676830415960852e-06, + "loss": 0.5301, + "step": 30080 + }, + { + "epoch": 5.66, + "grad_norm": 31.12389373779297, + "learning_rate": 8.673066064370413e-06, + "loss": 0.6386, + "step": 30090 + }, + { + "epoch": 5.67, + "grad_norm": 17.99003028869629, + "learning_rate": 8.669301712779975e-06, + "loss": 0.4503, + "step": 30100 + }, + { + "epoch": 5.67, + "grad_norm": 25.43760108947754, + "learning_rate": 8.665537361189536e-06, + "loss": 0.6649, + "step": 30110 + }, + { + "epoch": 5.67, + "grad_norm": 0.21140538156032562, + "learning_rate": 8.661773009599098e-06, + "loss": 0.3958, + "step": 30120 + }, + { + "epoch": 5.67, + "grad_norm": 6.105637550354004, + "learning_rate": 8.658008658008659e-06, + "loss": 0.7797, + "step": 30130 + }, + { + "epoch": 5.67, + "grad_norm": 3.348720073699951, + "learning_rate": 8.65424430641822e-06, + "loss": 0.3967, + "step": 30140 + }, + { + "epoch": 5.67, + "grad_norm": 17.45372772216797, + "learning_rate": 8.650479954827782e-06, + "loss": 0.4381, + "step": 30150 + }, + { + "epoch": 5.68, + "grad_norm": 5.134333610534668, + "learning_rate": 8.646715603237343e-06, + "loss": 0.6194, + "step": 30160 + }, + { + "epoch": 5.68, + "grad_norm": 28.561365127563477, + "learning_rate": 8.642951251646905e-06, + "loss": 0.8043, + "step": 30170 + }, + { + "epoch": 5.68, + "grad_norm": 9.468137741088867, + "learning_rate": 8.639186900056465e-06, + "loss": 0.5759, + "step": 30180 + }, + { + "epoch": 5.68, + "grad_norm": 5.760190010070801, + "learning_rate": 8.635422548466028e-06, + "loss": 0.6533, + "step": 30190 + }, + { + "epoch": 5.68, + "grad_norm": 22.225126266479492, + "learning_rate": 8.63165819687559e-06, + "loss": 0.7, + "step": 30200 + }, + { + "epoch": 5.69, + "grad_norm": 14.245889663696289, + "learning_rate": 8.62789384528515e-06, + "loss": 0.6829, + "step": 30210 + }, + { + "epoch": 5.69, + "grad_norm": 30.38525390625, + "learning_rate": 8.624129493694712e-06, + "loss": 0.5493, + "step": 30220 + }, + { + "epoch": 5.69, + "grad_norm": 16.88604164123535, + "learning_rate": 8.620365142104274e-06, + "loss": 0.6734, + "step": 30230 + }, + { + "epoch": 5.69, + "grad_norm": 23.054962158203125, + "learning_rate": 8.616600790513835e-06, + "loss": 0.6553, + "step": 30240 + }, + { + "epoch": 5.69, + "grad_norm": 16.953598022460938, + "learning_rate": 8.612836438923397e-06, + "loss": 0.6512, + "step": 30250 + }, + { + "epoch": 5.7, + "grad_norm": 22.24566078186035, + "learning_rate": 8.609072087332958e-06, + "loss": 0.5785, + "step": 30260 + }, + { + "epoch": 5.7, + "grad_norm": 0.3672504723072052, + "learning_rate": 8.605307735742518e-06, + "loss": 0.6382, + "step": 30270 + }, + { + "epoch": 5.7, + "grad_norm": 0.8988174796104431, + "learning_rate": 8.601543384152081e-06, + "loss": 0.4421, + "step": 30280 + }, + { + "epoch": 5.7, + "grad_norm": 15.05488395690918, + "learning_rate": 8.597779032561642e-06, + "loss": 0.4611, + "step": 30290 + }, + { + "epoch": 5.7, + "grad_norm": 24.66227912902832, + "learning_rate": 8.594014680971204e-06, + "loss": 0.5693, + "step": 30300 + }, + { + "epoch": 5.7, + "grad_norm": 24.09734344482422, + "learning_rate": 8.590250329380765e-06, + "loss": 0.4477, + "step": 30310 + }, + { + "epoch": 5.71, + "grad_norm": 9.480891227722168, + "learning_rate": 8.586485977790327e-06, + "loss": 0.5086, + "step": 30320 + }, + { + "epoch": 5.71, + "grad_norm": 41.71274185180664, + "learning_rate": 8.582721626199888e-06, + "loss": 0.8377, + "step": 30330 + }, + { + "epoch": 5.71, + "grad_norm": 30.149093627929688, + "learning_rate": 8.57895727460945e-06, + "loss": 0.6889, + "step": 30340 + }, + { + "epoch": 5.71, + "grad_norm": 14.241212844848633, + "learning_rate": 8.575192923019011e-06, + "loss": 0.6828, + "step": 30350 + }, + { + "epoch": 5.71, + "grad_norm": 8.023870468139648, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5508, + "step": 30360 + }, + { + "epoch": 5.72, + "grad_norm": 1.6237149238586426, + "learning_rate": 8.567664219838134e-06, + "loss": 0.3401, + "step": 30370 + }, + { + "epoch": 5.72, + "grad_norm": 7.518566608428955, + "learning_rate": 8.563899868247696e-06, + "loss": 0.4684, + "step": 30380 + }, + { + "epoch": 5.72, + "grad_norm": 15.331060409545898, + "learning_rate": 8.560135516657257e-06, + "loss": 0.5097, + "step": 30390 + }, + { + "epoch": 5.72, + "grad_norm": 6.36647891998291, + "learning_rate": 8.556371165066818e-06, + "loss": 0.5214, + "step": 30400 + }, + { + "epoch": 5.72, + "grad_norm": 17.332448959350586, + "learning_rate": 8.55260681347638e-06, + "loss": 0.4498, + "step": 30410 + }, + { + "epoch": 5.73, + "grad_norm": 63.40143966674805, + "learning_rate": 8.548842461885941e-06, + "loss": 0.6639, + "step": 30420 + }, + { + "epoch": 5.73, + "grad_norm": 9.465449333190918, + "learning_rate": 8.545078110295503e-06, + "loss": 0.5251, + "step": 30430 + }, + { + "epoch": 5.73, + "grad_norm": 11.202866554260254, + "learning_rate": 8.541313758705064e-06, + "loss": 0.4721, + "step": 30440 + }, + { + "epoch": 5.73, + "grad_norm": 66.58721923828125, + "learning_rate": 8.537549407114624e-06, + "loss": 0.6034, + "step": 30450 + }, + { + "epoch": 5.73, + "grad_norm": 36.35587692260742, + "learning_rate": 8.533785055524186e-06, + "loss": 0.5666, + "step": 30460 + }, + { + "epoch": 5.73, + "grad_norm": 0.15961378812789917, + "learning_rate": 8.530020703933749e-06, + "loss": 0.7364, + "step": 30470 + }, + { + "epoch": 5.74, + "grad_norm": 6.215275287628174, + "learning_rate": 8.52625635234331e-06, + "loss": 0.4973, + "step": 30480 + }, + { + "epoch": 5.74, + "grad_norm": 29.853361129760742, + "learning_rate": 8.522492000752872e-06, + "loss": 0.6356, + "step": 30490 + }, + { + "epoch": 5.74, + "grad_norm": 18.131460189819336, + "learning_rate": 8.518727649162433e-06, + "loss": 0.7122, + "step": 30500 + }, + { + "epoch": 5.74, + "grad_norm": 17.9971866607666, + "learning_rate": 8.514963297571995e-06, + "loss": 0.784, + "step": 30510 + }, + { + "epoch": 5.74, + "grad_norm": 21.481882095336914, + "learning_rate": 8.511198945981556e-06, + "loss": 0.6354, + "step": 30520 + }, + { + "epoch": 5.75, + "grad_norm": 7.704098224639893, + "learning_rate": 8.507434594391116e-06, + "loss": 0.5242, + "step": 30530 + }, + { + "epoch": 5.75, + "grad_norm": 0.10782869905233383, + "learning_rate": 8.503670242800677e-06, + "loss": 0.3119, + "step": 30540 + }, + { + "epoch": 5.75, + "grad_norm": 9.13038158416748, + "learning_rate": 8.499905891210239e-06, + "loss": 0.5088, + "step": 30550 + }, + { + "epoch": 5.75, + "grad_norm": 1.3244730234146118, + "learning_rate": 8.496141539619802e-06, + "loss": 0.3711, + "step": 30560 + }, + { + "epoch": 5.75, + "grad_norm": 12.541553497314453, + "learning_rate": 8.492377188029363e-06, + "loss": 0.4415, + "step": 30570 + }, + { + "epoch": 5.76, + "grad_norm": 6.784567832946777, + "learning_rate": 8.488612836438925e-06, + "loss": 0.3377, + "step": 30580 + }, + { + "epoch": 5.76, + "grad_norm": 29.57703971862793, + "learning_rate": 8.484848484848486e-06, + "loss": 0.5329, + "step": 30590 + }, + { + "epoch": 5.76, + "grad_norm": 16.579954147338867, + "learning_rate": 8.481084133258048e-06, + "loss": 0.4068, + "step": 30600 + }, + { + "epoch": 5.76, + "grad_norm": 13.801152229309082, + "learning_rate": 8.477319781667609e-06, + "loss": 0.5752, + "step": 30610 + }, + { + "epoch": 5.76, + "grad_norm": 4.930255889892578, + "learning_rate": 8.473555430077169e-06, + "loss": 0.6852, + "step": 30620 + }, + { + "epoch": 5.77, + "grad_norm": 6.195820331573486, + "learning_rate": 8.46979107848673e-06, + "loss": 0.4997, + "step": 30630 + }, + { + "epoch": 5.77, + "grad_norm": 14.934310913085938, + "learning_rate": 8.466026726896292e-06, + "loss": 0.4644, + "step": 30640 + }, + { + "epoch": 5.77, + "grad_norm": 13.503046989440918, + "learning_rate": 8.462262375305855e-06, + "loss": 0.6239, + "step": 30650 + }, + { + "epoch": 5.77, + "grad_norm": 7.415577411651611, + "learning_rate": 8.458498023715416e-06, + "loss": 0.4476, + "step": 30660 + }, + { + "epoch": 5.77, + "grad_norm": 11.47219181060791, + "learning_rate": 8.454733672124978e-06, + "loss": 0.4326, + "step": 30670 + }, + { + "epoch": 5.77, + "grad_norm": 31.040847778320312, + "learning_rate": 8.45096932053454e-06, + "loss": 0.5231, + "step": 30680 + }, + { + "epoch": 5.78, + "grad_norm": 14.181180953979492, + "learning_rate": 8.4472049689441e-06, + "loss": 0.6028, + "step": 30690 + }, + { + "epoch": 5.78, + "grad_norm": 14.366339683532715, + "learning_rate": 8.443440617353662e-06, + "loss": 0.5395, + "step": 30700 + }, + { + "epoch": 5.78, + "grad_norm": 1.154944896697998, + "learning_rate": 8.439676265763222e-06, + "loss": 0.6325, + "step": 30710 + }, + { + "epoch": 5.78, + "grad_norm": 5.166085243225098, + "learning_rate": 8.435911914172784e-06, + "loss": 0.5345, + "step": 30720 + }, + { + "epoch": 5.78, + "grad_norm": 2.16483211517334, + "learning_rate": 8.432147562582345e-06, + "loss": 0.4621, + "step": 30730 + }, + { + "epoch": 5.79, + "grad_norm": 24.309677124023438, + "learning_rate": 8.428383210991906e-06, + "loss": 0.4354, + "step": 30740 + }, + { + "epoch": 5.79, + "grad_norm": 30.56849479675293, + "learning_rate": 8.42461885940147e-06, + "loss": 0.6885, + "step": 30750 + }, + { + "epoch": 5.79, + "grad_norm": 30.05738067626953, + "learning_rate": 8.420854507811031e-06, + "loss": 0.3176, + "step": 30760 + }, + { + "epoch": 5.79, + "grad_norm": 1.265407919883728, + "learning_rate": 8.417090156220593e-06, + "loss": 0.415, + "step": 30770 + }, + { + "epoch": 5.79, + "grad_norm": 1.8372256755828857, + "learning_rate": 8.413325804630154e-06, + "loss": 0.3897, + "step": 30780 + }, + { + "epoch": 5.8, + "grad_norm": 5.470216274261475, + "learning_rate": 8.409561453039714e-06, + "loss": 0.7885, + "step": 30790 + }, + { + "epoch": 5.8, + "grad_norm": 28.494230270385742, + "learning_rate": 8.405797101449275e-06, + "loss": 0.5577, + "step": 30800 + }, + { + "epoch": 5.8, + "grad_norm": 3.192377805709839, + "learning_rate": 8.402032749858837e-06, + "loss": 0.6976, + "step": 30810 + }, + { + "epoch": 5.8, + "grad_norm": 16.5463924407959, + "learning_rate": 8.398268398268398e-06, + "loss": 0.4397, + "step": 30820 + }, + { + "epoch": 5.8, + "grad_norm": 28.271276473999023, + "learning_rate": 8.39450404667796e-06, + "loss": 0.8356, + "step": 30830 + }, + { + "epoch": 5.8, + "grad_norm": 1.6185463666915894, + "learning_rate": 8.390739695087523e-06, + "loss": 0.5331, + "step": 30840 + }, + { + "epoch": 5.81, + "grad_norm": 27.62196159362793, + "learning_rate": 8.386975343497084e-06, + "loss": 0.4344, + "step": 30850 + }, + { + "epoch": 5.81, + "grad_norm": 47.22684097290039, + "learning_rate": 8.383210991906646e-06, + "loss": 0.7706, + "step": 30860 + }, + { + "epoch": 5.81, + "grad_norm": 9.242682456970215, + "learning_rate": 8.379446640316207e-06, + "loss": 0.6522, + "step": 30870 + }, + { + "epoch": 5.81, + "grad_norm": 9.148713111877441, + "learning_rate": 8.375682288725767e-06, + "loss": 0.4746, + "step": 30880 + }, + { + "epoch": 5.81, + "grad_norm": 24.679914474487305, + "learning_rate": 8.371917937135328e-06, + "loss": 0.6837, + "step": 30890 + }, + { + "epoch": 5.82, + "grad_norm": 14.974303245544434, + "learning_rate": 8.36815358554489e-06, + "loss": 0.6234, + "step": 30900 + }, + { + "epoch": 5.82, + "grad_norm": 17.962249755859375, + "learning_rate": 8.364389233954451e-06, + "loss": 0.507, + "step": 30910 + }, + { + "epoch": 5.82, + "grad_norm": 10.956331253051758, + "learning_rate": 8.360624882364013e-06, + "loss": 0.5623, + "step": 30920 + }, + { + "epoch": 5.82, + "grad_norm": 0.729061484336853, + "learning_rate": 8.356860530773576e-06, + "loss": 0.6084, + "step": 30930 + }, + { + "epoch": 5.82, + "grad_norm": 12.048168182373047, + "learning_rate": 8.353096179183137e-06, + "loss": 0.3989, + "step": 30940 + }, + { + "epoch": 5.83, + "grad_norm": 7.992116928100586, + "learning_rate": 8.349331827592699e-06, + "loss": 0.4077, + "step": 30950 + }, + { + "epoch": 5.83, + "grad_norm": 19.040918350219727, + "learning_rate": 8.34556747600226e-06, + "loss": 0.9477, + "step": 30960 + }, + { + "epoch": 5.83, + "grad_norm": 12.26267147064209, + "learning_rate": 8.34180312441182e-06, + "loss": 0.5282, + "step": 30970 + }, + { + "epoch": 5.83, + "grad_norm": 7.242198467254639, + "learning_rate": 8.338038772821381e-06, + "loss": 0.4611, + "step": 30980 + }, + { + "epoch": 5.83, + "grad_norm": 27.804534912109375, + "learning_rate": 8.334274421230943e-06, + "loss": 0.5615, + "step": 30990 + }, + { + "epoch": 5.83, + "grad_norm": 0.4633367955684662, + "learning_rate": 8.330510069640504e-06, + "loss": 0.533, + "step": 31000 + }, + { + "epoch": 5.84, + "grad_norm": 7.059324741363525, + "learning_rate": 8.326745718050066e-06, + "loss": 0.1811, + "step": 31010 + }, + { + "epoch": 5.84, + "grad_norm": 13.364731788635254, + "learning_rate": 8.322981366459629e-06, + "loss": 0.4192, + "step": 31020 + }, + { + "epoch": 5.84, + "grad_norm": 10.441876411437988, + "learning_rate": 8.31921701486919e-06, + "loss": 0.4543, + "step": 31030 + }, + { + "epoch": 5.84, + "grad_norm": 1.0615036487579346, + "learning_rate": 8.315452663278752e-06, + "loss": 0.4342, + "step": 31040 + }, + { + "epoch": 5.84, + "grad_norm": 0.598838210105896, + "learning_rate": 8.311688311688313e-06, + "loss": 0.6021, + "step": 31050 + }, + { + "epoch": 5.85, + "grad_norm": 32.29551315307617, + "learning_rate": 8.307923960097873e-06, + "loss": 0.5029, + "step": 31060 + }, + { + "epoch": 5.85, + "grad_norm": 42.38307571411133, + "learning_rate": 8.304159608507435e-06, + "loss": 0.4835, + "step": 31070 + }, + { + "epoch": 5.85, + "grad_norm": 0.41502639651298523, + "learning_rate": 8.300395256916996e-06, + "loss": 0.2942, + "step": 31080 + }, + { + "epoch": 5.85, + "grad_norm": 1.5998517274856567, + "learning_rate": 8.296630905326558e-06, + "loss": 0.4603, + "step": 31090 + }, + { + "epoch": 5.85, + "grad_norm": 18.18465232849121, + "learning_rate": 8.292866553736119e-06, + "loss": 0.5552, + "step": 31100 + }, + { + "epoch": 5.86, + "grad_norm": 19.617231369018555, + "learning_rate": 8.28910220214568e-06, + "loss": 0.5419, + "step": 31110 + }, + { + "epoch": 5.86, + "grad_norm": 33.43571472167969, + "learning_rate": 8.285337850555244e-06, + "loss": 0.3924, + "step": 31120 + }, + { + "epoch": 5.86, + "grad_norm": 14.464301109313965, + "learning_rate": 8.281573498964805e-06, + "loss": 0.5375, + "step": 31130 + }, + { + "epoch": 5.86, + "grad_norm": 15.446751594543457, + "learning_rate": 8.277809147374365e-06, + "loss": 0.2812, + "step": 31140 + }, + { + "epoch": 5.86, + "grad_norm": 24.76715087890625, + "learning_rate": 8.274044795783926e-06, + "loss": 0.4512, + "step": 31150 + }, + { + "epoch": 5.86, + "grad_norm": 19.724828720092773, + "learning_rate": 8.270280444193488e-06, + "loss": 0.5171, + "step": 31160 + }, + { + "epoch": 5.87, + "grad_norm": 6.281129837036133, + "learning_rate": 8.26651609260305e-06, + "loss": 0.3646, + "step": 31170 + }, + { + "epoch": 5.87, + "grad_norm": 0.5755620002746582, + "learning_rate": 8.26275174101261e-06, + "loss": 0.6023, + "step": 31180 + }, + { + "epoch": 5.87, + "grad_norm": 24.553237915039062, + "learning_rate": 8.258987389422172e-06, + "loss": 0.4937, + "step": 31190 + }, + { + "epoch": 5.87, + "grad_norm": 25.367462158203125, + "learning_rate": 8.255223037831734e-06, + "loss": 0.4216, + "step": 31200 + }, + { + "epoch": 5.87, + "grad_norm": 5.50634765625, + "learning_rate": 8.251458686241297e-06, + "loss": 0.3745, + "step": 31210 + }, + { + "epoch": 5.88, + "grad_norm": 31.35790252685547, + "learning_rate": 8.247694334650858e-06, + "loss": 0.6542, + "step": 31220 + }, + { + "epoch": 5.88, + "grad_norm": 18.49880027770996, + "learning_rate": 8.243929983060418e-06, + "loss": 1.0202, + "step": 31230 + }, + { + "epoch": 5.88, + "grad_norm": 0.294521301984787, + "learning_rate": 8.24016563146998e-06, + "loss": 0.2461, + "step": 31240 + }, + { + "epoch": 5.88, + "grad_norm": 2.930119276046753, + "learning_rate": 8.236401279879541e-06, + "loss": 0.4741, + "step": 31250 + }, + { + "epoch": 5.88, + "grad_norm": 14.062408447265625, + "learning_rate": 8.232636928289102e-06, + "loss": 0.5901, + "step": 31260 + }, + { + "epoch": 5.89, + "grad_norm": 1.9135679006576538, + "learning_rate": 8.228872576698664e-06, + "loss": 0.3728, + "step": 31270 + }, + { + "epoch": 5.89, + "grad_norm": 12.453121185302734, + "learning_rate": 8.225108225108225e-06, + "loss": 0.612, + "step": 31280 + }, + { + "epoch": 5.89, + "grad_norm": 18.037458419799805, + "learning_rate": 8.221343873517787e-06, + "loss": 0.6464, + "step": 31290 + }, + { + "epoch": 5.89, + "grad_norm": 14.36330795288086, + "learning_rate": 8.21757952192735e-06, + "loss": 0.6451, + "step": 31300 + }, + { + "epoch": 5.89, + "grad_norm": 17.178071975708008, + "learning_rate": 8.213815170336911e-06, + "loss": 0.4489, + "step": 31310 + }, + { + "epoch": 5.89, + "grad_norm": 3.543149948120117, + "learning_rate": 8.210050818746471e-06, + "loss": 0.8048, + "step": 31320 + }, + { + "epoch": 5.9, + "grad_norm": 8.747088432312012, + "learning_rate": 8.206286467156033e-06, + "loss": 0.5323, + "step": 31330 + }, + { + "epoch": 5.9, + "grad_norm": 10.16124439239502, + "learning_rate": 8.202522115565594e-06, + "loss": 0.2721, + "step": 31340 + }, + { + "epoch": 5.9, + "grad_norm": 25.606826782226562, + "learning_rate": 8.198757763975156e-06, + "loss": 0.3302, + "step": 31350 + }, + { + "epoch": 5.9, + "grad_norm": 10.258896827697754, + "learning_rate": 8.194993412384717e-06, + "loss": 0.4167, + "step": 31360 + }, + { + "epoch": 5.9, + "grad_norm": 11.452646255493164, + "learning_rate": 8.191229060794278e-06, + "loss": 0.5729, + "step": 31370 + }, + { + "epoch": 5.91, + "grad_norm": 20.775026321411133, + "learning_rate": 8.18746470920384e-06, + "loss": 0.6972, + "step": 31380 + }, + { + "epoch": 5.91, + "grad_norm": 0.9261711239814758, + "learning_rate": 8.183700357613401e-06, + "loss": 0.4341, + "step": 31390 + }, + { + "epoch": 5.91, + "grad_norm": 23.06140899658203, + "learning_rate": 8.179936006022963e-06, + "loss": 0.6072, + "step": 31400 + }, + { + "epoch": 5.91, + "grad_norm": 0.12934058904647827, + "learning_rate": 8.176171654432524e-06, + "loss": 0.5984, + "step": 31410 + }, + { + "epoch": 5.91, + "grad_norm": 11.607454299926758, + "learning_rate": 8.172407302842086e-06, + "loss": 0.4704, + "step": 31420 + }, + { + "epoch": 5.92, + "grad_norm": 17.176578521728516, + "learning_rate": 8.168642951251647e-06, + "loss": 0.6911, + "step": 31430 + }, + { + "epoch": 5.92, + "grad_norm": 21.022741317749023, + "learning_rate": 8.164878599661209e-06, + "loss": 0.5338, + "step": 31440 + }, + { + "epoch": 5.92, + "grad_norm": 1.4555150270462036, + "learning_rate": 8.16111424807077e-06, + "loss": 0.5478, + "step": 31450 + }, + { + "epoch": 5.92, + "grad_norm": 0.8523018956184387, + "learning_rate": 8.157349896480332e-06, + "loss": 0.3781, + "step": 31460 + }, + { + "epoch": 5.92, + "grad_norm": 0.12009706348180771, + "learning_rate": 8.153585544889893e-06, + "loss": 0.5797, + "step": 31470 + }, + { + "epoch": 5.93, + "grad_norm": 3.4244158267974854, + "learning_rate": 8.149821193299455e-06, + "loss": 0.2886, + "step": 31480 + }, + { + "epoch": 5.93, + "grad_norm": 5.182470798492432, + "learning_rate": 8.146056841709016e-06, + "loss": 0.4928, + "step": 31490 + }, + { + "epoch": 5.93, + "grad_norm": 37.03120040893555, + "learning_rate": 8.142292490118577e-06, + "loss": 0.5429, + "step": 31500 + }, + { + "epoch": 5.93, + "grad_norm": 10.818685531616211, + "learning_rate": 8.138528138528139e-06, + "loss": 0.7055, + "step": 31510 + }, + { + "epoch": 5.93, + "grad_norm": 19.673473358154297, + "learning_rate": 8.1347637869377e-06, + "loss": 0.7041, + "step": 31520 + }, + { + "epoch": 5.93, + "grad_norm": 19.44447135925293, + "learning_rate": 8.130999435347262e-06, + "loss": 0.4307, + "step": 31530 + }, + { + "epoch": 5.94, + "grad_norm": 15.414891242980957, + "learning_rate": 8.127235083756823e-06, + "loss": 0.433, + "step": 31540 + }, + { + "epoch": 5.94, + "grad_norm": 1.3869212865829468, + "learning_rate": 8.123470732166385e-06, + "loss": 0.4351, + "step": 31550 + }, + { + "epoch": 5.94, + "grad_norm": 6.919072151184082, + "learning_rate": 8.119706380575946e-06, + "loss": 0.4183, + "step": 31560 + }, + { + "epoch": 5.94, + "grad_norm": 18.920053482055664, + "learning_rate": 8.115942028985508e-06, + "loss": 0.6036, + "step": 31570 + }, + { + "epoch": 5.94, + "grad_norm": 23.368011474609375, + "learning_rate": 8.112177677395069e-06, + "loss": 0.4688, + "step": 31580 + }, + { + "epoch": 5.95, + "grad_norm": 43.60259246826172, + "learning_rate": 8.10841332580463e-06, + "loss": 0.2959, + "step": 31590 + }, + { + "epoch": 5.95, + "grad_norm": 3.7617850303649902, + "learning_rate": 8.104648974214192e-06, + "loss": 0.3229, + "step": 31600 + }, + { + "epoch": 5.95, + "grad_norm": 28.480266571044922, + "learning_rate": 8.100884622623754e-06, + "loss": 0.5828, + "step": 31610 + }, + { + "epoch": 5.95, + "grad_norm": 6.059977054595947, + "learning_rate": 8.097120271033315e-06, + "loss": 0.6493, + "step": 31620 + }, + { + "epoch": 5.95, + "grad_norm": 21.183731079101562, + "learning_rate": 8.093355919442876e-06, + "loss": 0.4919, + "step": 31630 + }, + { + "epoch": 5.96, + "grad_norm": 0.6658665537834167, + "learning_rate": 8.089591567852438e-06, + "loss": 0.3223, + "step": 31640 + }, + { + "epoch": 5.96, + "grad_norm": 6.641610622406006, + "learning_rate": 8.085827216262e-06, + "loss": 0.452, + "step": 31650 + }, + { + "epoch": 5.96, + "grad_norm": 2.345966339111328, + "learning_rate": 8.08206286467156e-06, + "loss": 0.4293, + "step": 31660 + }, + { + "epoch": 5.96, + "grad_norm": 21.061580657958984, + "learning_rate": 8.078298513081122e-06, + "loss": 0.5237, + "step": 31670 + }, + { + "epoch": 5.96, + "grad_norm": 20.903779983520508, + "learning_rate": 8.074534161490684e-06, + "loss": 0.4802, + "step": 31680 + }, + { + "epoch": 5.96, + "grad_norm": 13.212108612060547, + "learning_rate": 8.070769809900245e-06, + "loss": 0.674, + "step": 31690 + }, + { + "epoch": 5.97, + "grad_norm": 12.111379623413086, + "learning_rate": 8.067005458309807e-06, + "loss": 0.519, + "step": 31700 + }, + { + "epoch": 5.97, + "grad_norm": 21.36750030517578, + "learning_rate": 8.063241106719368e-06, + "loss": 0.6218, + "step": 31710 + }, + { + "epoch": 5.97, + "grad_norm": 35.358192443847656, + "learning_rate": 8.05947675512893e-06, + "loss": 0.5728, + "step": 31720 + }, + { + "epoch": 5.97, + "grad_norm": 16.899436950683594, + "learning_rate": 8.055712403538491e-06, + "loss": 0.3583, + "step": 31730 + }, + { + "epoch": 5.97, + "grad_norm": 27.319355010986328, + "learning_rate": 8.051948051948052e-06, + "loss": 0.5901, + "step": 31740 + }, + { + "epoch": 5.98, + "grad_norm": 14.794363021850586, + "learning_rate": 8.048183700357614e-06, + "loss": 0.5036, + "step": 31750 + }, + { + "epoch": 5.98, + "grad_norm": 0.04856634885072708, + "learning_rate": 8.044419348767175e-06, + "loss": 0.4173, + "step": 31760 + }, + { + "epoch": 5.98, + "grad_norm": 16.557096481323242, + "learning_rate": 8.040654997176737e-06, + "loss": 0.6479, + "step": 31770 + }, + { + "epoch": 5.98, + "grad_norm": 12.90949821472168, + "learning_rate": 8.036890645586298e-06, + "loss": 0.5475, + "step": 31780 + }, + { + "epoch": 5.98, + "grad_norm": 23.995361328125, + "learning_rate": 8.03312629399586e-06, + "loss": 0.4879, + "step": 31790 + }, + { + "epoch": 5.99, + "grad_norm": 15.855318069458008, + "learning_rate": 8.029361942405421e-06, + "loss": 0.5636, + "step": 31800 + }, + { + "epoch": 5.99, + "grad_norm": 17.926624298095703, + "learning_rate": 8.025597590814983e-06, + "loss": 0.6181, + "step": 31810 + }, + { + "epoch": 5.99, + "grad_norm": 25.674468994140625, + "learning_rate": 8.021833239224544e-06, + "loss": 0.5092, + "step": 31820 + }, + { + "epoch": 5.99, + "grad_norm": 15.86432933807373, + "learning_rate": 8.018068887634106e-06, + "loss": 0.5039, + "step": 31830 + }, + { + "epoch": 5.99, + "grad_norm": 16.778894424438477, + "learning_rate": 8.014304536043667e-06, + "loss": 0.53, + "step": 31840 + }, + { + "epoch": 5.99, + "grad_norm": 20.491600036621094, + "learning_rate": 8.010540184453229e-06, + "loss": 0.5336, + "step": 31850 + }, + { + "epoch": 6.0, + "grad_norm": 22.663936614990234, + "learning_rate": 8.00677583286279e-06, + "loss": 0.2657, + "step": 31860 + }, + { + "epoch": 6.0, + "grad_norm": 30.6973876953125, + "learning_rate": 8.003011481272351e-06, + "loss": 0.3944, + "step": 31870 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9201333333333334, + "eval_loss": 0.3082124590873718, + "eval_runtime": 51.3947, + "eval_samples_per_second": 145.929, + "eval_steps_per_second": 18.251, + "step": 31878 + }, + { + "epoch": 6.0, + "grad_norm": 16.97609519958496, + "learning_rate": 7.999247129681913e-06, + "loss": 0.5414, + "step": 31880 + }, + { + "epoch": 6.0, + "grad_norm": 34.15602111816406, + "learning_rate": 7.995482778091474e-06, + "loss": 0.7876, + "step": 31890 + }, + { + "epoch": 6.0, + "grad_norm": 18.55365753173828, + "learning_rate": 7.991718426501036e-06, + "loss": 0.5426, + "step": 31900 + }, + { + "epoch": 6.01, + "grad_norm": 23.530202865600586, + "learning_rate": 7.987954074910597e-06, + "loss": 0.5966, + "step": 31910 + }, + { + "epoch": 6.01, + "grad_norm": 22.947160720825195, + "learning_rate": 7.984189723320159e-06, + "loss": 0.5899, + "step": 31920 + }, + { + "epoch": 6.01, + "grad_norm": 21.2280216217041, + "learning_rate": 7.98042537172972e-06, + "loss": 0.2858, + "step": 31930 + }, + { + "epoch": 6.01, + "grad_norm": 13.020065307617188, + "learning_rate": 7.976661020139282e-06, + "loss": 0.6148, + "step": 31940 + }, + { + "epoch": 6.01, + "grad_norm": 25.60552978515625, + "learning_rate": 7.972896668548843e-06, + "loss": 0.4152, + "step": 31950 + }, + { + "epoch": 6.02, + "grad_norm": 0.053030744194984436, + "learning_rate": 7.969132316958405e-06, + "loss": 0.4397, + "step": 31960 + }, + { + "epoch": 6.02, + "grad_norm": 18.835439682006836, + "learning_rate": 7.965367965367966e-06, + "loss": 0.574, + "step": 31970 + }, + { + "epoch": 6.02, + "grad_norm": 13.809188842773438, + "learning_rate": 7.961603613777528e-06, + "loss": 0.3238, + "step": 31980 + }, + { + "epoch": 6.02, + "grad_norm": 17.327247619628906, + "learning_rate": 7.957839262187089e-06, + "loss": 0.7307, + "step": 31990 + }, + { + "epoch": 6.02, + "grad_norm": 9.818562507629395, + "learning_rate": 7.95407491059665e-06, + "loss": 0.4293, + "step": 32000 + }, + { + "epoch": 6.02, + "grad_norm": 0.10741060227155685, + "learning_rate": 7.950310559006212e-06, + "loss": 0.3728, + "step": 32010 + }, + { + "epoch": 6.03, + "grad_norm": 12.27418041229248, + "learning_rate": 7.946546207415773e-06, + "loss": 0.5624, + "step": 32020 + }, + { + "epoch": 6.03, + "grad_norm": 24.412097930908203, + "learning_rate": 7.942781855825335e-06, + "loss": 0.3476, + "step": 32030 + }, + { + "epoch": 6.03, + "grad_norm": 11.939266204833984, + "learning_rate": 7.939017504234896e-06, + "loss": 0.6026, + "step": 32040 + }, + { + "epoch": 6.03, + "grad_norm": 1.6670972108840942, + "learning_rate": 7.935253152644458e-06, + "loss": 0.461, + "step": 32050 + }, + { + "epoch": 6.03, + "grad_norm": 18.398563385009766, + "learning_rate": 7.93148880105402e-06, + "loss": 0.8113, + "step": 32060 + }, + { + "epoch": 6.04, + "grad_norm": 7.880075454711914, + "learning_rate": 7.92772444946358e-06, + "loss": 0.3885, + "step": 32070 + }, + { + "epoch": 6.04, + "grad_norm": 12.342472076416016, + "learning_rate": 7.923960097873142e-06, + "loss": 0.251, + "step": 32080 + }, + { + "epoch": 6.04, + "grad_norm": 49.20840072631836, + "learning_rate": 7.920195746282704e-06, + "loss": 0.5007, + "step": 32090 + }, + { + "epoch": 6.04, + "grad_norm": 17.772216796875, + "learning_rate": 7.916431394692265e-06, + "loss": 0.3225, + "step": 32100 + }, + { + "epoch": 6.04, + "grad_norm": 53.0165901184082, + "learning_rate": 7.912667043101827e-06, + "loss": 0.597, + "step": 32110 + }, + { + "epoch": 6.05, + "grad_norm": 16.379194259643555, + "learning_rate": 7.908902691511388e-06, + "loss": 0.4052, + "step": 32120 + }, + { + "epoch": 6.05, + "grad_norm": 0.09049464017152786, + "learning_rate": 7.90513833992095e-06, + "loss": 0.777, + "step": 32130 + }, + { + "epoch": 6.05, + "grad_norm": 8.671985626220703, + "learning_rate": 7.901373988330511e-06, + "loss": 0.8194, + "step": 32140 + }, + { + "epoch": 6.05, + "grad_norm": 34.3663215637207, + "learning_rate": 7.897609636740072e-06, + "loss": 0.6592, + "step": 32150 + }, + { + "epoch": 6.05, + "grad_norm": 1.3141288757324219, + "learning_rate": 7.893845285149634e-06, + "loss": 0.3544, + "step": 32160 + }, + { + "epoch": 6.05, + "grad_norm": 17.053375244140625, + "learning_rate": 7.890080933559195e-06, + "loss": 0.7546, + "step": 32170 + }, + { + "epoch": 6.06, + "grad_norm": 40.75886917114258, + "learning_rate": 7.886316581968757e-06, + "loss": 0.5418, + "step": 32180 + }, + { + "epoch": 6.06, + "grad_norm": 9.297530174255371, + "learning_rate": 7.882552230378318e-06, + "loss": 0.6633, + "step": 32190 + }, + { + "epoch": 6.06, + "grad_norm": 13.147290229797363, + "learning_rate": 7.87878787878788e-06, + "loss": 0.4542, + "step": 32200 + }, + { + "epoch": 6.06, + "grad_norm": 31.284879684448242, + "learning_rate": 7.875023527197441e-06, + "loss": 0.4669, + "step": 32210 + }, + { + "epoch": 6.06, + "grad_norm": 15.669341087341309, + "learning_rate": 7.871259175607003e-06, + "loss": 0.1938, + "step": 32220 + }, + { + "epoch": 6.07, + "grad_norm": 44.40217590332031, + "learning_rate": 7.867494824016564e-06, + "loss": 0.3318, + "step": 32230 + }, + { + "epoch": 6.07, + "grad_norm": 6.310189247131348, + "learning_rate": 7.863730472426126e-06, + "loss": 0.6378, + "step": 32240 + }, + { + "epoch": 6.07, + "grad_norm": 13.212310791015625, + "learning_rate": 7.859966120835687e-06, + "loss": 0.6015, + "step": 32250 + }, + { + "epoch": 6.07, + "grad_norm": 15.912529945373535, + "learning_rate": 7.856201769245248e-06, + "loss": 0.5288, + "step": 32260 + }, + { + "epoch": 6.07, + "grad_norm": 1.8978145122528076, + "learning_rate": 7.852437417654808e-06, + "loss": 0.4826, + "step": 32270 + }, + { + "epoch": 6.08, + "grad_norm": 39.089073181152344, + "learning_rate": 7.848673066064371e-06, + "loss": 0.5948, + "step": 32280 + }, + { + "epoch": 6.08, + "grad_norm": 8.845064163208008, + "learning_rate": 7.844908714473933e-06, + "loss": 0.4591, + "step": 32290 + }, + { + "epoch": 6.08, + "grad_norm": 20.97511100769043, + "learning_rate": 7.841144362883494e-06, + "loss": 0.6119, + "step": 32300 + }, + { + "epoch": 6.08, + "grad_norm": 8.488319396972656, + "learning_rate": 7.837380011293056e-06, + "loss": 0.6892, + "step": 32310 + }, + { + "epoch": 6.08, + "grad_norm": 38.96818542480469, + "learning_rate": 7.833615659702617e-06, + "loss": 0.3597, + "step": 32320 + }, + { + "epoch": 6.09, + "grad_norm": 1.3680989742279053, + "learning_rate": 7.829851308112179e-06, + "loss": 0.5009, + "step": 32330 + }, + { + "epoch": 6.09, + "grad_norm": 26.558027267456055, + "learning_rate": 7.82608695652174e-06, + "loss": 0.6395, + "step": 32340 + }, + { + "epoch": 6.09, + "grad_norm": 17.90494155883789, + "learning_rate": 7.822322604931302e-06, + "loss": 0.6256, + "step": 32350 + }, + { + "epoch": 6.09, + "grad_norm": 11.778217315673828, + "learning_rate": 7.818558253340861e-06, + "loss": 0.3306, + "step": 32360 + }, + { + "epoch": 6.09, + "grad_norm": 18.40743637084961, + "learning_rate": 7.814793901750425e-06, + "loss": 0.4534, + "step": 32370 + }, + { + "epoch": 6.09, + "grad_norm": 16.67788314819336, + "learning_rate": 7.811029550159986e-06, + "loss": 0.6644, + "step": 32380 + }, + { + "epoch": 6.1, + "grad_norm": 0.07633399218320847, + "learning_rate": 7.807265198569547e-06, + "loss": 0.5102, + "step": 32390 + }, + { + "epoch": 6.1, + "grad_norm": 15.977582931518555, + "learning_rate": 7.803500846979109e-06, + "loss": 0.5423, + "step": 32400 + }, + { + "epoch": 6.1, + "grad_norm": 53.08889389038086, + "learning_rate": 7.79973649538867e-06, + "loss": 0.596, + "step": 32410 + }, + { + "epoch": 6.1, + "grad_norm": 7.981726169586182, + "learning_rate": 7.795972143798232e-06, + "loss": 0.6868, + "step": 32420 + }, + { + "epoch": 6.1, + "grad_norm": 22.594266891479492, + "learning_rate": 7.792207792207793e-06, + "loss": 0.6394, + "step": 32430 + }, + { + "epoch": 6.11, + "grad_norm": 18.69639778137207, + "learning_rate": 7.788443440617355e-06, + "loss": 0.4663, + "step": 32440 + }, + { + "epoch": 6.11, + "grad_norm": 11.259078025817871, + "learning_rate": 7.784679089026915e-06, + "loss": 0.4948, + "step": 32450 + }, + { + "epoch": 6.11, + "grad_norm": 6.536806583404541, + "learning_rate": 7.780914737436478e-06, + "loss": 0.5363, + "step": 32460 + }, + { + "epoch": 6.11, + "grad_norm": 24.416154861450195, + "learning_rate": 7.777150385846039e-06, + "loss": 0.6788, + "step": 32470 + }, + { + "epoch": 6.11, + "grad_norm": 7.047547340393066, + "learning_rate": 7.7733860342556e-06, + "loss": 0.6391, + "step": 32480 + }, + { + "epoch": 6.12, + "grad_norm": 32.992122650146484, + "learning_rate": 7.769621682665162e-06, + "loss": 0.4505, + "step": 32490 + }, + { + "epoch": 6.12, + "grad_norm": 21.74233055114746, + "learning_rate": 7.765857331074724e-06, + "loss": 0.441, + "step": 32500 + }, + { + "epoch": 6.12, + "grad_norm": 2.9683542251586914, + "learning_rate": 7.762092979484285e-06, + "loss": 0.4137, + "step": 32510 + }, + { + "epoch": 6.12, + "grad_norm": 6.940702438354492, + "learning_rate": 7.758328627893846e-06, + "loss": 0.3561, + "step": 32520 + }, + { + "epoch": 6.12, + "grad_norm": 13.663592338562012, + "learning_rate": 7.754564276303408e-06, + "loss": 0.4385, + "step": 32530 + }, + { + "epoch": 6.12, + "grad_norm": 4.1214752197265625, + "learning_rate": 7.750799924712968e-06, + "loss": 0.5538, + "step": 32540 + }, + { + "epoch": 6.13, + "grad_norm": 41.12890625, + "learning_rate": 7.747035573122529e-06, + "loss": 0.4315, + "step": 32550 + }, + { + "epoch": 6.13, + "grad_norm": 6.651817798614502, + "learning_rate": 7.743271221532092e-06, + "loss": 0.5182, + "step": 32560 + }, + { + "epoch": 6.13, + "grad_norm": 22.12759780883789, + "learning_rate": 7.739506869941654e-06, + "loss": 0.4464, + "step": 32570 + }, + { + "epoch": 6.13, + "grad_norm": 9.155705451965332, + "learning_rate": 7.735742518351215e-06, + "loss": 0.5285, + "step": 32580 + }, + { + "epoch": 6.13, + "grad_norm": 22.846324920654297, + "learning_rate": 7.731978166760777e-06, + "loss": 0.5287, + "step": 32590 + }, + { + "epoch": 6.14, + "grad_norm": 2.597435474395752, + "learning_rate": 7.728213815170338e-06, + "loss": 0.064, + "step": 32600 + }, + { + "epoch": 6.14, + "grad_norm": 10.483798027038574, + "learning_rate": 7.7244494635799e-06, + "loss": 0.3093, + "step": 32610 + }, + { + "epoch": 6.14, + "grad_norm": 9.434378623962402, + "learning_rate": 7.72068511198946e-06, + "loss": 0.6346, + "step": 32620 + }, + { + "epoch": 6.14, + "grad_norm": 0.27151790261268616, + "learning_rate": 7.71692076039902e-06, + "loss": 0.3794, + "step": 32630 + }, + { + "epoch": 6.14, + "grad_norm": 7.006466388702393, + "learning_rate": 7.713156408808582e-06, + "loss": 0.5533, + "step": 32640 + }, + { + "epoch": 6.15, + "grad_norm": 6.906270980834961, + "learning_rate": 7.709392057218145e-06, + "loss": 0.4613, + "step": 32650 + }, + { + "epoch": 6.15, + "grad_norm": 1.0680428743362427, + "learning_rate": 7.705627705627707e-06, + "loss": 0.474, + "step": 32660 + }, + { + "epoch": 6.15, + "grad_norm": 0.06817318499088287, + "learning_rate": 7.701863354037268e-06, + "loss": 0.4843, + "step": 32670 + }, + { + "epoch": 6.15, + "grad_norm": 9.567086219787598, + "learning_rate": 7.69809900244683e-06, + "loss": 0.6909, + "step": 32680 + }, + { + "epoch": 6.15, + "grad_norm": 4.2329301834106445, + "learning_rate": 7.694334650856391e-06, + "loss": 0.4636, + "step": 32690 + }, + { + "epoch": 6.15, + "grad_norm": 0.9662070870399475, + "learning_rate": 7.690570299265953e-06, + "loss": 0.4107, + "step": 32700 + }, + { + "epoch": 6.16, + "grad_norm": 10.737784385681152, + "learning_rate": 7.686805947675512e-06, + "loss": 0.3815, + "step": 32710 + }, + { + "epoch": 6.16, + "grad_norm": 16.375762939453125, + "learning_rate": 7.683041596085074e-06, + "loss": 0.4073, + "step": 32720 + }, + { + "epoch": 6.16, + "grad_norm": 3.9611103534698486, + "learning_rate": 7.679277244494635e-06, + "loss": 0.5408, + "step": 32730 + }, + { + "epoch": 6.16, + "grad_norm": 3.900883674621582, + "learning_rate": 7.675512892904199e-06, + "loss": 0.5193, + "step": 32740 + }, + { + "epoch": 6.16, + "grad_norm": 13.367103576660156, + "learning_rate": 7.67174854131376e-06, + "loss": 0.7197, + "step": 32750 + }, + { + "epoch": 6.17, + "grad_norm": 0.168531134724617, + "learning_rate": 7.667984189723321e-06, + "loss": 0.3934, + "step": 32760 + }, + { + "epoch": 6.17, + "grad_norm": 2.455349922180176, + "learning_rate": 7.664219838132883e-06, + "loss": 0.5187, + "step": 32770 + }, + { + "epoch": 6.17, + "grad_norm": 17.506908416748047, + "learning_rate": 7.660455486542444e-06, + "loss": 0.5227, + "step": 32780 + }, + { + "epoch": 6.17, + "grad_norm": 7.857856750488281, + "learning_rate": 7.656691134952006e-06, + "loss": 0.6049, + "step": 32790 + }, + { + "epoch": 6.17, + "grad_norm": 10.179645538330078, + "learning_rate": 7.652926783361566e-06, + "loss": 0.5264, + "step": 32800 + }, + { + "epoch": 6.18, + "grad_norm": 15.510612487792969, + "learning_rate": 7.649162431771127e-06, + "loss": 0.3707, + "step": 32810 + }, + { + "epoch": 6.18, + "grad_norm": 4.712235927581787, + "learning_rate": 7.645398080180689e-06, + "loss": 0.5458, + "step": 32820 + }, + { + "epoch": 6.18, + "grad_norm": 7.241546630859375, + "learning_rate": 7.641633728590252e-06, + "loss": 0.5487, + "step": 32830 + }, + { + "epoch": 6.18, + "grad_norm": 1.5072888135910034, + "learning_rate": 7.637869376999813e-06, + "loss": 0.6038, + "step": 32840 + }, + { + "epoch": 6.18, + "grad_norm": 0.652369499206543, + "learning_rate": 7.634105025409375e-06, + "loss": 0.5724, + "step": 32850 + }, + { + "epoch": 6.18, + "grad_norm": 4.111026287078857, + "learning_rate": 7.630340673818936e-06, + "loss": 0.2281, + "step": 32860 + }, + { + "epoch": 6.19, + "grad_norm": 0.2701318860054016, + "learning_rate": 7.6265763222284976e-06, + "loss": 0.6504, + "step": 32870 + }, + { + "epoch": 6.19, + "grad_norm": 13.367432594299316, + "learning_rate": 7.622811970638057e-06, + "loss": 0.2885, + "step": 32880 + }, + { + "epoch": 6.19, + "grad_norm": 5.478610515594482, + "learning_rate": 7.61904761904762e-06, + "loss": 0.5181, + "step": 32890 + }, + { + "epoch": 6.19, + "grad_norm": 8.030369758605957, + "learning_rate": 7.615283267457181e-06, + "loss": 0.5897, + "step": 32900 + }, + { + "epoch": 6.19, + "grad_norm": 13.834616661071777, + "learning_rate": 7.6115189158667426e-06, + "loss": 0.5119, + "step": 32910 + }, + { + "epoch": 6.2, + "grad_norm": 5.764062881469727, + "learning_rate": 7.607754564276304e-06, + "loss": 0.5504, + "step": 32920 + }, + { + "epoch": 6.2, + "grad_norm": 51.650718688964844, + "learning_rate": 7.6039902126858655e-06, + "loss": 0.4928, + "step": 32930 + }, + { + "epoch": 6.2, + "grad_norm": 43.39817428588867, + "learning_rate": 7.600225861095427e-06, + "loss": 0.3655, + "step": 32940 + }, + { + "epoch": 6.2, + "grad_norm": 1.6382009983062744, + "learning_rate": 7.596461509504988e-06, + "loss": 0.4539, + "step": 32950 + }, + { + "epoch": 6.2, + "grad_norm": 0.764748215675354, + "learning_rate": 7.592697157914551e-06, + "loss": 0.6964, + "step": 32960 + }, + { + "epoch": 6.21, + "grad_norm": 46.74039077758789, + "learning_rate": 7.5889328063241105e-06, + "loss": 0.4344, + "step": 32970 + }, + { + "epoch": 6.21, + "grad_norm": 24.714452743530273, + "learning_rate": 7.585168454733673e-06, + "loss": 0.7972, + "step": 32980 + }, + { + "epoch": 6.21, + "grad_norm": 15.897273063659668, + "learning_rate": 7.581404103143234e-06, + "loss": 0.443, + "step": 32990 + }, + { + "epoch": 6.21, + "grad_norm": 5.593212604522705, + "learning_rate": 7.577639751552796e-06, + "loss": 0.5032, + "step": 33000 + }, + { + "epoch": 6.21, + "grad_norm": 10.625405311584473, + "learning_rate": 7.573875399962357e-06, + "loss": 0.4239, + "step": 33010 + }, + { + "epoch": 6.21, + "grad_norm": 28.671268463134766, + "learning_rate": 7.570111048371919e-06, + "loss": 0.3406, + "step": 33020 + }, + { + "epoch": 6.22, + "grad_norm": 1.3727935552597046, + "learning_rate": 7.56634669678148e-06, + "loss": 0.4927, + "step": 33030 + }, + { + "epoch": 6.22, + "grad_norm": 8.277888298034668, + "learning_rate": 7.5625823451910415e-06, + "loss": 0.5651, + "step": 33040 + }, + { + "epoch": 6.22, + "grad_norm": 23.557905197143555, + "learning_rate": 7.558817993600603e-06, + "loss": 0.4431, + "step": 33050 + }, + { + "epoch": 6.22, + "grad_norm": 27.223262786865234, + "learning_rate": 7.555053642010164e-06, + "loss": 0.8412, + "step": 33060 + }, + { + "epoch": 6.22, + "grad_norm": 14.346407890319824, + "learning_rate": 7.551289290419725e-06, + "loss": 0.5623, + "step": 33070 + }, + { + "epoch": 6.23, + "grad_norm": 0.6994918584823608, + "learning_rate": 7.547524938829287e-06, + "loss": 0.6004, + "step": 33080 + }, + { + "epoch": 6.23, + "grad_norm": 3.5773377418518066, + "learning_rate": 7.543760587238849e-06, + "loss": 0.4514, + "step": 33090 + }, + { + "epoch": 6.23, + "grad_norm": 15.664129257202148, + "learning_rate": 7.53999623564841e-06, + "loss": 0.437, + "step": 33100 + }, + { + "epoch": 6.23, + "grad_norm": 14.454998016357422, + "learning_rate": 7.536231884057972e-06, + "loss": 0.3146, + "step": 33110 + }, + { + "epoch": 6.23, + "grad_norm": 14.287149429321289, + "learning_rate": 7.532467532467533e-06, + "loss": 0.3651, + "step": 33120 + }, + { + "epoch": 6.24, + "grad_norm": 35.363502502441406, + "learning_rate": 7.528703180877095e-06, + "loss": 0.3985, + "step": 33130 + }, + { + "epoch": 6.24, + "grad_norm": 11.074187278747559, + "learning_rate": 7.524938829286656e-06, + "loss": 0.7611, + "step": 33140 + }, + { + "epoch": 6.24, + "grad_norm": 2.34920597076416, + "learning_rate": 7.521174477696217e-06, + "loss": 0.6438, + "step": 33150 + }, + { + "epoch": 6.24, + "grad_norm": 3.194157361984253, + "learning_rate": 7.517410126105778e-06, + "loss": 0.9574, + "step": 33160 + }, + { + "epoch": 6.24, + "grad_norm": 15.871007919311523, + "learning_rate": 7.5136457745153405e-06, + "loss": 0.8127, + "step": 33170 + }, + { + "epoch": 6.25, + "grad_norm": 0.2083834558725357, + "learning_rate": 7.509881422924902e-06, + "loss": 0.438, + "step": 33180 + }, + { + "epoch": 6.25, + "grad_norm": 9.858641624450684, + "learning_rate": 7.5061170713344635e-06, + "loss": 0.4504, + "step": 33190 + }, + { + "epoch": 6.25, + "grad_norm": 9.694896697998047, + "learning_rate": 7.502352719744025e-06, + "loss": 0.6191, + "step": 33200 + }, + { + "epoch": 6.25, + "grad_norm": 18.69541358947754, + "learning_rate": 7.498588368153586e-06, + "loss": 0.9686, + "step": 33210 + }, + { + "epoch": 6.25, + "grad_norm": 18.836008071899414, + "learning_rate": 7.494824016563148e-06, + "loss": 0.4195, + "step": 33220 + }, + { + "epoch": 6.25, + "grad_norm": 8.28152084350586, + "learning_rate": 7.4910596649727084e-06, + "loss": 0.4661, + "step": 33230 + }, + { + "epoch": 6.26, + "grad_norm": 0.8841540813446045, + "learning_rate": 7.48729531338227e-06, + "loss": 1.1192, + "step": 33240 + }, + { + "epoch": 6.26, + "grad_norm": 21.22022247314453, + "learning_rate": 7.483530961791831e-06, + "loss": 0.5756, + "step": 33250 + }, + { + "epoch": 6.26, + "grad_norm": 5.069668769836426, + "learning_rate": 7.479766610201394e-06, + "loss": 0.5685, + "step": 33260 + }, + { + "epoch": 6.26, + "grad_norm": 1.734766960144043, + "learning_rate": 7.476002258610955e-06, + "loss": 0.2923, + "step": 33270 + }, + { + "epoch": 6.26, + "grad_norm": 15.850728988647461, + "learning_rate": 7.472237907020517e-06, + "loss": 0.6755, + "step": 33280 + }, + { + "epoch": 6.27, + "grad_norm": 0.8242527842521667, + "learning_rate": 7.468473555430078e-06, + "loss": 0.5514, + "step": 33290 + }, + { + "epoch": 6.27, + "grad_norm": 22.578418731689453, + "learning_rate": 7.4647092038396395e-06, + "loss": 0.5968, + "step": 33300 + }, + { + "epoch": 6.27, + "grad_norm": 17.16265869140625, + "learning_rate": 7.460944852249201e-06, + "loss": 0.4647, + "step": 33310 + }, + { + "epoch": 6.27, + "grad_norm": 32.857601165771484, + "learning_rate": 7.457180500658762e-06, + "loss": 0.511, + "step": 33320 + }, + { + "epoch": 6.27, + "grad_norm": 10.078173637390137, + "learning_rate": 7.453416149068323e-06, + "loss": 0.3435, + "step": 33330 + }, + { + "epoch": 6.28, + "grad_norm": 18.10398292541504, + "learning_rate": 7.4496517974778845e-06, + "loss": 0.348, + "step": 33340 + }, + { + "epoch": 6.28, + "grad_norm": 9.601995468139648, + "learning_rate": 7.445887445887446e-06, + "loss": 0.4952, + "step": 33350 + }, + { + "epoch": 6.28, + "grad_norm": 13.778106689453125, + "learning_rate": 7.442123094297008e-06, + "loss": 0.4809, + "step": 33360 + }, + { + "epoch": 6.28, + "grad_norm": 19.068946838378906, + "learning_rate": 7.43835874270657e-06, + "loss": 0.3458, + "step": 33370 + }, + { + "epoch": 6.28, + "grad_norm": 1.7670868635177612, + "learning_rate": 7.434594391116131e-06, + "loss": 0.6565, + "step": 33380 + }, + { + "epoch": 6.28, + "grad_norm": 25.797090530395508, + "learning_rate": 7.430830039525693e-06, + "loss": 0.7225, + "step": 33390 + }, + { + "epoch": 6.29, + "grad_norm": 30.813005447387695, + "learning_rate": 7.427065687935254e-06, + "loss": 0.8055, + "step": 33400 + }, + { + "epoch": 6.29, + "grad_norm": 7.973330497741699, + "learning_rate": 7.423301336344815e-06, + "loss": 0.4666, + "step": 33410 + }, + { + "epoch": 6.29, + "grad_norm": 9.906311988830566, + "learning_rate": 7.419536984754376e-06, + "loss": 0.3954, + "step": 33420 + }, + { + "epoch": 6.29, + "grad_norm": 43.313724517822266, + "learning_rate": 7.415772633163938e-06, + "loss": 0.6149, + "step": 33430 + }, + { + "epoch": 6.29, + "grad_norm": 7.175143241882324, + "learning_rate": 7.412008281573499e-06, + "loss": 0.6234, + "step": 33440 + }, + { + "epoch": 6.3, + "grad_norm": 28.30812644958496, + "learning_rate": 7.4082439299830614e-06, + "loss": 0.3256, + "step": 33450 + }, + { + "epoch": 6.3, + "grad_norm": 12.158953666687012, + "learning_rate": 7.404479578392623e-06, + "loss": 0.3962, + "step": 33460 + }, + { + "epoch": 6.3, + "grad_norm": 17.756189346313477, + "learning_rate": 7.400715226802184e-06, + "loss": 0.5152, + "step": 33470 + }, + { + "epoch": 6.3, + "grad_norm": 18.10654067993164, + "learning_rate": 7.396950875211746e-06, + "loss": 0.6907, + "step": 33480 + }, + { + "epoch": 6.3, + "grad_norm": 18.467599868774414, + "learning_rate": 7.3931865236213064e-06, + "loss": 0.4874, + "step": 33490 + }, + { + "epoch": 6.31, + "grad_norm": 1.991639256477356, + "learning_rate": 7.389422172030868e-06, + "loss": 0.3471, + "step": 33500 + }, + { + "epoch": 6.31, + "grad_norm": 20.875844955444336, + "learning_rate": 7.385657820440429e-06, + "loss": 0.8077, + "step": 33510 + }, + { + "epoch": 6.31, + "grad_norm": 4.168043613433838, + "learning_rate": 7.381893468849991e-06, + "loss": 0.3689, + "step": 33520 + }, + { + "epoch": 6.31, + "grad_norm": 0.10714370012283325, + "learning_rate": 7.378129117259552e-06, + "loss": 0.4633, + "step": 33530 + }, + { + "epoch": 6.31, + "grad_norm": 14.68402099609375, + "learning_rate": 7.3743647656691146e-06, + "loss": 0.5519, + "step": 33540 + }, + { + "epoch": 6.31, + "grad_norm": 1.1338722705841064, + "learning_rate": 7.370600414078676e-06, + "loss": 0.6535, + "step": 33550 + }, + { + "epoch": 6.32, + "grad_norm": 20.80348014831543, + "learning_rate": 7.3668360624882375e-06, + "loss": 0.6073, + "step": 33560 + }, + { + "epoch": 6.32, + "grad_norm": 17.18090057373047, + "learning_rate": 7.363071710897799e-06, + "loss": 0.5032, + "step": 33570 + }, + { + "epoch": 6.32, + "grad_norm": 0.5403892397880554, + "learning_rate": 7.3593073593073596e-06, + "loss": 0.4669, + "step": 33580 + }, + { + "epoch": 6.32, + "grad_norm": 10.763050079345703, + "learning_rate": 7.355543007716921e-06, + "loss": 0.5767, + "step": 33590 + }, + { + "epoch": 6.32, + "grad_norm": 46.35898971557617, + "learning_rate": 7.3517786561264825e-06, + "loss": 0.6032, + "step": 33600 + }, + { + "epoch": 6.33, + "grad_norm": 12.066878318786621, + "learning_rate": 7.348014304536044e-06, + "loss": 0.3331, + "step": 33610 + }, + { + "epoch": 6.33, + "grad_norm": 7.480466365814209, + "learning_rate": 7.344249952945605e-06, + "loss": 0.6212, + "step": 33620 + }, + { + "epoch": 6.33, + "grad_norm": 18.744464874267578, + "learning_rate": 7.340485601355168e-06, + "loss": 0.7442, + "step": 33630 + }, + { + "epoch": 6.33, + "grad_norm": 7.432648658752441, + "learning_rate": 7.336721249764729e-06, + "loss": 0.358, + "step": 33640 + }, + { + "epoch": 6.33, + "grad_norm": 8.940939903259277, + "learning_rate": 7.332956898174291e-06, + "loss": 0.7015, + "step": 33650 + }, + { + "epoch": 6.34, + "grad_norm": 0.28902769088745117, + "learning_rate": 7.329192546583852e-06, + "loss": 0.5588, + "step": 33660 + }, + { + "epoch": 6.34, + "grad_norm": 0.46211880445480347, + "learning_rate": 7.325428194993413e-06, + "loss": 0.6197, + "step": 33670 + }, + { + "epoch": 6.34, + "grad_norm": 25.7103328704834, + "learning_rate": 7.321663843402974e-06, + "loss": 0.4847, + "step": 33680 + }, + { + "epoch": 6.34, + "grad_norm": 8.652647018432617, + "learning_rate": 7.317899491812536e-06, + "loss": 0.3872, + "step": 33690 + }, + { + "epoch": 6.34, + "grad_norm": 10.216687202453613, + "learning_rate": 7.314135140222097e-06, + "loss": 0.4059, + "step": 33700 + }, + { + "epoch": 6.34, + "grad_norm": 2.1355504989624023, + "learning_rate": 7.3103707886316586e-06, + "loss": 0.5013, + "step": 33710 + }, + { + "epoch": 6.35, + "grad_norm": 18.312965393066406, + "learning_rate": 7.30660643704122e-06, + "loss": 0.536, + "step": 33720 + }, + { + "epoch": 6.35, + "grad_norm": 11.597451210021973, + "learning_rate": 7.302842085450782e-06, + "loss": 0.4923, + "step": 33730 + }, + { + "epoch": 6.35, + "grad_norm": 21.869190216064453, + "learning_rate": 7.299077733860344e-06, + "loss": 0.6294, + "step": 33740 + }, + { + "epoch": 6.35, + "grad_norm": 9.284852981567383, + "learning_rate": 7.295313382269905e-06, + "loss": 0.4809, + "step": 33750 + }, + { + "epoch": 6.35, + "grad_norm": 26.03097152709961, + "learning_rate": 7.291549030679466e-06, + "loss": 0.6448, + "step": 33760 + }, + { + "epoch": 6.36, + "grad_norm": 22.911418914794922, + "learning_rate": 7.287784679089027e-06, + "loss": 0.5017, + "step": 33770 + }, + { + "epoch": 6.36, + "grad_norm": 50.75450897216797, + "learning_rate": 7.284020327498589e-06, + "loss": 0.6046, + "step": 33780 + }, + { + "epoch": 6.36, + "grad_norm": 22.943689346313477, + "learning_rate": 7.28025597590815e-06, + "loss": 0.4832, + "step": 33790 + }, + { + "epoch": 6.36, + "grad_norm": 7.2119460105896, + "learning_rate": 7.276491624317712e-06, + "loss": 0.852, + "step": 33800 + }, + { + "epoch": 6.36, + "grad_norm": 9.327651023864746, + "learning_rate": 7.272727272727273e-06, + "loss": 0.5135, + "step": 33810 + }, + { + "epoch": 6.37, + "grad_norm": 20.863204956054688, + "learning_rate": 7.2689629211368355e-06, + "loss": 0.5866, + "step": 33820 + }, + { + "epoch": 6.37, + "grad_norm": 6.156462669372559, + "learning_rate": 7.265198569546397e-06, + "loss": 0.62, + "step": 33830 + }, + { + "epoch": 6.37, + "grad_norm": 24.723724365234375, + "learning_rate": 7.2614342179559576e-06, + "loss": 0.6837, + "step": 33840 + }, + { + "epoch": 6.37, + "grad_norm": 0.5178024172782898, + "learning_rate": 7.257669866365519e-06, + "loss": 0.2405, + "step": 33850 + }, + { + "epoch": 6.37, + "grad_norm": 7.65985107421875, + "learning_rate": 7.2539055147750805e-06, + "loss": 0.3353, + "step": 33860 + }, + { + "epoch": 6.37, + "grad_norm": 21.34691619873047, + "learning_rate": 7.250141163184642e-06, + "loss": 0.6229, + "step": 33870 + }, + { + "epoch": 6.38, + "grad_norm": 20.477985382080078, + "learning_rate": 7.246376811594203e-06, + "loss": 0.4679, + "step": 33880 + }, + { + "epoch": 6.38, + "grad_norm": 22.93824005126953, + "learning_rate": 7.242612460003765e-06, + "loss": 0.5659, + "step": 33890 + }, + { + "epoch": 6.38, + "grad_norm": 0.7325233221054077, + "learning_rate": 7.238848108413326e-06, + "loss": 0.5032, + "step": 33900 + }, + { + "epoch": 6.38, + "grad_norm": 14.224431037902832, + "learning_rate": 7.235083756822889e-06, + "loss": 0.674, + "step": 33910 + }, + { + "epoch": 6.38, + "grad_norm": 19.057111740112305, + "learning_rate": 7.23131940523245e-06, + "loss": 0.4854, + "step": 33920 + }, + { + "epoch": 6.39, + "grad_norm": 16.500669479370117, + "learning_rate": 7.22755505364201e-06, + "loss": 0.4265, + "step": 33930 + }, + { + "epoch": 6.39, + "grad_norm": 18.075031280517578, + "learning_rate": 7.223790702051572e-06, + "loss": 0.8634, + "step": 33940 + }, + { + "epoch": 6.39, + "grad_norm": 1.375807523727417, + "learning_rate": 7.220026350461134e-06, + "loss": 0.5841, + "step": 33950 + }, + { + "epoch": 6.39, + "grad_norm": 2.372751474380493, + "learning_rate": 7.216261998870695e-06, + "loss": 0.3938, + "step": 33960 + }, + { + "epoch": 6.39, + "grad_norm": 0.7880887985229492, + "learning_rate": 7.2124976472802565e-06, + "loss": 0.5251, + "step": 33970 + }, + { + "epoch": 6.4, + "grad_norm": 10.042675018310547, + "learning_rate": 7.208733295689818e-06, + "loss": 0.3012, + "step": 33980 + }, + { + "epoch": 6.4, + "grad_norm": 41.02911376953125, + "learning_rate": 7.2049689440993795e-06, + "loss": 0.6843, + "step": 33990 + }, + { + "epoch": 6.4, + "grad_norm": 0.09337029606103897, + "learning_rate": 7.201204592508941e-06, + "loss": 0.6831, + "step": 34000 + }, + { + "epoch": 6.4, + "grad_norm": 0.04473143443465233, + "learning_rate": 7.197440240918503e-06, + "loss": 0.5183, + "step": 34010 + }, + { + "epoch": 6.4, + "grad_norm": 30.65087127685547, + "learning_rate": 7.193675889328063e-06, + "loss": 0.5636, + "step": 34020 + }, + { + "epoch": 6.41, + "grad_norm": 22.237075805664062, + "learning_rate": 7.189911537737625e-06, + "loss": 0.6722, + "step": 34030 + }, + { + "epoch": 6.41, + "grad_norm": 23.21628761291504, + "learning_rate": 7.186147186147187e-06, + "loss": 0.4067, + "step": 34040 + }, + { + "epoch": 6.41, + "grad_norm": 6.441122055053711, + "learning_rate": 7.182382834556748e-06, + "loss": 0.373, + "step": 34050 + }, + { + "epoch": 6.41, + "grad_norm": 17.526185989379883, + "learning_rate": 7.17861848296631e-06, + "loss": 0.4955, + "step": 34060 + }, + { + "epoch": 6.41, + "grad_norm": 5.90300178527832, + "learning_rate": 7.174854131375871e-06, + "loss": 0.6357, + "step": 34070 + }, + { + "epoch": 6.41, + "grad_norm": 14.964542388916016, + "learning_rate": 7.171089779785433e-06, + "loss": 0.6288, + "step": 34080 + }, + { + "epoch": 6.42, + "grad_norm": 14.822508811950684, + "learning_rate": 7.167325428194994e-06, + "loss": 0.6348, + "step": 34090 + }, + { + "epoch": 6.42, + "grad_norm": 40.991546630859375, + "learning_rate": 7.163561076604555e-06, + "loss": 0.3295, + "step": 34100 + }, + { + "epoch": 6.42, + "grad_norm": 13.501119613647461, + "learning_rate": 7.159796725014116e-06, + "loss": 0.5754, + "step": 34110 + }, + { + "epoch": 6.42, + "grad_norm": 18.85485076904297, + "learning_rate": 7.1560323734236784e-06, + "loss": 0.5432, + "step": 34120 + }, + { + "epoch": 6.42, + "grad_norm": 8.995746612548828, + "learning_rate": 7.15226802183324e-06, + "loss": 0.9828, + "step": 34130 + }, + { + "epoch": 6.43, + "grad_norm": 13.933249473571777, + "learning_rate": 7.148503670242801e-06, + "loss": 0.5424, + "step": 34140 + }, + { + "epoch": 6.43, + "grad_norm": 13.510233879089355, + "learning_rate": 7.144739318652363e-06, + "loss": 0.6617, + "step": 34150 + }, + { + "epoch": 6.43, + "grad_norm": 13.763504981994629, + "learning_rate": 7.140974967061924e-06, + "loss": 0.4808, + "step": 34160 + }, + { + "epoch": 6.43, + "grad_norm": 18.854583740234375, + "learning_rate": 7.137210615471486e-06, + "loss": 0.3383, + "step": 34170 + }, + { + "epoch": 6.43, + "grad_norm": 0.48108717799186707, + "learning_rate": 7.133446263881047e-06, + "loss": 0.4345, + "step": 34180 + }, + { + "epoch": 6.44, + "grad_norm": 50.08021926879883, + "learning_rate": 7.129681912290608e-06, + "loss": 0.4781, + "step": 34190 + }, + { + "epoch": 6.44, + "grad_norm": 18.643566131591797, + "learning_rate": 7.125917560700169e-06, + "loss": 0.4631, + "step": 34200 + }, + { + "epoch": 6.44, + "grad_norm": 12.921839714050293, + "learning_rate": 7.122153209109732e-06, + "loss": 0.5039, + "step": 34210 + }, + { + "epoch": 6.44, + "grad_norm": 27.30744171142578, + "learning_rate": 7.118388857519293e-06, + "loss": 0.5912, + "step": 34220 + }, + { + "epoch": 6.44, + "grad_norm": 27.566017150878906, + "learning_rate": 7.1146245059288545e-06, + "loss": 0.4702, + "step": 34230 + }, + { + "epoch": 6.44, + "grad_norm": 12.484590530395508, + "learning_rate": 7.110860154338416e-06, + "loss": 0.5907, + "step": 34240 + }, + { + "epoch": 6.45, + "grad_norm": 19.984474182128906, + "learning_rate": 7.1070958027479774e-06, + "loss": 0.2223, + "step": 34250 + }, + { + "epoch": 6.45, + "grad_norm": 6.380549430847168, + "learning_rate": 7.103331451157539e-06, + "loss": 0.3962, + "step": 34260 + }, + { + "epoch": 6.45, + "grad_norm": 8.179642677307129, + "learning_rate": 7.0995670995671e-06, + "loss": 0.778, + "step": 34270 + }, + { + "epoch": 6.45, + "grad_norm": 0.15643277764320374, + "learning_rate": 7.095802747976661e-06, + "loss": 0.316, + "step": 34280 + }, + { + "epoch": 6.45, + "grad_norm": 17.682392120361328, + "learning_rate": 7.0920383963862224e-06, + "loss": 0.3378, + "step": 34290 + }, + { + "epoch": 6.46, + "grad_norm": 10.891948699951172, + "learning_rate": 7.088274044795784e-06, + "loss": 0.345, + "step": 34300 + }, + { + "epoch": 6.46, + "grad_norm": 37.315853118896484, + "learning_rate": 7.084509693205346e-06, + "loss": 0.5888, + "step": 34310 + }, + { + "epoch": 6.46, + "grad_norm": 0.2774854302406311, + "learning_rate": 7.080745341614908e-06, + "loss": 0.2816, + "step": 34320 + }, + { + "epoch": 6.46, + "grad_norm": 15.914167404174805, + "learning_rate": 7.076980990024469e-06, + "loss": 0.6068, + "step": 34330 + }, + { + "epoch": 6.46, + "grad_norm": 17.352615356445312, + "learning_rate": 7.073216638434031e-06, + "loss": 0.4122, + "step": 34340 + }, + { + "epoch": 6.47, + "grad_norm": 21.448284149169922, + "learning_rate": 7.069452286843592e-06, + "loss": 0.2653, + "step": 34350 + }, + { + "epoch": 6.47, + "grad_norm": 21.003995895385742, + "learning_rate": 7.0656879352531535e-06, + "loss": 0.3752, + "step": 34360 + }, + { + "epoch": 6.47, + "grad_norm": 12.033404350280762, + "learning_rate": 7.061923583662714e-06, + "loss": 0.5186, + "step": 34370 + }, + { + "epoch": 6.47, + "grad_norm": 9.405254364013672, + "learning_rate": 7.058159232072276e-06, + "loss": 0.7365, + "step": 34380 + }, + { + "epoch": 6.47, + "grad_norm": 25.861106872558594, + "learning_rate": 7.054394880481837e-06, + "loss": 0.7919, + "step": 34390 + }, + { + "epoch": 6.47, + "grad_norm": 6.728229522705078, + "learning_rate": 7.050630528891399e-06, + "loss": 0.4589, + "step": 34400 + }, + { + "epoch": 6.48, + "grad_norm": 29.374160766601562, + "learning_rate": 7.046866177300961e-06, + "loss": 0.7846, + "step": 34410 + }, + { + "epoch": 6.48, + "grad_norm": 31.664583206176758, + "learning_rate": 7.043101825710522e-06, + "loss": 0.9016, + "step": 34420 + }, + { + "epoch": 6.48, + "grad_norm": 9.76760482788086, + "learning_rate": 7.039337474120084e-06, + "loss": 0.5766, + "step": 34430 + }, + { + "epoch": 6.48, + "grad_norm": 19.36331558227539, + "learning_rate": 7.035573122529645e-06, + "loss": 0.4637, + "step": 34440 + }, + { + "epoch": 6.48, + "grad_norm": 7.009848594665527, + "learning_rate": 7.031808770939206e-06, + "loss": 0.6186, + "step": 34450 + }, + { + "epoch": 6.49, + "grad_norm": 11.30031967163086, + "learning_rate": 7.028044419348767e-06, + "loss": 0.859, + "step": 34460 + }, + { + "epoch": 6.49, + "grad_norm": 23.84954833984375, + "learning_rate": 7.024280067758329e-06, + "loss": 0.4672, + "step": 34470 + }, + { + "epoch": 6.49, + "grad_norm": 1.282763957977295, + "learning_rate": 7.02051571616789e-06, + "loss": 0.3136, + "step": 34480 + }, + { + "epoch": 6.49, + "grad_norm": 0.5825543999671936, + "learning_rate": 7.0167513645774525e-06, + "loss": 0.4409, + "step": 34490 + }, + { + "epoch": 6.49, + "grad_norm": 23.45937728881836, + "learning_rate": 7.012987012987014e-06, + "loss": 0.5154, + "step": 34500 + }, + { + "epoch": 6.5, + "grad_norm": 0.2989109456539154, + "learning_rate": 7.009222661396575e-06, + "loss": 0.5457, + "step": 34510 + }, + { + "epoch": 6.5, + "grad_norm": 12.64752197265625, + "learning_rate": 7.005458309806137e-06, + "loss": 0.4991, + "step": 34520 + }, + { + "epoch": 6.5, + "grad_norm": 56.25718688964844, + "learning_rate": 7.001693958215698e-06, + "loss": 0.6106, + "step": 34530 + }, + { + "epoch": 6.5, + "grad_norm": 16.893884658813477, + "learning_rate": 6.997929606625259e-06, + "loss": 0.4511, + "step": 34540 + }, + { + "epoch": 6.5, + "grad_norm": 8.532828330993652, + "learning_rate": 6.99416525503482e-06, + "loss": 0.388, + "step": 34550 + }, + { + "epoch": 6.5, + "grad_norm": 12.58056926727295, + "learning_rate": 6.990400903444382e-06, + "loss": 0.8358, + "step": 34560 + }, + { + "epoch": 6.51, + "grad_norm": 20.3173885345459, + "learning_rate": 6.986636551853943e-06, + "loss": 0.3383, + "step": 34570 + }, + { + "epoch": 6.51, + "grad_norm": 7.896786689758301, + "learning_rate": 6.982872200263505e-06, + "loss": 0.5333, + "step": 34580 + }, + { + "epoch": 6.51, + "grad_norm": 6.4286065101623535, + "learning_rate": 6.979107848673067e-06, + "loss": 0.4064, + "step": 34590 + }, + { + "epoch": 6.51, + "grad_norm": 23.140363693237305, + "learning_rate": 6.9753434970826286e-06, + "loss": 0.623, + "step": 34600 + }, + { + "epoch": 6.51, + "grad_norm": 12.2300443649292, + "learning_rate": 6.97157914549219e-06, + "loss": 0.4408, + "step": 34610 + }, + { + "epoch": 6.52, + "grad_norm": 30.439172744750977, + "learning_rate": 6.9678147939017515e-06, + "loss": 0.3854, + "step": 34620 + }, + { + "epoch": 6.52, + "grad_norm": 13.196409225463867, + "learning_rate": 6.964050442311312e-06, + "loss": 0.4278, + "step": 34630 + }, + { + "epoch": 6.52, + "grad_norm": 8.042349815368652, + "learning_rate": 6.9602860907208736e-06, + "loss": 0.4624, + "step": 34640 + }, + { + "epoch": 6.52, + "grad_norm": 7.406027793884277, + "learning_rate": 6.956521739130435e-06, + "loss": 0.4726, + "step": 34650 + }, + { + "epoch": 6.52, + "grad_norm": 8.683379173278809, + "learning_rate": 6.9527573875399965e-06, + "loss": 0.6854, + "step": 34660 + }, + { + "epoch": 6.53, + "grad_norm": 23.16596794128418, + "learning_rate": 6.948993035949558e-06, + "loss": 0.6081, + "step": 34670 + }, + { + "epoch": 6.53, + "grad_norm": 0.07282250374555588, + "learning_rate": 6.94522868435912e-06, + "loss": 0.5896, + "step": 34680 + }, + { + "epoch": 6.53, + "grad_norm": 39.00768280029297, + "learning_rate": 6.941464332768682e-06, + "loss": 0.5436, + "step": 34690 + }, + { + "epoch": 6.53, + "grad_norm": 35.062660217285156, + "learning_rate": 6.937699981178243e-06, + "loss": 0.6628, + "step": 34700 + }, + { + "epoch": 6.53, + "grad_norm": 9.127306938171387, + "learning_rate": 6.933935629587804e-06, + "loss": 0.7176, + "step": 34710 + }, + { + "epoch": 6.53, + "grad_norm": 0.9268749952316284, + "learning_rate": 6.930171277997365e-06, + "loss": 0.438, + "step": 34720 + }, + { + "epoch": 6.54, + "grad_norm": 2.197774648666382, + "learning_rate": 6.926406926406927e-06, + "loss": 0.4989, + "step": 34730 + }, + { + "epoch": 6.54, + "grad_norm": 32.574371337890625, + "learning_rate": 6.922642574816488e-06, + "loss": 0.586, + "step": 34740 + }, + { + "epoch": 6.54, + "grad_norm": 31.461170196533203, + "learning_rate": 6.91887822322605e-06, + "loss": 0.5052, + "step": 34750 + }, + { + "epoch": 6.54, + "grad_norm": 12.898176193237305, + "learning_rate": 6.915113871635611e-06, + "loss": 0.5327, + "step": 34760 + }, + { + "epoch": 6.54, + "grad_norm": 2.2167086601257324, + "learning_rate": 6.911349520045173e-06, + "loss": 0.5105, + "step": 34770 + }, + { + "epoch": 6.55, + "grad_norm": 15.682035446166992, + "learning_rate": 6.907585168454735e-06, + "loss": 0.3067, + "step": 34780 + }, + { + "epoch": 6.55, + "grad_norm": 41.14264678955078, + "learning_rate": 6.903820816864296e-06, + "loss": 0.4904, + "step": 34790 + }, + { + "epoch": 6.55, + "grad_norm": 0.08841241896152496, + "learning_rate": 6.900056465273857e-06, + "loss": 0.4581, + "step": 34800 + }, + { + "epoch": 6.55, + "grad_norm": 38.545257568359375, + "learning_rate": 6.896292113683418e-06, + "loss": 0.4896, + "step": 34810 + }, + { + "epoch": 6.55, + "grad_norm": 35.339149475097656, + "learning_rate": 6.89252776209298e-06, + "loss": 0.5188, + "step": 34820 + }, + { + "epoch": 6.56, + "grad_norm": 14.379655838012695, + "learning_rate": 6.888763410502541e-06, + "loss": 0.3942, + "step": 34830 + }, + { + "epoch": 6.56, + "grad_norm": 21.1137752532959, + "learning_rate": 6.884999058912103e-06, + "loss": 0.5441, + "step": 34840 + }, + { + "epoch": 6.56, + "grad_norm": 20.75609588623047, + "learning_rate": 6.881234707321664e-06, + "loss": 0.6482, + "step": 34850 + }, + { + "epoch": 6.56, + "grad_norm": 19.010169982910156, + "learning_rate": 6.8774703557312265e-06, + "loss": 0.4973, + "step": 34860 + }, + { + "epoch": 6.56, + "grad_norm": 19.815196990966797, + "learning_rate": 6.873706004140788e-06, + "loss": 0.6047, + "step": 34870 + }, + { + "epoch": 6.57, + "grad_norm": 8.65109634399414, + "learning_rate": 6.8699416525503495e-06, + "loss": 0.3104, + "step": 34880 + }, + { + "epoch": 6.57, + "grad_norm": 1.1485213041305542, + "learning_rate": 6.86617730095991e-06, + "loss": 0.4565, + "step": 34890 + }, + { + "epoch": 6.57, + "grad_norm": 1.1633775234222412, + "learning_rate": 6.8624129493694715e-06, + "loss": 0.5844, + "step": 34900 + }, + { + "epoch": 6.57, + "grad_norm": 7.708075523376465, + "learning_rate": 6.858648597779033e-06, + "loss": 0.6197, + "step": 34910 + }, + { + "epoch": 6.57, + "grad_norm": 4.411716938018799, + "learning_rate": 6.8548842461885945e-06, + "loss": 0.5093, + "step": 34920 + }, + { + "epoch": 6.57, + "grad_norm": 18.344188690185547, + "learning_rate": 6.851119894598156e-06, + "loss": 0.693, + "step": 34930 + }, + { + "epoch": 6.58, + "grad_norm": 40.74114990234375, + "learning_rate": 6.847355543007717e-06, + "loss": 0.6018, + "step": 34940 + }, + { + "epoch": 6.58, + "grad_norm": 0.057777296751737595, + "learning_rate": 6.843591191417279e-06, + "loss": 0.5193, + "step": 34950 + }, + { + "epoch": 6.58, + "grad_norm": 13.32892894744873, + "learning_rate": 6.839826839826841e-06, + "loss": 0.5995, + "step": 34960 + }, + { + "epoch": 6.58, + "grad_norm": 28.153160095214844, + "learning_rate": 6.836062488236401e-06, + "loss": 0.5686, + "step": 34970 + }, + { + "epoch": 6.58, + "grad_norm": 7.822689056396484, + "learning_rate": 6.832298136645963e-06, + "loss": 0.4162, + "step": 34980 + }, + { + "epoch": 6.59, + "grad_norm": 25.546289443969727, + "learning_rate": 6.828533785055525e-06, + "loss": 0.3894, + "step": 34990 + }, + { + "epoch": 6.59, + "grad_norm": 0.47211772203445435, + "learning_rate": 6.824769433465086e-06, + "loss": 0.2991, + "step": 35000 + }, + { + "epoch": 6.59, + "grad_norm": 6.658637523651123, + "learning_rate": 6.821005081874648e-06, + "loss": 0.6118, + "step": 35010 + }, + { + "epoch": 6.59, + "grad_norm": 4.372053146362305, + "learning_rate": 6.817240730284209e-06, + "loss": 0.239, + "step": 35020 + }, + { + "epoch": 6.59, + "grad_norm": 25.087793350219727, + "learning_rate": 6.8134763786937705e-06, + "loss": 0.6427, + "step": 35030 + }, + { + "epoch": 6.6, + "grad_norm": 18.80016326904297, + "learning_rate": 6.809712027103332e-06, + "loss": 0.3743, + "step": 35040 + }, + { + "epoch": 6.6, + "grad_norm": 0.2986818552017212, + "learning_rate": 6.805947675512894e-06, + "loss": 0.3221, + "step": 35050 + }, + { + "epoch": 6.6, + "grad_norm": 15.778828620910645, + "learning_rate": 6.802183323922454e-06, + "loss": 0.855, + "step": 35060 + }, + { + "epoch": 6.6, + "grad_norm": 1.3159215450286865, + "learning_rate": 6.798418972332016e-06, + "loss": 0.5559, + "step": 35070 + }, + { + "epoch": 6.6, + "grad_norm": 0.06491217762231827, + "learning_rate": 6.794654620741578e-06, + "loss": 0.2105, + "step": 35080 + }, + { + "epoch": 6.6, + "grad_norm": 19.969703674316406, + "learning_rate": 6.790890269151139e-06, + "loss": 0.4636, + "step": 35090 + }, + { + "epoch": 6.61, + "grad_norm": 11.362780570983887, + "learning_rate": 6.787125917560701e-06, + "loss": 0.5571, + "step": 35100 + }, + { + "epoch": 6.61, + "grad_norm": 8.18557357788086, + "learning_rate": 6.783361565970262e-06, + "loss": 0.4913, + "step": 35110 + }, + { + "epoch": 6.61, + "grad_norm": 10.069347381591797, + "learning_rate": 6.779597214379824e-06, + "loss": 0.2839, + "step": 35120 + }, + { + "epoch": 6.61, + "grad_norm": 2.1053032875061035, + "learning_rate": 6.775832862789385e-06, + "loss": 0.6116, + "step": 35130 + }, + { + "epoch": 6.61, + "grad_norm": 12.72517204284668, + "learning_rate": 6.7720685111989474e-06, + "loss": 0.4671, + "step": 35140 + }, + { + "epoch": 6.62, + "grad_norm": 19.65533447265625, + "learning_rate": 6.768304159608507e-06, + "loss": 0.5756, + "step": 35150 + }, + { + "epoch": 6.62, + "grad_norm": 15.998537063598633, + "learning_rate": 6.764539808018069e-06, + "loss": 0.5428, + "step": 35160 + }, + { + "epoch": 6.62, + "grad_norm": 24.56063461303711, + "learning_rate": 6.760775456427631e-06, + "loss": 0.6397, + "step": 35170 + }, + { + "epoch": 6.62, + "grad_norm": 0.9871610403060913, + "learning_rate": 6.7570111048371924e-06, + "loss": 0.5336, + "step": 35180 + }, + { + "epoch": 6.62, + "grad_norm": 0.14848846197128296, + "learning_rate": 6.753246753246754e-06, + "loss": 0.4377, + "step": 35190 + }, + { + "epoch": 6.63, + "grad_norm": 55.045509338378906, + "learning_rate": 6.749482401656315e-06, + "loss": 0.5341, + "step": 35200 + }, + { + "epoch": 6.63, + "grad_norm": 13.329508781433105, + "learning_rate": 6.745718050065877e-06, + "loss": 0.5933, + "step": 35210 + }, + { + "epoch": 6.63, + "grad_norm": 3.753176212310791, + "learning_rate": 6.741953698475438e-06, + "loss": 0.301, + "step": 35220 + }, + { + "epoch": 6.63, + "grad_norm": 0.3469744622707367, + "learning_rate": 6.738189346885e-06, + "loss": 0.3636, + "step": 35230 + }, + { + "epoch": 6.63, + "grad_norm": 8.834991455078125, + "learning_rate": 6.73442499529456e-06, + "loss": 0.7362, + "step": 35240 + }, + { + "epoch": 6.63, + "grad_norm": 13.375589370727539, + "learning_rate": 6.730660643704122e-06, + "loss": 0.4198, + "step": 35250 + }, + { + "epoch": 6.64, + "grad_norm": 0.9780624508857727, + "learning_rate": 6.726896292113684e-06, + "loss": 0.2504, + "step": 35260 + }, + { + "epoch": 6.64, + "grad_norm": 12.835625648498535, + "learning_rate": 6.723131940523246e-06, + "loss": 0.7162, + "step": 35270 + }, + { + "epoch": 6.64, + "grad_norm": 1.3344264030456543, + "learning_rate": 6.719367588932807e-06, + "loss": 0.3267, + "step": 35280 + }, + { + "epoch": 6.64, + "grad_norm": 33.066993713378906, + "learning_rate": 6.7156032373423685e-06, + "loss": 0.367, + "step": 35290 + }, + { + "epoch": 6.64, + "grad_norm": 5.983303546905518, + "learning_rate": 6.71183888575193e-06, + "loss": 0.8041, + "step": 35300 + }, + { + "epoch": 6.65, + "grad_norm": 34.84541320800781, + "learning_rate": 6.7080745341614914e-06, + "loss": 0.4726, + "step": 35310 + }, + { + "epoch": 6.65, + "grad_norm": 15.427559852600098, + "learning_rate": 6.704310182571052e-06, + "loss": 0.2845, + "step": 35320 + }, + { + "epoch": 6.65, + "grad_norm": 8.32028865814209, + "learning_rate": 6.7005458309806135e-06, + "loss": 0.5386, + "step": 35330 + }, + { + "epoch": 6.65, + "grad_norm": 49.66559982299805, + "learning_rate": 6.696781479390175e-06, + "loss": 0.4041, + "step": 35340 + }, + { + "epoch": 6.65, + "grad_norm": 3.6853106021881104, + "learning_rate": 6.693017127799737e-06, + "loss": 0.6245, + "step": 35350 + }, + { + "epoch": 6.66, + "grad_norm": 2.9729301929473877, + "learning_rate": 6.689252776209299e-06, + "loss": 0.5603, + "step": 35360 + }, + { + "epoch": 6.66, + "grad_norm": 7.048079490661621, + "learning_rate": 6.68548842461886e-06, + "loss": 0.5224, + "step": 35370 + }, + { + "epoch": 6.66, + "grad_norm": 4.4709296226501465, + "learning_rate": 6.681724073028422e-06, + "loss": 0.2556, + "step": 35380 + }, + { + "epoch": 6.66, + "grad_norm": 29.859085083007812, + "learning_rate": 6.677959721437983e-06, + "loss": 0.5444, + "step": 35390 + }, + { + "epoch": 6.66, + "grad_norm": 6.1535797119140625, + "learning_rate": 6.6741953698475446e-06, + "loss": 0.5421, + "step": 35400 + }, + { + "epoch": 6.66, + "grad_norm": 5.314403533935547, + "learning_rate": 6.670431018257105e-06, + "loss": 0.5939, + "step": 35410 + }, + { + "epoch": 6.67, + "grad_norm": 17.58977508544922, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4557, + "step": 35420 + }, + { + "epoch": 6.67, + "grad_norm": 1.5328501462936401, + "learning_rate": 6.662902315076228e-06, + "loss": 0.4192, + "step": 35430 + }, + { + "epoch": 6.67, + "grad_norm": 16.774066925048828, + "learning_rate": 6.65913796348579e-06, + "loss": 0.644, + "step": 35440 + }, + { + "epoch": 6.67, + "grad_norm": 56.260440826416016, + "learning_rate": 6.655373611895352e-06, + "loss": 0.7726, + "step": 35450 + }, + { + "epoch": 6.67, + "grad_norm": 20.503341674804688, + "learning_rate": 6.651609260304913e-06, + "loss": 0.5263, + "step": 35460 + }, + { + "epoch": 6.68, + "grad_norm": 4.7927422523498535, + "learning_rate": 6.647844908714475e-06, + "loss": 0.5455, + "step": 35470 + }, + { + "epoch": 6.68, + "grad_norm": 0.0458342470228672, + "learning_rate": 6.644080557124036e-06, + "loss": 0.3213, + "step": 35480 + }, + { + "epoch": 6.68, + "grad_norm": 12.544048309326172, + "learning_rate": 6.640316205533598e-06, + "loss": 0.7539, + "step": 35490 + }, + { + "epoch": 6.68, + "grad_norm": 23.94100570678711, + "learning_rate": 6.636551853943158e-06, + "loss": 0.7003, + "step": 35500 + }, + { + "epoch": 6.68, + "grad_norm": 1.0551177263259888, + "learning_rate": 6.63278750235272e-06, + "loss": 0.3386, + "step": 35510 + }, + { + "epoch": 6.69, + "grad_norm": 18.300785064697266, + "learning_rate": 6.629023150762281e-06, + "loss": 0.5915, + "step": 35520 + }, + { + "epoch": 6.69, + "grad_norm": 2.241436243057251, + "learning_rate": 6.625258799171843e-06, + "loss": 0.3854, + "step": 35530 + }, + { + "epoch": 6.69, + "grad_norm": 2.476534366607666, + "learning_rate": 6.621494447581405e-06, + "loss": 0.5499, + "step": 35540 + }, + { + "epoch": 6.69, + "grad_norm": 14.764324188232422, + "learning_rate": 6.6177300959909665e-06, + "loss": 0.4017, + "step": 35550 + }, + { + "epoch": 6.69, + "grad_norm": 20.741302490234375, + "learning_rate": 6.613965744400528e-06, + "loss": 0.533, + "step": 35560 + }, + { + "epoch": 6.69, + "grad_norm": 20.3760929107666, + "learning_rate": 6.610201392810089e-06, + "loss": 0.7259, + "step": 35570 + }, + { + "epoch": 6.7, + "grad_norm": 22.055103302001953, + "learning_rate": 6.60643704121965e-06, + "loss": 0.7339, + "step": 35580 + }, + { + "epoch": 6.7, + "grad_norm": 0.13808929920196533, + "learning_rate": 6.6026726896292115e-06, + "loss": 0.2974, + "step": 35590 + }, + { + "epoch": 6.7, + "grad_norm": 0.07140693813562393, + "learning_rate": 6.598908338038773e-06, + "loss": 0.3145, + "step": 35600 + }, + { + "epoch": 6.7, + "grad_norm": 24.929866790771484, + "learning_rate": 6.595143986448334e-06, + "loss": 0.2889, + "step": 35610 + }, + { + "epoch": 6.7, + "grad_norm": 16.1369571685791, + "learning_rate": 6.591379634857896e-06, + "loss": 1.012, + "step": 35620 + }, + { + "epoch": 6.71, + "grad_norm": 29.283899307250977, + "learning_rate": 6.587615283267458e-06, + "loss": 0.5044, + "step": 35630 + }, + { + "epoch": 6.71, + "grad_norm": 8.903387069702148, + "learning_rate": 6.58385093167702e-06, + "loss": 0.3735, + "step": 35640 + }, + { + "epoch": 6.71, + "grad_norm": 32.65473937988281, + "learning_rate": 6.580086580086581e-06, + "loss": 0.3131, + "step": 35650 + }, + { + "epoch": 6.71, + "grad_norm": 3.421217203140259, + "learning_rate": 6.5763222284961426e-06, + "loss": 0.4596, + "step": 35660 + }, + { + "epoch": 6.71, + "grad_norm": 22.2224178314209, + "learning_rate": 6.572557876905703e-06, + "loss": 0.6354, + "step": 35670 + }, + { + "epoch": 6.72, + "grad_norm": 10.610343933105469, + "learning_rate": 6.568793525315265e-06, + "loss": 0.6314, + "step": 35680 + }, + { + "epoch": 6.72, + "grad_norm": 18.17767906188965, + "learning_rate": 6.565029173724826e-06, + "loss": 0.4187, + "step": 35690 + }, + { + "epoch": 6.72, + "grad_norm": 11.569199562072754, + "learning_rate": 6.5612648221343875e-06, + "loss": 0.3357, + "step": 35700 + }, + { + "epoch": 6.72, + "grad_norm": 2.6818912029266357, + "learning_rate": 6.557500470543949e-06, + "loss": 0.3558, + "step": 35710 + }, + { + "epoch": 6.72, + "grad_norm": 7.722105026245117, + "learning_rate": 6.553736118953511e-06, + "loss": 0.3008, + "step": 35720 + }, + { + "epoch": 6.73, + "grad_norm": 47.493011474609375, + "learning_rate": 6.549971767363073e-06, + "loss": 0.6678, + "step": 35730 + }, + { + "epoch": 6.73, + "grad_norm": 21.441015243530273, + "learning_rate": 6.546207415772634e-06, + "loss": 0.5969, + "step": 35740 + }, + { + "epoch": 6.73, + "grad_norm": 0.7386395931243896, + "learning_rate": 6.542443064182196e-06, + "loss": 0.6374, + "step": 35750 + }, + { + "epoch": 6.73, + "grad_norm": 12.457733154296875, + "learning_rate": 6.538678712591756e-06, + "loss": 0.5366, + "step": 35760 + }, + { + "epoch": 6.73, + "grad_norm": 16.376632690429688, + "learning_rate": 6.534914361001318e-06, + "loss": 0.3374, + "step": 35770 + }, + { + "epoch": 6.73, + "grad_norm": 16.77738380432129, + "learning_rate": 6.531150009410879e-06, + "loss": 0.3973, + "step": 35780 + }, + { + "epoch": 6.74, + "grad_norm": 3.2299344539642334, + "learning_rate": 6.527385657820441e-06, + "loss": 0.3064, + "step": 35790 + }, + { + "epoch": 6.74, + "grad_norm": 29.193862915039062, + "learning_rate": 6.523621306230002e-06, + "loss": 0.5058, + "step": 35800 + }, + { + "epoch": 6.74, + "grad_norm": 0.34984686970710754, + "learning_rate": 6.519856954639564e-06, + "loss": 0.4511, + "step": 35810 + }, + { + "epoch": 6.74, + "grad_norm": 14.813493728637695, + "learning_rate": 6.516092603049126e-06, + "loss": 0.4569, + "step": 35820 + }, + { + "epoch": 6.74, + "grad_norm": 15.381288528442383, + "learning_rate": 6.512328251458687e-06, + "loss": 0.5122, + "step": 35830 + }, + { + "epoch": 6.75, + "grad_norm": 4.633648872375488, + "learning_rate": 6.508563899868249e-06, + "loss": 0.4984, + "step": 35840 + }, + { + "epoch": 6.75, + "grad_norm": 9.600768089294434, + "learning_rate": 6.5047995482778095e-06, + "loss": 0.4427, + "step": 35850 + }, + { + "epoch": 6.75, + "grad_norm": 10.763100624084473, + "learning_rate": 6.501035196687371e-06, + "loss": 0.4402, + "step": 35860 + }, + { + "epoch": 6.75, + "grad_norm": 11.68506145477295, + "learning_rate": 6.497270845096932e-06, + "loss": 0.4209, + "step": 35870 + }, + { + "epoch": 6.75, + "grad_norm": 14.833422660827637, + "learning_rate": 6.493506493506494e-06, + "loss": 0.6574, + "step": 35880 + }, + { + "epoch": 6.76, + "grad_norm": 4.88935661315918, + "learning_rate": 6.489742141916055e-06, + "loss": 0.4479, + "step": 35890 + }, + { + "epoch": 6.76, + "grad_norm": 10.467138290405273, + "learning_rate": 6.485977790325617e-06, + "loss": 0.5853, + "step": 35900 + }, + { + "epoch": 6.76, + "grad_norm": 1.4499626159667969, + "learning_rate": 6.482213438735179e-06, + "loss": 0.4039, + "step": 35910 + }, + { + "epoch": 6.76, + "grad_norm": 14.744542121887207, + "learning_rate": 6.4784490871447405e-06, + "loss": 0.333, + "step": 35920 + }, + { + "epoch": 6.76, + "grad_norm": 0.04979345574975014, + "learning_rate": 6.474684735554301e-06, + "loss": 0.4814, + "step": 35930 + }, + { + "epoch": 6.76, + "grad_norm": 15.655257225036621, + "learning_rate": 6.470920383963863e-06, + "loss": 0.4449, + "step": 35940 + }, + { + "epoch": 6.77, + "grad_norm": 0.3292839229106903, + "learning_rate": 6.467156032373424e-06, + "loss": 0.2991, + "step": 35950 + }, + { + "epoch": 6.77, + "grad_norm": 22.464662551879883, + "learning_rate": 6.4633916807829855e-06, + "loss": 0.9058, + "step": 35960 + }, + { + "epoch": 6.77, + "grad_norm": 26.670724868774414, + "learning_rate": 6.459627329192547e-06, + "loss": 0.5506, + "step": 35970 + }, + { + "epoch": 6.77, + "grad_norm": 20.745296478271484, + "learning_rate": 6.4558629776021084e-06, + "loss": 0.2994, + "step": 35980 + }, + { + "epoch": 6.77, + "grad_norm": 25.35480499267578, + "learning_rate": 6.45209862601167e-06, + "loss": 0.5347, + "step": 35990 + }, + { + "epoch": 6.78, + "grad_norm": 0.18468712270259857, + "learning_rate": 6.448334274421232e-06, + "loss": 0.4672, + "step": 36000 + }, + { + "epoch": 6.78, + "grad_norm": 27.896060943603516, + "learning_rate": 6.444569922830794e-06, + "loss": 0.7579, + "step": 36010 + }, + { + "epoch": 6.78, + "grad_norm": 8.810677528381348, + "learning_rate": 6.440805571240354e-06, + "loss": 0.4414, + "step": 36020 + }, + { + "epoch": 6.78, + "grad_norm": 32.96968078613281, + "learning_rate": 6.437041219649916e-06, + "loss": 0.336, + "step": 36030 + }, + { + "epoch": 6.78, + "grad_norm": 35.82292556762695, + "learning_rate": 6.433276868059477e-06, + "loss": 0.5106, + "step": 36040 + }, + { + "epoch": 6.79, + "grad_norm": 27.85030746459961, + "learning_rate": 6.429512516469039e-06, + "loss": 0.5962, + "step": 36050 + }, + { + "epoch": 6.79, + "grad_norm": 27.592071533203125, + "learning_rate": 6.4257481648786e-06, + "loss": 0.7332, + "step": 36060 + }, + { + "epoch": 6.79, + "grad_norm": 23.796886444091797, + "learning_rate": 6.421983813288162e-06, + "loss": 0.4692, + "step": 36070 + }, + { + "epoch": 6.79, + "grad_norm": 14.783629417419434, + "learning_rate": 6.418219461697723e-06, + "loss": 0.4037, + "step": 36080 + }, + { + "epoch": 6.79, + "grad_norm": 14.800186157226562, + "learning_rate": 6.414455110107285e-06, + "loss": 0.3501, + "step": 36090 + }, + { + "epoch": 6.79, + "grad_norm": 31.679777145385742, + "learning_rate": 6.410690758516847e-06, + "loss": 0.3348, + "step": 36100 + }, + { + "epoch": 6.8, + "grad_norm": 27.46072769165039, + "learning_rate": 6.406926406926407e-06, + "loss": 0.7376, + "step": 36110 + }, + { + "epoch": 6.8, + "grad_norm": 5.877180099487305, + "learning_rate": 6.403162055335969e-06, + "loss": 0.6983, + "step": 36120 + }, + { + "epoch": 6.8, + "grad_norm": 2.0558340549468994, + "learning_rate": 6.39939770374553e-06, + "loss": 0.499, + "step": 36130 + }, + { + "epoch": 6.8, + "grad_norm": 7.006868839263916, + "learning_rate": 6.395633352155092e-06, + "loss": 0.544, + "step": 36140 + }, + { + "epoch": 6.8, + "grad_norm": 15.996200561523438, + "learning_rate": 6.391869000564653e-06, + "loss": 0.3828, + "step": 36150 + }, + { + "epoch": 6.81, + "grad_norm": 11.61379337310791, + "learning_rate": 6.388104648974215e-06, + "loss": 0.4883, + "step": 36160 + }, + { + "epoch": 6.81, + "grad_norm": 7.964717388153076, + "learning_rate": 6.384340297383776e-06, + "loss": 0.3079, + "step": 36170 + }, + { + "epoch": 6.81, + "grad_norm": 0.344844788312912, + "learning_rate": 6.380575945793338e-06, + "loss": 0.5976, + "step": 36180 + }, + { + "epoch": 6.81, + "grad_norm": 5.613010883331299, + "learning_rate": 6.376811594202898e-06, + "loss": 0.5082, + "step": 36190 + }, + { + "epoch": 6.81, + "grad_norm": 33.136268615722656, + "learning_rate": 6.37304724261246e-06, + "loss": 0.4065, + "step": 36200 + }, + { + "epoch": 6.82, + "grad_norm": 26.772239685058594, + "learning_rate": 6.369282891022022e-06, + "loss": 0.6971, + "step": 36210 + }, + { + "epoch": 6.82, + "grad_norm": 21.456703186035156, + "learning_rate": 6.3655185394315835e-06, + "loss": 0.3556, + "step": 36220 + }, + { + "epoch": 6.82, + "grad_norm": 26.07500648498535, + "learning_rate": 6.361754187841145e-06, + "loss": 0.5151, + "step": 36230 + }, + { + "epoch": 6.82, + "grad_norm": 19.035114288330078, + "learning_rate": 6.3579898362507064e-06, + "loss": 0.897, + "step": 36240 + }, + { + "epoch": 6.82, + "grad_norm": 11.198712348937988, + "learning_rate": 6.354225484660268e-06, + "loss": 0.6124, + "step": 36250 + }, + { + "epoch": 6.82, + "grad_norm": 7.805197715759277, + "learning_rate": 6.350461133069829e-06, + "loss": 0.6911, + "step": 36260 + }, + { + "epoch": 6.83, + "grad_norm": 29.66564178466797, + "learning_rate": 6.346696781479391e-06, + "loss": 0.2914, + "step": 36270 + }, + { + "epoch": 6.83, + "grad_norm": 42.64144515991211, + "learning_rate": 6.342932429888951e-06, + "loss": 0.8512, + "step": 36280 + }, + { + "epoch": 6.83, + "grad_norm": 6.411401748657227, + "learning_rate": 6.339168078298513e-06, + "loss": 0.5668, + "step": 36290 + }, + { + "epoch": 6.83, + "grad_norm": 11.873297691345215, + "learning_rate": 6.335403726708075e-06, + "loss": 0.6048, + "step": 36300 + }, + { + "epoch": 6.83, + "grad_norm": 18.666488647460938, + "learning_rate": 6.331639375117637e-06, + "loss": 0.6428, + "step": 36310 + }, + { + "epoch": 6.84, + "grad_norm": 13.76421070098877, + "learning_rate": 6.327875023527198e-06, + "loss": 0.4083, + "step": 36320 + }, + { + "epoch": 6.84, + "grad_norm": 10.075658798217773, + "learning_rate": 6.3241106719367596e-06, + "loss": 0.3769, + "step": 36330 + }, + { + "epoch": 6.84, + "grad_norm": 19.852584838867188, + "learning_rate": 6.320346320346321e-06, + "loss": 0.6485, + "step": 36340 + }, + { + "epoch": 6.84, + "grad_norm": 15.061224937438965, + "learning_rate": 6.3165819687558825e-06, + "loss": 0.4621, + "step": 36350 + }, + { + "epoch": 6.84, + "grad_norm": 15.020421981811523, + "learning_rate": 6.312817617165444e-06, + "loss": 0.5403, + "step": 36360 + }, + { + "epoch": 6.85, + "grad_norm": 0.03311420977115631, + "learning_rate": 6.3090532655750046e-06, + "loss": 0.2076, + "step": 36370 + }, + { + "epoch": 6.85, + "grad_norm": 40.68346405029297, + "learning_rate": 6.305288913984566e-06, + "loss": 0.529, + "step": 36380 + }, + { + "epoch": 6.85, + "grad_norm": 6.1792521476745605, + "learning_rate": 6.3015245623941275e-06, + "loss": 0.5218, + "step": 36390 + }, + { + "epoch": 6.85, + "grad_norm": 18.217483520507812, + "learning_rate": 6.29776021080369e-06, + "loss": 0.3166, + "step": 36400 + }, + { + "epoch": 6.85, + "grad_norm": 0.7558429837226868, + "learning_rate": 6.293995859213251e-06, + "loss": 0.6442, + "step": 36410 + }, + { + "epoch": 6.85, + "grad_norm": 2.1329829692840576, + "learning_rate": 6.290231507622813e-06, + "loss": 0.5574, + "step": 36420 + }, + { + "epoch": 6.86, + "grad_norm": 19.163063049316406, + "learning_rate": 6.286467156032374e-06, + "loss": 0.4113, + "step": 36430 + }, + { + "epoch": 6.86, + "grad_norm": 21.05096435546875, + "learning_rate": 6.282702804441936e-06, + "loss": 0.644, + "step": 36440 + }, + { + "epoch": 6.86, + "grad_norm": 15.979854583740234, + "learning_rate": 6.278938452851497e-06, + "loss": 0.4465, + "step": 36450 + }, + { + "epoch": 6.86, + "grad_norm": 0.3811950087547302, + "learning_rate": 6.275174101261058e-06, + "loss": 0.472, + "step": 36460 + }, + { + "epoch": 6.86, + "grad_norm": 2.4271154403686523, + "learning_rate": 6.271409749670619e-06, + "loss": 0.6266, + "step": 36470 + }, + { + "epoch": 6.87, + "grad_norm": 21.372413635253906, + "learning_rate": 6.267645398080181e-06, + "loss": 0.5725, + "step": 36480 + }, + { + "epoch": 6.87, + "grad_norm": 13.645919799804688, + "learning_rate": 6.263881046489743e-06, + "loss": 0.4645, + "step": 36490 + }, + { + "epoch": 6.87, + "grad_norm": 56.878578186035156, + "learning_rate": 6.260116694899304e-06, + "loss": 0.5547, + "step": 36500 + }, + { + "epoch": 6.87, + "grad_norm": 2.6566834449768066, + "learning_rate": 6.256352343308866e-06, + "loss": 0.3839, + "step": 36510 + }, + { + "epoch": 6.87, + "grad_norm": 18.267131805419922, + "learning_rate": 6.252587991718427e-06, + "loss": 0.3353, + "step": 36520 + }, + { + "epoch": 6.88, + "grad_norm": 29.942182540893555, + "learning_rate": 6.248823640127989e-06, + "loss": 0.8058, + "step": 36530 + }, + { + "epoch": 6.88, + "grad_norm": 44.51432418823242, + "learning_rate": 6.245059288537549e-06, + "loss": 0.7491, + "step": 36540 + }, + { + "epoch": 6.88, + "grad_norm": 22.656213760375977, + "learning_rate": 6.241294936947111e-06, + "loss": 0.3901, + "step": 36550 + }, + { + "epoch": 6.88, + "grad_norm": 4.600666046142578, + "learning_rate": 6.237530585356672e-06, + "loss": 0.4892, + "step": 36560 + }, + { + "epoch": 6.88, + "grad_norm": 0.16996584832668304, + "learning_rate": 6.233766233766234e-06, + "loss": 0.3893, + "step": 36570 + }, + { + "epoch": 6.88, + "grad_norm": 7.973287105560303, + "learning_rate": 6.230001882175796e-06, + "loss": 0.4334, + "step": 36580 + }, + { + "epoch": 6.89, + "grad_norm": 6.781386375427246, + "learning_rate": 6.2262375305853575e-06, + "loss": 0.4581, + "step": 36590 + }, + { + "epoch": 6.89, + "grad_norm": 17.54666519165039, + "learning_rate": 6.222473178994919e-06, + "loss": 0.351, + "step": 36600 + }, + { + "epoch": 6.89, + "grad_norm": 1.0639405250549316, + "learning_rate": 6.2187088274044805e-06, + "loss": 0.514, + "step": 36610 + }, + { + "epoch": 6.89, + "grad_norm": 0.03612279146909714, + "learning_rate": 6.214944475814042e-06, + "loss": 0.6789, + "step": 36620 + }, + { + "epoch": 6.89, + "grad_norm": 13.270979881286621, + "learning_rate": 6.2111801242236025e-06, + "loss": 0.4822, + "step": 36630 + }, + { + "epoch": 6.9, + "grad_norm": 1.6162692308425903, + "learning_rate": 6.207415772633164e-06, + "loss": 0.4827, + "step": 36640 + }, + { + "epoch": 6.9, + "grad_norm": 28.315080642700195, + "learning_rate": 6.2036514210427255e-06, + "loss": 0.616, + "step": 36650 + }, + { + "epoch": 6.9, + "grad_norm": 11.296304702758789, + "learning_rate": 6.199887069452287e-06, + "loss": 0.6246, + "step": 36660 + }, + { + "epoch": 6.9, + "grad_norm": 10.235584259033203, + "learning_rate": 6.196122717861849e-06, + "loss": 0.433, + "step": 36670 + }, + { + "epoch": 6.9, + "grad_norm": 26.53406524658203, + "learning_rate": 6.192358366271411e-06, + "loss": 0.3869, + "step": 36680 + }, + { + "epoch": 6.91, + "grad_norm": 3.4729597568511963, + "learning_rate": 6.188594014680972e-06, + "loss": 0.3534, + "step": 36690 + }, + { + "epoch": 6.91, + "grad_norm": 20.50278091430664, + "learning_rate": 6.184829663090534e-06, + "loss": 0.3664, + "step": 36700 + }, + { + "epoch": 6.91, + "grad_norm": 7.9125566482543945, + "learning_rate": 6.181065311500095e-06, + "loss": 0.3867, + "step": 36710 + }, + { + "epoch": 6.91, + "grad_norm": 17.5158634185791, + "learning_rate": 6.177300959909656e-06, + "loss": 0.315, + "step": 36720 + }, + { + "epoch": 6.91, + "grad_norm": 30.33664321899414, + "learning_rate": 6.173536608319217e-06, + "loss": 0.4541, + "step": 36730 + }, + { + "epoch": 6.92, + "grad_norm": 24.297866821289062, + "learning_rate": 6.169772256728779e-06, + "loss": 0.772, + "step": 36740 + }, + { + "epoch": 6.92, + "grad_norm": 16.661861419677734, + "learning_rate": 6.16600790513834e-06, + "loss": 0.5398, + "step": 36750 + }, + { + "epoch": 6.92, + "grad_norm": 0.9989810585975647, + "learning_rate": 6.1622435535479015e-06, + "loss": 0.4729, + "step": 36760 + }, + { + "epoch": 6.92, + "grad_norm": 20.895919799804688, + "learning_rate": 6.158479201957464e-06, + "loss": 0.5922, + "step": 36770 + }, + { + "epoch": 6.92, + "grad_norm": 13.476861953735352, + "learning_rate": 6.154714850367025e-06, + "loss": 0.6265, + "step": 36780 + }, + { + "epoch": 6.92, + "grad_norm": 29.94508171081543, + "learning_rate": 6.150950498776587e-06, + "loss": 0.5422, + "step": 36790 + }, + { + "epoch": 6.93, + "grad_norm": 6.413869380950928, + "learning_rate": 6.147186147186147e-06, + "loss": 0.4729, + "step": 36800 + }, + { + "epoch": 6.93, + "grad_norm": 26.468042373657227, + "learning_rate": 6.143421795595709e-06, + "loss": 0.4204, + "step": 36810 + }, + { + "epoch": 6.93, + "grad_norm": 5.048950672149658, + "learning_rate": 6.13965744400527e-06, + "loss": 0.5322, + "step": 36820 + }, + { + "epoch": 6.93, + "grad_norm": 20.66354751586914, + "learning_rate": 6.135893092414832e-06, + "loss": 0.4366, + "step": 36830 + }, + { + "epoch": 6.93, + "grad_norm": 14.539073944091797, + "learning_rate": 6.132128740824393e-06, + "loss": 0.5468, + "step": 36840 + }, + { + "epoch": 6.94, + "grad_norm": 19.072439193725586, + "learning_rate": 6.128364389233955e-06, + "loss": 0.2972, + "step": 36850 + }, + { + "epoch": 6.94, + "grad_norm": 0.2176668494939804, + "learning_rate": 6.124600037643517e-06, + "loss": 0.4319, + "step": 36860 + }, + { + "epoch": 6.94, + "grad_norm": 1.570940613746643, + "learning_rate": 6.1208356860530784e-06, + "loss": 0.2465, + "step": 36870 + }, + { + "epoch": 6.94, + "grad_norm": 0.4804990291595459, + "learning_rate": 6.11707133446264e-06, + "loss": 0.5241, + "step": 36880 + }, + { + "epoch": 6.94, + "grad_norm": 20.45847511291504, + "learning_rate": 6.1133069828722005e-06, + "loss": 0.5885, + "step": 36890 + }, + { + "epoch": 6.95, + "grad_norm": 0.5880195498466492, + "learning_rate": 6.109542631281762e-06, + "loss": 0.6118, + "step": 36900 + }, + { + "epoch": 6.95, + "grad_norm": 19.450361251831055, + "learning_rate": 6.1057782796913234e-06, + "loss": 0.6389, + "step": 36910 + }, + { + "epoch": 6.95, + "grad_norm": 6.449407577514648, + "learning_rate": 6.102013928100885e-06, + "loss": 0.4828, + "step": 36920 + }, + { + "epoch": 6.95, + "grad_norm": 19.77794075012207, + "learning_rate": 6.098249576510446e-06, + "loss": 0.5739, + "step": 36930 + }, + { + "epoch": 6.95, + "grad_norm": 19.867918014526367, + "learning_rate": 6.094485224920008e-06, + "loss": 0.5181, + "step": 36940 + }, + { + "epoch": 6.95, + "grad_norm": 11.12002944946289, + "learning_rate": 6.09072087332957e-06, + "loss": 0.526, + "step": 36950 + }, + { + "epoch": 6.96, + "grad_norm": 2.037919521331787, + "learning_rate": 6.086956521739132e-06, + "loss": 0.4911, + "step": 36960 + }, + { + "epoch": 6.96, + "grad_norm": 31.55478286743164, + "learning_rate": 6.083192170148693e-06, + "loss": 0.856, + "step": 36970 + }, + { + "epoch": 6.96, + "grad_norm": 18.756759643554688, + "learning_rate": 6.079427818558254e-06, + "loss": 0.7955, + "step": 36980 + }, + { + "epoch": 6.96, + "grad_norm": 18.737565994262695, + "learning_rate": 6.075663466967815e-06, + "loss": 0.5185, + "step": 36990 + }, + { + "epoch": 6.96, + "grad_norm": 8.580978393554688, + "learning_rate": 6.071899115377377e-06, + "loss": 0.5442, + "step": 37000 + }, + { + "epoch": 6.97, + "grad_norm": 16.809736251831055, + "learning_rate": 6.068134763786938e-06, + "loss": 0.7495, + "step": 37010 + }, + { + "epoch": 6.97, + "grad_norm": 11.04421329498291, + "learning_rate": 6.0643704121964995e-06, + "loss": 0.2769, + "step": 37020 + }, + { + "epoch": 6.97, + "grad_norm": 11.780550956726074, + "learning_rate": 6.060606060606061e-06, + "loss": 0.8881, + "step": 37030 + }, + { + "epoch": 6.97, + "grad_norm": 1.5381724834442139, + "learning_rate": 6.0568417090156224e-06, + "loss": 0.4434, + "step": 37040 + }, + { + "epoch": 6.97, + "grad_norm": 2.1745412349700928, + "learning_rate": 6.053077357425185e-06, + "loss": 0.3855, + "step": 37050 + }, + { + "epoch": 6.98, + "grad_norm": 1.2057762145996094, + "learning_rate": 6.049313005834746e-06, + "loss": 0.5586, + "step": 37060 + }, + { + "epoch": 6.98, + "grad_norm": 4.387953758239746, + "learning_rate": 6.045548654244307e-06, + "loss": 0.6215, + "step": 37070 + }, + { + "epoch": 6.98, + "grad_norm": 15.834346771240234, + "learning_rate": 6.041784302653868e-06, + "loss": 0.6887, + "step": 37080 + }, + { + "epoch": 6.98, + "grad_norm": 22.549379348754883, + "learning_rate": 6.03801995106343e-06, + "loss": 0.5226, + "step": 37090 + }, + { + "epoch": 6.98, + "grad_norm": 21.140119552612305, + "learning_rate": 6.034255599472991e-06, + "loss": 0.8414, + "step": 37100 + }, + { + "epoch": 6.98, + "grad_norm": 8.247200012207031, + "learning_rate": 6.030491247882553e-06, + "loss": 0.49, + "step": 37110 + }, + { + "epoch": 6.99, + "grad_norm": 7.904429912567139, + "learning_rate": 6.026726896292114e-06, + "loss": 0.2786, + "step": 37120 + }, + { + "epoch": 6.99, + "grad_norm": 26.10804557800293, + "learning_rate": 6.022962544701676e-06, + "loss": 0.3379, + "step": 37130 + }, + { + "epoch": 6.99, + "grad_norm": 12.558807373046875, + "learning_rate": 6.019198193111238e-06, + "loss": 0.6979, + "step": 37140 + }, + { + "epoch": 6.99, + "grad_norm": 4.539823055267334, + "learning_rate": 6.015433841520798e-06, + "loss": 0.4851, + "step": 37150 + }, + { + "epoch": 6.99, + "grad_norm": 17.593862533569336, + "learning_rate": 6.01166948993036e-06, + "loss": 0.4093, + "step": 37160 + }, + { + "epoch": 7.0, + "grad_norm": 6.471458435058594, + "learning_rate": 6.007905138339921e-06, + "loss": 0.4581, + "step": 37170 + }, + { + "epoch": 7.0, + "grad_norm": 6.4814348220825195, + "learning_rate": 6.004140786749483e-06, + "loss": 0.4462, + "step": 37180 + }, + { + "epoch": 7.0, + "grad_norm": 0.26597660779953003, + "learning_rate": 6.000376435159044e-06, + "loss": 0.303, + "step": 37190 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9241333333333334, + "eval_loss": 0.30108216404914856, + "eval_runtime": 52.0717, + "eval_samples_per_second": 144.032, + "eval_steps_per_second": 18.014, + "step": 37191 + }, + { + "epoch": 7.0, + "grad_norm": 4.4914164543151855, + "learning_rate": 5.996612083568606e-06, + "loss": 0.5521, + "step": 37200 + }, + { + "epoch": 7.0, + "grad_norm": 20.4888973236084, + "learning_rate": 5.992847731978167e-06, + "loss": 0.3498, + "step": 37210 + }, + { + "epoch": 7.01, + "grad_norm": 18.38087272644043, + "learning_rate": 5.989083380387729e-06, + "loss": 0.6529, + "step": 37220 + }, + { + "epoch": 7.01, + "grad_norm": 9.481815338134766, + "learning_rate": 5.985319028797291e-06, + "loss": 0.5024, + "step": 37230 + }, + { + "epoch": 7.01, + "grad_norm": 2.8433196544647217, + "learning_rate": 5.981554677206851e-06, + "loss": 0.3982, + "step": 37240 + }, + { + "epoch": 7.01, + "grad_norm": 26.133316040039062, + "learning_rate": 5.977790325616413e-06, + "loss": 0.9014, + "step": 37250 + }, + { + "epoch": 7.01, + "grad_norm": 17.805519104003906, + "learning_rate": 5.9740259740259746e-06, + "loss": 0.4383, + "step": 37260 + }, + { + "epoch": 7.01, + "grad_norm": 8.60024642944336, + "learning_rate": 5.970261622435536e-06, + "loss": 0.7659, + "step": 37270 + }, + { + "epoch": 7.02, + "grad_norm": 1.4241914749145508, + "learning_rate": 5.9664972708450975e-06, + "loss": 0.5184, + "step": 37280 + }, + { + "epoch": 7.02, + "grad_norm": 3.293872356414795, + "learning_rate": 5.962732919254659e-06, + "loss": 0.4679, + "step": 37290 + }, + { + "epoch": 7.02, + "grad_norm": 9.334550857543945, + "learning_rate": 5.95896856766422e-06, + "loss": 0.3055, + "step": 37300 + }, + { + "epoch": 7.02, + "grad_norm": 12.182844161987305, + "learning_rate": 5.955204216073782e-06, + "loss": 0.4598, + "step": 37310 + }, + { + "epoch": 7.02, + "grad_norm": 21.008893966674805, + "learning_rate": 5.951439864483344e-06, + "loss": 0.5906, + "step": 37320 + }, + { + "epoch": 7.03, + "grad_norm": 21.775415420532227, + "learning_rate": 5.947675512892904e-06, + "loss": 0.4748, + "step": 37330 + }, + { + "epoch": 7.03, + "grad_norm": 33.95808792114258, + "learning_rate": 5.943911161302465e-06, + "loss": 0.6582, + "step": 37340 + }, + { + "epoch": 7.03, + "grad_norm": 28.52260971069336, + "learning_rate": 5.940146809712028e-06, + "loss": 0.4331, + "step": 37350 + }, + { + "epoch": 7.03, + "grad_norm": 0.2158019095659256, + "learning_rate": 5.936382458121589e-06, + "loss": 0.4771, + "step": 37360 + }, + { + "epoch": 7.03, + "grad_norm": 11.500349998474121, + "learning_rate": 5.932618106531151e-06, + "loss": 0.2201, + "step": 37370 + }, + { + "epoch": 7.04, + "grad_norm": 13.823716163635254, + "learning_rate": 5.928853754940712e-06, + "loss": 0.429, + "step": 37380 + }, + { + "epoch": 7.04, + "grad_norm": 20.627649307250977, + "learning_rate": 5.9250894033502736e-06, + "loss": 0.526, + "step": 37390 + }, + { + "epoch": 7.04, + "grad_norm": 1.0602643489837646, + "learning_rate": 5.921325051759835e-06, + "loss": 0.3343, + "step": 37400 + }, + { + "epoch": 7.04, + "grad_norm": 34.25511932373047, + "learning_rate": 5.917560700169396e-06, + "loss": 0.7096, + "step": 37410 + }, + { + "epoch": 7.04, + "grad_norm": 58.332923889160156, + "learning_rate": 5.913796348578957e-06, + "loss": 0.5007, + "step": 37420 + }, + { + "epoch": 7.04, + "grad_norm": 0.520494282245636, + "learning_rate": 5.9100319969885186e-06, + "loss": 0.6852, + "step": 37430 + }, + { + "epoch": 7.05, + "grad_norm": 14.561441421508789, + "learning_rate": 5.906267645398081e-06, + "loss": 0.7221, + "step": 37440 + }, + { + "epoch": 7.05, + "grad_norm": 7.289056777954102, + "learning_rate": 5.902503293807642e-06, + "loss": 0.4541, + "step": 37450 + }, + { + "epoch": 7.05, + "grad_norm": 15.227532386779785, + "learning_rate": 5.898738942217204e-06, + "loss": 0.6697, + "step": 37460 + }, + { + "epoch": 7.05, + "grad_norm": 33.787776947021484, + "learning_rate": 5.894974590626765e-06, + "loss": 0.5081, + "step": 37470 + }, + { + "epoch": 7.05, + "grad_norm": 18.24729347229004, + "learning_rate": 5.891210239036327e-06, + "loss": 0.5493, + "step": 37480 + }, + { + "epoch": 7.06, + "grad_norm": 17.62848472595215, + "learning_rate": 5.887445887445888e-06, + "loss": 0.6633, + "step": 37490 + }, + { + "epoch": 7.06, + "grad_norm": 13.691800117492676, + "learning_rate": 5.883681535855449e-06, + "loss": 0.5323, + "step": 37500 + }, + { + "epoch": 7.06, + "grad_norm": 33.989376068115234, + "learning_rate": 5.87991718426501e-06, + "loss": 0.5766, + "step": 37510 + }, + { + "epoch": 7.06, + "grad_norm": 6.813748359680176, + "learning_rate": 5.876152832674572e-06, + "loss": 0.4941, + "step": 37520 + }, + { + "epoch": 7.06, + "grad_norm": 10.321836471557617, + "learning_rate": 5.872388481084134e-06, + "loss": 0.6905, + "step": 37530 + }, + { + "epoch": 7.07, + "grad_norm": 0.11116129904985428, + "learning_rate": 5.8686241294936955e-06, + "loss": 0.2757, + "step": 37540 + }, + { + "epoch": 7.07, + "grad_norm": 15.543323516845703, + "learning_rate": 5.864859777903257e-06, + "loss": 0.5171, + "step": 37550 + }, + { + "epoch": 7.07, + "grad_norm": 10.380474090576172, + "learning_rate": 5.861095426312818e-06, + "loss": 0.6989, + "step": 37560 + }, + { + "epoch": 7.07, + "grad_norm": 4.593369007110596, + "learning_rate": 5.85733107472238e-06, + "loss": 0.3715, + "step": 37570 + }, + { + "epoch": 7.07, + "grad_norm": 0.048009395599365234, + "learning_rate": 5.853566723131941e-06, + "loss": 0.4439, + "step": 37580 + }, + { + "epoch": 7.08, + "grad_norm": 0.29102352261543274, + "learning_rate": 5.849802371541502e-06, + "loss": 0.3896, + "step": 37590 + }, + { + "epoch": 7.08, + "grad_norm": 31.79117774963379, + "learning_rate": 5.846038019951063e-06, + "loss": 0.4598, + "step": 37600 + }, + { + "epoch": 7.08, + "grad_norm": 0.6519231796264648, + "learning_rate": 5.842273668360625e-06, + "loss": 0.4029, + "step": 37610 + }, + { + "epoch": 7.08, + "grad_norm": 0.04763523116707802, + "learning_rate": 5.838509316770186e-06, + "loss": 0.4446, + "step": 37620 + }, + { + "epoch": 7.08, + "grad_norm": 5.059743881225586, + "learning_rate": 5.834744965179749e-06, + "loss": 0.2375, + "step": 37630 + }, + { + "epoch": 7.08, + "grad_norm": 18.59803581237793, + "learning_rate": 5.83098061358931e-06, + "loss": 0.2464, + "step": 37640 + }, + { + "epoch": 7.09, + "grad_norm": 10.456582069396973, + "learning_rate": 5.8272162619988715e-06, + "loss": 0.3523, + "step": 37650 + }, + { + "epoch": 7.09, + "grad_norm": 32.90705871582031, + "learning_rate": 5.823451910408433e-06, + "loss": 0.5078, + "step": 37660 + }, + { + "epoch": 7.09, + "grad_norm": 3.806859254837036, + "learning_rate": 5.8196875588179945e-06, + "loss": 0.3444, + "step": 37670 + }, + { + "epoch": 7.09, + "grad_norm": 0.30914196372032166, + "learning_rate": 5.815923207227555e-06, + "loss": 0.4412, + "step": 37680 + }, + { + "epoch": 7.09, + "grad_norm": 1.1022928953170776, + "learning_rate": 5.8121588556371165e-06, + "loss": 0.4646, + "step": 37690 + }, + { + "epoch": 7.1, + "grad_norm": 2.6765952110290527, + "learning_rate": 5.808394504046678e-06, + "loss": 0.4558, + "step": 37700 + }, + { + "epoch": 7.1, + "grad_norm": 18.882295608520508, + "learning_rate": 5.8046301524562395e-06, + "loss": 0.3892, + "step": 37710 + }, + { + "epoch": 7.1, + "grad_norm": 23.711225509643555, + "learning_rate": 5.800865800865802e-06, + "loss": 0.4363, + "step": 37720 + }, + { + "epoch": 7.1, + "grad_norm": 15.44466781616211, + "learning_rate": 5.797101449275363e-06, + "loss": 0.3836, + "step": 37730 + }, + { + "epoch": 7.1, + "grad_norm": 42.25343322753906, + "learning_rate": 5.793337097684925e-06, + "loss": 0.3638, + "step": 37740 + }, + { + "epoch": 7.11, + "grad_norm": 5.034309387207031, + "learning_rate": 5.789572746094486e-06, + "loss": 0.4246, + "step": 37750 + }, + { + "epoch": 7.11, + "grad_norm": 8.63538646697998, + "learning_rate": 5.785808394504047e-06, + "loss": 0.3743, + "step": 37760 + }, + { + "epoch": 7.11, + "grad_norm": 7.230229377746582, + "learning_rate": 5.782044042913608e-06, + "loss": 0.3815, + "step": 37770 + }, + { + "epoch": 7.11, + "grad_norm": 45.37762451171875, + "learning_rate": 5.77827969132317e-06, + "loss": 0.5341, + "step": 37780 + }, + { + "epoch": 7.11, + "grad_norm": 13.632198333740234, + "learning_rate": 5.774515339732731e-06, + "loss": 0.5235, + "step": 37790 + }, + { + "epoch": 7.11, + "grad_norm": 0.051364749670028687, + "learning_rate": 5.770750988142293e-06, + "loss": 0.2958, + "step": 37800 + }, + { + "epoch": 7.12, + "grad_norm": 31.266891479492188, + "learning_rate": 5.766986636551855e-06, + "loss": 0.7418, + "step": 37810 + }, + { + "epoch": 7.12, + "grad_norm": 21.40471649169922, + "learning_rate": 5.763222284961416e-06, + "loss": 0.5075, + "step": 37820 + }, + { + "epoch": 7.12, + "grad_norm": 0.524456262588501, + "learning_rate": 5.759457933370978e-06, + "loss": 0.3901, + "step": 37830 + }, + { + "epoch": 7.12, + "grad_norm": 4.217536926269531, + "learning_rate": 5.755693581780539e-06, + "loss": 0.5058, + "step": 37840 + }, + { + "epoch": 7.12, + "grad_norm": 14.552117347717285, + "learning_rate": 5.7519292301901e-06, + "loss": 0.7943, + "step": 37850 + }, + { + "epoch": 7.13, + "grad_norm": 8.193163871765137, + "learning_rate": 5.748164878599661e-06, + "loss": 0.4663, + "step": 37860 + }, + { + "epoch": 7.13, + "grad_norm": 29.58894920349121, + "learning_rate": 5.744400527009223e-06, + "loss": 0.4782, + "step": 37870 + }, + { + "epoch": 7.13, + "grad_norm": 1.9362828731536865, + "learning_rate": 5.740636175418784e-06, + "loss": 0.4437, + "step": 37880 + }, + { + "epoch": 7.13, + "grad_norm": 11.760843276977539, + "learning_rate": 5.736871823828346e-06, + "loss": 0.6013, + "step": 37890 + }, + { + "epoch": 7.13, + "grad_norm": 11.666337966918945, + "learning_rate": 5.733107472237908e-06, + "loss": 0.477, + "step": 37900 + }, + { + "epoch": 7.14, + "grad_norm": 28.812885284423828, + "learning_rate": 5.7293431206474695e-06, + "loss": 0.5293, + "step": 37910 + }, + { + "epoch": 7.14, + "grad_norm": 38.73482894897461, + "learning_rate": 5.725578769057031e-06, + "loss": 0.5247, + "step": 37920 + }, + { + "epoch": 7.14, + "grad_norm": 14.522818565368652, + "learning_rate": 5.7218144174665924e-06, + "loss": 0.6026, + "step": 37930 + }, + { + "epoch": 7.14, + "grad_norm": 5.249240875244141, + "learning_rate": 5.718050065876153e-06, + "loss": 0.3719, + "step": 37940 + }, + { + "epoch": 7.14, + "grad_norm": 20.325544357299805, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.4992, + "step": 37950 + }, + { + "epoch": 7.14, + "grad_norm": 10.740631103515625, + "learning_rate": 5.710521362695276e-06, + "loss": 0.5879, + "step": 37960 + }, + { + "epoch": 7.15, + "grad_norm": 12.676395416259766, + "learning_rate": 5.7067570111048374e-06, + "loss": 0.46, + "step": 37970 + }, + { + "epoch": 7.15, + "grad_norm": 18.58064079284668, + "learning_rate": 5.702992659514399e-06, + "loss": 0.3504, + "step": 37980 + }, + { + "epoch": 7.15, + "grad_norm": 4.198108196258545, + "learning_rate": 5.69922830792396e-06, + "loss": 0.5009, + "step": 37990 + }, + { + "epoch": 7.15, + "grad_norm": 41.74122619628906, + "learning_rate": 5.695463956333523e-06, + "loss": 0.4566, + "step": 38000 + }, + { + "epoch": 7.15, + "grad_norm": 4.8473219871521, + "learning_rate": 5.691699604743084e-06, + "loss": 0.437, + "step": 38010 + }, + { + "epoch": 7.16, + "grad_norm": 0.8085455298423767, + "learning_rate": 5.687935253152645e-06, + "loss": 0.4446, + "step": 38020 + }, + { + "epoch": 7.16, + "grad_norm": 12.186211585998535, + "learning_rate": 5.684170901562206e-06, + "loss": 0.3344, + "step": 38030 + }, + { + "epoch": 7.16, + "grad_norm": 8.344589233398438, + "learning_rate": 5.680406549971768e-06, + "loss": 0.6024, + "step": 38040 + }, + { + "epoch": 7.16, + "grad_norm": 16.539567947387695, + "learning_rate": 5.676642198381329e-06, + "loss": 0.4564, + "step": 38050 + }, + { + "epoch": 7.16, + "grad_norm": 13.2172269821167, + "learning_rate": 5.672877846790891e-06, + "loss": 0.336, + "step": 38060 + }, + { + "epoch": 7.17, + "grad_norm": 0.8715003728866577, + "learning_rate": 5.669113495200452e-06, + "loss": 0.5934, + "step": 38070 + }, + { + "epoch": 7.17, + "grad_norm": 0.03967274725437164, + "learning_rate": 5.6653491436100135e-06, + "loss": 0.4955, + "step": 38080 + }, + { + "epoch": 7.17, + "grad_norm": 42.34977722167969, + "learning_rate": 5.661584792019576e-06, + "loss": 0.6749, + "step": 38090 + }, + { + "epoch": 7.17, + "grad_norm": 13.210389137268066, + "learning_rate": 5.657820440429137e-06, + "loss": 0.4131, + "step": 38100 + }, + { + "epoch": 7.17, + "grad_norm": 6.826159477233887, + "learning_rate": 5.654056088838698e-06, + "loss": 0.3849, + "step": 38110 + }, + { + "epoch": 7.17, + "grad_norm": 4.8396406173706055, + "learning_rate": 5.650291737248259e-06, + "loss": 0.4347, + "step": 38120 + }, + { + "epoch": 7.18, + "grad_norm": 0.492663711309433, + "learning_rate": 5.646527385657821e-06, + "loss": 0.462, + "step": 38130 + }, + { + "epoch": 7.18, + "grad_norm": 11.567475318908691, + "learning_rate": 5.642763034067382e-06, + "loss": 0.4399, + "step": 38140 + }, + { + "epoch": 7.18, + "grad_norm": 15.863377571105957, + "learning_rate": 5.638998682476944e-06, + "loss": 0.8885, + "step": 38150 + }, + { + "epoch": 7.18, + "grad_norm": 14.544013977050781, + "learning_rate": 5.635234330886505e-06, + "loss": 0.5326, + "step": 38160 + }, + { + "epoch": 7.18, + "grad_norm": 10.913338661193848, + "learning_rate": 5.631469979296067e-06, + "loss": 0.4607, + "step": 38170 + }, + { + "epoch": 7.19, + "grad_norm": 9.55777645111084, + "learning_rate": 5.627705627705629e-06, + "loss": 0.2982, + "step": 38180 + }, + { + "epoch": 7.19, + "grad_norm": 32.77585983276367, + "learning_rate": 5.62394127611519e-06, + "loss": 0.5706, + "step": 38190 + }, + { + "epoch": 7.19, + "grad_norm": 17.475671768188477, + "learning_rate": 5.620176924524751e-06, + "loss": 0.5987, + "step": 38200 + }, + { + "epoch": 7.19, + "grad_norm": 20.316225051879883, + "learning_rate": 5.6164125729343125e-06, + "loss": 0.2997, + "step": 38210 + }, + { + "epoch": 7.19, + "grad_norm": 24.66366958618164, + "learning_rate": 5.612648221343874e-06, + "loss": 0.5617, + "step": 38220 + }, + { + "epoch": 7.2, + "grad_norm": 2.09023380279541, + "learning_rate": 5.608883869753435e-06, + "loss": 0.1539, + "step": 38230 + }, + { + "epoch": 7.2, + "grad_norm": 15.220237731933594, + "learning_rate": 5.605119518162997e-06, + "loss": 0.426, + "step": 38240 + }, + { + "epoch": 7.2, + "grad_norm": 2.988645076751709, + "learning_rate": 5.601355166572558e-06, + "loss": 0.4009, + "step": 38250 + }, + { + "epoch": 7.2, + "grad_norm": 10.845462799072266, + "learning_rate": 5.59759081498212e-06, + "loss": 0.3647, + "step": 38260 + }, + { + "epoch": 7.2, + "grad_norm": 24.533771514892578, + "learning_rate": 5.593826463391681e-06, + "loss": 0.6294, + "step": 38270 + }, + { + "epoch": 7.2, + "grad_norm": 33.65419006347656, + "learning_rate": 5.590062111801242e-06, + "loss": 0.608, + "step": 38280 + }, + { + "epoch": 7.21, + "grad_norm": 22.451557159423828, + "learning_rate": 5.586297760210803e-06, + "loss": 0.421, + "step": 38290 + }, + { + "epoch": 7.21, + "grad_norm": 10.964974403381348, + "learning_rate": 5.582533408620366e-06, + "loss": 0.4638, + "step": 38300 + }, + { + "epoch": 7.21, + "grad_norm": 0.03477161377668381, + "learning_rate": 5.578769057029927e-06, + "loss": 0.4434, + "step": 38310 + }, + { + "epoch": 7.21, + "grad_norm": 0.03130391240119934, + "learning_rate": 5.5750047054394886e-06, + "loss": 0.65, + "step": 38320 + }, + { + "epoch": 7.21, + "grad_norm": 7.138416290283203, + "learning_rate": 5.57124035384905e-06, + "loss": 0.4412, + "step": 38330 + }, + { + "epoch": 7.22, + "grad_norm": 0.3959789574146271, + "learning_rate": 5.5674760022586115e-06, + "loss": 0.3357, + "step": 38340 + }, + { + "epoch": 7.22, + "grad_norm": 14.614726066589355, + "learning_rate": 5.563711650668173e-06, + "loss": 0.6037, + "step": 38350 + }, + { + "epoch": 7.22, + "grad_norm": 9.566722869873047, + "learning_rate": 5.559947299077734e-06, + "loss": 0.4153, + "step": 38360 + }, + { + "epoch": 7.22, + "grad_norm": 24.345317840576172, + "learning_rate": 5.556182947487295e-06, + "loss": 0.3658, + "step": 38370 + }, + { + "epoch": 7.22, + "grad_norm": 45.182655334472656, + "learning_rate": 5.5524185958968565e-06, + "loss": 0.3317, + "step": 38380 + }, + { + "epoch": 7.23, + "grad_norm": 20.688343048095703, + "learning_rate": 5.548654244306419e-06, + "loss": 0.6442, + "step": 38390 + }, + { + "epoch": 7.23, + "grad_norm": 0.04111315310001373, + "learning_rate": 5.54488989271598e-06, + "loss": 0.6456, + "step": 38400 + }, + { + "epoch": 7.23, + "grad_norm": 25.130800247192383, + "learning_rate": 5.541125541125542e-06, + "loss": 0.4863, + "step": 38410 + }, + { + "epoch": 7.23, + "grad_norm": 8.105908393859863, + "learning_rate": 5.537361189535103e-06, + "loss": 0.6085, + "step": 38420 + }, + { + "epoch": 7.23, + "grad_norm": 20.2083740234375, + "learning_rate": 5.533596837944665e-06, + "loss": 0.5963, + "step": 38430 + }, + { + "epoch": 7.24, + "grad_norm": 0.32264116406440735, + "learning_rate": 5.529832486354226e-06, + "loss": 0.2069, + "step": 38440 + }, + { + "epoch": 7.24, + "grad_norm": 15.73507022857666, + "learning_rate": 5.5260681347637875e-06, + "loss": 0.3864, + "step": 38450 + }, + { + "epoch": 7.24, + "grad_norm": 13.899942398071289, + "learning_rate": 5.522303783173348e-06, + "loss": 0.4887, + "step": 38460 + }, + { + "epoch": 7.24, + "grad_norm": 29.511226654052734, + "learning_rate": 5.51853943158291e-06, + "loss": 0.4645, + "step": 38470 + }, + { + "epoch": 7.24, + "grad_norm": 24.286212921142578, + "learning_rate": 5.514775079992472e-06, + "loss": 0.9881, + "step": 38480 + }, + { + "epoch": 7.24, + "grad_norm": 7.7546491622924805, + "learning_rate": 5.511010728402033e-06, + "loss": 0.1187, + "step": 38490 + }, + { + "epoch": 7.25, + "grad_norm": 2.176553726196289, + "learning_rate": 5.507246376811595e-06, + "loss": 0.2799, + "step": 38500 + }, + { + "epoch": 7.25, + "grad_norm": 20.29352378845215, + "learning_rate": 5.503482025221156e-06, + "loss": 0.5774, + "step": 38510 + }, + { + "epoch": 7.25, + "grad_norm": 11.283035278320312, + "learning_rate": 5.499717673630718e-06, + "loss": 0.6969, + "step": 38520 + }, + { + "epoch": 7.25, + "grad_norm": 2.3551344871520996, + "learning_rate": 5.495953322040279e-06, + "loss": 0.6055, + "step": 38530 + }, + { + "epoch": 7.25, + "grad_norm": 17.497970581054688, + "learning_rate": 5.492188970449841e-06, + "loss": 0.5596, + "step": 38540 + }, + { + "epoch": 7.26, + "grad_norm": 0.6391315460205078, + "learning_rate": 5.488424618859401e-06, + "loss": 0.5974, + "step": 38550 + }, + { + "epoch": 7.26, + "grad_norm": 10.734766960144043, + "learning_rate": 5.484660267268963e-06, + "loss": 0.5015, + "step": 38560 + }, + { + "epoch": 7.26, + "grad_norm": 28.943450927734375, + "learning_rate": 5.480895915678524e-06, + "loss": 0.291, + "step": 38570 + }, + { + "epoch": 7.26, + "grad_norm": 30.02581024169922, + "learning_rate": 5.4771315640880865e-06, + "loss": 0.5427, + "step": 38580 + }, + { + "epoch": 7.26, + "grad_norm": 2.238271474838257, + "learning_rate": 5.473367212497648e-06, + "loss": 0.5335, + "step": 38590 + }, + { + "epoch": 7.27, + "grad_norm": 23.164579391479492, + "learning_rate": 5.4696028609072095e-06, + "loss": 0.6102, + "step": 38600 + }, + { + "epoch": 7.27, + "grad_norm": 15.776400566101074, + "learning_rate": 5.465838509316771e-06, + "loss": 0.5805, + "step": 38610 + }, + { + "epoch": 7.27, + "grad_norm": 15.679583549499512, + "learning_rate": 5.462074157726332e-06, + "loss": 0.667, + "step": 38620 + }, + { + "epoch": 7.27, + "grad_norm": 13.96922492980957, + "learning_rate": 5.458309806135893e-06, + "loss": 0.1757, + "step": 38630 + }, + { + "epoch": 7.27, + "grad_norm": 20.9793643951416, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.4583, + "step": 38640 + }, + { + "epoch": 7.27, + "grad_norm": 13.186413764953613, + "learning_rate": 5.450781102955016e-06, + "loss": 0.3714, + "step": 38650 + }, + { + "epoch": 7.28, + "grad_norm": 0.07995478063821793, + "learning_rate": 5.447016751364577e-06, + "loss": 0.3397, + "step": 38660 + }, + { + "epoch": 7.28, + "grad_norm": 0.7243136167526245, + "learning_rate": 5.44325239977414e-06, + "loss": 0.4177, + "step": 38670 + }, + { + "epoch": 7.28, + "grad_norm": 19.715696334838867, + "learning_rate": 5.439488048183701e-06, + "loss": 0.5963, + "step": 38680 + }, + { + "epoch": 7.28, + "grad_norm": 13.43181037902832, + "learning_rate": 5.435723696593263e-06, + "loss": 0.4663, + "step": 38690 + }, + { + "epoch": 7.28, + "grad_norm": 21.230018615722656, + "learning_rate": 5.431959345002824e-06, + "loss": 0.3512, + "step": 38700 + }, + { + "epoch": 7.29, + "grad_norm": 20.25225067138672, + "learning_rate": 5.4281949934123855e-06, + "loss": 0.7707, + "step": 38710 + }, + { + "epoch": 7.29, + "grad_norm": 0.15251778066158295, + "learning_rate": 5.424430641821946e-06, + "loss": 0.3512, + "step": 38720 + }, + { + "epoch": 7.29, + "grad_norm": 25.118371963500977, + "learning_rate": 5.420666290231508e-06, + "loss": 0.5093, + "step": 38730 + }, + { + "epoch": 7.29, + "grad_norm": 5.278887748718262, + "learning_rate": 5.416901938641069e-06, + "loss": 0.5096, + "step": 38740 + }, + { + "epoch": 7.29, + "grad_norm": 2.8506057262420654, + "learning_rate": 5.4131375870506305e-06, + "loss": 0.4723, + "step": 38750 + }, + { + "epoch": 7.3, + "grad_norm": 12.97459602355957, + "learning_rate": 5.409373235460193e-06, + "loss": 0.74, + "step": 38760 + }, + { + "epoch": 7.3, + "grad_norm": 12.791807174682617, + "learning_rate": 5.405608883869754e-06, + "loss": 0.5725, + "step": 38770 + }, + { + "epoch": 7.3, + "grad_norm": 0.048314113169908524, + "learning_rate": 5.401844532279316e-06, + "loss": 0.5425, + "step": 38780 + }, + { + "epoch": 7.3, + "grad_norm": 2.543400287628174, + "learning_rate": 5.398080180688877e-06, + "loss": 0.5112, + "step": 38790 + }, + { + "epoch": 7.3, + "grad_norm": 8.345157623291016, + "learning_rate": 5.394315829098439e-06, + "loss": 0.2803, + "step": 38800 + }, + { + "epoch": 7.3, + "grad_norm": 37.12696838378906, + "learning_rate": 5.390551477507999e-06, + "loss": 0.609, + "step": 38810 + }, + { + "epoch": 7.31, + "grad_norm": 15.115370750427246, + "learning_rate": 5.386787125917561e-06, + "loss": 0.4371, + "step": 38820 + }, + { + "epoch": 7.31, + "grad_norm": 24.323984146118164, + "learning_rate": 5.383022774327122e-06, + "loss": 0.7161, + "step": 38830 + }, + { + "epoch": 7.31, + "grad_norm": 27.755170822143555, + "learning_rate": 5.379258422736684e-06, + "loss": 0.559, + "step": 38840 + }, + { + "epoch": 7.31, + "grad_norm": 14.916714668273926, + "learning_rate": 5.375494071146246e-06, + "loss": 0.3077, + "step": 38850 + }, + { + "epoch": 7.31, + "grad_norm": 7.22851037979126, + "learning_rate": 5.3717297195558074e-06, + "loss": 0.4101, + "step": 38860 + }, + { + "epoch": 7.32, + "grad_norm": 17.815229415893555, + "learning_rate": 5.367965367965369e-06, + "loss": 0.6718, + "step": 38870 + }, + { + "epoch": 7.32, + "grad_norm": 43.300785064697266, + "learning_rate": 5.36420101637493e-06, + "loss": 0.2887, + "step": 38880 + }, + { + "epoch": 7.32, + "grad_norm": 44.66895294189453, + "learning_rate": 5.360436664784491e-06, + "loss": 0.4488, + "step": 38890 + }, + { + "epoch": 7.32, + "grad_norm": 13.247306823730469, + "learning_rate": 5.3566723131940524e-06, + "loss": 0.4967, + "step": 38900 + }, + { + "epoch": 7.32, + "grad_norm": 9.896644592285156, + "learning_rate": 5.352907961603614e-06, + "loss": 0.5352, + "step": 38910 + }, + { + "epoch": 7.33, + "grad_norm": 15.37808609008789, + "learning_rate": 5.349143610013175e-06, + "loss": 0.4616, + "step": 38920 + }, + { + "epoch": 7.33, + "grad_norm": 21.590145111083984, + "learning_rate": 5.345379258422737e-06, + "loss": 0.8465, + "step": 38930 + }, + { + "epoch": 7.33, + "grad_norm": 3.483672618865967, + "learning_rate": 5.341614906832298e-06, + "loss": 0.3051, + "step": 38940 + }, + { + "epoch": 7.33, + "grad_norm": 24.26028060913086, + "learning_rate": 5.337850555241861e-06, + "loss": 0.3835, + "step": 38950 + }, + { + "epoch": 7.33, + "grad_norm": 7.518918991088867, + "learning_rate": 5.334086203651422e-06, + "loss": 0.3848, + "step": 38960 + }, + { + "epoch": 7.33, + "grad_norm": 25.557096481323242, + "learning_rate": 5.3303218520609835e-06, + "loss": 0.511, + "step": 38970 + }, + { + "epoch": 7.34, + "grad_norm": 5.811315059661865, + "learning_rate": 5.326557500470544e-06, + "loss": 0.3622, + "step": 38980 + }, + { + "epoch": 7.34, + "grad_norm": 0.346421480178833, + "learning_rate": 5.322793148880106e-06, + "loss": 0.4936, + "step": 38990 + }, + { + "epoch": 7.34, + "grad_norm": 28.79204559326172, + "learning_rate": 5.319028797289667e-06, + "loss": 0.8451, + "step": 39000 + }, + { + "epoch": 7.34, + "grad_norm": 44.75595474243164, + "learning_rate": 5.3152644456992285e-06, + "loss": 0.4559, + "step": 39010 + }, + { + "epoch": 7.34, + "grad_norm": 10.665853500366211, + "learning_rate": 5.31150009410879e-06, + "loss": 0.5485, + "step": 39020 + }, + { + "epoch": 7.35, + "grad_norm": 11.347060203552246, + "learning_rate": 5.307735742518351e-06, + "loss": 0.5688, + "step": 39030 + }, + { + "epoch": 7.35, + "grad_norm": 0.026885811239480972, + "learning_rate": 5.303971390927914e-06, + "loss": 0.6052, + "step": 39040 + }, + { + "epoch": 7.35, + "grad_norm": 17.71526336669922, + "learning_rate": 5.300207039337475e-06, + "loss": 0.2693, + "step": 39050 + }, + { + "epoch": 7.35, + "grad_norm": 19.759750366210938, + "learning_rate": 5.296442687747037e-06, + "loss": 0.4032, + "step": 39060 + }, + { + "epoch": 7.35, + "grad_norm": 7.999401569366455, + "learning_rate": 5.292678336156597e-06, + "loss": 0.4863, + "step": 39070 + }, + { + "epoch": 7.36, + "grad_norm": 0.07535995543003082, + "learning_rate": 5.288913984566159e-06, + "loss": 0.3263, + "step": 39080 + }, + { + "epoch": 7.36, + "grad_norm": 15.817834854125977, + "learning_rate": 5.28514963297572e-06, + "loss": 0.7172, + "step": 39090 + }, + { + "epoch": 7.36, + "grad_norm": 7.389345169067383, + "learning_rate": 5.281385281385282e-06, + "loss": 0.737, + "step": 39100 + }, + { + "epoch": 7.36, + "grad_norm": 1.000137209892273, + "learning_rate": 5.277620929794843e-06, + "loss": 0.5336, + "step": 39110 + }, + { + "epoch": 7.36, + "grad_norm": 0.07644683867692947, + "learning_rate": 5.2738565782044046e-06, + "loss": 0.3762, + "step": 39120 + }, + { + "epoch": 7.36, + "grad_norm": 34.44324493408203, + "learning_rate": 5.270092226613967e-06, + "loss": 0.434, + "step": 39130 + }, + { + "epoch": 7.37, + "grad_norm": 15.976019859313965, + "learning_rate": 5.266327875023528e-06, + "loss": 0.3836, + "step": 39140 + }, + { + "epoch": 7.37, + "grad_norm": 0.10559114813804626, + "learning_rate": 5.26256352343309e-06, + "loss": 0.3044, + "step": 39150 + }, + { + "epoch": 7.37, + "grad_norm": 13.347066879272461, + "learning_rate": 5.25879917184265e-06, + "loss": 0.5778, + "step": 39160 + }, + { + "epoch": 7.37, + "grad_norm": 49.38035583496094, + "learning_rate": 5.255034820252212e-06, + "loss": 0.3162, + "step": 39170 + }, + { + "epoch": 7.37, + "grad_norm": 3.3886990547180176, + "learning_rate": 5.251270468661773e-06, + "loss": 0.3553, + "step": 39180 + }, + { + "epoch": 7.38, + "grad_norm": 9.105963706970215, + "learning_rate": 5.247506117071335e-06, + "loss": 0.7682, + "step": 39190 + }, + { + "epoch": 7.38, + "grad_norm": 14.80863094329834, + "learning_rate": 5.243741765480896e-06, + "loss": 0.9301, + "step": 39200 + }, + { + "epoch": 7.38, + "grad_norm": 3.077606201171875, + "learning_rate": 5.239977413890458e-06, + "loss": 0.5738, + "step": 39210 + }, + { + "epoch": 7.38, + "grad_norm": 9.167863845825195, + "learning_rate": 5.236213062300019e-06, + "loss": 0.6652, + "step": 39220 + }, + { + "epoch": 7.38, + "grad_norm": 39.81913375854492, + "learning_rate": 5.2324487107095815e-06, + "loss": 0.6311, + "step": 39230 + }, + { + "epoch": 7.39, + "grad_norm": 0.21679580211639404, + "learning_rate": 5.228684359119141e-06, + "loss": 0.4019, + "step": 39240 + }, + { + "epoch": 7.39, + "grad_norm": 42.18350601196289, + "learning_rate": 5.2249200075287036e-06, + "loss": 0.4451, + "step": 39250 + }, + { + "epoch": 7.39, + "grad_norm": 4.216889381408691, + "learning_rate": 5.221155655938265e-06, + "loss": 0.4791, + "step": 39260 + }, + { + "epoch": 7.39, + "grad_norm": 27.158920288085938, + "learning_rate": 5.2173913043478265e-06, + "loss": 0.632, + "step": 39270 + }, + { + "epoch": 7.39, + "grad_norm": 3.5005364418029785, + "learning_rate": 5.213626952757388e-06, + "loss": 0.3373, + "step": 39280 + }, + { + "epoch": 7.4, + "grad_norm": 5.771264553070068, + "learning_rate": 5.209862601166949e-06, + "loss": 0.8069, + "step": 39290 + }, + { + "epoch": 7.4, + "grad_norm": 21.01832389831543, + "learning_rate": 5.206098249576511e-06, + "loss": 0.6151, + "step": 39300 + }, + { + "epoch": 7.4, + "grad_norm": 0.21687443554401398, + "learning_rate": 5.202333897986072e-06, + "loss": 0.4028, + "step": 39310 + }, + { + "epoch": 7.4, + "grad_norm": 17.40138816833496, + "learning_rate": 5.198569546395635e-06, + "loss": 0.3488, + "step": 39320 + }, + { + "epoch": 7.4, + "grad_norm": 18.54778289794922, + "learning_rate": 5.194805194805194e-06, + "loss": 0.4226, + "step": 39330 + }, + { + "epoch": 7.4, + "grad_norm": 0.8926262259483337, + "learning_rate": 5.191040843214757e-06, + "loss": 0.4072, + "step": 39340 + }, + { + "epoch": 7.41, + "grad_norm": 0.043081410229206085, + "learning_rate": 5.187276491624318e-06, + "loss": 0.4525, + "step": 39350 + }, + { + "epoch": 7.41, + "grad_norm": 28.826021194458008, + "learning_rate": 5.18351214003388e-06, + "loss": 0.4743, + "step": 39360 + }, + { + "epoch": 7.41, + "grad_norm": 19.19860076904297, + "learning_rate": 5.179747788443441e-06, + "loss": 0.6083, + "step": 39370 + }, + { + "epoch": 7.41, + "grad_norm": 23.61128044128418, + "learning_rate": 5.1759834368530025e-06, + "loss": 0.5457, + "step": 39380 + }, + { + "epoch": 7.41, + "grad_norm": 15.979894638061523, + "learning_rate": 5.172219085262564e-06, + "loss": 0.4737, + "step": 39390 + }, + { + "epoch": 7.42, + "grad_norm": 16.330854415893555, + "learning_rate": 5.1684547336721255e-06, + "loss": 0.6512, + "step": 39400 + }, + { + "epoch": 7.42, + "grad_norm": 3.0890164375305176, + "learning_rate": 5.164690382081688e-06, + "loss": 0.5336, + "step": 39410 + }, + { + "epoch": 7.42, + "grad_norm": 24.518455505371094, + "learning_rate": 5.1609260304912475e-06, + "loss": 0.4603, + "step": 39420 + }, + { + "epoch": 7.42, + "grad_norm": 0.07051704078912735, + "learning_rate": 5.15716167890081e-06, + "loss": 0.2793, + "step": 39430 + }, + { + "epoch": 7.42, + "grad_norm": 9.986491203308105, + "learning_rate": 5.153397327310371e-06, + "loss": 0.5185, + "step": 39440 + }, + { + "epoch": 7.43, + "grad_norm": 6.8465142250061035, + "learning_rate": 5.149632975719933e-06, + "loss": 0.6369, + "step": 39450 + }, + { + "epoch": 7.43, + "grad_norm": 0.4283851683139801, + "learning_rate": 5.145868624129494e-06, + "loss": 0.5151, + "step": 39460 + }, + { + "epoch": 7.43, + "grad_norm": 13.111185073852539, + "learning_rate": 5.142104272539056e-06, + "loss": 0.4777, + "step": 39470 + }, + { + "epoch": 7.43, + "grad_norm": 18.829418182373047, + "learning_rate": 5.138339920948617e-06, + "loss": 0.5422, + "step": 39480 + }, + { + "epoch": 7.43, + "grad_norm": 18.628862380981445, + "learning_rate": 5.134575569358179e-06, + "loss": 0.5727, + "step": 39490 + }, + { + "epoch": 7.43, + "grad_norm": 8.623425483703613, + "learning_rate": 5.130811217767739e-06, + "loss": 0.5578, + "step": 39500 + }, + { + "epoch": 7.44, + "grad_norm": 7.155189514160156, + "learning_rate": 5.127046866177301e-06, + "loss": 0.6192, + "step": 39510 + }, + { + "epoch": 7.44, + "grad_norm": 2.426950216293335, + "learning_rate": 5.123282514586862e-06, + "loss": 0.5366, + "step": 39520 + }, + { + "epoch": 7.44, + "grad_norm": 5.574533939361572, + "learning_rate": 5.1195181629964245e-06, + "loss": 0.4428, + "step": 39530 + }, + { + "epoch": 7.44, + "grad_norm": 0.15088945627212524, + "learning_rate": 5.115753811405986e-06, + "loss": 0.4344, + "step": 39540 + }, + { + "epoch": 7.44, + "grad_norm": 53.62294387817383, + "learning_rate": 5.111989459815547e-06, + "loss": 0.3419, + "step": 39550 + }, + { + "epoch": 7.45, + "grad_norm": 19.655786514282227, + "learning_rate": 5.108225108225109e-06, + "loss": 0.4297, + "step": 39560 + }, + { + "epoch": 7.45, + "grad_norm": 9.113370895385742, + "learning_rate": 5.10446075663467e-06, + "loss": 0.4997, + "step": 39570 + }, + { + "epoch": 7.45, + "grad_norm": 8.7140474319458, + "learning_rate": 5.100696405044232e-06, + "loss": 0.5054, + "step": 39580 + }, + { + "epoch": 7.45, + "grad_norm": 0.8119073510169983, + "learning_rate": 5.096932053453792e-06, + "loss": 0.4112, + "step": 39590 + }, + { + "epoch": 7.45, + "grad_norm": 24.722349166870117, + "learning_rate": 5.093167701863354e-06, + "loss": 0.4957, + "step": 39600 + }, + { + "epoch": 7.46, + "grad_norm": 23.276451110839844, + "learning_rate": 5.089403350272915e-06, + "loss": 0.0708, + "step": 39610 + }, + { + "epoch": 7.46, + "grad_norm": 10.128479957580566, + "learning_rate": 5.085638998682478e-06, + "loss": 0.3646, + "step": 39620 + }, + { + "epoch": 7.46, + "grad_norm": 79.91693878173828, + "learning_rate": 5.081874647092039e-06, + "loss": 0.567, + "step": 39630 + }, + { + "epoch": 7.46, + "grad_norm": 33.41339111328125, + "learning_rate": 5.0781102955016005e-06, + "loss": 0.3338, + "step": 39640 + }, + { + "epoch": 7.46, + "grad_norm": 21.802555084228516, + "learning_rate": 5.074345943911162e-06, + "loss": 0.3191, + "step": 39650 + }, + { + "epoch": 7.46, + "grad_norm": 18.43767738342285, + "learning_rate": 5.0705815923207234e-06, + "loss": 0.5815, + "step": 39660 + }, + { + "epoch": 7.47, + "grad_norm": 14.148484230041504, + "learning_rate": 5.066817240730285e-06, + "loss": 0.2298, + "step": 39670 + }, + { + "epoch": 7.47, + "grad_norm": 20.37163734436035, + "learning_rate": 5.0630528891398455e-06, + "loss": 0.8356, + "step": 39680 + }, + { + "epoch": 7.47, + "grad_norm": 6.101649284362793, + "learning_rate": 5.059288537549407e-06, + "loss": 0.4301, + "step": 39690 + }, + { + "epoch": 7.47, + "grad_norm": 1.8522956371307373, + "learning_rate": 5.0555241859589684e-06, + "loss": 0.3962, + "step": 39700 + }, + { + "epoch": 7.47, + "grad_norm": 37.072349548339844, + "learning_rate": 5.051759834368531e-06, + "loss": 0.4138, + "step": 39710 + }, + { + "epoch": 7.48, + "grad_norm": 28.832242965698242, + "learning_rate": 5.047995482778092e-06, + "loss": 0.5587, + "step": 39720 + }, + { + "epoch": 7.48, + "grad_norm": 29.438364028930664, + "learning_rate": 5.044231131187654e-06, + "loss": 0.3898, + "step": 39730 + }, + { + "epoch": 7.48, + "grad_norm": 0.1980200558900833, + "learning_rate": 5.040466779597215e-06, + "loss": 0.1686, + "step": 39740 + }, + { + "epoch": 7.48, + "grad_norm": 2.973400592803955, + "learning_rate": 5.036702428006777e-06, + "loss": 0.2205, + "step": 39750 + }, + { + "epoch": 7.48, + "grad_norm": 41.77248001098633, + "learning_rate": 5.032938076416338e-06, + "loss": 0.5068, + "step": 39760 + }, + { + "epoch": 7.49, + "grad_norm": 23.240005493164062, + "learning_rate": 5.029173724825899e-06, + "loss": 0.2396, + "step": 39770 + }, + { + "epoch": 7.49, + "grad_norm": 29.466114044189453, + "learning_rate": 5.02540937323546e-06, + "loss": 0.7611, + "step": 39780 + }, + { + "epoch": 7.49, + "grad_norm": 18.149154663085938, + "learning_rate": 5.021645021645022e-06, + "loss": 0.4035, + "step": 39790 + }, + { + "epoch": 7.49, + "grad_norm": 15.7437105178833, + "learning_rate": 5.017880670054583e-06, + "loss": 0.5208, + "step": 39800 + }, + { + "epoch": 7.49, + "grad_norm": 23.674476623535156, + "learning_rate": 5.014116318464145e-06, + "loss": 0.3757, + "step": 39810 + }, + { + "epoch": 7.49, + "grad_norm": 2.990457057952881, + "learning_rate": 5.010351966873707e-06, + "loss": 0.3227, + "step": 39820 + }, + { + "epoch": 7.5, + "grad_norm": 5.852288246154785, + "learning_rate": 5.006587615283268e-06, + "loss": 0.398, + "step": 39830 + }, + { + "epoch": 7.5, + "grad_norm": 21.586061477661133, + "learning_rate": 5.00282326369283e-06, + "loss": 0.4684, + "step": 39840 + }, + { + "epoch": 7.5, + "grad_norm": 18.58787727355957, + "learning_rate": 4.999058912102391e-06, + "loss": 0.4361, + "step": 39850 + }, + { + "epoch": 7.5, + "grad_norm": 6.530487060546875, + "learning_rate": 4.995294560511953e-06, + "loss": 0.419, + "step": 39860 + }, + { + "epoch": 7.5, + "grad_norm": 18.475072860717773, + "learning_rate": 4.991530208921513e-06, + "loss": 0.5548, + "step": 39870 + }, + { + "epoch": 7.51, + "grad_norm": 14.466314315795898, + "learning_rate": 4.987765857331075e-06, + "loss": 0.5023, + "step": 39880 + }, + { + "epoch": 7.51, + "grad_norm": 15.371254920959473, + "learning_rate": 4.984001505740636e-06, + "loss": 0.4835, + "step": 39890 + }, + { + "epoch": 7.51, + "grad_norm": 3.507498264312744, + "learning_rate": 4.9802371541501985e-06, + "loss": 0.4585, + "step": 39900 + }, + { + "epoch": 7.51, + "grad_norm": 30.916147232055664, + "learning_rate": 4.97647280255976e-06, + "loss": 0.5154, + "step": 39910 + }, + { + "epoch": 7.51, + "grad_norm": 14.572548866271973, + "learning_rate": 4.9727084509693206e-06, + "loss": 0.2884, + "step": 39920 + }, + { + "epoch": 7.52, + "grad_norm": 0.2854940891265869, + "learning_rate": 4.968944099378882e-06, + "loss": 0.4503, + "step": 39930 + }, + { + "epoch": 7.52, + "grad_norm": 30.854312896728516, + "learning_rate": 4.9651797477884435e-06, + "loss": 0.5331, + "step": 39940 + }, + { + "epoch": 7.52, + "grad_norm": 19.74198341369629, + "learning_rate": 4.961415396198006e-06, + "loss": 0.1224, + "step": 39950 + }, + { + "epoch": 7.52, + "grad_norm": 25.249618530273438, + "learning_rate": 4.957651044607566e-06, + "loss": 0.4178, + "step": 39960 + }, + { + "epoch": 7.52, + "grad_norm": 8.16231632232666, + "learning_rate": 4.953886693017128e-06, + "loss": 0.5486, + "step": 39970 + }, + { + "epoch": 7.52, + "grad_norm": 3.6326353549957275, + "learning_rate": 4.950122341426689e-06, + "loss": 0.5058, + "step": 39980 + }, + { + "epoch": 7.53, + "grad_norm": 20.812101364135742, + "learning_rate": 4.946357989836252e-06, + "loss": 0.5612, + "step": 39990 + }, + { + "epoch": 7.53, + "grad_norm": 24.15587615966797, + "learning_rate": 4.942593638245812e-06, + "loss": 0.5528, + "step": 40000 + }, + { + "epoch": 7.53, + "grad_norm": 0.3987553119659424, + "learning_rate": 4.938829286655374e-06, + "loss": 0.3515, + "step": 40010 + }, + { + "epoch": 7.53, + "grad_norm": 0.9960697889328003, + "learning_rate": 4.935064935064935e-06, + "loss": 0.3475, + "step": 40020 + }, + { + "epoch": 7.53, + "grad_norm": 11.244698524475098, + "learning_rate": 4.931300583474497e-06, + "loss": 0.7113, + "step": 40030 + }, + { + "epoch": 7.54, + "grad_norm": 12.09398365020752, + "learning_rate": 4.927536231884059e-06, + "loss": 0.3169, + "step": 40040 + }, + { + "epoch": 7.54, + "grad_norm": 17.471298217773438, + "learning_rate": 4.9237718802936196e-06, + "loss": 0.3766, + "step": 40050 + }, + { + "epoch": 7.54, + "grad_norm": 20.0872802734375, + "learning_rate": 4.920007528703181e-06, + "loss": 0.4371, + "step": 40060 + }, + { + "epoch": 7.54, + "grad_norm": 0.6726735830307007, + "learning_rate": 4.9162431771127425e-06, + "loss": 0.5611, + "step": 40070 + }, + { + "epoch": 7.54, + "grad_norm": 0.362667977809906, + "learning_rate": 4.912478825522305e-06, + "loss": 0.4279, + "step": 40080 + }, + { + "epoch": 7.55, + "grad_norm": 48.890018463134766, + "learning_rate": 4.908714473931865e-06, + "loss": 0.3399, + "step": 40090 + }, + { + "epoch": 7.55, + "grad_norm": 2.0744881629943848, + "learning_rate": 4.904950122341427e-06, + "loss": 0.2226, + "step": 40100 + }, + { + "epoch": 7.55, + "grad_norm": 4.881161212921143, + "learning_rate": 4.901185770750988e-06, + "loss": 0.6647, + "step": 40110 + }, + { + "epoch": 7.55, + "grad_norm": 15.641790390014648, + "learning_rate": 4.89742141916055e-06, + "loss": 0.4884, + "step": 40120 + }, + { + "epoch": 7.55, + "grad_norm": 27.704832077026367, + "learning_rate": 4.893657067570112e-06, + "loss": 0.6596, + "step": 40130 + }, + { + "epoch": 7.56, + "grad_norm": 11.091574668884277, + "learning_rate": 4.889892715979673e-06, + "loss": 0.4945, + "step": 40140 + }, + { + "epoch": 7.56, + "grad_norm": 23.857437133789062, + "learning_rate": 4.886128364389234e-06, + "loss": 0.4619, + "step": 40150 + }, + { + "epoch": 7.56, + "grad_norm": 0.4577654004096985, + "learning_rate": 4.882364012798796e-06, + "loss": 0.2583, + "step": 40160 + }, + { + "epoch": 7.56, + "grad_norm": 0.02999284863471985, + "learning_rate": 4.878599661208357e-06, + "loss": 0.4674, + "step": 40170 + }, + { + "epoch": 7.56, + "grad_norm": 8.35001277923584, + "learning_rate": 4.8748353096179186e-06, + "loss": 0.4048, + "step": 40180 + }, + { + "epoch": 7.56, + "grad_norm": 24.522512435913086, + "learning_rate": 4.87107095802748e-06, + "loss": 0.3845, + "step": 40190 + }, + { + "epoch": 7.57, + "grad_norm": 11.478434562683105, + "learning_rate": 4.8673066064370415e-06, + "loss": 0.4528, + "step": 40200 + }, + { + "epoch": 7.57, + "grad_norm": 34.55609130859375, + "learning_rate": 4.863542254846603e-06, + "loss": 0.3256, + "step": 40210 + }, + { + "epoch": 7.57, + "grad_norm": 30.954259872436523, + "learning_rate": 4.859777903256164e-06, + "loss": 0.6146, + "step": 40220 + }, + { + "epoch": 7.57, + "grad_norm": 17.295373916625977, + "learning_rate": 4.856013551665726e-06, + "loss": 0.3439, + "step": 40230 + }, + { + "epoch": 7.57, + "grad_norm": 4.239554405212402, + "learning_rate": 4.852249200075287e-06, + "loss": 0.4998, + "step": 40240 + }, + { + "epoch": 7.58, + "grad_norm": 9.876043319702148, + "learning_rate": 4.848484848484849e-06, + "loss": 0.4356, + "step": 40250 + }, + { + "epoch": 7.58, + "grad_norm": 14.549151420593262, + "learning_rate": 4.84472049689441e-06, + "loss": 0.3502, + "step": 40260 + }, + { + "epoch": 7.58, + "grad_norm": 51.860286712646484, + "learning_rate": 4.840956145303972e-06, + "loss": 0.6799, + "step": 40270 + }, + { + "epoch": 7.58, + "grad_norm": 14.050992965698242, + "learning_rate": 4.837191793713533e-06, + "loss": 0.2468, + "step": 40280 + }, + { + "epoch": 7.58, + "grad_norm": 43.28739547729492, + "learning_rate": 4.833427442123095e-06, + "loss": 0.4276, + "step": 40290 + }, + { + "epoch": 7.59, + "grad_norm": 0.5742427706718445, + "learning_rate": 4.829663090532656e-06, + "loss": 0.3327, + "step": 40300 + }, + { + "epoch": 7.59, + "grad_norm": 21.811113357543945, + "learning_rate": 4.8258987389422175e-06, + "loss": 0.4486, + "step": 40310 + }, + { + "epoch": 7.59, + "grad_norm": 0.1784685254096985, + "learning_rate": 4.822134387351779e-06, + "loss": 0.2737, + "step": 40320 + }, + { + "epoch": 7.59, + "grad_norm": 38.78385543823242, + "learning_rate": 4.8183700357613405e-06, + "loss": 0.4103, + "step": 40330 + }, + { + "epoch": 7.59, + "grad_norm": 26.003643035888672, + "learning_rate": 4.814605684170902e-06, + "loss": 0.4167, + "step": 40340 + }, + { + "epoch": 7.59, + "grad_norm": 0.23788726329803467, + "learning_rate": 4.810841332580463e-06, + "loss": 0.5304, + "step": 40350 + }, + { + "epoch": 7.6, + "grad_norm": 18.81310272216797, + "learning_rate": 4.807076980990025e-06, + "loss": 0.5927, + "step": 40360 + }, + { + "epoch": 7.6, + "grad_norm": 36.79855728149414, + "learning_rate": 4.803312629399586e-06, + "loss": 0.3944, + "step": 40370 + }, + { + "epoch": 7.6, + "grad_norm": 11.548039436340332, + "learning_rate": 4.799548277809148e-06, + "loss": 0.7024, + "step": 40380 + }, + { + "epoch": 7.6, + "grad_norm": 17.895483016967773, + "learning_rate": 4.795783926218709e-06, + "loss": 0.4029, + "step": 40390 + }, + { + "epoch": 7.6, + "grad_norm": 25.039541244506836, + "learning_rate": 4.792019574628271e-06, + "loss": 0.4372, + "step": 40400 + }, + { + "epoch": 7.61, + "grad_norm": 5.053126811981201, + "learning_rate": 4.788255223037832e-06, + "loss": 0.1909, + "step": 40410 + }, + { + "epoch": 7.61, + "grad_norm": 19.565786361694336, + "learning_rate": 4.784490871447394e-06, + "loss": 0.6079, + "step": 40420 + }, + { + "epoch": 7.61, + "grad_norm": 21.104755401611328, + "learning_rate": 4.780726519856955e-06, + "loss": 0.5151, + "step": 40430 + }, + { + "epoch": 7.61, + "grad_norm": 20.68798828125, + "learning_rate": 4.7769621682665165e-06, + "loss": 0.4636, + "step": 40440 + }, + { + "epoch": 7.61, + "grad_norm": 0.7437390685081482, + "learning_rate": 4.773197816676078e-06, + "loss": 0.4972, + "step": 40450 + }, + { + "epoch": 7.62, + "grad_norm": 5.112244606018066, + "learning_rate": 4.7694334650856395e-06, + "loss": 0.4124, + "step": 40460 + }, + { + "epoch": 7.62, + "grad_norm": 14.092161178588867, + "learning_rate": 4.765669113495201e-06, + "loss": 0.5287, + "step": 40470 + }, + { + "epoch": 7.62, + "grad_norm": 30.17268180847168, + "learning_rate": 4.761904761904762e-06, + "loss": 0.5739, + "step": 40480 + }, + { + "epoch": 7.62, + "grad_norm": 6.536983013153076, + "learning_rate": 4.758140410314324e-06, + "loss": 0.2285, + "step": 40490 + }, + { + "epoch": 7.62, + "grad_norm": 0.2827499508857727, + "learning_rate": 4.754376058723885e-06, + "loss": 0.6735, + "step": 40500 + }, + { + "epoch": 7.62, + "grad_norm": 6.0071563720703125, + "learning_rate": 4.750611707133447e-06, + "loss": 0.5643, + "step": 40510 + }, + { + "epoch": 7.63, + "grad_norm": 8.479032516479492, + "learning_rate": 4.746847355543008e-06, + "loss": 0.3364, + "step": 40520 + }, + { + "epoch": 7.63, + "grad_norm": 18.724328994750977, + "learning_rate": 4.74308300395257e-06, + "loss": 0.3524, + "step": 40530 + }, + { + "epoch": 7.63, + "grad_norm": 9.712454795837402, + "learning_rate": 4.739318652362131e-06, + "loss": 0.4297, + "step": 40540 + }, + { + "epoch": 7.63, + "grad_norm": 22.55899429321289, + "learning_rate": 4.735554300771693e-06, + "loss": 0.3444, + "step": 40550 + }, + { + "epoch": 7.63, + "grad_norm": 1.9720405340194702, + "learning_rate": 4.731789949181254e-06, + "loss": 0.4414, + "step": 40560 + }, + { + "epoch": 7.64, + "grad_norm": 0.06697694957256317, + "learning_rate": 4.7280255975908155e-06, + "loss": 0.4789, + "step": 40570 + }, + { + "epoch": 7.64, + "grad_norm": 11.40739631652832, + "learning_rate": 4.724261246000377e-06, + "loss": 0.5276, + "step": 40580 + }, + { + "epoch": 7.64, + "grad_norm": 17.688581466674805, + "learning_rate": 4.7204968944099384e-06, + "loss": 0.4745, + "step": 40590 + }, + { + "epoch": 7.64, + "grad_norm": 21.510601043701172, + "learning_rate": 4.7167325428195e-06, + "loss": 0.3477, + "step": 40600 + }, + { + "epoch": 7.64, + "grad_norm": 0.09768573194742203, + "learning_rate": 4.7129681912290605e-06, + "loss": 0.2761, + "step": 40610 + }, + { + "epoch": 7.65, + "grad_norm": 19.56352424621582, + "learning_rate": 4.709203839638623e-06, + "loss": 0.4043, + "step": 40620 + }, + { + "epoch": 7.65, + "grad_norm": 29.654693603515625, + "learning_rate": 4.705439488048184e-06, + "loss": 0.4509, + "step": 40630 + }, + { + "epoch": 7.65, + "grad_norm": 3.091660737991333, + "learning_rate": 4.701675136457746e-06, + "loss": 0.6926, + "step": 40640 + }, + { + "epoch": 7.65, + "grad_norm": 0.025056788697838783, + "learning_rate": 4.697910784867307e-06, + "loss": 0.4671, + "step": 40650 + }, + { + "epoch": 7.65, + "grad_norm": 16.61036491394043, + "learning_rate": 4.694146433276869e-06, + "loss": 0.4399, + "step": 40660 + }, + { + "epoch": 7.65, + "grad_norm": 14.356207847595215, + "learning_rate": 4.69038208168643e-06, + "loss": 0.4281, + "step": 40670 + }, + { + "epoch": 7.66, + "grad_norm": 26.823406219482422, + "learning_rate": 4.686617730095992e-06, + "loss": 0.5089, + "step": 40680 + }, + { + "epoch": 7.66, + "grad_norm": 2.588717460632324, + "learning_rate": 4.682853378505553e-06, + "loss": 0.4385, + "step": 40690 + }, + { + "epoch": 7.66, + "grad_norm": 8.470640182495117, + "learning_rate": 4.679089026915114e-06, + "loss": 0.534, + "step": 40700 + }, + { + "epoch": 7.66, + "grad_norm": 1.754682183265686, + "learning_rate": 4.675324675324676e-06, + "loss": 0.5254, + "step": 40710 + }, + { + "epoch": 7.66, + "grad_norm": 37.55604934692383, + "learning_rate": 4.6715603237342374e-06, + "loss": 0.4852, + "step": 40720 + }, + { + "epoch": 7.67, + "grad_norm": 4.329613208770752, + "learning_rate": 4.667795972143799e-06, + "loss": 0.3389, + "step": 40730 + }, + { + "epoch": 7.67, + "grad_norm": 5.099806785583496, + "learning_rate": 4.66403162055336e-06, + "loss": 0.6732, + "step": 40740 + }, + { + "epoch": 7.67, + "grad_norm": 8.931364059448242, + "learning_rate": 4.660267268962921e-06, + "loss": 0.4488, + "step": 40750 + }, + { + "epoch": 7.67, + "grad_norm": 9.485333442687988, + "learning_rate": 4.656502917372483e-06, + "loss": 0.3706, + "step": 40760 + }, + { + "epoch": 7.67, + "grad_norm": 10.791669845581055, + "learning_rate": 4.652738565782045e-06, + "loss": 0.5172, + "step": 40770 + }, + { + "epoch": 7.68, + "grad_norm": 11.53160285949707, + "learning_rate": 4.648974214191606e-06, + "loss": 0.325, + "step": 40780 + }, + { + "epoch": 7.68, + "grad_norm": 0.11603966355323792, + "learning_rate": 4.645209862601167e-06, + "loss": 0.5713, + "step": 40790 + }, + { + "epoch": 7.68, + "grad_norm": 4.958864688873291, + "learning_rate": 4.641445511010729e-06, + "loss": 0.4887, + "step": 40800 + }, + { + "epoch": 7.68, + "grad_norm": 6.0979814529418945, + "learning_rate": 4.637681159420291e-06, + "loss": 0.4514, + "step": 40810 + }, + { + "epoch": 7.68, + "grad_norm": 37.97774124145508, + "learning_rate": 4.633916807829852e-06, + "loss": 0.5397, + "step": 40820 + }, + { + "epoch": 7.68, + "grad_norm": 6.384561061859131, + "learning_rate": 4.630152456239413e-06, + "loss": 0.2419, + "step": 40830 + }, + { + "epoch": 7.69, + "grad_norm": 32.47441101074219, + "learning_rate": 4.626388104648974e-06, + "loss": 0.7723, + "step": 40840 + }, + { + "epoch": 7.69, + "grad_norm": 58.51771545410156, + "learning_rate": 4.622623753058536e-06, + "loss": 0.4001, + "step": 40850 + }, + { + "epoch": 7.69, + "grad_norm": 1.2339733839035034, + "learning_rate": 4.618859401468098e-06, + "loss": 0.2751, + "step": 40860 + }, + { + "epoch": 7.69, + "grad_norm": 1.42243492603302, + "learning_rate": 4.615095049877659e-06, + "loss": 0.4036, + "step": 40870 + }, + { + "epoch": 7.69, + "grad_norm": 0.05874158814549446, + "learning_rate": 4.61133069828722e-06, + "loss": 0.2054, + "step": 40880 + }, + { + "epoch": 7.7, + "grad_norm": 0.06559593230485916, + "learning_rate": 4.607566346696781e-06, + "loss": 0.3678, + "step": 40890 + }, + { + "epoch": 7.7, + "grad_norm": 37.953216552734375, + "learning_rate": 4.603801995106344e-06, + "loss": 0.7121, + "step": 40900 + }, + { + "epoch": 7.7, + "grad_norm": 28.273658752441406, + "learning_rate": 4.600037643515905e-06, + "loss": 0.7176, + "step": 40910 + }, + { + "epoch": 7.7, + "grad_norm": 0.9303427934646606, + "learning_rate": 4.596273291925466e-06, + "loss": 0.4192, + "step": 40920 + }, + { + "epoch": 7.7, + "grad_norm": 6.678520202636719, + "learning_rate": 4.592508940335027e-06, + "loss": 0.4673, + "step": 40930 + }, + { + "epoch": 7.71, + "grad_norm": 10.7068510055542, + "learning_rate": 4.5887445887445896e-06, + "loss": 0.5474, + "step": 40940 + }, + { + "epoch": 7.71, + "grad_norm": 24.268388748168945, + "learning_rate": 4.584980237154151e-06, + "loss": 0.4693, + "step": 40950 + }, + { + "epoch": 7.71, + "grad_norm": 0.8732045292854309, + "learning_rate": 4.581215885563712e-06, + "loss": 0.3434, + "step": 40960 + }, + { + "epoch": 7.71, + "grad_norm": 4.5128865242004395, + "learning_rate": 4.577451533973273e-06, + "loss": 0.5951, + "step": 40970 + }, + { + "epoch": 7.71, + "grad_norm": 29.08911895751953, + "learning_rate": 4.5736871823828346e-06, + "loss": 0.8518, + "step": 40980 + }, + { + "epoch": 7.72, + "grad_norm": 34.50959014892578, + "learning_rate": 4.569922830792397e-06, + "loss": 0.3967, + "step": 40990 + }, + { + "epoch": 7.72, + "grad_norm": 12.16829776763916, + "learning_rate": 4.566158479201958e-06, + "loss": 0.568, + "step": 41000 + }, + { + "epoch": 7.72, + "grad_norm": 8.815430641174316, + "learning_rate": 4.562394127611519e-06, + "loss": 0.6214, + "step": 41010 + }, + { + "epoch": 7.72, + "grad_norm": 0.7041884064674377, + "learning_rate": 4.55862977602108e-06, + "loss": 0.3887, + "step": 41020 + }, + { + "epoch": 7.72, + "grad_norm": 5.827664852142334, + "learning_rate": 4.554865424430642e-06, + "loss": 0.3346, + "step": 41030 + }, + { + "epoch": 7.72, + "grad_norm": 13.621718406677246, + "learning_rate": 4.551101072840204e-06, + "loss": 0.5485, + "step": 41040 + }, + { + "epoch": 7.73, + "grad_norm": 0.27952587604522705, + "learning_rate": 4.547336721249765e-06, + "loss": 0.4817, + "step": 41050 + }, + { + "epoch": 7.73, + "grad_norm": 7.287542343139648, + "learning_rate": 4.543572369659326e-06, + "loss": 0.674, + "step": 41060 + }, + { + "epoch": 7.73, + "grad_norm": 8.12995433807373, + "learning_rate": 4.539808018068888e-06, + "loss": 0.6428, + "step": 41070 + }, + { + "epoch": 7.73, + "grad_norm": 20.937484741210938, + "learning_rate": 4.53604366647845e-06, + "loss": 0.5712, + "step": 41080 + }, + { + "epoch": 7.73, + "grad_norm": 15.425496101379395, + "learning_rate": 4.532279314888011e-06, + "loss": 0.7567, + "step": 41090 + }, + { + "epoch": 7.74, + "grad_norm": 14.051750183105469, + "learning_rate": 4.528514963297572e-06, + "loss": 0.5802, + "step": 41100 + }, + { + "epoch": 7.74, + "grad_norm": 0.16733068227767944, + "learning_rate": 4.5247506117071336e-06, + "loss": 0.2608, + "step": 41110 + }, + { + "epoch": 7.74, + "grad_norm": 17.37752914428711, + "learning_rate": 4.520986260116695e-06, + "loss": 0.5201, + "step": 41120 + }, + { + "epoch": 7.74, + "grad_norm": 18.55728530883789, + "learning_rate": 4.517221908526257e-06, + "loss": 0.4982, + "step": 41130 + }, + { + "epoch": 7.74, + "grad_norm": 15.515111923217773, + "learning_rate": 4.513457556935818e-06, + "loss": 0.5779, + "step": 41140 + }, + { + "epoch": 7.75, + "grad_norm": 2.7760565280914307, + "learning_rate": 4.509693205345379e-06, + "loss": 0.2145, + "step": 41150 + }, + { + "epoch": 7.75, + "grad_norm": 7.785154342651367, + "learning_rate": 4.505928853754941e-06, + "loss": 0.2332, + "step": 41160 + }, + { + "epoch": 7.75, + "grad_norm": 27.99343490600586, + "learning_rate": 4.502164502164502e-06, + "loss": 0.4307, + "step": 41170 + }, + { + "epoch": 7.75, + "grad_norm": 19.354997634887695, + "learning_rate": 4.498400150574064e-06, + "loss": 0.8148, + "step": 41180 + }, + { + "epoch": 7.75, + "grad_norm": 2.94992995262146, + "learning_rate": 4.494635798983625e-06, + "loss": 0.1812, + "step": 41190 + }, + { + "epoch": 7.75, + "grad_norm": 10.769566535949707, + "learning_rate": 4.490871447393187e-06, + "loss": 0.4019, + "step": 41200 + }, + { + "epoch": 7.76, + "grad_norm": 15.878639221191406, + "learning_rate": 4.487107095802748e-06, + "loss": 0.4287, + "step": 41210 + }, + { + "epoch": 7.76, + "grad_norm": 0.04310622438788414, + "learning_rate": 4.48334274421231e-06, + "loss": 0.333, + "step": 41220 + }, + { + "epoch": 7.76, + "grad_norm": 5.737595081329346, + "learning_rate": 4.479578392621871e-06, + "loss": 0.4404, + "step": 41230 + }, + { + "epoch": 7.76, + "grad_norm": 14.339284896850586, + "learning_rate": 4.4758140410314325e-06, + "loss": 0.4577, + "step": 41240 + }, + { + "epoch": 7.76, + "grad_norm": 10.48139762878418, + "learning_rate": 4.472049689440994e-06, + "loss": 0.3453, + "step": 41250 + }, + { + "epoch": 7.77, + "grad_norm": 4.904701232910156, + "learning_rate": 4.4682853378505555e-06, + "loss": 0.3275, + "step": 41260 + }, + { + "epoch": 7.77, + "grad_norm": 0.7866021990776062, + "learning_rate": 4.464520986260117e-06, + "loss": 0.4991, + "step": 41270 + }, + { + "epoch": 7.77, + "grad_norm": 13.277933120727539, + "learning_rate": 4.460756634669678e-06, + "loss": 0.4167, + "step": 41280 + }, + { + "epoch": 7.77, + "grad_norm": 31.82375717163086, + "learning_rate": 4.45699228307924e-06, + "loss": 0.4556, + "step": 41290 + }, + { + "epoch": 7.77, + "grad_norm": 36.45254898071289, + "learning_rate": 4.453227931488801e-06, + "loss": 0.4507, + "step": 41300 + }, + { + "epoch": 7.78, + "grad_norm": 38.87356948852539, + "learning_rate": 4.449463579898363e-06, + "loss": 0.4436, + "step": 41310 + }, + { + "epoch": 7.78, + "grad_norm": 0.09693789482116699, + "learning_rate": 4.445699228307924e-06, + "loss": 0.3846, + "step": 41320 + }, + { + "epoch": 7.78, + "grad_norm": 17.44363784790039, + "learning_rate": 4.441934876717486e-06, + "loss": 0.3734, + "step": 41330 + }, + { + "epoch": 7.78, + "grad_norm": 3.033371925354004, + "learning_rate": 4.438170525127047e-06, + "loss": 0.3748, + "step": 41340 + }, + { + "epoch": 7.78, + "grad_norm": 14.98725414276123, + "learning_rate": 4.434406173536609e-06, + "loss": 0.8152, + "step": 41350 + }, + { + "epoch": 7.78, + "grad_norm": 6.687844753265381, + "learning_rate": 4.43064182194617e-06, + "loss": 0.6698, + "step": 41360 + }, + { + "epoch": 7.79, + "grad_norm": 0.14900249242782593, + "learning_rate": 4.4268774703557315e-06, + "loss": 0.3195, + "step": 41370 + }, + { + "epoch": 7.79, + "grad_norm": 9.252775192260742, + "learning_rate": 4.423113118765293e-06, + "loss": 0.4316, + "step": 41380 + }, + { + "epoch": 7.79, + "grad_norm": 7.680717945098877, + "learning_rate": 4.4193487671748545e-06, + "loss": 0.164, + "step": 41390 + }, + { + "epoch": 7.79, + "grad_norm": 7.291797637939453, + "learning_rate": 4.415584415584416e-06, + "loss": 0.5361, + "step": 41400 + }, + { + "epoch": 7.79, + "grad_norm": 3.572070360183716, + "learning_rate": 4.411820063993977e-06, + "loss": 0.4278, + "step": 41410 + }, + { + "epoch": 7.8, + "grad_norm": 0.2998538911342621, + "learning_rate": 4.408055712403539e-06, + "loss": 0.3525, + "step": 41420 + }, + { + "epoch": 7.8, + "grad_norm": 0.47384247183799744, + "learning_rate": 4.4042913608131e-06, + "loss": 0.2823, + "step": 41430 + }, + { + "epoch": 7.8, + "grad_norm": 0.05279775708913803, + "learning_rate": 4.400527009222662e-06, + "loss": 0.4339, + "step": 41440 + }, + { + "epoch": 7.8, + "grad_norm": 26.150938034057617, + "learning_rate": 4.396762657632223e-06, + "loss": 0.537, + "step": 41450 + }, + { + "epoch": 7.8, + "grad_norm": 29.126239776611328, + "learning_rate": 4.392998306041785e-06, + "loss": 0.4351, + "step": 41460 + }, + { + "epoch": 7.81, + "grad_norm": 13.385824203491211, + "learning_rate": 4.389233954451346e-06, + "loss": 0.2548, + "step": 41470 + }, + { + "epoch": 7.81, + "grad_norm": 110.32833099365234, + "learning_rate": 4.385469602860908e-06, + "loss": 0.6887, + "step": 41480 + }, + { + "epoch": 7.81, + "grad_norm": 2.3352975845336914, + "learning_rate": 4.381705251270469e-06, + "loss": 0.4305, + "step": 41490 + }, + { + "epoch": 7.81, + "grad_norm": 18.318313598632812, + "learning_rate": 4.3779408996800305e-06, + "loss": 0.2915, + "step": 41500 + }, + { + "epoch": 7.81, + "grad_norm": 14.995280265808105, + "learning_rate": 4.374176548089592e-06, + "loss": 0.4918, + "step": 41510 + }, + { + "epoch": 7.81, + "grad_norm": 8.48552417755127, + "learning_rate": 4.3704121964991534e-06, + "loss": 0.502, + "step": 41520 + }, + { + "epoch": 7.82, + "grad_norm": 6.212337017059326, + "learning_rate": 4.366647844908715e-06, + "loss": 0.5833, + "step": 41530 + }, + { + "epoch": 7.82, + "grad_norm": 7.40939474105835, + "learning_rate": 4.362883493318276e-06, + "loss": 0.4578, + "step": 41540 + }, + { + "epoch": 7.82, + "grad_norm": 7.82789945602417, + "learning_rate": 4.359119141727838e-06, + "loss": 0.4524, + "step": 41550 + }, + { + "epoch": 7.82, + "grad_norm": 20.18194580078125, + "learning_rate": 4.355354790137399e-06, + "loss": 0.7412, + "step": 41560 + }, + { + "epoch": 7.82, + "grad_norm": 1.9940485954284668, + "learning_rate": 4.351590438546961e-06, + "loss": 0.3181, + "step": 41570 + }, + { + "epoch": 7.83, + "grad_norm": 4.829334259033203, + "learning_rate": 4.347826086956522e-06, + "loss": 0.19, + "step": 41580 + }, + { + "epoch": 7.83, + "grad_norm": 0.15990076959133148, + "learning_rate": 4.344061735366084e-06, + "loss": 0.2216, + "step": 41590 + }, + { + "epoch": 7.83, + "grad_norm": 6.221506595611572, + "learning_rate": 4.340297383775645e-06, + "loss": 0.389, + "step": 41600 + }, + { + "epoch": 7.83, + "grad_norm": 8.496952056884766, + "learning_rate": 4.336533032185207e-06, + "loss": 0.5286, + "step": 41610 + }, + { + "epoch": 7.83, + "grad_norm": 23.715831756591797, + "learning_rate": 4.332768680594768e-06, + "loss": 0.4569, + "step": 41620 + }, + { + "epoch": 7.84, + "grad_norm": 5.149649143218994, + "learning_rate": 4.3290043290043295e-06, + "loss": 0.4774, + "step": 41630 + }, + { + "epoch": 7.84, + "grad_norm": 17.749088287353516, + "learning_rate": 4.325239977413891e-06, + "loss": 0.514, + "step": 41640 + }, + { + "epoch": 7.84, + "grad_norm": 1.3218231201171875, + "learning_rate": 4.3214756258234524e-06, + "loss": 0.2201, + "step": 41650 + }, + { + "epoch": 7.84, + "grad_norm": 10.610811233520508, + "learning_rate": 4.317711274233014e-06, + "loss": 0.5015, + "step": 41660 + }, + { + "epoch": 7.84, + "grad_norm": 30.114301681518555, + "learning_rate": 4.313946922642575e-06, + "loss": 0.456, + "step": 41670 + }, + { + "epoch": 7.84, + "grad_norm": 6.920668601989746, + "learning_rate": 4.310182571052137e-06, + "loss": 0.4653, + "step": 41680 + }, + { + "epoch": 7.85, + "grad_norm": 0.0986671969294548, + "learning_rate": 4.306418219461698e-06, + "loss": 0.4564, + "step": 41690 + }, + { + "epoch": 7.85, + "grad_norm": 10.98770523071289, + "learning_rate": 4.302653867871259e-06, + "loss": 0.4449, + "step": 41700 + }, + { + "epoch": 7.85, + "grad_norm": 7.872487545013428, + "learning_rate": 4.298889516280821e-06, + "loss": 0.3511, + "step": 41710 + }, + { + "epoch": 7.85, + "grad_norm": 0.12211643904447556, + "learning_rate": 4.295125164690383e-06, + "loss": 0.3521, + "step": 41720 + }, + { + "epoch": 7.85, + "grad_norm": 2.3334715366363525, + "learning_rate": 4.291360813099944e-06, + "loss": 0.4879, + "step": 41730 + }, + { + "epoch": 7.86, + "grad_norm": 1.1513813734054565, + "learning_rate": 4.287596461509506e-06, + "loss": 0.2625, + "step": 41740 + }, + { + "epoch": 7.86, + "grad_norm": 3.9860312938690186, + "learning_rate": 4.283832109919067e-06, + "loss": 0.3835, + "step": 41750 + }, + { + "epoch": 7.86, + "grad_norm": 16.243440628051758, + "learning_rate": 4.2800677583286285e-06, + "loss": 0.4477, + "step": 41760 + }, + { + "epoch": 7.86, + "grad_norm": 23.092260360717773, + "learning_rate": 4.27630340673819e-06, + "loss": 0.4379, + "step": 41770 + }, + { + "epoch": 7.86, + "grad_norm": 0.4055626094341278, + "learning_rate": 4.272539055147751e-06, + "loss": 0.5019, + "step": 41780 + }, + { + "epoch": 7.87, + "grad_norm": 15.21871566772461, + "learning_rate": 4.268774703557312e-06, + "loss": 0.4496, + "step": 41790 + }, + { + "epoch": 7.87, + "grad_norm": 22.797069549560547, + "learning_rate": 4.265010351966874e-06, + "loss": 0.5968, + "step": 41800 + }, + { + "epoch": 7.87, + "grad_norm": 0.028871312737464905, + "learning_rate": 4.261246000376436e-06, + "loss": 0.5744, + "step": 41810 + }, + { + "epoch": 7.87, + "grad_norm": 14.592629432678223, + "learning_rate": 4.257481648785997e-06, + "loss": 0.2576, + "step": 41820 + }, + { + "epoch": 7.87, + "grad_norm": 50.91252899169922, + "learning_rate": 4.253717297195558e-06, + "loss": 0.4805, + "step": 41830 + }, + { + "epoch": 7.88, + "grad_norm": 0.8403691649436951, + "learning_rate": 4.249952945605119e-06, + "loss": 0.3705, + "step": 41840 + }, + { + "epoch": 7.88, + "grad_norm": 0.6822836995124817, + "learning_rate": 4.246188594014682e-06, + "loss": 0.2433, + "step": 41850 + }, + { + "epoch": 7.88, + "grad_norm": 4.622705936431885, + "learning_rate": 4.242424242424243e-06, + "loss": 0.6058, + "step": 41860 + }, + { + "epoch": 7.88, + "grad_norm": 17.88176155090332, + "learning_rate": 4.2386598908338046e-06, + "loss": 0.5387, + "step": 41870 + }, + { + "epoch": 7.88, + "grad_norm": 0.5473073124885559, + "learning_rate": 4.234895539243365e-06, + "loss": 0.6625, + "step": 41880 + }, + { + "epoch": 7.88, + "grad_norm": 0.17052659392356873, + "learning_rate": 4.2311311876529275e-06, + "loss": 0.3257, + "step": 41890 + }, + { + "epoch": 7.89, + "grad_norm": 0.7636673450469971, + "learning_rate": 4.227366836062489e-06, + "loss": 0.3624, + "step": 41900 + }, + { + "epoch": 7.89, + "grad_norm": 20.62749481201172, + "learning_rate": 4.22360248447205e-06, + "loss": 0.4521, + "step": 41910 + }, + { + "epoch": 7.89, + "grad_norm": 6.845425128936768, + "learning_rate": 4.219838132881611e-06, + "loss": 0.3049, + "step": 41920 + }, + { + "epoch": 7.89, + "grad_norm": 26.27695655822754, + "learning_rate": 4.2160737812911725e-06, + "loss": 0.699, + "step": 41930 + }, + { + "epoch": 7.89, + "grad_norm": 10.913443565368652, + "learning_rate": 4.212309429700735e-06, + "loss": 0.7243, + "step": 41940 + }, + { + "epoch": 7.9, + "grad_norm": 29.593870162963867, + "learning_rate": 4.208545078110296e-06, + "loss": 0.5249, + "step": 41950 + }, + { + "epoch": 7.9, + "grad_norm": 7.848869323730469, + "learning_rate": 4.204780726519857e-06, + "loss": 0.4757, + "step": 41960 + }, + { + "epoch": 7.9, + "grad_norm": 10.693510055541992, + "learning_rate": 4.201016374929418e-06, + "loss": 0.4779, + "step": 41970 + }, + { + "epoch": 7.9, + "grad_norm": 2.0626046657562256, + "learning_rate": 4.19725202333898e-06, + "loss": 0.412, + "step": 41980 + }, + { + "epoch": 7.9, + "grad_norm": 7.025002479553223, + "learning_rate": 4.193487671748542e-06, + "loss": 0.2752, + "step": 41990 + }, + { + "epoch": 7.91, + "grad_norm": 23.475351333618164, + "learning_rate": 4.1897233201581036e-06, + "loss": 0.4691, + "step": 42000 + }, + { + "epoch": 7.91, + "grad_norm": 17.180540084838867, + "learning_rate": 4.185958968567664e-06, + "loss": 0.3662, + "step": 42010 + }, + { + "epoch": 7.91, + "grad_norm": 30.194869995117188, + "learning_rate": 4.182194616977226e-06, + "loss": 0.3635, + "step": 42020 + }, + { + "epoch": 7.91, + "grad_norm": 0.9849434494972229, + "learning_rate": 4.178430265386788e-06, + "loss": 0.5017, + "step": 42030 + }, + { + "epoch": 7.91, + "grad_norm": 0.7494462132453918, + "learning_rate": 4.174665913796349e-06, + "loss": 0.3769, + "step": 42040 + }, + { + "epoch": 7.91, + "grad_norm": 15.633011817932129, + "learning_rate": 4.17090156220591e-06, + "loss": 0.4248, + "step": 42050 + }, + { + "epoch": 7.92, + "grad_norm": 0.03347090259194374, + "learning_rate": 4.1671372106154715e-06, + "loss": 0.6362, + "step": 42060 + }, + { + "epoch": 7.92, + "grad_norm": 2.5611062049865723, + "learning_rate": 4.163372859025033e-06, + "loss": 0.5368, + "step": 42070 + }, + { + "epoch": 7.92, + "grad_norm": 3.9073028564453125, + "learning_rate": 4.159608507434595e-06, + "loss": 0.6604, + "step": 42080 + }, + { + "epoch": 7.92, + "grad_norm": 30.027442932128906, + "learning_rate": 4.155844155844157e-06, + "loss": 0.521, + "step": 42090 + }, + { + "epoch": 7.92, + "grad_norm": 22.11733055114746, + "learning_rate": 4.152079804253717e-06, + "loss": 0.5561, + "step": 42100 + }, + { + "epoch": 7.93, + "grad_norm": 13.414361000061035, + "learning_rate": 4.148315452663279e-06, + "loss": 0.5497, + "step": 42110 + }, + { + "epoch": 7.93, + "grad_norm": 0.3089528977870941, + "learning_rate": 4.14455110107284e-06, + "loss": 0.3887, + "step": 42120 + }, + { + "epoch": 7.93, + "grad_norm": 31.682811737060547, + "learning_rate": 4.1407867494824025e-06, + "loss": 0.7196, + "step": 42130 + }, + { + "epoch": 7.93, + "grad_norm": 43.80996322631836, + "learning_rate": 4.137022397891963e-06, + "loss": 0.3509, + "step": 42140 + }, + { + "epoch": 7.93, + "grad_norm": 3.049875020980835, + "learning_rate": 4.133258046301525e-06, + "loss": 0.4552, + "step": 42150 + }, + { + "epoch": 7.94, + "grad_norm": 22.50640296936035, + "learning_rate": 4.129493694711086e-06, + "loss": 0.298, + "step": 42160 + }, + { + "epoch": 7.94, + "grad_norm": 19.20767593383789, + "learning_rate": 4.125729343120648e-06, + "loss": 0.3963, + "step": 42170 + }, + { + "epoch": 7.94, + "grad_norm": 15.141514778137207, + "learning_rate": 4.121964991530209e-06, + "loss": 0.4502, + "step": 42180 + }, + { + "epoch": 7.94, + "grad_norm": 17.997087478637695, + "learning_rate": 4.1182006399397705e-06, + "loss": 0.2375, + "step": 42190 + }, + { + "epoch": 7.94, + "grad_norm": 1.2107799053192139, + "learning_rate": 4.114436288349332e-06, + "loss": 0.4342, + "step": 42200 + }, + { + "epoch": 7.94, + "grad_norm": 17.25642967224121, + "learning_rate": 4.110671936758893e-06, + "loss": 0.599, + "step": 42210 + }, + { + "epoch": 7.95, + "grad_norm": 23.210981369018555, + "learning_rate": 4.106907585168456e-06, + "loss": 0.6262, + "step": 42220 + }, + { + "epoch": 7.95, + "grad_norm": 18.647939682006836, + "learning_rate": 4.103143233578016e-06, + "loss": 0.6408, + "step": 42230 + }, + { + "epoch": 7.95, + "grad_norm": 24.76263999938965, + "learning_rate": 4.099378881987578e-06, + "loss": 0.4443, + "step": 42240 + }, + { + "epoch": 7.95, + "grad_norm": 6.9933013916015625, + "learning_rate": 4.095614530397139e-06, + "loss": 0.6158, + "step": 42250 + }, + { + "epoch": 7.95, + "grad_norm": 0.15161870419979095, + "learning_rate": 4.091850178806701e-06, + "loss": 0.2637, + "step": 42260 + }, + { + "epoch": 7.96, + "grad_norm": 5.794905185699463, + "learning_rate": 4.088085827216262e-06, + "loss": 1.0222, + "step": 42270 + }, + { + "epoch": 7.96, + "grad_norm": 37.5270881652832, + "learning_rate": 4.084321475625824e-06, + "loss": 0.3388, + "step": 42280 + }, + { + "epoch": 7.96, + "grad_norm": 20.47008514404297, + "learning_rate": 4.080557124035385e-06, + "loss": 0.4998, + "step": 42290 + }, + { + "epoch": 7.96, + "grad_norm": 10.392271041870117, + "learning_rate": 4.0767927724449465e-06, + "loss": 0.2468, + "step": 42300 + }, + { + "epoch": 7.96, + "grad_norm": 22.866073608398438, + "learning_rate": 4.073028420854508e-06, + "loss": 0.361, + "step": 42310 + }, + { + "epoch": 7.97, + "grad_norm": 3.4224941730499268, + "learning_rate": 4.0692640692640695e-06, + "loss": 0.5056, + "step": 42320 + }, + { + "epoch": 7.97, + "grad_norm": 15.538722038269043, + "learning_rate": 4.065499717673631e-06, + "loss": 0.5327, + "step": 42330 + }, + { + "epoch": 7.97, + "grad_norm": 17.44469451904297, + "learning_rate": 4.061735366083192e-06, + "loss": 0.628, + "step": 42340 + }, + { + "epoch": 7.97, + "grad_norm": 24.630922317504883, + "learning_rate": 4.057971014492754e-06, + "loss": 0.5497, + "step": 42350 + }, + { + "epoch": 7.97, + "grad_norm": 2.710216760635376, + "learning_rate": 4.054206662902315e-06, + "loss": 0.2458, + "step": 42360 + }, + { + "epoch": 7.97, + "grad_norm": 2.76000714302063, + "learning_rate": 4.050442311311877e-06, + "loss": 0.4532, + "step": 42370 + }, + { + "epoch": 7.98, + "grad_norm": 20.97554588317871, + "learning_rate": 4.046677959721438e-06, + "loss": 0.559, + "step": 42380 + }, + { + "epoch": 7.98, + "grad_norm": 53.717735290527344, + "learning_rate": 4.042913608131e-06, + "loss": 0.8006, + "step": 42390 + }, + { + "epoch": 7.98, + "grad_norm": 11.499624252319336, + "learning_rate": 4.039149256540561e-06, + "loss": 0.593, + "step": 42400 + }, + { + "epoch": 7.98, + "grad_norm": 1.2157829999923706, + "learning_rate": 4.035384904950123e-06, + "loss": 0.4126, + "step": 42410 + }, + { + "epoch": 7.98, + "grad_norm": 6.117697715759277, + "learning_rate": 4.031620553359684e-06, + "loss": 0.4347, + "step": 42420 + }, + { + "epoch": 7.99, + "grad_norm": 6.3184051513671875, + "learning_rate": 4.0278562017692455e-06, + "loss": 0.2998, + "step": 42430 + }, + { + "epoch": 7.99, + "grad_norm": 20.870182037353516, + "learning_rate": 4.024091850178807e-06, + "loss": 0.6725, + "step": 42440 + }, + { + "epoch": 7.99, + "grad_norm": 17.98334503173828, + "learning_rate": 4.0203274985883684e-06, + "loss": 0.3782, + "step": 42450 + }, + { + "epoch": 7.99, + "grad_norm": 7.8092193603515625, + "learning_rate": 4.01656314699793e-06, + "loss": 0.3603, + "step": 42460 + }, + { + "epoch": 7.99, + "grad_norm": 1.0869126319885254, + "learning_rate": 4.012798795407491e-06, + "loss": 0.4338, + "step": 42470 + }, + { + "epoch": 8.0, + "grad_norm": 4.888917446136475, + "learning_rate": 4.009034443817053e-06, + "loss": 0.3939, + "step": 42480 + }, + { + "epoch": 8.0, + "grad_norm": 0.40398913621902466, + "learning_rate": 4.005270092226614e-06, + "loss": 0.6347, + "step": 42490 + }, + { + "epoch": 8.0, + "grad_norm": 17.665483474731445, + "learning_rate": 4.001505740636176e-06, + "loss": 0.6128, + "step": 42500 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9261333333333334, + "eval_loss": 0.2983257472515106, + "eval_runtime": 51.1739, + "eval_samples_per_second": 146.559, + "eval_steps_per_second": 18.33, + "step": 42504 + }, + { + "epoch": 8.0, + "grad_norm": 8.759658813476562, + "learning_rate": 3.997741389045737e-06, + "loss": 0.6272, + "step": 42510 + }, + { + "epoch": 8.0, + "grad_norm": 1.319584846496582, + "learning_rate": 3.993977037455299e-06, + "loss": 0.4546, + "step": 42520 + }, + { + "epoch": 8.0, + "grad_norm": 16.56199836730957, + "learning_rate": 3.99021268586486e-06, + "loss": 0.7, + "step": 42530 + }, + { + "epoch": 8.01, + "grad_norm": 2.113156318664551, + "learning_rate": 3.986448334274422e-06, + "loss": 0.2512, + "step": 42540 + }, + { + "epoch": 8.01, + "grad_norm": 5.923486709594727, + "learning_rate": 3.982683982683983e-06, + "loss": 0.4087, + "step": 42550 + }, + { + "epoch": 8.01, + "grad_norm": 10.338817596435547, + "learning_rate": 3.9789196310935445e-06, + "loss": 0.4118, + "step": 42560 + }, + { + "epoch": 8.01, + "grad_norm": 16.17352294921875, + "learning_rate": 3.975155279503106e-06, + "loss": 0.6563, + "step": 42570 + }, + { + "epoch": 8.01, + "grad_norm": 12.729879379272461, + "learning_rate": 3.9713909279126674e-06, + "loss": 0.5468, + "step": 42580 + }, + { + "epoch": 8.02, + "grad_norm": 40.52555847167969, + "learning_rate": 3.967626576322229e-06, + "loss": 0.3984, + "step": 42590 + }, + { + "epoch": 8.02, + "grad_norm": 22.066959381103516, + "learning_rate": 3.96386222473179e-06, + "loss": 0.6228, + "step": 42600 + }, + { + "epoch": 8.02, + "grad_norm": 10.428549766540527, + "learning_rate": 3.960097873141352e-06, + "loss": 0.5729, + "step": 42610 + }, + { + "epoch": 8.02, + "grad_norm": 9.915628433227539, + "learning_rate": 3.956333521550913e-06, + "loss": 0.3231, + "step": 42620 + }, + { + "epoch": 8.02, + "grad_norm": 33.11208724975586, + "learning_rate": 3.952569169960475e-06, + "loss": 0.3779, + "step": 42630 + }, + { + "epoch": 8.03, + "grad_norm": 4.810709476470947, + "learning_rate": 3.948804818370036e-06, + "loss": 0.6932, + "step": 42640 + }, + { + "epoch": 8.03, + "grad_norm": 21.556655883789062, + "learning_rate": 3.945040466779598e-06, + "loss": 0.3488, + "step": 42650 + }, + { + "epoch": 8.03, + "grad_norm": 12.891266822814941, + "learning_rate": 3.941276115189159e-06, + "loss": 0.4167, + "step": 42660 + }, + { + "epoch": 8.03, + "grad_norm": 0.5069437026977539, + "learning_rate": 3.9375117635987206e-06, + "loss": 0.2696, + "step": 42670 + }, + { + "epoch": 8.03, + "grad_norm": 8.775614738464355, + "learning_rate": 3.933747412008282e-06, + "loss": 0.5591, + "step": 42680 + }, + { + "epoch": 8.04, + "grad_norm": 6.012180805206299, + "learning_rate": 3.9299830604178435e-06, + "loss": 0.5263, + "step": 42690 + }, + { + "epoch": 8.04, + "grad_norm": 7.999334812164307, + "learning_rate": 3.926218708827404e-06, + "loss": 0.3107, + "step": 42700 + }, + { + "epoch": 8.04, + "grad_norm": 12.329937934875488, + "learning_rate": 3.922454357236966e-06, + "loss": 0.3968, + "step": 42710 + }, + { + "epoch": 8.04, + "grad_norm": 16.49465560913086, + "learning_rate": 3.918690005646528e-06, + "loss": 0.6177, + "step": 42720 + }, + { + "epoch": 8.04, + "grad_norm": 21.259458541870117, + "learning_rate": 3.914925654056089e-06, + "loss": 0.5283, + "step": 42730 + }, + { + "epoch": 8.04, + "grad_norm": 17.6918888092041, + "learning_rate": 3.911161302465651e-06, + "loss": 0.2778, + "step": 42740 + }, + { + "epoch": 8.05, + "grad_norm": 8.05215072631836, + "learning_rate": 3.907396950875212e-06, + "loss": 0.4152, + "step": 42750 + }, + { + "epoch": 8.05, + "grad_norm": 17.829729080200195, + "learning_rate": 3.903632599284774e-06, + "loss": 0.2527, + "step": 42760 + }, + { + "epoch": 8.05, + "grad_norm": 28.24970054626465, + "learning_rate": 3.899868247694335e-06, + "loss": 0.3636, + "step": 42770 + }, + { + "epoch": 8.05, + "grad_norm": 13.935927391052246, + "learning_rate": 3.896103896103897e-06, + "loss": 0.2224, + "step": 42780 + }, + { + "epoch": 8.05, + "grad_norm": 21.196792602539062, + "learning_rate": 3.892339544513457e-06, + "loss": 0.5058, + "step": 42790 + }, + { + "epoch": 8.06, + "grad_norm": 18.93883514404297, + "learning_rate": 3.8885751929230196e-06, + "loss": 0.4206, + "step": 42800 + }, + { + "epoch": 8.06, + "grad_norm": 0.547015368938446, + "learning_rate": 3.884810841332581e-06, + "loss": 0.6522, + "step": 42810 + }, + { + "epoch": 8.06, + "grad_norm": 9.596744537353516, + "learning_rate": 3.8810464897421425e-06, + "loss": 0.6818, + "step": 42820 + }, + { + "epoch": 8.06, + "grad_norm": 17.509187698364258, + "learning_rate": 3.877282138151704e-06, + "loss": 0.3802, + "step": 42830 + }, + { + "epoch": 8.06, + "grad_norm": 25.433029174804688, + "learning_rate": 3.8735177865612646e-06, + "loss": 0.2126, + "step": 42840 + }, + { + "epoch": 8.07, + "grad_norm": 41.067604064941406, + "learning_rate": 3.869753434970827e-06, + "loss": 0.4666, + "step": 42850 + }, + { + "epoch": 8.07, + "grad_norm": 14.244887351989746, + "learning_rate": 3.865989083380388e-06, + "loss": 0.4267, + "step": 42860 + }, + { + "epoch": 8.07, + "grad_norm": 19.82988929748535, + "learning_rate": 3.86222473178995e-06, + "loss": 0.6026, + "step": 42870 + }, + { + "epoch": 8.07, + "grad_norm": 4.339291572570801, + "learning_rate": 3.85846038019951e-06, + "loss": 0.2404, + "step": 42880 + }, + { + "epoch": 8.07, + "grad_norm": 17.403106689453125, + "learning_rate": 3.854696028609073e-06, + "loss": 0.6288, + "step": 42890 + }, + { + "epoch": 8.07, + "grad_norm": 44.81716537475586, + "learning_rate": 3.850931677018634e-06, + "loss": 0.4931, + "step": 42900 + }, + { + "epoch": 8.08, + "grad_norm": 6.8778510093688965, + "learning_rate": 3.847167325428196e-06, + "loss": 0.4437, + "step": 42910 + }, + { + "epoch": 8.08, + "grad_norm": 0.9462854862213135, + "learning_rate": 3.843402973837756e-06, + "loss": 0.3791, + "step": 42920 + }, + { + "epoch": 8.08, + "grad_norm": 13.075630187988281, + "learning_rate": 3.839638622247318e-06, + "loss": 0.4265, + "step": 42930 + }, + { + "epoch": 8.08, + "grad_norm": 34.70516586303711, + "learning_rate": 3.83587427065688e-06, + "loss": 0.3808, + "step": 42940 + }, + { + "epoch": 8.08, + "grad_norm": 0.2037682831287384, + "learning_rate": 3.8321099190664415e-06, + "loss": 0.3715, + "step": 42950 + }, + { + "epoch": 8.09, + "grad_norm": 1.690575361251831, + "learning_rate": 3.828345567476003e-06, + "loss": 0.3384, + "step": 42960 + }, + { + "epoch": 8.09, + "grad_norm": 4.726039886474609, + "learning_rate": 3.8245812158855635e-06, + "loss": 0.3805, + "step": 42970 + }, + { + "epoch": 8.09, + "grad_norm": 10.409626007080078, + "learning_rate": 3.820816864295126e-06, + "loss": 0.2682, + "step": 42980 + }, + { + "epoch": 8.09, + "grad_norm": 12.783897399902344, + "learning_rate": 3.817052512704687e-06, + "loss": 0.6112, + "step": 42990 + }, + { + "epoch": 8.09, + "grad_norm": 0.31685903668403625, + "learning_rate": 3.8132881611142488e-06, + "loss": 0.4119, + "step": 43000 + }, + { + "epoch": 8.1, + "grad_norm": 43.89032745361328, + "learning_rate": 3.80952380952381e-06, + "loss": 0.4113, + "step": 43010 + }, + { + "epoch": 8.1, + "grad_norm": 3.8857526779174805, + "learning_rate": 3.8057594579333713e-06, + "loss": 0.6077, + "step": 43020 + }, + { + "epoch": 8.1, + "grad_norm": 7.122863292694092, + "learning_rate": 3.8019951063429327e-06, + "loss": 0.1793, + "step": 43030 + }, + { + "epoch": 8.1, + "grad_norm": 21.782251358032227, + "learning_rate": 3.798230754752494e-06, + "loss": 0.5502, + "step": 43040 + }, + { + "epoch": 8.1, + "grad_norm": 12.728435516357422, + "learning_rate": 3.7944664031620552e-06, + "loss": 0.6658, + "step": 43050 + }, + { + "epoch": 8.1, + "grad_norm": 20.162574768066406, + "learning_rate": 3.790702051571617e-06, + "loss": 0.5913, + "step": 43060 + }, + { + "epoch": 8.11, + "grad_norm": 0.8549222946166992, + "learning_rate": 3.7869376999811786e-06, + "loss": 0.2046, + "step": 43070 + }, + { + "epoch": 8.11, + "grad_norm": 19.518613815307617, + "learning_rate": 3.78317334839074e-06, + "loss": 0.4695, + "step": 43080 + }, + { + "epoch": 8.11, + "grad_norm": 8.266684532165527, + "learning_rate": 3.7794089968003015e-06, + "loss": 0.4126, + "step": 43090 + }, + { + "epoch": 8.11, + "grad_norm": 21.11467170715332, + "learning_rate": 3.7756446452098625e-06, + "loss": 0.6289, + "step": 43100 + }, + { + "epoch": 8.11, + "grad_norm": 25.66094970703125, + "learning_rate": 3.7718802936194244e-06, + "loss": 0.2911, + "step": 43110 + }, + { + "epoch": 8.12, + "grad_norm": 23.889019012451172, + "learning_rate": 3.768115942028986e-06, + "loss": 0.6902, + "step": 43120 + }, + { + "epoch": 8.12, + "grad_norm": 11.190979957580566, + "learning_rate": 3.7643515904385473e-06, + "loss": 0.4301, + "step": 43130 + }, + { + "epoch": 8.12, + "grad_norm": 1.0701913833618164, + "learning_rate": 3.7605872388481084e-06, + "loss": 0.356, + "step": 43140 + }, + { + "epoch": 8.12, + "grad_norm": 9.612865447998047, + "learning_rate": 3.7568228872576703e-06, + "loss": 0.487, + "step": 43150 + }, + { + "epoch": 8.12, + "grad_norm": 0.3060283064842224, + "learning_rate": 3.7530585356672317e-06, + "loss": 0.2213, + "step": 43160 + }, + { + "epoch": 8.13, + "grad_norm": 1.2015522718429565, + "learning_rate": 3.749294184076793e-06, + "loss": 0.4213, + "step": 43170 + }, + { + "epoch": 8.13, + "grad_norm": 9.978540420532227, + "learning_rate": 3.7455298324863542e-06, + "loss": 0.6853, + "step": 43180 + }, + { + "epoch": 8.13, + "grad_norm": 10.406660079956055, + "learning_rate": 3.7417654808959157e-06, + "loss": 0.4558, + "step": 43190 + }, + { + "epoch": 8.13, + "grad_norm": 0.04582936689257622, + "learning_rate": 3.7380011293054776e-06, + "loss": 0.468, + "step": 43200 + }, + { + "epoch": 8.13, + "grad_norm": 4.475496292114258, + "learning_rate": 3.734236777715039e-06, + "loss": 0.5617, + "step": 43210 + }, + { + "epoch": 8.13, + "grad_norm": 11.952781677246094, + "learning_rate": 3.7304724261246005e-06, + "loss": 0.4628, + "step": 43220 + }, + { + "epoch": 8.14, + "grad_norm": 23.401905059814453, + "learning_rate": 3.7267080745341615e-06, + "loss": 0.3671, + "step": 43230 + }, + { + "epoch": 8.14, + "grad_norm": 11.260015487670898, + "learning_rate": 3.722943722943723e-06, + "loss": 0.5275, + "step": 43240 + }, + { + "epoch": 8.14, + "grad_norm": 7.514461994171143, + "learning_rate": 3.719179371353285e-06, + "loss": 0.6261, + "step": 43250 + }, + { + "epoch": 8.14, + "grad_norm": 13.519734382629395, + "learning_rate": 3.7154150197628463e-06, + "loss": 0.5934, + "step": 43260 + }, + { + "epoch": 8.14, + "grad_norm": 26.910083770751953, + "learning_rate": 3.7116506681724074e-06, + "loss": 0.7417, + "step": 43270 + }, + { + "epoch": 8.15, + "grad_norm": 8.547293663024902, + "learning_rate": 3.707886316581969e-06, + "loss": 0.2699, + "step": 43280 + }, + { + "epoch": 8.15, + "grad_norm": 0.09624161571264267, + "learning_rate": 3.7041219649915307e-06, + "loss": 0.5396, + "step": 43290 + }, + { + "epoch": 8.15, + "grad_norm": 10.193184852600098, + "learning_rate": 3.700357613401092e-06, + "loss": 0.6458, + "step": 43300 + }, + { + "epoch": 8.15, + "grad_norm": 4.950165271759033, + "learning_rate": 3.6965932618106532e-06, + "loss": 0.3573, + "step": 43310 + }, + { + "epoch": 8.15, + "grad_norm": 20.236759185791016, + "learning_rate": 3.6928289102202147e-06, + "loss": 0.5585, + "step": 43320 + }, + { + "epoch": 8.16, + "grad_norm": 7.94288969039917, + "learning_rate": 3.689064558629776e-06, + "loss": 0.5142, + "step": 43330 + }, + { + "epoch": 8.16, + "grad_norm": 4.145201206207275, + "learning_rate": 3.685300207039338e-06, + "loss": 0.3554, + "step": 43340 + }, + { + "epoch": 8.16, + "grad_norm": 14.267728805541992, + "learning_rate": 3.6815358554488995e-06, + "loss": 0.3611, + "step": 43350 + }, + { + "epoch": 8.16, + "grad_norm": 0.9065011143684387, + "learning_rate": 3.6777715038584605e-06, + "loss": 0.2953, + "step": 43360 + }, + { + "epoch": 8.16, + "grad_norm": 9.731773376464844, + "learning_rate": 3.674007152268022e-06, + "loss": 0.5442, + "step": 43370 + }, + { + "epoch": 8.16, + "grad_norm": 23.468883514404297, + "learning_rate": 3.670242800677584e-06, + "loss": 0.6701, + "step": 43380 + }, + { + "epoch": 8.17, + "grad_norm": 9.353164672851562, + "learning_rate": 3.6664784490871453e-06, + "loss": 0.4046, + "step": 43390 + }, + { + "epoch": 8.17, + "grad_norm": 10.205562591552734, + "learning_rate": 3.6627140974967064e-06, + "loss": 0.3539, + "step": 43400 + }, + { + "epoch": 8.17, + "grad_norm": 7.568580150604248, + "learning_rate": 3.658949745906268e-06, + "loss": 0.7413, + "step": 43410 + }, + { + "epoch": 8.17, + "grad_norm": 0.29625949263572693, + "learning_rate": 3.6551853943158293e-06, + "loss": 0.3291, + "step": 43420 + }, + { + "epoch": 8.17, + "grad_norm": 8.475521087646484, + "learning_rate": 3.651421042725391e-06, + "loss": 0.239, + "step": 43430 + }, + { + "epoch": 8.18, + "grad_norm": 21.264280319213867, + "learning_rate": 3.6476566911349526e-06, + "loss": 0.6242, + "step": 43440 + }, + { + "epoch": 8.18, + "grad_norm": 7.065421104431152, + "learning_rate": 3.6438923395445137e-06, + "loss": 0.4781, + "step": 43450 + }, + { + "epoch": 8.18, + "grad_norm": 11.760064125061035, + "learning_rate": 3.640127987954075e-06, + "loss": 0.2676, + "step": 43460 + }, + { + "epoch": 8.18, + "grad_norm": 0.0757002905011177, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.4388, + "step": 43470 + }, + { + "epoch": 8.18, + "grad_norm": 6.151671886444092, + "learning_rate": 3.6325992847731985e-06, + "loss": 0.5118, + "step": 43480 + }, + { + "epoch": 8.19, + "grad_norm": 0.9541302919387817, + "learning_rate": 3.6288349331827595e-06, + "loss": 0.7873, + "step": 43490 + }, + { + "epoch": 8.19, + "grad_norm": 0.43069586157798767, + "learning_rate": 3.625070581592321e-06, + "loss": 0.4353, + "step": 43500 + }, + { + "epoch": 8.19, + "grad_norm": 20.052248001098633, + "learning_rate": 3.6213062300018824e-06, + "loss": 0.4903, + "step": 43510 + }, + { + "epoch": 8.19, + "grad_norm": 8.033267974853516, + "learning_rate": 3.6175418784114443e-06, + "loss": 0.5027, + "step": 43520 + }, + { + "epoch": 8.19, + "grad_norm": 12.000602722167969, + "learning_rate": 3.613777526821005e-06, + "loss": 0.4379, + "step": 43530 + }, + { + "epoch": 8.19, + "grad_norm": 0.025615401566028595, + "learning_rate": 3.610013175230567e-06, + "loss": 0.362, + "step": 43540 + }, + { + "epoch": 8.2, + "grad_norm": 0.06578720360994339, + "learning_rate": 3.6062488236401283e-06, + "loss": 0.4048, + "step": 43550 + }, + { + "epoch": 8.2, + "grad_norm": 19.945615768432617, + "learning_rate": 3.6024844720496897e-06, + "loss": 0.4744, + "step": 43560 + }, + { + "epoch": 8.2, + "grad_norm": 1.860599398612976, + "learning_rate": 3.5987201204592516e-06, + "loss": 0.3348, + "step": 43570 + }, + { + "epoch": 8.2, + "grad_norm": 0.18426920473575592, + "learning_rate": 3.5949557688688127e-06, + "loss": 0.3012, + "step": 43580 + }, + { + "epoch": 8.2, + "grad_norm": 18.49991798400879, + "learning_rate": 3.591191417278374e-06, + "loss": 0.5334, + "step": 43590 + }, + { + "epoch": 8.21, + "grad_norm": 7.533271312713623, + "learning_rate": 3.5874270656879356e-06, + "loss": 0.269, + "step": 43600 + }, + { + "epoch": 8.21, + "grad_norm": 31.758325576782227, + "learning_rate": 3.583662714097497e-06, + "loss": 0.6352, + "step": 43610 + }, + { + "epoch": 8.21, + "grad_norm": 24.36396026611328, + "learning_rate": 3.579898362507058e-06, + "loss": 0.3615, + "step": 43620 + }, + { + "epoch": 8.21, + "grad_norm": 0.032064277678728104, + "learning_rate": 3.57613401091662e-06, + "loss": 0.7085, + "step": 43630 + }, + { + "epoch": 8.21, + "grad_norm": 10.80846881866455, + "learning_rate": 3.5723696593261814e-06, + "loss": 0.6805, + "step": 43640 + }, + { + "epoch": 8.22, + "grad_norm": 7.788702964782715, + "learning_rate": 3.568605307735743e-06, + "loss": 0.6523, + "step": 43650 + }, + { + "epoch": 8.22, + "grad_norm": 6.078163146972656, + "learning_rate": 3.564840956145304e-06, + "loss": 0.3491, + "step": 43660 + }, + { + "epoch": 8.22, + "grad_norm": 34.41270065307617, + "learning_rate": 3.561076604554866e-06, + "loss": 0.3832, + "step": 43670 + }, + { + "epoch": 8.22, + "grad_norm": 0.11715062707662582, + "learning_rate": 3.5573122529644273e-06, + "loss": 0.4516, + "step": 43680 + }, + { + "epoch": 8.22, + "grad_norm": 25.467695236206055, + "learning_rate": 3.5535479013739887e-06, + "loss": 0.3377, + "step": 43690 + }, + { + "epoch": 8.23, + "grad_norm": 0.11090195924043655, + "learning_rate": 3.54978354978355e-06, + "loss": 0.4038, + "step": 43700 + }, + { + "epoch": 8.23, + "grad_norm": 12.662226676940918, + "learning_rate": 3.5460191981931112e-06, + "loss": 0.379, + "step": 43710 + }, + { + "epoch": 8.23, + "grad_norm": 5.7085652351379395, + "learning_rate": 3.542254846602673e-06, + "loss": 0.3542, + "step": 43720 + }, + { + "epoch": 8.23, + "grad_norm": 15.676560401916504, + "learning_rate": 3.5384904950122346e-06, + "loss": 0.4124, + "step": 43730 + }, + { + "epoch": 8.23, + "grad_norm": 34.08810806274414, + "learning_rate": 3.534726143421796e-06, + "loss": 0.3937, + "step": 43740 + }, + { + "epoch": 8.23, + "grad_norm": 9.684732437133789, + "learning_rate": 3.530961791831357e-06, + "loss": 0.2242, + "step": 43750 + }, + { + "epoch": 8.24, + "grad_norm": 8.743865013122559, + "learning_rate": 3.5271974402409185e-06, + "loss": 0.3141, + "step": 43760 + }, + { + "epoch": 8.24, + "grad_norm": 9.374190330505371, + "learning_rate": 3.5234330886504804e-06, + "loss": 0.3993, + "step": 43770 + }, + { + "epoch": 8.24, + "grad_norm": 6.62257719039917, + "learning_rate": 3.519668737060042e-06, + "loss": 0.4389, + "step": 43780 + }, + { + "epoch": 8.24, + "grad_norm": 126.70000457763672, + "learning_rate": 3.515904385469603e-06, + "loss": 0.3082, + "step": 43790 + }, + { + "epoch": 8.24, + "grad_norm": 16.551645278930664, + "learning_rate": 3.5121400338791644e-06, + "loss": 0.3682, + "step": 43800 + }, + { + "epoch": 8.25, + "grad_norm": 0.19622400403022766, + "learning_rate": 3.5083756822887262e-06, + "loss": 0.389, + "step": 43810 + }, + { + "epoch": 8.25, + "grad_norm": 11.304600715637207, + "learning_rate": 3.5046113306982877e-06, + "loss": 0.4126, + "step": 43820 + }, + { + "epoch": 8.25, + "grad_norm": 13.223969459533691, + "learning_rate": 3.500846979107849e-06, + "loss": 0.3159, + "step": 43830 + }, + { + "epoch": 8.25, + "grad_norm": 0.057929929345846176, + "learning_rate": 3.49708262751741e-06, + "loss": 0.6223, + "step": 43840 + }, + { + "epoch": 8.25, + "grad_norm": 28.158432006835938, + "learning_rate": 3.4933182759269717e-06, + "loss": 0.3654, + "step": 43850 + }, + { + "epoch": 8.26, + "grad_norm": 0.9253088235855103, + "learning_rate": 3.4895539243365336e-06, + "loss": 0.5879, + "step": 43860 + }, + { + "epoch": 8.26, + "grad_norm": 25.360929489135742, + "learning_rate": 3.485789572746095e-06, + "loss": 0.5482, + "step": 43870 + }, + { + "epoch": 8.26, + "grad_norm": 25.015487670898438, + "learning_rate": 3.482025221155656e-06, + "loss": 0.5676, + "step": 43880 + }, + { + "epoch": 8.26, + "grad_norm": 25.207012176513672, + "learning_rate": 3.4782608695652175e-06, + "loss": 0.5568, + "step": 43890 + }, + { + "epoch": 8.26, + "grad_norm": 29.287954330444336, + "learning_rate": 3.474496517974779e-06, + "loss": 0.5971, + "step": 43900 + }, + { + "epoch": 8.26, + "grad_norm": 0.47074535489082336, + "learning_rate": 3.470732166384341e-06, + "loss": 0.3605, + "step": 43910 + }, + { + "epoch": 8.27, + "grad_norm": 7.249814510345459, + "learning_rate": 3.466967814793902e-06, + "loss": 0.4114, + "step": 43920 + }, + { + "epoch": 8.27, + "grad_norm": 16.79249382019043, + "learning_rate": 3.4632034632034634e-06, + "loss": 0.4512, + "step": 43930 + }, + { + "epoch": 8.27, + "grad_norm": 20.726879119873047, + "learning_rate": 3.459439111613025e-06, + "loss": 0.4038, + "step": 43940 + }, + { + "epoch": 8.27, + "grad_norm": 46.845523834228516, + "learning_rate": 3.4556747600225867e-06, + "loss": 0.4775, + "step": 43950 + }, + { + "epoch": 8.27, + "grad_norm": 18.067689895629883, + "learning_rate": 3.451910408432148e-06, + "loss": 0.4175, + "step": 43960 + }, + { + "epoch": 8.28, + "grad_norm": 42.61848831176758, + "learning_rate": 3.448146056841709e-06, + "loss": 0.6115, + "step": 43970 + }, + { + "epoch": 8.28, + "grad_norm": 6.6008687019348145, + "learning_rate": 3.4443817052512707e-06, + "loss": 0.3097, + "step": 43980 + }, + { + "epoch": 8.28, + "grad_norm": 27.931385040283203, + "learning_rate": 3.440617353660832e-06, + "loss": 0.3939, + "step": 43990 + }, + { + "epoch": 8.28, + "grad_norm": 17.77448844909668, + "learning_rate": 3.436853002070394e-06, + "loss": 0.299, + "step": 44000 + }, + { + "epoch": 8.28, + "grad_norm": 8.479036331176758, + "learning_rate": 3.433088650479955e-06, + "loss": 0.4371, + "step": 44010 + }, + { + "epoch": 8.29, + "grad_norm": 12.390946388244629, + "learning_rate": 3.4293242988895165e-06, + "loss": 0.2047, + "step": 44020 + }, + { + "epoch": 8.29, + "grad_norm": 12.151750564575195, + "learning_rate": 3.425559947299078e-06, + "loss": 0.5079, + "step": 44030 + }, + { + "epoch": 8.29, + "grad_norm": 17.92153549194336, + "learning_rate": 3.4217955957086394e-06, + "loss": 0.543, + "step": 44040 + }, + { + "epoch": 8.29, + "grad_norm": 8.07863998413086, + "learning_rate": 3.4180312441182005e-06, + "loss": 0.5345, + "step": 44050 + }, + { + "epoch": 8.29, + "grad_norm": 0.12737701833248138, + "learning_rate": 3.4142668925277623e-06, + "loss": 0.7064, + "step": 44060 + }, + { + "epoch": 8.29, + "grad_norm": 10.987157821655273, + "learning_rate": 3.410502540937324e-06, + "loss": 0.6823, + "step": 44070 + }, + { + "epoch": 8.3, + "grad_norm": 12.050490379333496, + "learning_rate": 3.4067381893468853e-06, + "loss": 0.5315, + "step": 44080 + }, + { + "epoch": 8.3, + "grad_norm": 17.466691970825195, + "learning_rate": 3.402973837756447e-06, + "loss": 0.5675, + "step": 44090 + }, + { + "epoch": 8.3, + "grad_norm": 19.85411262512207, + "learning_rate": 3.399209486166008e-06, + "loss": 0.3984, + "step": 44100 + }, + { + "epoch": 8.3, + "grad_norm": 12.507843971252441, + "learning_rate": 3.3954451345755696e-06, + "loss": 0.413, + "step": 44110 + }, + { + "epoch": 8.3, + "grad_norm": 3.8064775466918945, + "learning_rate": 3.391680782985131e-06, + "loss": 0.4874, + "step": 44120 + }, + { + "epoch": 8.31, + "grad_norm": 20.31875228881836, + "learning_rate": 3.3879164313946926e-06, + "loss": 0.5857, + "step": 44130 + }, + { + "epoch": 8.31, + "grad_norm": 0.29422613978385925, + "learning_rate": 3.3841520798042536e-06, + "loss": 0.3885, + "step": 44140 + }, + { + "epoch": 8.31, + "grad_norm": 13.232914924621582, + "learning_rate": 3.3803877282138155e-06, + "loss": 0.2175, + "step": 44150 + }, + { + "epoch": 8.31, + "grad_norm": 0.48562586307525635, + "learning_rate": 3.376623376623377e-06, + "loss": 0.2482, + "step": 44160 + }, + { + "epoch": 8.31, + "grad_norm": 18.276710510253906, + "learning_rate": 3.3728590250329384e-06, + "loss": 0.5239, + "step": 44170 + }, + { + "epoch": 8.32, + "grad_norm": 1.2684160470962524, + "learning_rate": 3.3690946734425e-06, + "loss": 0.2591, + "step": 44180 + }, + { + "epoch": 8.32, + "grad_norm": 11.328255653381348, + "learning_rate": 3.365330321852061e-06, + "loss": 0.3205, + "step": 44190 + }, + { + "epoch": 8.32, + "grad_norm": 8.913219451904297, + "learning_rate": 3.361565970261623e-06, + "loss": 0.4611, + "step": 44200 + }, + { + "epoch": 8.32, + "grad_norm": 1.8168461322784424, + "learning_rate": 3.3578016186711843e-06, + "loss": 0.2763, + "step": 44210 + }, + { + "epoch": 8.32, + "grad_norm": 9.876787185668945, + "learning_rate": 3.3540372670807457e-06, + "loss": 0.3158, + "step": 44220 + }, + { + "epoch": 8.32, + "grad_norm": 26.141977310180664, + "learning_rate": 3.3502729154903068e-06, + "loss": 0.519, + "step": 44230 + }, + { + "epoch": 8.33, + "grad_norm": 0.03895975649356842, + "learning_rate": 3.3465085638998686e-06, + "loss": 0.3647, + "step": 44240 + }, + { + "epoch": 8.33, + "grad_norm": 28.49066162109375, + "learning_rate": 3.34274421230943e-06, + "loss": 0.5051, + "step": 44250 + }, + { + "epoch": 8.33, + "grad_norm": 48.42610168457031, + "learning_rate": 3.3389798607189916e-06, + "loss": 0.5039, + "step": 44260 + }, + { + "epoch": 8.33, + "grad_norm": 33.13661575317383, + "learning_rate": 3.3352155091285526e-06, + "loss": 0.6969, + "step": 44270 + }, + { + "epoch": 8.33, + "grad_norm": 24.356779098510742, + "learning_rate": 3.331451157538114e-06, + "loss": 0.646, + "step": 44280 + }, + { + "epoch": 8.34, + "grad_norm": 26.351469039916992, + "learning_rate": 3.327686805947676e-06, + "loss": 0.4839, + "step": 44290 + }, + { + "epoch": 8.34, + "grad_norm": 3.9066474437713623, + "learning_rate": 3.3239224543572374e-06, + "loss": 0.4166, + "step": 44300 + }, + { + "epoch": 8.34, + "grad_norm": 0.12449698895215988, + "learning_rate": 3.320158102766799e-06, + "loss": 0.213, + "step": 44310 + }, + { + "epoch": 8.34, + "grad_norm": 6.33676290512085, + "learning_rate": 3.31639375117636e-06, + "loss": 0.3959, + "step": 44320 + }, + { + "epoch": 8.34, + "grad_norm": 12.31187629699707, + "learning_rate": 3.3126293995859214e-06, + "loss": 0.6402, + "step": 44330 + }, + { + "epoch": 8.35, + "grad_norm": 9.686624526977539, + "learning_rate": 3.3088650479954832e-06, + "loss": 0.4905, + "step": 44340 + }, + { + "epoch": 8.35, + "grad_norm": 15.566709518432617, + "learning_rate": 3.3051006964050447e-06, + "loss": 0.3201, + "step": 44350 + }, + { + "epoch": 8.35, + "grad_norm": 19.786592483520508, + "learning_rate": 3.3013363448146057e-06, + "loss": 0.4149, + "step": 44360 + }, + { + "epoch": 8.35, + "grad_norm": 43.112449645996094, + "learning_rate": 3.297571993224167e-06, + "loss": 0.5619, + "step": 44370 + }, + { + "epoch": 8.35, + "grad_norm": 32.787532806396484, + "learning_rate": 3.293807641633729e-06, + "loss": 0.4769, + "step": 44380 + }, + { + "epoch": 8.35, + "grad_norm": 21.896869659423828, + "learning_rate": 3.2900432900432905e-06, + "loss": 0.3075, + "step": 44390 + }, + { + "epoch": 8.36, + "grad_norm": 2.6966474056243896, + "learning_rate": 3.2862789384528516e-06, + "loss": 0.6221, + "step": 44400 + }, + { + "epoch": 8.36, + "grad_norm": 0.19632263481616974, + "learning_rate": 3.282514586862413e-06, + "loss": 0.4079, + "step": 44410 + }, + { + "epoch": 8.36, + "grad_norm": 23.635337829589844, + "learning_rate": 3.2787502352719745e-06, + "loss": 0.3567, + "step": 44420 + }, + { + "epoch": 8.36, + "grad_norm": 13.236377716064453, + "learning_rate": 3.2749858836815364e-06, + "loss": 0.3691, + "step": 44430 + }, + { + "epoch": 8.36, + "grad_norm": 19.794361114501953, + "learning_rate": 3.271221532091098e-06, + "loss": 0.5006, + "step": 44440 + }, + { + "epoch": 8.37, + "grad_norm": 0.1621648520231247, + "learning_rate": 3.267457180500659e-06, + "loss": 0.6175, + "step": 44450 + }, + { + "epoch": 8.37, + "grad_norm": 0.08422687649726868, + "learning_rate": 3.2636928289102203e-06, + "loss": 0.4767, + "step": 44460 + }, + { + "epoch": 8.37, + "grad_norm": 6.153237819671631, + "learning_rate": 3.259928477319782e-06, + "loss": 0.3885, + "step": 44470 + }, + { + "epoch": 8.37, + "grad_norm": 18.663501739501953, + "learning_rate": 3.2561641257293437e-06, + "loss": 0.3928, + "step": 44480 + }, + { + "epoch": 8.37, + "grad_norm": 12.590583801269531, + "learning_rate": 3.2523997741389047e-06, + "loss": 0.6693, + "step": 44490 + }, + { + "epoch": 8.38, + "grad_norm": 0.5464389324188232, + "learning_rate": 3.248635422548466e-06, + "loss": 0.4404, + "step": 44500 + }, + { + "epoch": 8.38, + "grad_norm": 2.3248085975646973, + "learning_rate": 3.2448710709580277e-06, + "loss": 0.6251, + "step": 44510 + }, + { + "epoch": 8.38, + "grad_norm": 27.805416107177734, + "learning_rate": 3.2411067193675895e-06, + "loss": 0.1487, + "step": 44520 + }, + { + "epoch": 8.38, + "grad_norm": 31.039339065551758, + "learning_rate": 3.2373423677771506e-06, + "loss": 0.4554, + "step": 44530 + }, + { + "epoch": 8.38, + "grad_norm": 20.5913143157959, + "learning_rate": 3.233578016186712e-06, + "loss": 0.3836, + "step": 44540 + }, + { + "epoch": 8.39, + "grad_norm": 43.38515853881836, + "learning_rate": 3.2298136645962735e-06, + "loss": 0.4318, + "step": 44550 + }, + { + "epoch": 8.39, + "grad_norm": 12.96084976196289, + "learning_rate": 3.226049313005835e-06, + "loss": 0.3663, + "step": 44560 + }, + { + "epoch": 8.39, + "grad_norm": 8.591901779174805, + "learning_rate": 3.222284961415397e-06, + "loss": 0.4199, + "step": 44570 + }, + { + "epoch": 8.39, + "grad_norm": 14.201598167419434, + "learning_rate": 3.218520609824958e-06, + "loss": 0.6438, + "step": 44580 + }, + { + "epoch": 8.39, + "grad_norm": 20.398212432861328, + "learning_rate": 3.2147562582345193e-06, + "loss": 0.2309, + "step": 44590 + }, + { + "epoch": 8.39, + "grad_norm": 4.721048355102539, + "learning_rate": 3.210991906644081e-06, + "loss": 0.5085, + "step": 44600 + }, + { + "epoch": 8.4, + "grad_norm": 11.639357566833496, + "learning_rate": 3.2072275550536427e-06, + "loss": 0.5171, + "step": 44610 + }, + { + "epoch": 8.4, + "grad_norm": 9.085111618041992, + "learning_rate": 3.2034632034632033e-06, + "loss": 0.4878, + "step": 44620 + }, + { + "epoch": 8.4, + "grad_norm": 14.544848442077637, + "learning_rate": 3.199698851872765e-06, + "loss": 0.4514, + "step": 44630 + }, + { + "epoch": 8.4, + "grad_norm": 3.9646859169006348, + "learning_rate": 3.1959345002823266e-06, + "loss": 0.4601, + "step": 44640 + }, + { + "epoch": 8.4, + "grad_norm": 25.853208541870117, + "learning_rate": 3.192170148691888e-06, + "loss": 0.5069, + "step": 44650 + }, + { + "epoch": 8.41, + "grad_norm": 0.13335028290748596, + "learning_rate": 3.188405797101449e-06, + "loss": 0.4518, + "step": 44660 + }, + { + "epoch": 8.41, + "grad_norm": 4.05998420715332, + "learning_rate": 3.184641445511011e-06, + "loss": 0.2396, + "step": 44670 + }, + { + "epoch": 8.41, + "grad_norm": 15.973206520080566, + "learning_rate": 3.1808770939205725e-06, + "loss": 0.3994, + "step": 44680 + }, + { + "epoch": 8.41, + "grad_norm": 5.575290679931641, + "learning_rate": 3.177112742330134e-06, + "loss": 0.8244, + "step": 44690 + }, + { + "epoch": 8.41, + "grad_norm": 78.56654357910156, + "learning_rate": 3.1733483907396954e-06, + "loss": 0.8038, + "step": 44700 + }, + { + "epoch": 8.42, + "grad_norm": 0.16877728700637817, + "learning_rate": 3.1695840391492564e-06, + "loss": 0.3286, + "step": 44710 + }, + { + "epoch": 8.42, + "grad_norm": 9.68952751159668, + "learning_rate": 3.1658196875588183e-06, + "loss": 0.5651, + "step": 44720 + }, + { + "epoch": 8.42, + "grad_norm": 47.49652862548828, + "learning_rate": 3.1620553359683798e-06, + "loss": 0.3635, + "step": 44730 + }, + { + "epoch": 8.42, + "grad_norm": 19.1754207611084, + "learning_rate": 3.1582909843779412e-06, + "loss": 0.5277, + "step": 44740 + }, + { + "epoch": 8.42, + "grad_norm": 21.708391189575195, + "learning_rate": 3.1545266327875023e-06, + "loss": 0.4516, + "step": 44750 + }, + { + "epoch": 8.42, + "grad_norm": 2.858497381210327, + "learning_rate": 3.1507622811970637e-06, + "loss": 0.5113, + "step": 44760 + }, + { + "epoch": 8.43, + "grad_norm": 14.189351081848145, + "learning_rate": 3.1469979296066256e-06, + "loss": 0.3809, + "step": 44770 + }, + { + "epoch": 8.43, + "grad_norm": 15.705812454223633, + "learning_rate": 3.143233578016187e-06, + "loss": 0.4126, + "step": 44780 + }, + { + "epoch": 8.43, + "grad_norm": 66.25598907470703, + "learning_rate": 3.1394692264257485e-06, + "loss": 0.6253, + "step": 44790 + }, + { + "epoch": 8.43, + "grad_norm": 7.330162048339844, + "learning_rate": 3.1357048748353096e-06, + "loss": 0.5187, + "step": 44800 + }, + { + "epoch": 8.43, + "grad_norm": 5.743100166320801, + "learning_rate": 3.1319405232448715e-06, + "loss": 0.4984, + "step": 44810 + }, + { + "epoch": 8.44, + "grad_norm": 8.980222702026367, + "learning_rate": 3.128176171654433e-06, + "loss": 0.4996, + "step": 44820 + }, + { + "epoch": 8.44, + "grad_norm": 0.5218448042869568, + "learning_rate": 3.1244118200639944e-06, + "loss": 0.5344, + "step": 44830 + }, + { + "epoch": 8.44, + "grad_norm": 1.2095797061920166, + "learning_rate": 3.1206474684735554e-06, + "loss": 0.2706, + "step": 44840 + }, + { + "epoch": 8.44, + "grad_norm": 44.43700408935547, + "learning_rate": 3.116883116883117e-06, + "loss": 0.4895, + "step": 44850 + }, + { + "epoch": 8.44, + "grad_norm": 5.458186626434326, + "learning_rate": 3.1131187652926788e-06, + "loss": 0.3371, + "step": 44860 + }, + { + "epoch": 8.45, + "grad_norm": 6.718016624450684, + "learning_rate": 3.1093544137022402e-06, + "loss": 0.2024, + "step": 44870 + }, + { + "epoch": 8.45, + "grad_norm": 0.143607959151268, + "learning_rate": 3.1055900621118013e-06, + "loss": 0.2257, + "step": 44880 + }, + { + "epoch": 8.45, + "grad_norm": 10.796164512634277, + "learning_rate": 3.1018257105213627e-06, + "loss": 0.6509, + "step": 44890 + }, + { + "epoch": 8.45, + "grad_norm": 9.629598617553711, + "learning_rate": 3.0980613589309246e-06, + "loss": 0.5034, + "step": 44900 + }, + { + "epoch": 8.45, + "grad_norm": 0.05811280012130737, + "learning_rate": 3.094297007340486e-06, + "loss": 0.4932, + "step": 44910 + }, + { + "epoch": 8.45, + "grad_norm": 9.126168251037598, + "learning_rate": 3.0905326557500475e-06, + "loss": 0.4464, + "step": 44920 + }, + { + "epoch": 8.46, + "grad_norm": 15.563265800476074, + "learning_rate": 3.0867683041596086e-06, + "loss": 0.4157, + "step": 44930 + }, + { + "epoch": 8.46, + "grad_norm": 14.518123626708984, + "learning_rate": 3.08300395256917e-06, + "loss": 0.3727, + "step": 44940 + }, + { + "epoch": 8.46, + "grad_norm": 5.1271138191223145, + "learning_rate": 3.079239600978732e-06, + "loss": 0.6179, + "step": 44950 + }, + { + "epoch": 8.46, + "grad_norm": 0.39322352409362793, + "learning_rate": 3.0754752493882934e-06, + "loss": 0.2365, + "step": 44960 + }, + { + "epoch": 8.46, + "grad_norm": 3.110018014907837, + "learning_rate": 3.0717108977978544e-06, + "loss": 0.4637, + "step": 44970 + }, + { + "epoch": 8.47, + "grad_norm": 0.4475475251674652, + "learning_rate": 3.067946546207416e-06, + "loss": 0.4693, + "step": 44980 + }, + { + "epoch": 8.47, + "grad_norm": 31.466312408447266, + "learning_rate": 3.0641821946169773e-06, + "loss": 0.6135, + "step": 44990 + }, + { + "epoch": 8.47, + "grad_norm": 8.0161714553833, + "learning_rate": 3.0604178430265392e-06, + "loss": 0.4308, + "step": 45000 + }, + { + "epoch": 8.47, + "grad_norm": 0.21481114625930786, + "learning_rate": 3.0566534914361003e-06, + "loss": 0.3452, + "step": 45010 + }, + { + "epoch": 8.47, + "grad_norm": 6.250048637390137, + "learning_rate": 3.0528891398456617e-06, + "loss": 0.3198, + "step": 45020 + }, + { + "epoch": 8.48, + "grad_norm": 14.705049514770508, + "learning_rate": 3.049124788255223e-06, + "loss": 0.5674, + "step": 45030 + }, + { + "epoch": 8.48, + "grad_norm": 13.137300491333008, + "learning_rate": 3.045360436664785e-06, + "loss": 0.4053, + "step": 45040 + }, + { + "epoch": 8.48, + "grad_norm": 5.976210594177246, + "learning_rate": 3.0415960850743465e-06, + "loss": 0.5598, + "step": 45050 + }, + { + "epoch": 8.48, + "grad_norm": 23.372941970825195, + "learning_rate": 3.0378317334839076e-06, + "loss": 0.9162, + "step": 45060 + }, + { + "epoch": 8.48, + "grad_norm": 16.597759246826172, + "learning_rate": 3.034067381893469e-06, + "loss": 0.4671, + "step": 45070 + }, + { + "epoch": 8.48, + "grad_norm": 21.51179313659668, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.5754, + "step": 45080 + }, + { + "epoch": 8.49, + "grad_norm": 1.9006558656692505, + "learning_rate": 3.0265386787125924e-06, + "loss": 0.4043, + "step": 45090 + }, + { + "epoch": 8.49, + "grad_norm": 12.068243980407715, + "learning_rate": 3.0227743271221534e-06, + "loss": 0.3859, + "step": 45100 + }, + { + "epoch": 8.49, + "grad_norm": 7.060889720916748, + "learning_rate": 3.019009975531715e-06, + "loss": 0.3461, + "step": 45110 + }, + { + "epoch": 8.49, + "grad_norm": 8.987565040588379, + "learning_rate": 3.0152456239412763e-06, + "loss": 0.5619, + "step": 45120 + }, + { + "epoch": 8.49, + "grad_norm": 7.495594501495361, + "learning_rate": 3.011481272350838e-06, + "loss": 0.2997, + "step": 45130 + }, + { + "epoch": 8.5, + "grad_norm": 19.95897102355957, + "learning_rate": 3.007716920760399e-06, + "loss": 0.5481, + "step": 45140 + }, + { + "epoch": 8.5, + "grad_norm": 13.875322341918945, + "learning_rate": 3.0039525691699607e-06, + "loss": 0.4323, + "step": 45150 + }, + { + "epoch": 8.5, + "grad_norm": 13.79182243347168, + "learning_rate": 3.000188217579522e-06, + "loss": 0.2615, + "step": 45160 + }, + { + "epoch": 8.5, + "grad_norm": 1.8631694316864014, + "learning_rate": 2.9964238659890836e-06, + "loss": 0.7032, + "step": 45170 + }, + { + "epoch": 8.5, + "grad_norm": 5.367626190185547, + "learning_rate": 2.9926595143986455e-06, + "loss": 0.509, + "step": 45180 + }, + { + "epoch": 8.51, + "grad_norm": 6.53688907623291, + "learning_rate": 2.9888951628082066e-06, + "loss": 0.4337, + "step": 45190 + }, + { + "epoch": 8.51, + "grad_norm": 17.974353790283203, + "learning_rate": 2.985130811217768e-06, + "loss": 0.7414, + "step": 45200 + }, + { + "epoch": 8.51, + "grad_norm": 17.594377517700195, + "learning_rate": 2.9813664596273295e-06, + "loss": 0.5211, + "step": 45210 + }, + { + "epoch": 8.51, + "grad_norm": 29.985471725463867, + "learning_rate": 2.977602108036891e-06, + "loss": 0.6543, + "step": 45220 + }, + { + "epoch": 8.51, + "grad_norm": 0.37220674753189087, + "learning_rate": 2.973837756446452e-06, + "loss": 0.183, + "step": 45230 + }, + { + "epoch": 8.51, + "grad_norm": 51.8664436340332, + "learning_rate": 2.970073404856014e-06, + "loss": 0.4546, + "step": 45240 + }, + { + "epoch": 8.52, + "grad_norm": 0.12566237151622772, + "learning_rate": 2.9663090532655753e-06, + "loss": 0.6928, + "step": 45250 + }, + { + "epoch": 8.52, + "grad_norm": 27.250438690185547, + "learning_rate": 2.9625447016751368e-06, + "loss": 0.6348, + "step": 45260 + }, + { + "epoch": 8.52, + "grad_norm": 1.0455703735351562, + "learning_rate": 2.958780350084698e-06, + "loss": 0.51, + "step": 45270 + }, + { + "epoch": 8.52, + "grad_norm": 18.566070556640625, + "learning_rate": 2.9550159984942593e-06, + "loss": 0.4247, + "step": 45280 + }, + { + "epoch": 8.52, + "grad_norm": 22.149303436279297, + "learning_rate": 2.951251646903821e-06, + "loss": 0.5396, + "step": 45290 + }, + { + "epoch": 8.53, + "grad_norm": 7.4732561111450195, + "learning_rate": 2.9474872953133826e-06, + "loss": 0.6225, + "step": 45300 + }, + { + "epoch": 8.53, + "grad_norm": 21.741275787353516, + "learning_rate": 2.943722943722944e-06, + "loss": 0.7131, + "step": 45310 + }, + { + "epoch": 8.53, + "grad_norm": 3.925509214401245, + "learning_rate": 2.939958592132505e-06, + "loss": 0.3896, + "step": 45320 + }, + { + "epoch": 8.53, + "grad_norm": 0.22670765221118927, + "learning_rate": 2.936194240542067e-06, + "loss": 0.2686, + "step": 45330 + }, + { + "epoch": 8.53, + "grad_norm": 9.519122123718262, + "learning_rate": 2.9324298889516285e-06, + "loss": 0.5187, + "step": 45340 + }, + { + "epoch": 8.54, + "grad_norm": 1.0992772579193115, + "learning_rate": 2.92866553736119e-06, + "loss": 0.2774, + "step": 45350 + }, + { + "epoch": 8.54, + "grad_norm": 16.670631408691406, + "learning_rate": 2.924901185770751e-06, + "loss": 0.7732, + "step": 45360 + }, + { + "epoch": 8.54, + "grad_norm": 0.2704927623271942, + "learning_rate": 2.9211368341803124e-06, + "loss": 0.3941, + "step": 45370 + }, + { + "epoch": 8.54, + "grad_norm": 0.42437541484832764, + "learning_rate": 2.9173724825898743e-06, + "loss": 0.689, + "step": 45380 + }, + { + "epoch": 8.54, + "grad_norm": 1.1275883913040161, + "learning_rate": 2.9136081309994358e-06, + "loss": 0.3139, + "step": 45390 + }, + { + "epoch": 8.55, + "grad_norm": 19.983110427856445, + "learning_rate": 2.9098437794089972e-06, + "loss": 0.5787, + "step": 45400 + }, + { + "epoch": 8.55, + "grad_norm": 17.412151336669922, + "learning_rate": 2.9060794278185583e-06, + "loss": 0.4265, + "step": 45410 + }, + { + "epoch": 8.55, + "grad_norm": 1.204263687133789, + "learning_rate": 2.9023150762281197e-06, + "loss": 0.8414, + "step": 45420 + }, + { + "epoch": 8.55, + "grad_norm": 18.92685890197754, + "learning_rate": 2.8985507246376816e-06, + "loss": 0.5895, + "step": 45430 + }, + { + "epoch": 8.55, + "grad_norm": 20.907590866088867, + "learning_rate": 2.894786373047243e-06, + "loss": 0.6537, + "step": 45440 + }, + { + "epoch": 8.55, + "grad_norm": 0.030812498182058334, + "learning_rate": 2.891022021456804e-06, + "loss": 0.8201, + "step": 45450 + }, + { + "epoch": 8.56, + "grad_norm": 19.68631935119629, + "learning_rate": 2.8872576698663656e-06, + "loss": 0.27, + "step": 45460 + }, + { + "epoch": 8.56, + "grad_norm": 0.13562491536140442, + "learning_rate": 2.8834933182759275e-06, + "loss": 0.7518, + "step": 45470 + }, + { + "epoch": 8.56, + "grad_norm": 13.175758361816406, + "learning_rate": 2.879728966685489e-06, + "loss": 0.2638, + "step": 45480 + }, + { + "epoch": 8.56, + "grad_norm": 38.72014617919922, + "learning_rate": 2.87596461509505e-06, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 8.56, + "grad_norm": 5.701416969299316, + "learning_rate": 2.8722002635046114e-06, + "loss": 0.4767, + "step": 45500 + }, + { + "epoch": 8.57, + "grad_norm": 6.803305625915527, + "learning_rate": 2.868435911914173e-06, + "loss": 0.3493, + "step": 45510 + }, + { + "epoch": 8.57, + "grad_norm": 34.136417388916016, + "learning_rate": 2.8646715603237348e-06, + "loss": 0.3706, + "step": 45520 + }, + { + "epoch": 8.57, + "grad_norm": 15.489117622375488, + "learning_rate": 2.8609072087332962e-06, + "loss": 0.5064, + "step": 45530 + }, + { + "epoch": 8.57, + "grad_norm": 5.791557788848877, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.4265, + "step": 45540 + }, + { + "epoch": 8.57, + "grad_norm": 5.909194469451904, + "learning_rate": 2.8533785055524187e-06, + "loss": 0.4563, + "step": 45550 + }, + { + "epoch": 8.58, + "grad_norm": 21.65177345275879, + "learning_rate": 2.84961415396198e-06, + "loss": 0.3891, + "step": 45560 + }, + { + "epoch": 8.58, + "grad_norm": 3.773228645324707, + "learning_rate": 2.845849802371542e-06, + "loss": 0.501, + "step": 45570 + }, + { + "epoch": 8.58, + "grad_norm": 9.513245582580566, + "learning_rate": 2.842085450781103e-06, + "loss": 0.4636, + "step": 45580 + }, + { + "epoch": 8.58, + "grad_norm": 8.289735794067383, + "learning_rate": 2.8383210991906646e-06, + "loss": 0.4985, + "step": 45590 + }, + { + "epoch": 8.58, + "grad_norm": 0.9724116921424866, + "learning_rate": 2.834556747600226e-06, + "loss": 0.5116, + "step": 45600 + }, + { + "epoch": 8.58, + "grad_norm": 21.775800704956055, + "learning_rate": 2.830792396009788e-06, + "loss": 0.5441, + "step": 45610 + }, + { + "epoch": 8.59, + "grad_norm": 48.54513931274414, + "learning_rate": 2.827028044419349e-06, + "loss": 0.3705, + "step": 45620 + }, + { + "epoch": 8.59, + "grad_norm": 0.5952854752540588, + "learning_rate": 2.8232636928289104e-06, + "loss": 0.3866, + "step": 45630 + }, + { + "epoch": 8.59, + "grad_norm": 12.538860321044922, + "learning_rate": 2.819499341238472e-06, + "loss": 0.4496, + "step": 45640 + }, + { + "epoch": 8.59, + "grad_norm": 46.39492416381836, + "learning_rate": 2.8157349896480333e-06, + "loss": 0.3195, + "step": 45650 + }, + { + "epoch": 8.59, + "grad_norm": 13.821070671081543, + "learning_rate": 2.811970638057595e-06, + "loss": 0.2962, + "step": 45660 + }, + { + "epoch": 8.6, + "grad_norm": 6.4973225593566895, + "learning_rate": 2.8082062864671562e-06, + "loss": 0.4642, + "step": 45670 + }, + { + "epoch": 8.6, + "grad_norm": 9.997817039489746, + "learning_rate": 2.8044419348767177e-06, + "loss": 0.2811, + "step": 45680 + }, + { + "epoch": 8.6, + "grad_norm": 25.398149490356445, + "learning_rate": 2.800677583286279e-06, + "loss": 0.5049, + "step": 45690 + }, + { + "epoch": 8.6, + "grad_norm": 22.4417667388916, + "learning_rate": 2.7969132316958406e-06, + "loss": 0.5439, + "step": 45700 + }, + { + "epoch": 8.6, + "grad_norm": 19.63978385925293, + "learning_rate": 2.7931488801054017e-06, + "loss": 0.5902, + "step": 45710 + }, + { + "epoch": 8.61, + "grad_norm": 0.3071557581424713, + "learning_rate": 2.7893845285149635e-06, + "loss": 0.3403, + "step": 45720 + }, + { + "epoch": 8.61, + "grad_norm": 35.829811096191406, + "learning_rate": 2.785620176924525e-06, + "loss": 0.3616, + "step": 45730 + }, + { + "epoch": 8.61, + "grad_norm": 14.984370231628418, + "learning_rate": 2.7818558253340865e-06, + "loss": 0.7215, + "step": 45740 + }, + { + "epoch": 8.61, + "grad_norm": 8.33390998840332, + "learning_rate": 2.7780914737436475e-06, + "loss": 0.6237, + "step": 45750 + }, + { + "epoch": 8.61, + "grad_norm": 22.571699142456055, + "learning_rate": 2.7743271221532094e-06, + "loss": 0.6739, + "step": 45760 + }, + { + "epoch": 8.61, + "grad_norm": 17.001461029052734, + "learning_rate": 2.770562770562771e-06, + "loss": 0.2547, + "step": 45770 + }, + { + "epoch": 8.62, + "grad_norm": 19.975543975830078, + "learning_rate": 2.7667984189723323e-06, + "loss": 0.5098, + "step": 45780 + }, + { + "epoch": 8.62, + "grad_norm": 8.409460067749023, + "learning_rate": 2.7630340673818938e-06, + "loss": 0.407, + "step": 45790 + }, + { + "epoch": 8.62, + "grad_norm": 11.9586820602417, + "learning_rate": 2.759269715791455e-06, + "loss": 0.6268, + "step": 45800 + }, + { + "epoch": 8.62, + "grad_norm": 25.05490493774414, + "learning_rate": 2.7555053642010167e-06, + "loss": 0.3221, + "step": 45810 + }, + { + "epoch": 8.62, + "grad_norm": 16.877796173095703, + "learning_rate": 2.751741012610578e-06, + "loss": 0.3368, + "step": 45820 + }, + { + "epoch": 8.63, + "grad_norm": 37.895652770996094, + "learning_rate": 2.7479766610201396e-06, + "loss": 0.5498, + "step": 45830 + }, + { + "epoch": 8.63, + "grad_norm": 16.549964904785156, + "learning_rate": 2.7442123094297007e-06, + "loss": 0.6585, + "step": 45840 + }, + { + "epoch": 8.63, + "grad_norm": 23.333738327026367, + "learning_rate": 2.740447957839262e-06, + "loss": 0.4209, + "step": 45850 + }, + { + "epoch": 8.63, + "grad_norm": 17.801307678222656, + "learning_rate": 2.736683606248824e-06, + "loss": 0.3118, + "step": 45860 + }, + { + "epoch": 8.63, + "grad_norm": 4.754730224609375, + "learning_rate": 2.7329192546583855e-06, + "loss": 0.3706, + "step": 45870 + }, + { + "epoch": 8.64, + "grad_norm": 12.892595291137695, + "learning_rate": 2.7291549030679465e-06, + "loss": 0.5805, + "step": 45880 + }, + { + "epoch": 8.64, + "grad_norm": 16.939796447753906, + "learning_rate": 2.725390551477508e-06, + "loss": 0.5106, + "step": 45890 + }, + { + "epoch": 8.64, + "grad_norm": 16.29544448852539, + "learning_rate": 2.72162619988707e-06, + "loss": 0.4779, + "step": 45900 + }, + { + "epoch": 8.64, + "grad_norm": 28.001670837402344, + "learning_rate": 2.7178618482966313e-06, + "loss": 0.1302, + "step": 45910 + }, + { + "epoch": 8.64, + "grad_norm": 0.1437859982252121, + "learning_rate": 2.7140974967061928e-06, + "loss": 0.4312, + "step": 45920 + }, + { + "epoch": 8.64, + "grad_norm": 18.679183959960938, + "learning_rate": 2.710333145115754e-06, + "loss": 0.2055, + "step": 45930 + }, + { + "epoch": 8.65, + "grad_norm": 19.28763198852539, + "learning_rate": 2.7065687935253153e-06, + "loss": 0.828, + "step": 45940 + }, + { + "epoch": 8.65, + "grad_norm": 77.62793731689453, + "learning_rate": 2.702804441934877e-06, + "loss": 0.5013, + "step": 45950 + }, + { + "epoch": 8.65, + "grad_norm": 21.08930015563965, + "learning_rate": 2.6990400903444386e-06, + "loss": 0.5488, + "step": 45960 + }, + { + "epoch": 8.65, + "grad_norm": 25.826997756958008, + "learning_rate": 2.6952757387539996e-06, + "loss": 0.3493, + "step": 45970 + }, + { + "epoch": 8.65, + "grad_norm": 4.74186897277832, + "learning_rate": 2.691511387163561e-06, + "loss": 0.5234, + "step": 45980 + }, + { + "epoch": 8.66, + "grad_norm": 8.202046394348145, + "learning_rate": 2.687747035573123e-06, + "loss": 0.5571, + "step": 45990 + }, + { + "epoch": 8.66, + "grad_norm": 19.04949188232422, + "learning_rate": 2.6839826839826844e-06, + "loss": 0.4959, + "step": 46000 + }, + { + "epoch": 8.66, + "grad_norm": 10.953725814819336, + "learning_rate": 2.6802183323922455e-06, + "loss": 0.5771, + "step": 46010 + }, + { + "epoch": 8.66, + "grad_norm": 1.0045043230056763, + "learning_rate": 2.676453980801807e-06, + "loss": 0.3382, + "step": 46020 + }, + { + "epoch": 8.66, + "grad_norm": 0.04298378899693489, + "learning_rate": 2.6726896292113684e-06, + "loss": 0.4584, + "step": 46030 + }, + { + "epoch": 8.67, + "grad_norm": 0.0919141098856926, + "learning_rate": 2.6689252776209303e-06, + "loss": 0.5683, + "step": 46040 + }, + { + "epoch": 8.67, + "grad_norm": 27.5792293548584, + "learning_rate": 2.6651609260304918e-06, + "loss": 0.5817, + "step": 46050 + }, + { + "epoch": 8.67, + "grad_norm": 5.107207775115967, + "learning_rate": 2.661396574440053e-06, + "loss": 0.3344, + "step": 46060 + }, + { + "epoch": 8.67, + "grad_norm": 0.08540444076061249, + "learning_rate": 2.6576322228496142e-06, + "loss": 0.4315, + "step": 46070 + }, + { + "epoch": 8.67, + "grad_norm": 6.959580898284912, + "learning_rate": 2.6538678712591757e-06, + "loss": 0.7812, + "step": 46080 + }, + { + "epoch": 8.67, + "grad_norm": 21.82856559753418, + "learning_rate": 2.6501035196687376e-06, + "loss": 0.4448, + "step": 46090 + }, + { + "epoch": 8.68, + "grad_norm": 17.987428665161133, + "learning_rate": 2.6463391680782986e-06, + "loss": 0.5252, + "step": 46100 + }, + { + "epoch": 8.68, + "grad_norm": 29.11836814880371, + "learning_rate": 2.64257481648786e-06, + "loss": 0.5055, + "step": 46110 + }, + { + "epoch": 8.68, + "grad_norm": 10.755464553833008, + "learning_rate": 2.6388104648974216e-06, + "loss": 0.2324, + "step": 46120 + }, + { + "epoch": 8.68, + "grad_norm": 20.402507781982422, + "learning_rate": 2.6350461133069834e-06, + "loss": 0.3029, + "step": 46130 + }, + { + "epoch": 8.68, + "grad_norm": 4.016020774841309, + "learning_rate": 2.631281761716545e-06, + "loss": 0.3907, + "step": 46140 + }, + { + "epoch": 8.69, + "grad_norm": 14.392875671386719, + "learning_rate": 2.627517410126106e-06, + "loss": 0.5084, + "step": 46150 + }, + { + "epoch": 8.69, + "grad_norm": 1.0872794389724731, + "learning_rate": 2.6237530585356674e-06, + "loss": 0.658, + "step": 46160 + }, + { + "epoch": 8.69, + "grad_norm": 5.119357109069824, + "learning_rate": 2.619988706945229e-06, + "loss": 0.3549, + "step": 46170 + }, + { + "epoch": 8.69, + "grad_norm": 0.06370334327220917, + "learning_rate": 2.6162243553547907e-06, + "loss": 0.4486, + "step": 46180 + }, + { + "epoch": 8.69, + "grad_norm": 8.075560569763184, + "learning_rate": 2.6124600037643518e-06, + "loss": 0.4195, + "step": 46190 + }, + { + "epoch": 8.7, + "grad_norm": 27.81960678100586, + "learning_rate": 2.6086956521739132e-06, + "loss": 0.7454, + "step": 46200 + }, + { + "epoch": 8.7, + "grad_norm": 16.909664154052734, + "learning_rate": 2.6049313005834747e-06, + "loss": 0.5945, + "step": 46210 + }, + { + "epoch": 8.7, + "grad_norm": 1.0878101587295532, + "learning_rate": 2.601166948993036e-06, + "loss": 0.3299, + "step": 46220 + }, + { + "epoch": 8.7, + "grad_norm": 19.037450790405273, + "learning_rate": 2.597402597402597e-06, + "loss": 0.338, + "step": 46230 + }, + { + "epoch": 8.7, + "grad_norm": 14.639829635620117, + "learning_rate": 2.593638245812159e-06, + "loss": 0.425, + "step": 46240 + }, + { + "epoch": 8.71, + "grad_norm": 0.7264255881309509, + "learning_rate": 2.5898738942217205e-06, + "loss": 0.3609, + "step": 46250 + }, + { + "epoch": 8.71, + "grad_norm": 8.234590530395508, + "learning_rate": 2.586109542631282e-06, + "loss": 0.4549, + "step": 46260 + }, + { + "epoch": 8.71, + "grad_norm": 9.959664344787598, + "learning_rate": 2.582345191040844e-06, + "loss": 0.3512, + "step": 46270 + }, + { + "epoch": 8.71, + "grad_norm": 0.06254367530345917, + "learning_rate": 2.578580839450405e-06, + "loss": 0.2918, + "step": 46280 + }, + { + "epoch": 8.71, + "grad_norm": 2.8327713012695312, + "learning_rate": 2.5748164878599664e-06, + "loss": 0.5665, + "step": 46290 + }, + { + "epoch": 8.71, + "grad_norm": 6.886368751525879, + "learning_rate": 2.571052136269528e-06, + "loss": 0.4747, + "step": 46300 + }, + { + "epoch": 8.72, + "grad_norm": 10.857604026794434, + "learning_rate": 2.5672877846790893e-06, + "loss": 0.7911, + "step": 46310 + }, + { + "epoch": 8.72, + "grad_norm": 17.762399673461914, + "learning_rate": 2.5635234330886503e-06, + "loss": 0.2114, + "step": 46320 + }, + { + "epoch": 8.72, + "grad_norm": 9.51750659942627, + "learning_rate": 2.5597590814982122e-06, + "loss": 0.2595, + "step": 46330 + }, + { + "epoch": 8.72, + "grad_norm": 9.107878684997559, + "learning_rate": 2.5559947299077737e-06, + "loss": 0.514, + "step": 46340 + }, + { + "epoch": 8.72, + "grad_norm": 0.17868681252002716, + "learning_rate": 2.552230378317335e-06, + "loss": 0.3083, + "step": 46350 + }, + { + "epoch": 8.73, + "grad_norm": 0.14755873382091522, + "learning_rate": 2.548466026726896e-06, + "loss": 0.6239, + "step": 46360 + }, + { + "epoch": 8.73, + "grad_norm": 4.853813648223877, + "learning_rate": 2.5447016751364576e-06, + "loss": 0.4185, + "step": 46370 + }, + { + "epoch": 8.73, + "grad_norm": 20.324190139770508, + "learning_rate": 2.5409373235460195e-06, + "loss": 0.6337, + "step": 46380 + }, + { + "epoch": 8.73, + "grad_norm": 0.38305485248565674, + "learning_rate": 2.537172971955581e-06, + "loss": 0.3451, + "step": 46390 + }, + { + "epoch": 8.73, + "grad_norm": 15.589489936828613, + "learning_rate": 2.5334086203651425e-06, + "loss": 0.7042, + "step": 46400 + }, + { + "epoch": 8.74, + "grad_norm": 35.21918869018555, + "learning_rate": 2.5296442687747035e-06, + "loss": 0.6307, + "step": 46410 + }, + { + "epoch": 8.74, + "grad_norm": 0.9967852830886841, + "learning_rate": 2.5258799171842654e-06, + "loss": 0.457, + "step": 46420 + }, + { + "epoch": 8.74, + "grad_norm": 18.615461349487305, + "learning_rate": 2.522115565593827e-06, + "loss": 0.4258, + "step": 46430 + }, + { + "epoch": 8.74, + "grad_norm": 7.5972981452941895, + "learning_rate": 2.5183512140033883e-06, + "loss": 0.3654, + "step": 46440 + }, + { + "epoch": 8.74, + "grad_norm": 0.10862822085618973, + "learning_rate": 2.5145868624129493e-06, + "loss": 0.386, + "step": 46450 + }, + { + "epoch": 8.74, + "grad_norm": 13.601619720458984, + "learning_rate": 2.510822510822511e-06, + "loss": 0.4652, + "step": 46460 + }, + { + "epoch": 8.75, + "grad_norm": 11.241927146911621, + "learning_rate": 2.5070581592320727e-06, + "loss": 0.2974, + "step": 46470 + }, + { + "epoch": 8.75, + "grad_norm": 4.576162338256836, + "learning_rate": 2.503293807641634e-06, + "loss": 0.4565, + "step": 46480 + }, + { + "epoch": 8.75, + "grad_norm": 1.5839916467666626, + "learning_rate": 2.4995294560511956e-06, + "loss": 0.3777, + "step": 46490 + }, + { + "epoch": 8.75, + "grad_norm": 25.34798812866211, + "learning_rate": 2.4957651044607566e-06, + "loss": 0.4831, + "step": 46500 + }, + { + "epoch": 8.75, + "grad_norm": 0.46405714750289917, + "learning_rate": 2.492000752870318e-06, + "loss": 0.3859, + "step": 46510 + }, + { + "epoch": 8.76, + "grad_norm": 12.911664962768555, + "learning_rate": 2.48823640127988e-06, + "loss": 0.4369, + "step": 46520 + }, + { + "epoch": 8.76, + "grad_norm": 10.62582778930664, + "learning_rate": 2.484472049689441e-06, + "loss": 0.2246, + "step": 46530 + }, + { + "epoch": 8.76, + "grad_norm": 15.52188777923584, + "learning_rate": 2.480707698099003e-06, + "loss": 0.3334, + "step": 46540 + }, + { + "epoch": 8.76, + "grad_norm": 8.674419403076172, + "learning_rate": 2.476943346508564e-06, + "loss": 0.4668, + "step": 46550 + }, + { + "epoch": 8.76, + "grad_norm": 17.727657318115234, + "learning_rate": 2.473178994918126e-06, + "loss": 0.4731, + "step": 46560 + }, + { + "epoch": 8.77, + "grad_norm": 11.835549354553223, + "learning_rate": 2.469414643327687e-06, + "loss": 0.4562, + "step": 46570 + }, + { + "epoch": 8.77, + "grad_norm": 5.226273536682129, + "learning_rate": 2.4656502917372483e-06, + "loss": 0.4373, + "step": 46580 + }, + { + "epoch": 8.77, + "grad_norm": 16.807971954345703, + "learning_rate": 2.4618859401468098e-06, + "loss": 0.5285, + "step": 46590 + }, + { + "epoch": 8.77, + "grad_norm": 11.890029907226562, + "learning_rate": 2.4581215885563712e-06, + "loss": 0.3881, + "step": 46600 + }, + { + "epoch": 8.77, + "grad_norm": 7.499889373779297, + "learning_rate": 2.4543572369659327e-06, + "loss": 0.3529, + "step": 46610 + }, + { + "epoch": 8.77, + "grad_norm": 2.0104305744171143, + "learning_rate": 2.450592885375494e-06, + "loss": 0.5556, + "step": 46620 + }, + { + "epoch": 8.78, + "grad_norm": 1.1118526458740234, + "learning_rate": 2.446828533785056e-06, + "loss": 0.3103, + "step": 46630 + }, + { + "epoch": 8.78, + "grad_norm": 10.730484008789062, + "learning_rate": 2.443064182194617e-06, + "loss": 0.4211, + "step": 46640 + }, + { + "epoch": 8.78, + "grad_norm": 0.4229322075843811, + "learning_rate": 2.4392998306041785e-06, + "loss": 0.7197, + "step": 46650 + }, + { + "epoch": 8.78, + "grad_norm": 7.243539810180664, + "learning_rate": 2.43553547901374e-06, + "loss": 0.2981, + "step": 46660 + }, + { + "epoch": 8.78, + "grad_norm": 0.10506740212440491, + "learning_rate": 2.4317711274233015e-06, + "loss": 0.4107, + "step": 46670 + }, + { + "epoch": 8.79, + "grad_norm": 0.9739378690719604, + "learning_rate": 2.428006775832863e-06, + "loss": 0.4228, + "step": 46680 + }, + { + "epoch": 8.79, + "grad_norm": 21.565500259399414, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.554, + "step": 46690 + }, + { + "epoch": 8.79, + "grad_norm": 8.53973388671875, + "learning_rate": 2.420478072651986e-06, + "loss": 0.7312, + "step": 46700 + }, + { + "epoch": 8.79, + "grad_norm": 15.285102844238281, + "learning_rate": 2.4167137210615473e-06, + "loss": 0.4223, + "step": 46710 + }, + { + "epoch": 8.79, + "grad_norm": 27.030920028686523, + "learning_rate": 2.4129493694711088e-06, + "loss": 0.3982, + "step": 46720 + }, + { + "epoch": 8.8, + "grad_norm": 1.0703567266464233, + "learning_rate": 2.4091850178806702e-06, + "loss": 0.5266, + "step": 46730 + }, + { + "epoch": 8.8, + "grad_norm": 1.5598318576812744, + "learning_rate": 2.4054206662902317e-06, + "loss": 0.4147, + "step": 46740 + }, + { + "epoch": 8.8, + "grad_norm": 5.893131732940674, + "learning_rate": 2.401656314699793e-06, + "loss": 0.9074, + "step": 46750 + }, + { + "epoch": 8.8, + "grad_norm": 0.11686406284570694, + "learning_rate": 2.3978919631093546e-06, + "loss": 0.3176, + "step": 46760 + }, + { + "epoch": 8.8, + "grad_norm": 17.671430587768555, + "learning_rate": 2.394127611518916e-06, + "loss": 0.6666, + "step": 46770 + }, + { + "epoch": 8.8, + "grad_norm": 6.053145408630371, + "learning_rate": 2.3903632599284775e-06, + "loss": 0.453, + "step": 46780 + }, + { + "epoch": 8.81, + "grad_norm": 17.766340255737305, + "learning_rate": 2.386598908338039e-06, + "loss": 0.292, + "step": 46790 + }, + { + "epoch": 8.81, + "grad_norm": 7.656116485595703, + "learning_rate": 2.3828345567476005e-06, + "loss": 0.4534, + "step": 46800 + }, + { + "epoch": 8.81, + "grad_norm": 9.28414249420166, + "learning_rate": 2.379070205157162e-06, + "loss": 0.2725, + "step": 46810 + }, + { + "epoch": 8.81, + "grad_norm": 0.04184451699256897, + "learning_rate": 2.3753058535667234e-06, + "loss": 0.4548, + "step": 46820 + }, + { + "epoch": 8.81, + "grad_norm": 51.41383743286133, + "learning_rate": 2.371541501976285e-06, + "loss": 0.6366, + "step": 46830 + }, + { + "epoch": 8.82, + "grad_norm": 0.9687474370002747, + "learning_rate": 2.3677771503858463e-06, + "loss": 0.1807, + "step": 46840 + }, + { + "epoch": 8.82, + "grad_norm": 10.670173645019531, + "learning_rate": 2.3640127987954078e-06, + "loss": 0.2247, + "step": 46850 + }, + { + "epoch": 8.82, + "grad_norm": 25.308748245239258, + "learning_rate": 2.3602484472049692e-06, + "loss": 0.4082, + "step": 46860 + }, + { + "epoch": 8.82, + "grad_norm": 6.221033573150635, + "learning_rate": 2.3564840956145303e-06, + "loss": 0.4089, + "step": 46870 + }, + { + "epoch": 8.82, + "grad_norm": 6.021878719329834, + "learning_rate": 2.352719744024092e-06, + "loss": 0.5282, + "step": 46880 + }, + { + "epoch": 8.83, + "grad_norm": 4.055378437042236, + "learning_rate": 2.3489553924336536e-06, + "loss": 0.3669, + "step": 46890 + }, + { + "epoch": 8.83, + "grad_norm": 2.8399837017059326, + "learning_rate": 2.345191040843215e-06, + "loss": 0.4622, + "step": 46900 + }, + { + "epoch": 8.83, + "grad_norm": 13.155150413513184, + "learning_rate": 2.3414266892527765e-06, + "loss": 0.3145, + "step": 46910 + }, + { + "epoch": 8.83, + "grad_norm": 6.751850605010986, + "learning_rate": 2.337662337662338e-06, + "loss": 0.6429, + "step": 46920 + }, + { + "epoch": 8.83, + "grad_norm": 25.934751510620117, + "learning_rate": 2.3338979860718994e-06, + "loss": 0.3962, + "step": 46930 + }, + { + "epoch": 8.83, + "grad_norm": 0.046511393040418625, + "learning_rate": 2.3301336344814605e-06, + "loss": 0.6207, + "step": 46940 + }, + { + "epoch": 8.84, + "grad_norm": 26.9798641204834, + "learning_rate": 2.3263692828910224e-06, + "loss": 0.4355, + "step": 46950 + }, + { + "epoch": 8.84, + "grad_norm": 9.000409126281738, + "learning_rate": 2.3226049313005834e-06, + "loss": 0.5159, + "step": 46960 + }, + { + "epoch": 8.84, + "grad_norm": 26.408952713012695, + "learning_rate": 2.3188405797101453e-06, + "loss": 0.3787, + "step": 46970 + }, + { + "epoch": 8.84, + "grad_norm": 53.70569610595703, + "learning_rate": 2.3150762281197063e-06, + "loss": 0.5017, + "step": 46980 + }, + { + "epoch": 8.84, + "grad_norm": 24.432947158813477, + "learning_rate": 2.311311876529268e-06, + "loss": 0.2867, + "step": 46990 + }, + { + "epoch": 8.85, + "grad_norm": 0.2684187591075897, + "learning_rate": 2.3075475249388297e-06, + "loss": 0.3527, + "step": 47000 + }, + { + "epoch": 8.85, + "grad_norm": 7.198283672332764, + "learning_rate": 2.3037831733483907e-06, + "loss": 0.4833, + "step": 47010 + }, + { + "epoch": 8.85, + "grad_norm": 15.57585334777832, + "learning_rate": 2.3000188217579526e-06, + "loss": 0.3947, + "step": 47020 + }, + { + "epoch": 8.85, + "grad_norm": 34.16131591796875, + "learning_rate": 2.2962544701675136e-06, + "loss": 0.4851, + "step": 47030 + }, + { + "epoch": 8.85, + "grad_norm": 1.565393090248108, + "learning_rate": 2.2924901185770755e-06, + "loss": 0.5857, + "step": 47040 + }, + { + "epoch": 8.86, + "grad_norm": 43.33597946166992, + "learning_rate": 2.2887257669866366e-06, + "loss": 0.6679, + "step": 47050 + }, + { + "epoch": 8.86, + "grad_norm": 11.061119079589844, + "learning_rate": 2.2849614153961984e-06, + "loss": 0.5468, + "step": 47060 + }, + { + "epoch": 8.86, + "grad_norm": 0.20168103277683258, + "learning_rate": 2.2811970638057595e-06, + "loss": 0.5677, + "step": 47070 + }, + { + "epoch": 8.86, + "grad_norm": 5.765427112579346, + "learning_rate": 2.277432712215321e-06, + "loss": 0.6151, + "step": 47080 + }, + { + "epoch": 8.86, + "grad_norm": 14.285442352294922, + "learning_rate": 2.2736683606248824e-06, + "loss": 0.2915, + "step": 47090 + }, + { + "epoch": 8.87, + "grad_norm": 11.015250205993652, + "learning_rate": 2.269904009034444e-06, + "loss": 0.4167, + "step": 47100 + }, + { + "epoch": 8.87, + "grad_norm": 42.790672302246094, + "learning_rate": 2.2661396574440053e-06, + "loss": 0.632, + "step": 47110 + }, + { + "epoch": 8.87, + "grad_norm": 23.58126449584961, + "learning_rate": 2.2623753058535668e-06, + "loss": 0.5522, + "step": 47120 + }, + { + "epoch": 8.87, + "grad_norm": 0.10688672214746475, + "learning_rate": 2.2586109542631287e-06, + "loss": 0.5342, + "step": 47130 + }, + { + "epoch": 8.87, + "grad_norm": 22.022897720336914, + "learning_rate": 2.2548466026726897e-06, + "loss": 0.5221, + "step": 47140 + }, + { + "epoch": 8.87, + "grad_norm": 3.2762904167175293, + "learning_rate": 2.251082251082251e-06, + "loss": 0.508, + "step": 47150 + }, + { + "epoch": 8.88, + "grad_norm": 8.347319602966309, + "learning_rate": 2.2473178994918126e-06, + "loss": 0.3269, + "step": 47160 + }, + { + "epoch": 8.88, + "grad_norm": 0.2670442461967468, + "learning_rate": 2.243553547901374e-06, + "loss": 0.4172, + "step": 47170 + }, + { + "epoch": 8.88, + "grad_norm": 22.307165145874023, + "learning_rate": 2.2397891963109355e-06, + "loss": 0.6191, + "step": 47180 + }, + { + "epoch": 8.88, + "grad_norm": 12.482026100158691, + "learning_rate": 2.236024844720497e-06, + "loss": 0.4978, + "step": 47190 + }, + { + "epoch": 8.88, + "grad_norm": 12.081169128417969, + "learning_rate": 2.2322604931300585e-06, + "loss": 0.4355, + "step": 47200 + }, + { + "epoch": 8.89, + "grad_norm": 16.33040428161621, + "learning_rate": 2.22849614153962e-06, + "loss": 0.5954, + "step": 47210 + }, + { + "epoch": 8.89, + "grad_norm": 15.850767135620117, + "learning_rate": 2.2247317899491814e-06, + "loss": 0.4529, + "step": 47220 + }, + { + "epoch": 8.89, + "grad_norm": 39.77294158935547, + "learning_rate": 2.220967438358743e-06, + "loss": 0.2148, + "step": 47230 + }, + { + "epoch": 8.89, + "grad_norm": 18.885601043701172, + "learning_rate": 2.2172030867683043e-06, + "loss": 0.4973, + "step": 47240 + }, + { + "epoch": 8.89, + "grad_norm": 16.801286697387695, + "learning_rate": 2.2134387351778658e-06, + "loss": 0.4566, + "step": 47250 + }, + { + "epoch": 8.9, + "grad_norm": 18.179574966430664, + "learning_rate": 2.2096743835874272e-06, + "loss": 0.7612, + "step": 47260 + }, + { + "epoch": 8.9, + "grad_norm": 3.8040480613708496, + "learning_rate": 2.2059100319969887e-06, + "loss": 0.4395, + "step": 47270 + }, + { + "epoch": 8.9, + "grad_norm": 26.792110443115234, + "learning_rate": 2.20214568040655e-06, + "loss": 0.6617, + "step": 47280 + }, + { + "epoch": 8.9, + "grad_norm": 10.858789443969727, + "learning_rate": 2.1983813288161116e-06, + "loss": 0.4822, + "step": 47290 + }, + { + "epoch": 8.9, + "grad_norm": 0.04407741501927376, + "learning_rate": 2.194616977225673e-06, + "loss": 0.6258, + "step": 47300 + }, + { + "epoch": 8.9, + "grad_norm": 23.669815063476562, + "learning_rate": 2.1908526256352345e-06, + "loss": 0.4182, + "step": 47310 + }, + { + "epoch": 8.91, + "grad_norm": 15.9193115234375, + "learning_rate": 2.187088274044796e-06, + "loss": 0.3994, + "step": 47320 + }, + { + "epoch": 8.91, + "grad_norm": 35.726600646972656, + "learning_rate": 2.1833239224543575e-06, + "loss": 0.2649, + "step": 47330 + }, + { + "epoch": 8.91, + "grad_norm": 23.043758392333984, + "learning_rate": 2.179559570863919e-06, + "loss": 0.3575, + "step": 47340 + }, + { + "epoch": 8.91, + "grad_norm": 1.2536325454711914, + "learning_rate": 2.1757952192734804e-06, + "loss": 0.3955, + "step": 47350 + }, + { + "epoch": 8.91, + "grad_norm": 19.012310028076172, + "learning_rate": 2.172030867683042e-06, + "loss": 0.5392, + "step": 47360 + }, + { + "epoch": 8.92, + "grad_norm": 7.179901599884033, + "learning_rate": 2.1682665160926033e-06, + "loss": 0.5012, + "step": 47370 + }, + { + "epoch": 8.92, + "grad_norm": 15.418367385864258, + "learning_rate": 2.1645021645021648e-06, + "loss": 0.5293, + "step": 47380 + }, + { + "epoch": 8.92, + "grad_norm": 25.173019409179688, + "learning_rate": 2.1607378129117262e-06, + "loss": 0.4274, + "step": 47390 + }, + { + "epoch": 8.92, + "grad_norm": 1.2999225854873657, + "learning_rate": 2.1569734613212877e-06, + "loss": 0.3006, + "step": 47400 + }, + { + "epoch": 8.92, + "grad_norm": 0.06721743196249008, + "learning_rate": 2.153209109730849e-06, + "loss": 0.5072, + "step": 47410 + }, + { + "epoch": 8.93, + "grad_norm": 43.51065444946289, + "learning_rate": 2.1494447581404106e-06, + "loss": 0.5687, + "step": 47420 + }, + { + "epoch": 8.93, + "grad_norm": 1.2883092164993286, + "learning_rate": 2.145680406549972e-06, + "loss": 0.2178, + "step": 47430 + }, + { + "epoch": 8.93, + "grad_norm": 3.8370001316070557, + "learning_rate": 2.1419160549595335e-06, + "loss": 0.4656, + "step": 47440 + }, + { + "epoch": 8.93, + "grad_norm": 1.2734487056732178, + "learning_rate": 2.138151703369095e-06, + "loss": 0.3782, + "step": 47450 + }, + { + "epoch": 8.93, + "grad_norm": 7.262458324432373, + "learning_rate": 2.134387351778656e-06, + "loss": 0.4573, + "step": 47460 + }, + { + "epoch": 8.93, + "grad_norm": 33.35980987548828, + "learning_rate": 2.130623000188218e-06, + "loss": 0.3935, + "step": 47470 + }, + { + "epoch": 8.94, + "grad_norm": 5.109274387359619, + "learning_rate": 2.126858648597779e-06, + "loss": 0.4141, + "step": 47480 + }, + { + "epoch": 8.94, + "grad_norm": 5.794544219970703, + "learning_rate": 2.123094297007341e-06, + "loss": 0.4369, + "step": 47490 + }, + { + "epoch": 8.94, + "grad_norm": 1.0066646337509155, + "learning_rate": 2.1193299454169023e-06, + "loss": 0.6685, + "step": 47500 + }, + { + "epoch": 8.94, + "grad_norm": 11.47095012664795, + "learning_rate": 2.1155655938264637e-06, + "loss": 0.3829, + "step": 47510 + }, + { + "epoch": 8.94, + "grad_norm": 14.297951698303223, + "learning_rate": 2.111801242236025e-06, + "loss": 0.4893, + "step": 47520 + }, + { + "epoch": 8.95, + "grad_norm": 11.840413093566895, + "learning_rate": 2.1080368906455862e-06, + "loss": 0.7612, + "step": 47530 + }, + { + "epoch": 8.95, + "grad_norm": 38.1298713684082, + "learning_rate": 2.104272539055148e-06, + "loss": 0.6616, + "step": 47540 + }, + { + "epoch": 8.95, + "grad_norm": 25.78653907775879, + "learning_rate": 2.100508187464709e-06, + "loss": 0.2917, + "step": 47550 + }, + { + "epoch": 8.95, + "grad_norm": 19.186622619628906, + "learning_rate": 2.096743835874271e-06, + "loss": 0.5563, + "step": 47560 + }, + { + "epoch": 8.95, + "grad_norm": 6.531424522399902, + "learning_rate": 2.092979484283832e-06, + "loss": 0.3453, + "step": 47570 + }, + { + "epoch": 8.96, + "grad_norm": 0.02262808382511139, + "learning_rate": 2.089215132693394e-06, + "loss": 0.4988, + "step": 47580 + }, + { + "epoch": 8.96, + "grad_norm": 18.038076400756836, + "learning_rate": 2.085450781102955e-06, + "loss": 0.5999, + "step": 47590 + }, + { + "epoch": 8.96, + "grad_norm": 12.460607528686523, + "learning_rate": 2.0816864295125165e-06, + "loss": 0.7356, + "step": 47600 + }, + { + "epoch": 8.96, + "grad_norm": 45.871124267578125, + "learning_rate": 2.0779220779220784e-06, + "loss": 0.2332, + "step": 47610 + }, + { + "epoch": 8.96, + "grad_norm": 6.987379550933838, + "learning_rate": 2.0741577263316394e-06, + "loss": 0.4788, + "step": 47620 + }, + { + "epoch": 8.96, + "grad_norm": 27.808523178100586, + "learning_rate": 2.0703933747412013e-06, + "loss": 0.6439, + "step": 47630 + }, + { + "epoch": 8.97, + "grad_norm": 15.022832870483398, + "learning_rate": 2.0666290231507623e-06, + "loss": 0.4583, + "step": 47640 + }, + { + "epoch": 8.97, + "grad_norm": 1.9445959329605103, + "learning_rate": 2.062864671560324e-06, + "loss": 0.6531, + "step": 47650 + }, + { + "epoch": 8.97, + "grad_norm": 19.67501449584961, + "learning_rate": 2.0591003199698852e-06, + "loss": 0.5846, + "step": 47660 + }, + { + "epoch": 8.97, + "grad_norm": 4.834009647369385, + "learning_rate": 2.0553359683794467e-06, + "loss": 0.3116, + "step": 47670 + }, + { + "epoch": 8.97, + "grad_norm": 41.817848205566406, + "learning_rate": 2.051571616789008e-06, + "loss": 0.5561, + "step": 47680 + }, + { + "epoch": 8.98, + "grad_norm": 10.177002906799316, + "learning_rate": 2.0478072651985696e-06, + "loss": 0.4742, + "step": 47690 + }, + { + "epoch": 8.98, + "grad_norm": 9.804389953613281, + "learning_rate": 2.044042913608131e-06, + "loss": 0.5149, + "step": 47700 + }, + { + "epoch": 8.98, + "grad_norm": 0.5583279132843018, + "learning_rate": 2.0402785620176925e-06, + "loss": 0.1706, + "step": 47710 + }, + { + "epoch": 8.98, + "grad_norm": 17.16972541809082, + "learning_rate": 2.036514210427254e-06, + "loss": 0.5996, + "step": 47720 + }, + { + "epoch": 8.98, + "grad_norm": 12.238551139831543, + "learning_rate": 2.0327498588368155e-06, + "loss": 0.3095, + "step": 47730 + }, + { + "epoch": 8.99, + "grad_norm": 23.57866859436035, + "learning_rate": 2.028985507246377e-06, + "loss": 0.332, + "step": 47740 + }, + { + "epoch": 8.99, + "grad_norm": 40.92672348022461, + "learning_rate": 2.0252211556559384e-06, + "loss": 0.6045, + "step": 47750 + }, + { + "epoch": 8.99, + "grad_norm": 0.26052126288414, + "learning_rate": 2.0214568040655e-06, + "loss": 0.5199, + "step": 47760 + }, + { + "epoch": 8.99, + "grad_norm": 14.104440689086914, + "learning_rate": 2.0176924524750613e-06, + "loss": 0.3415, + "step": 47770 + }, + { + "epoch": 8.99, + "grad_norm": 5.497945785522461, + "learning_rate": 2.0139281008846228e-06, + "loss": 0.3376, + "step": 47780 + }, + { + "epoch": 8.99, + "grad_norm": 16.819019317626953, + "learning_rate": 2.0101637492941842e-06, + "loss": 0.2294, + "step": 47790 + }, + { + "epoch": 9.0, + "grad_norm": 26.409992218017578, + "learning_rate": 2.0063993977037457e-06, + "loss": 0.4863, + "step": 47800 + }, + { + "epoch": 9.0, + "grad_norm": 37.679100036621094, + "learning_rate": 2.002635046113307e-06, + "loss": 0.3794, + "step": 47810 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.926, + "eval_loss": 0.29454880952835083, + "eval_runtime": 51.0107, + "eval_samples_per_second": 147.028, + "eval_steps_per_second": 18.388, + "step": 47817 + }, + { + "epoch": 9.0, + "grad_norm": 0.07574218511581421, + "learning_rate": 1.9988706945228686e-06, + "loss": 0.384, + "step": 47820 + }, + { + "epoch": 9.0, + "grad_norm": 0.08334751427173615, + "learning_rate": 1.99510634293243e-06, + "loss": 0.3326, + "step": 47830 + }, + { + "epoch": 9.0, + "grad_norm": 22.99428367614746, + "learning_rate": 1.9913419913419915e-06, + "loss": 0.3852, + "step": 47840 + }, + { + "epoch": 9.01, + "grad_norm": 2.2758443355560303, + "learning_rate": 1.987577639751553e-06, + "loss": 0.3487, + "step": 47850 + }, + { + "epoch": 9.01, + "grad_norm": 16.36058807373047, + "learning_rate": 1.9838132881611144e-06, + "loss": 0.732, + "step": 47860 + }, + { + "epoch": 9.01, + "grad_norm": 18.320423126220703, + "learning_rate": 1.980048936570676e-06, + "loss": 0.5723, + "step": 47870 + }, + { + "epoch": 9.01, + "grad_norm": 0.30943912267684937, + "learning_rate": 1.9762845849802374e-06, + "loss": 0.5259, + "step": 47880 + }, + { + "epoch": 9.01, + "grad_norm": 10.592155456542969, + "learning_rate": 1.972520233389799e-06, + "loss": 0.357, + "step": 47890 + }, + { + "epoch": 9.02, + "grad_norm": 20.643972396850586, + "learning_rate": 1.9687558817993603e-06, + "loss": 0.4431, + "step": 47900 + }, + { + "epoch": 9.02, + "grad_norm": 3.8286397457122803, + "learning_rate": 1.9649915302089217e-06, + "loss": 0.4602, + "step": 47910 + }, + { + "epoch": 9.02, + "grad_norm": 13.922072410583496, + "learning_rate": 1.961227178618483e-06, + "loss": 0.4626, + "step": 47920 + }, + { + "epoch": 9.02, + "grad_norm": 14.62498950958252, + "learning_rate": 1.9574628270280447e-06, + "loss": 0.1669, + "step": 47930 + }, + { + "epoch": 9.02, + "grad_norm": 13.237974166870117, + "learning_rate": 1.953698475437606e-06, + "loss": 0.3758, + "step": 47940 + }, + { + "epoch": 9.03, + "grad_norm": 47.09726333618164, + "learning_rate": 1.9499341238471676e-06, + "loss": 0.6936, + "step": 47950 + }, + { + "epoch": 9.03, + "grad_norm": 7.44305419921875, + "learning_rate": 1.9461697722567286e-06, + "loss": 0.5368, + "step": 47960 + }, + { + "epoch": 9.03, + "grad_norm": 10.614571571350098, + "learning_rate": 1.9424054206662905e-06, + "loss": 0.3991, + "step": 47970 + }, + { + "epoch": 9.03, + "grad_norm": 14.058085441589355, + "learning_rate": 1.938641069075852e-06, + "loss": 0.2912, + "step": 47980 + }, + { + "epoch": 9.03, + "grad_norm": 10.969157218933105, + "learning_rate": 1.9348767174854134e-06, + "loss": 0.4087, + "step": 47990 + }, + { + "epoch": 9.03, + "grad_norm": 10.787178039550781, + "learning_rate": 1.931112365894975e-06, + "loss": 0.4769, + "step": 48000 + }, + { + "epoch": 9.04, + "grad_norm": 15.043909072875977, + "learning_rate": 1.9273480143045364e-06, + "loss": 0.1376, + "step": 48010 + }, + { + "epoch": 9.04, + "grad_norm": 1.5361169576644897, + "learning_rate": 1.923583662714098e-06, + "loss": 0.4009, + "step": 48020 + }, + { + "epoch": 9.04, + "grad_norm": 9.03320598602295, + "learning_rate": 1.919819311123659e-06, + "loss": 0.6094, + "step": 48030 + }, + { + "epoch": 9.04, + "grad_norm": 0.36295774579048157, + "learning_rate": 1.9160549595332207e-06, + "loss": 0.282, + "step": 48040 + }, + { + "epoch": 9.04, + "grad_norm": 5.759310722351074, + "learning_rate": 1.9122906079427818e-06, + "loss": 0.2708, + "step": 48050 + }, + { + "epoch": 9.05, + "grad_norm": 0.02767598256468773, + "learning_rate": 1.9085262563523437e-06, + "loss": 0.5305, + "step": 48060 + }, + { + "epoch": 9.05, + "grad_norm": 1.4058023691177368, + "learning_rate": 1.904761904761905e-06, + "loss": 0.4482, + "step": 48070 + }, + { + "epoch": 9.05, + "grad_norm": 0.3036022186279297, + "learning_rate": 1.9009975531714664e-06, + "loss": 0.5453, + "step": 48080 + }, + { + "epoch": 9.05, + "grad_norm": 4.839325904846191, + "learning_rate": 1.8972332015810276e-06, + "loss": 0.3741, + "step": 48090 + }, + { + "epoch": 9.05, + "grad_norm": 14.845388412475586, + "learning_rate": 1.8934688499905893e-06, + "loss": 0.5632, + "step": 48100 + }, + { + "epoch": 9.06, + "grad_norm": 16.649734497070312, + "learning_rate": 1.8897044984001508e-06, + "loss": 0.1853, + "step": 48110 + }, + { + "epoch": 9.06, + "grad_norm": 25.03996467590332, + "learning_rate": 1.8859401468097122e-06, + "loss": 0.3492, + "step": 48120 + }, + { + "epoch": 9.06, + "grad_norm": 30.69152069091797, + "learning_rate": 1.8821757952192737e-06, + "loss": 0.4833, + "step": 48130 + }, + { + "epoch": 9.06, + "grad_norm": 13.534026145935059, + "learning_rate": 1.8784114436288351e-06, + "loss": 0.3608, + "step": 48140 + }, + { + "epoch": 9.06, + "grad_norm": 23.936994552612305, + "learning_rate": 1.8746470920383966e-06, + "loss": 0.4873, + "step": 48150 + }, + { + "epoch": 9.06, + "grad_norm": 41.1656608581543, + "learning_rate": 1.8708827404479578e-06, + "loss": 0.683, + "step": 48160 + }, + { + "epoch": 9.07, + "grad_norm": 12.103029251098633, + "learning_rate": 1.8671183888575195e-06, + "loss": 0.3607, + "step": 48170 + }, + { + "epoch": 9.07, + "grad_norm": 0.34992146492004395, + "learning_rate": 1.8633540372670808e-06, + "loss": 0.3565, + "step": 48180 + }, + { + "epoch": 9.07, + "grad_norm": 5.494448661804199, + "learning_rate": 1.8595896856766424e-06, + "loss": 0.3514, + "step": 48190 + }, + { + "epoch": 9.07, + "grad_norm": 19.909494400024414, + "learning_rate": 1.8558253340862037e-06, + "loss": 0.3663, + "step": 48200 + }, + { + "epoch": 9.07, + "grad_norm": 23.300514221191406, + "learning_rate": 1.8520609824957654e-06, + "loss": 0.6781, + "step": 48210 + }, + { + "epoch": 9.08, + "grad_norm": 10.3643798828125, + "learning_rate": 1.8482966309053266e-06, + "loss": 0.4268, + "step": 48220 + }, + { + "epoch": 9.08, + "grad_norm": 23.494760513305664, + "learning_rate": 1.844532279314888e-06, + "loss": 0.4987, + "step": 48230 + }, + { + "epoch": 9.08, + "grad_norm": 3.709768772125244, + "learning_rate": 1.8407679277244497e-06, + "loss": 0.3542, + "step": 48240 + }, + { + "epoch": 9.08, + "grad_norm": 0.16881948709487915, + "learning_rate": 1.837003576134011e-06, + "loss": 0.5134, + "step": 48250 + }, + { + "epoch": 9.08, + "grad_norm": 9.188733100891113, + "learning_rate": 1.8332392245435727e-06, + "loss": 0.6763, + "step": 48260 + }, + { + "epoch": 9.09, + "grad_norm": 12.953929901123047, + "learning_rate": 1.829474872953134e-06, + "loss": 0.3074, + "step": 48270 + }, + { + "epoch": 9.09, + "grad_norm": 6.657861232757568, + "learning_rate": 1.8257105213626956e-06, + "loss": 0.3391, + "step": 48280 + }, + { + "epoch": 9.09, + "grad_norm": 22.9435977935791, + "learning_rate": 1.8219461697722568e-06, + "loss": 0.7275, + "step": 48290 + }, + { + "epoch": 9.09, + "grad_norm": 25.13018035888672, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.1599, + "step": 48300 + }, + { + "epoch": 9.09, + "grad_norm": 16.501449584960938, + "learning_rate": 1.8144174665913798e-06, + "loss": 0.577, + "step": 48310 + }, + { + "epoch": 9.09, + "grad_norm": 39.59345626831055, + "learning_rate": 1.8106531150009412e-06, + "loss": 0.7677, + "step": 48320 + }, + { + "epoch": 9.1, + "grad_norm": 14.423561096191406, + "learning_rate": 1.8068887634105025e-06, + "loss": 0.5273, + "step": 48330 + }, + { + "epoch": 9.1, + "grad_norm": 24.364534378051758, + "learning_rate": 1.8031244118200641e-06, + "loss": 0.2518, + "step": 48340 + }, + { + "epoch": 9.1, + "grad_norm": 12.524968147277832, + "learning_rate": 1.7993600602296258e-06, + "loss": 0.6041, + "step": 48350 + }, + { + "epoch": 9.1, + "grad_norm": 23.22210693359375, + "learning_rate": 1.795595708639187e-06, + "loss": 0.6273, + "step": 48360 + }, + { + "epoch": 9.1, + "grad_norm": 9.35206413269043, + "learning_rate": 1.7918313570487485e-06, + "loss": 0.363, + "step": 48370 + }, + { + "epoch": 9.11, + "grad_norm": 11.951454162597656, + "learning_rate": 1.78806700545831e-06, + "loss": 0.2939, + "step": 48380 + }, + { + "epoch": 9.11, + "grad_norm": 8.72339153289795, + "learning_rate": 1.7843026538678714e-06, + "loss": 0.61, + "step": 48390 + }, + { + "epoch": 9.11, + "grad_norm": 36.56248092651367, + "learning_rate": 1.780538302277433e-06, + "loss": 0.5281, + "step": 48400 + }, + { + "epoch": 9.11, + "grad_norm": 13.832367897033691, + "learning_rate": 1.7767739506869944e-06, + "loss": 0.2781, + "step": 48410 + }, + { + "epoch": 9.11, + "grad_norm": 24.075450897216797, + "learning_rate": 1.7730095990965556e-06, + "loss": 0.6961, + "step": 48420 + }, + { + "epoch": 9.12, + "grad_norm": 2.9399142265319824, + "learning_rate": 1.7692452475061173e-06, + "loss": 0.2952, + "step": 48430 + }, + { + "epoch": 9.12, + "grad_norm": 16.432899475097656, + "learning_rate": 1.7654808959156785e-06, + "loss": 0.4107, + "step": 48440 + }, + { + "epoch": 9.12, + "grad_norm": 21.72768211364746, + "learning_rate": 1.7617165443252402e-06, + "loss": 0.7116, + "step": 48450 + }, + { + "epoch": 9.12, + "grad_norm": 20.015758514404297, + "learning_rate": 1.7579521927348015e-06, + "loss": 0.8156, + "step": 48460 + }, + { + "epoch": 9.12, + "grad_norm": 7.275087833404541, + "learning_rate": 1.7541878411443631e-06, + "loss": 0.7025, + "step": 48470 + }, + { + "epoch": 9.12, + "grad_norm": 59.83738708496094, + "learning_rate": 1.7504234895539246e-06, + "loss": 0.363, + "step": 48480 + }, + { + "epoch": 9.13, + "grad_norm": 19.102800369262695, + "learning_rate": 1.7466591379634858e-06, + "loss": 0.6994, + "step": 48490 + }, + { + "epoch": 9.13, + "grad_norm": 37.69584274291992, + "learning_rate": 1.7428947863730475e-06, + "loss": 0.6699, + "step": 48500 + }, + { + "epoch": 9.13, + "grad_norm": 7.24312686920166, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.9191, + "step": 48510 + }, + { + "epoch": 9.13, + "grad_norm": 28.769311904907227, + "learning_rate": 1.7353660831921704e-06, + "loss": 0.3994, + "step": 48520 + }, + { + "epoch": 9.13, + "grad_norm": 0.12715600430965424, + "learning_rate": 1.7316017316017317e-06, + "loss": 0.3204, + "step": 48530 + }, + { + "epoch": 9.14, + "grad_norm": 29.197851181030273, + "learning_rate": 1.7278373800112933e-06, + "loss": 0.6342, + "step": 48540 + }, + { + "epoch": 9.14, + "grad_norm": 4.771313667297363, + "learning_rate": 1.7240730284208546e-06, + "loss": 0.4141, + "step": 48550 + }, + { + "epoch": 9.14, + "grad_norm": 10.536064147949219, + "learning_rate": 1.720308676830416e-06, + "loss": 0.407, + "step": 48560 + }, + { + "epoch": 9.14, + "grad_norm": 19.451534271240234, + "learning_rate": 1.7165443252399775e-06, + "loss": 0.4267, + "step": 48570 + }, + { + "epoch": 9.14, + "grad_norm": 15.74153995513916, + "learning_rate": 1.712779973649539e-06, + "loss": 0.3457, + "step": 48580 + }, + { + "epoch": 9.15, + "grad_norm": 8.152856826782227, + "learning_rate": 1.7090156220591002e-06, + "loss": 0.2276, + "step": 48590 + }, + { + "epoch": 9.15, + "grad_norm": 6.447235584259033, + "learning_rate": 1.705251270468662e-06, + "loss": 0.2864, + "step": 48600 + }, + { + "epoch": 9.15, + "grad_norm": 15.62842845916748, + "learning_rate": 1.7014869188782236e-06, + "loss": 0.5409, + "step": 48610 + }, + { + "epoch": 9.15, + "grad_norm": 7.044529438018799, + "learning_rate": 1.6977225672877848e-06, + "loss": 0.6466, + "step": 48620 + }, + { + "epoch": 9.15, + "grad_norm": 17.935876846313477, + "learning_rate": 1.6939582156973463e-06, + "loss": 0.4593, + "step": 48630 + }, + { + "epoch": 9.15, + "grad_norm": 18.33387565612793, + "learning_rate": 1.6901938641069077e-06, + "loss": 0.5601, + "step": 48640 + }, + { + "epoch": 9.16, + "grad_norm": 11.096664428710938, + "learning_rate": 1.6864295125164692e-06, + "loss": 0.7234, + "step": 48650 + }, + { + "epoch": 9.16, + "grad_norm": 31.238327026367188, + "learning_rate": 1.6826651609260305e-06, + "loss": 0.496, + "step": 48660 + }, + { + "epoch": 9.16, + "grad_norm": 16.450227737426758, + "learning_rate": 1.6789008093355921e-06, + "loss": 0.4201, + "step": 48670 + }, + { + "epoch": 9.16, + "grad_norm": 7.500368118286133, + "learning_rate": 1.6751364577451534e-06, + "loss": 0.3319, + "step": 48680 + }, + { + "epoch": 9.16, + "grad_norm": 3.6829004287719727, + "learning_rate": 1.671372106154715e-06, + "loss": 0.5277, + "step": 48690 + }, + { + "epoch": 9.17, + "grad_norm": 5.226105213165283, + "learning_rate": 1.6676077545642763e-06, + "loss": 0.4659, + "step": 48700 + }, + { + "epoch": 9.17, + "grad_norm": 17.099241256713867, + "learning_rate": 1.663843402973838e-06, + "loss": 0.1683, + "step": 48710 + }, + { + "epoch": 9.17, + "grad_norm": 19.311145782470703, + "learning_rate": 1.6600790513833994e-06, + "loss": 0.3761, + "step": 48720 + }, + { + "epoch": 9.17, + "grad_norm": 11.647774696350098, + "learning_rate": 1.6563146997929607e-06, + "loss": 0.4523, + "step": 48730 + }, + { + "epoch": 9.17, + "grad_norm": 18.899429321289062, + "learning_rate": 1.6525503482025224e-06, + "loss": 0.4971, + "step": 48740 + }, + { + "epoch": 9.18, + "grad_norm": 0.5511684417724609, + "learning_rate": 1.6487859966120836e-06, + "loss": 0.2433, + "step": 48750 + }, + { + "epoch": 9.18, + "grad_norm": 7.991838455200195, + "learning_rate": 1.6450216450216453e-06, + "loss": 0.4022, + "step": 48760 + }, + { + "epoch": 9.18, + "grad_norm": 25.697017669677734, + "learning_rate": 1.6412572934312065e-06, + "loss": 0.4417, + "step": 48770 + }, + { + "epoch": 9.18, + "grad_norm": 12.145305633544922, + "learning_rate": 1.6374929418407682e-06, + "loss": 0.4612, + "step": 48780 + }, + { + "epoch": 9.18, + "grad_norm": 5.5945892333984375, + "learning_rate": 1.6337285902503294e-06, + "loss": 0.3146, + "step": 48790 + }, + { + "epoch": 9.19, + "grad_norm": 12.325715065002441, + "learning_rate": 1.629964238659891e-06, + "loss": 0.6039, + "step": 48800 + }, + { + "epoch": 9.19, + "grad_norm": 5.670255184173584, + "learning_rate": 1.6261998870694524e-06, + "loss": 0.5651, + "step": 48810 + }, + { + "epoch": 9.19, + "grad_norm": 80.41706848144531, + "learning_rate": 1.6224355354790138e-06, + "loss": 0.5711, + "step": 48820 + }, + { + "epoch": 9.19, + "grad_norm": 8.065768241882324, + "learning_rate": 1.6186711838885753e-06, + "loss": 0.6124, + "step": 48830 + }, + { + "epoch": 9.19, + "grad_norm": 0.024990417063236237, + "learning_rate": 1.6149068322981367e-06, + "loss": 0.269, + "step": 48840 + }, + { + "epoch": 9.19, + "grad_norm": 44.296573638916016, + "learning_rate": 1.6111424807076984e-06, + "loss": 0.4364, + "step": 48850 + }, + { + "epoch": 9.2, + "grad_norm": 6.221458435058594, + "learning_rate": 1.6073781291172597e-06, + "loss": 0.2955, + "step": 48860 + }, + { + "epoch": 9.2, + "grad_norm": 17.889265060424805, + "learning_rate": 1.6036137775268213e-06, + "loss": 0.3683, + "step": 48870 + }, + { + "epoch": 9.2, + "grad_norm": 9.34254264831543, + "learning_rate": 1.5998494259363826e-06, + "loss": 0.5364, + "step": 48880 + }, + { + "epoch": 9.2, + "grad_norm": 0.1648782640695572, + "learning_rate": 1.596085074345944e-06, + "loss": 0.3695, + "step": 48890 + }, + { + "epoch": 9.2, + "grad_norm": 1.0630319118499756, + "learning_rate": 1.5923207227555055e-06, + "loss": 0.3158, + "step": 48900 + }, + { + "epoch": 9.21, + "grad_norm": 28.75983428955078, + "learning_rate": 1.588556371165067e-06, + "loss": 0.3781, + "step": 48910 + }, + { + "epoch": 9.21, + "grad_norm": 4.615087509155273, + "learning_rate": 1.5847920195746282e-06, + "loss": 0.3918, + "step": 48920 + }, + { + "epoch": 9.21, + "grad_norm": 15.744338035583496, + "learning_rate": 1.5810276679841899e-06, + "loss": 0.153, + "step": 48930 + }, + { + "epoch": 9.21, + "grad_norm": 5.790966510772705, + "learning_rate": 1.5772633163937511e-06, + "loss": 0.3341, + "step": 48940 + }, + { + "epoch": 9.21, + "grad_norm": 0.05763714388012886, + "learning_rate": 1.5734989648033128e-06, + "loss": 0.4415, + "step": 48950 + }, + { + "epoch": 9.22, + "grad_norm": 15.090495109558105, + "learning_rate": 1.5697346132128743e-06, + "loss": 0.5551, + "step": 48960 + }, + { + "epoch": 9.22, + "grad_norm": 14.567575454711914, + "learning_rate": 1.5659702616224357e-06, + "loss": 0.7791, + "step": 48970 + }, + { + "epoch": 9.22, + "grad_norm": 5.712708473205566, + "learning_rate": 1.5622059100319972e-06, + "loss": 0.3734, + "step": 48980 + }, + { + "epoch": 9.22, + "grad_norm": 64.14569091796875, + "learning_rate": 1.5584415584415584e-06, + "loss": 0.4575, + "step": 48990 + }, + { + "epoch": 9.22, + "grad_norm": 20.8106632232666, + "learning_rate": 1.5546772068511201e-06, + "loss": 0.4518, + "step": 49000 + }, + { + "epoch": 9.22, + "grad_norm": 9.944857597351074, + "learning_rate": 1.5509128552606814e-06, + "loss": 0.4911, + "step": 49010 + }, + { + "epoch": 9.23, + "grad_norm": 8.021162986755371, + "learning_rate": 1.547148503670243e-06, + "loss": 0.6, + "step": 49020 + }, + { + "epoch": 9.23, + "grad_norm": 13.243287086486816, + "learning_rate": 1.5433841520798043e-06, + "loss": 0.507, + "step": 49030 + }, + { + "epoch": 9.23, + "grad_norm": 32.78733825683594, + "learning_rate": 1.539619800489366e-06, + "loss": 0.5617, + "step": 49040 + }, + { + "epoch": 9.23, + "grad_norm": 3.1205811500549316, + "learning_rate": 1.5358554488989272e-06, + "loss": 0.4243, + "step": 49050 + }, + { + "epoch": 9.23, + "grad_norm": 14.635000228881836, + "learning_rate": 1.5320910973084887e-06, + "loss": 0.5009, + "step": 49060 + }, + { + "epoch": 9.24, + "grad_norm": 18.14510154724121, + "learning_rate": 1.5283267457180501e-06, + "loss": 0.3399, + "step": 49070 + }, + { + "epoch": 9.24, + "grad_norm": 0.11989280581474304, + "learning_rate": 1.5245623941276116e-06, + "loss": 0.3495, + "step": 49080 + }, + { + "epoch": 9.24, + "grad_norm": 24.832670211791992, + "learning_rate": 1.5207980425371733e-06, + "loss": 0.3462, + "step": 49090 + }, + { + "epoch": 9.24, + "grad_norm": 12.079398155212402, + "learning_rate": 1.5170336909467345e-06, + "loss": 0.3772, + "step": 49100 + }, + { + "epoch": 9.24, + "grad_norm": 0.3867708146572113, + "learning_rate": 1.5132693393562962e-06, + "loss": 0.4475, + "step": 49110 + }, + { + "epoch": 9.25, + "grad_norm": 26.119632720947266, + "learning_rate": 1.5095049877658574e-06, + "loss": 0.6135, + "step": 49120 + }, + { + "epoch": 9.25, + "grad_norm": 1.460374116897583, + "learning_rate": 1.505740636175419e-06, + "loss": 0.3183, + "step": 49130 + }, + { + "epoch": 9.25, + "grad_norm": 5.709151268005371, + "learning_rate": 1.5019762845849804e-06, + "loss": 0.6727, + "step": 49140 + }, + { + "epoch": 9.25, + "grad_norm": 3.2548608779907227, + "learning_rate": 1.4982119329945418e-06, + "loss": 0.3298, + "step": 49150 + }, + { + "epoch": 9.25, + "grad_norm": 0.8151334524154663, + "learning_rate": 1.4944475814041033e-06, + "loss": 0.4324, + "step": 49160 + }, + { + "epoch": 9.25, + "grad_norm": 8.766486167907715, + "learning_rate": 1.4906832298136647e-06, + "loss": 0.5849, + "step": 49170 + }, + { + "epoch": 9.26, + "grad_norm": 12.971768379211426, + "learning_rate": 1.486918878223226e-06, + "loss": 0.5602, + "step": 49180 + }, + { + "epoch": 9.26, + "grad_norm": 0.07424553483724594, + "learning_rate": 1.4831545266327877e-06, + "loss": 0.3006, + "step": 49190 + }, + { + "epoch": 9.26, + "grad_norm": 40.09004592895508, + "learning_rate": 1.479390175042349e-06, + "loss": 0.4148, + "step": 49200 + }, + { + "epoch": 9.26, + "grad_norm": 40.092464447021484, + "learning_rate": 1.4756258234519106e-06, + "loss": 0.312, + "step": 49210 + }, + { + "epoch": 9.26, + "grad_norm": 28.163589477539062, + "learning_rate": 1.471861471861472e-06, + "loss": 0.281, + "step": 49220 + }, + { + "epoch": 9.27, + "grad_norm": 12.769076347351074, + "learning_rate": 1.4680971202710335e-06, + "loss": 0.5183, + "step": 49230 + }, + { + "epoch": 9.27, + "grad_norm": 2.987441301345825, + "learning_rate": 1.464332768680595e-06, + "loss": 0.5749, + "step": 49240 + }, + { + "epoch": 9.27, + "grad_norm": 6.323493003845215, + "learning_rate": 1.4605684170901562e-06, + "loss": 0.4274, + "step": 49250 + }, + { + "epoch": 9.27, + "grad_norm": 10.778115272521973, + "learning_rate": 1.4568040654997179e-06, + "loss": 0.5296, + "step": 49260 + }, + { + "epoch": 9.27, + "grad_norm": 0.0840461477637291, + "learning_rate": 1.4530397139092791e-06, + "loss": 0.3179, + "step": 49270 + }, + { + "epoch": 9.28, + "grad_norm": 0.9376565217971802, + "learning_rate": 1.4492753623188408e-06, + "loss": 0.2639, + "step": 49280 + }, + { + "epoch": 9.28, + "grad_norm": 13.167097091674805, + "learning_rate": 1.445511010728402e-06, + "loss": 0.3673, + "step": 49290 + }, + { + "epoch": 9.28, + "grad_norm": 0.04004925489425659, + "learning_rate": 1.4417466591379637e-06, + "loss": 0.349, + "step": 49300 + }, + { + "epoch": 9.28, + "grad_norm": 21.248655319213867, + "learning_rate": 1.437982307547525e-06, + "loss": 0.6205, + "step": 49310 + }, + { + "epoch": 9.28, + "grad_norm": 1.5469751358032227, + "learning_rate": 1.4342179559570864e-06, + "loss": 0.3818, + "step": 49320 + }, + { + "epoch": 9.28, + "grad_norm": 0.027218230068683624, + "learning_rate": 1.4304536043666481e-06, + "loss": 0.3784, + "step": 49330 + }, + { + "epoch": 9.29, + "grad_norm": 21.65924072265625, + "learning_rate": 1.4266892527762094e-06, + "loss": 0.4164, + "step": 49340 + }, + { + "epoch": 9.29, + "grad_norm": 32.658966064453125, + "learning_rate": 1.422924901185771e-06, + "loss": 0.4811, + "step": 49350 + }, + { + "epoch": 9.29, + "grad_norm": 0.4044090509414673, + "learning_rate": 1.4191605495953323e-06, + "loss": 0.3344, + "step": 49360 + }, + { + "epoch": 9.29, + "grad_norm": 25.323366165161133, + "learning_rate": 1.415396198004894e-06, + "loss": 0.2475, + "step": 49370 + }, + { + "epoch": 9.29, + "grad_norm": 10.626187324523926, + "learning_rate": 1.4116318464144552e-06, + "loss": 0.4904, + "step": 49380 + }, + { + "epoch": 9.3, + "grad_norm": 11.54336166381836, + "learning_rate": 1.4078674948240167e-06, + "loss": 0.4524, + "step": 49390 + }, + { + "epoch": 9.3, + "grad_norm": 26.418764114379883, + "learning_rate": 1.4041031432335781e-06, + "loss": 0.5615, + "step": 49400 + }, + { + "epoch": 9.3, + "grad_norm": 3.740413188934326, + "learning_rate": 1.4003387916431396e-06, + "loss": 0.1174, + "step": 49410 + }, + { + "epoch": 9.3, + "grad_norm": 15.656806945800781, + "learning_rate": 1.3965744400527008e-06, + "loss": 0.5396, + "step": 49420 + }, + { + "epoch": 9.3, + "grad_norm": 0.4741308391094208, + "learning_rate": 1.3928100884622625e-06, + "loss": 0.5084, + "step": 49430 + }, + { + "epoch": 9.31, + "grad_norm": 3.0225412845611572, + "learning_rate": 1.3890457368718238e-06, + "loss": 0.4074, + "step": 49440 + }, + { + "epoch": 9.31, + "grad_norm": 0.20880745351314545, + "learning_rate": 1.3852813852813854e-06, + "loss": 0.2169, + "step": 49450 + }, + { + "epoch": 9.31, + "grad_norm": 0.055619094520807266, + "learning_rate": 1.3815170336909469e-06, + "loss": 0.2984, + "step": 49460 + }, + { + "epoch": 9.31, + "grad_norm": 0.15215404331684113, + "learning_rate": 1.3777526821005083e-06, + "loss": 0.2482, + "step": 49470 + }, + { + "epoch": 9.31, + "grad_norm": 5.14688777923584, + "learning_rate": 1.3739883305100698e-06, + "loss": 0.285, + "step": 49480 + }, + { + "epoch": 9.31, + "grad_norm": 19.21902084350586, + "learning_rate": 1.370223978919631e-06, + "loss": 0.4919, + "step": 49490 + }, + { + "epoch": 9.32, + "grad_norm": 23.251541137695312, + "learning_rate": 1.3664596273291927e-06, + "loss": 0.4524, + "step": 49500 + }, + { + "epoch": 9.32, + "grad_norm": 5.6620588302612305, + "learning_rate": 1.362695275738754e-06, + "loss": 0.3456, + "step": 49510 + }, + { + "epoch": 9.32, + "grad_norm": 0.2966133654117584, + "learning_rate": 1.3589309241483157e-06, + "loss": 0.4254, + "step": 49520 + }, + { + "epoch": 9.32, + "grad_norm": 1.9271348714828491, + "learning_rate": 1.355166572557877e-06, + "loss": 0.75, + "step": 49530 + }, + { + "epoch": 9.32, + "grad_norm": 47.30406188964844, + "learning_rate": 1.3514022209674386e-06, + "loss": 0.2837, + "step": 49540 + }, + { + "epoch": 9.33, + "grad_norm": 6.934784889221191, + "learning_rate": 1.3476378693769998e-06, + "loss": 0.1063, + "step": 49550 + }, + { + "epoch": 9.33, + "grad_norm": 0.4049472212791443, + "learning_rate": 1.3438735177865615e-06, + "loss": 0.2103, + "step": 49560 + }, + { + "epoch": 9.33, + "grad_norm": 8.255983352661133, + "learning_rate": 1.3401091661961227e-06, + "loss": 0.4509, + "step": 49570 + }, + { + "epoch": 9.33, + "grad_norm": 14.6072416305542, + "learning_rate": 1.3363448146056842e-06, + "loss": 0.6148, + "step": 49580 + }, + { + "epoch": 9.33, + "grad_norm": 0.24846920371055603, + "learning_rate": 1.3325804630152459e-06, + "loss": 0.2994, + "step": 49590 + }, + { + "epoch": 9.34, + "grad_norm": 10.933160781860352, + "learning_rate": 1.3288161114248071e-06, + "loss": 0.3762, + "step": 49600 + }, + { + "epoch": 9.34, + "grad_norm": 19.94293975830078, + "learning_rate": 1.3250517598343688e-06, + "loss": 0.6265, + "step": 49610 + }, + { + "epoch": 9.34, + "grad_norm": 0.2474469691514969, + "learning_rate": 1.32128740824393e-06, + "loss": 0.5879, + "step": 49620 + }, + { + "epoch": 9.34, + "grad_norm": 4.398937225341797, + "learning_rate": 1.3175230566534917e-06, + "loss": 0.4399, + "step": 49630 + }, + { + "epoch": 9.34, + "grad_norm": 0.6015844941139221, + "learning_rate": 1.313758705063053e-06, + "loss": 0.2968, + "step": 49640 + }, + { + "epoch": 9.35, + "grad_norm": 0.09437728673219681, + "learning_rate": 1.3099943534726144e-06, + "loss": 0.4135, + "step": 49650 + }, + { + "epoch": 9.35, + "grad_norm": 26.453380584716797, + "learning_rate": 1.3062300018821759e-06, + "loss": 0.3049, + "step": 49660 + }, + { + "epoch": 9.35, + "grad_norm": 1.22520911693573, + "learning_rate": 1.3024656502917373e-06, + "loss": 0.4276, + "step": 49670 + }, + { + "epoch": 9.35, + "grad_norm": 26.06831169128418, + "learning_rate": 1.2987012987012986e-06, + "loss": 0.4042, + "step": 49680 + }, + { + "epoch": 9.35, + "grad_norm": 34.11174392700195, + "learning_rate": 1.2949369471108603e-06, + "loss": 0.2479, + "step": 49690 + }, + { + "epoch": 9.35, + "grad_norm": 6.261451721191406, + "learning_rate": 1.291172595520422e-06, + "loss": 0.4995, + "step": 49700 + }, + { + "epoch": 9.36, + "grad_norm": 25.64111328125, + "learning_rate": 1.2874082439299832e-06, + "loss": 0.4061, + "step": 49710 + }, + { + "epoch": 9.36, + "grad_norm": 12.963571548461914, + "learning_rate": 1.2836438923395447e-06, + "loss": 0.3339, + "step": 49720 + }, + { + "epoch": 9.36, + "grad_norm": 22.063447952270508, + "learning_rate": 1.2798795407491061e-06, + "loss": 0.5355, + "step": 49730 + }, + { + "epoch": 9.36, + "grad_norm": 5.219394207000732, + "learning_rate": 1.2761151891586676e-06, + "loss": 0.3802, + "step": 49740 + }, + { + "epoch": 9.36, + "grad_norm": 1.1139286756515503, + "learning_rate": 1.2723508375682288e-06, + "loss": 0.1823, + "step": 49750 + }, + { + "epoch": 9.37, + "grad_norm": 36.65229034423828, + "learning_rate": 1.2685864859777905e-06, + "loss": 0.4732, + "step": 49760 + }, + { + "epoch": 9.37, + "grad_norm": 1.4162871837615967, + "learning_rate": 1.2648221343873517e-06, + "loss": 0.255, + "step": 49770 + }, + { + "epoch": 9.37, + "grad_norm": 1.9202029705047607, + "learning_rate": 1.2610577827969134e-06, + "loss": 0.5858, + "step": 49780 + }, + { + "epoch": 9.37, + "grad_norm": 0.02834530733525753, + "learning_rate": 1.2572934312064747e-06, + "loss": 0.5202, + "step": 49790 + }, + { + "epoch": 9.37, + "grad_norm": 8.294178009033203, + "learning_rate": 1.2535290796160363e-06, + "loss": 0.1606, + "step": 49800 + }, + { + "epoch": 9.38, + "grad_norm": 10.036473274230957, + "learning_rate": 1.2497647280255978e-06, + "loss": 0.2661, + "step": 49810 + }, + { + "epoch": 9.38, + "grad_norm": 0.019691968336701393, + "learning_rate": 1.246000376435159e-06, + "loss": 0.4195, + "step": 49820 + }, + { + "epoch": 9.38, + "grad_norm": 14.037164688110352, + "learning_rate": 1.2422360248447205e-06, + "loss": 0.3664, + "step": 49830 + }, + { + "epoch": 9.38, + "grad_norm": 20.38290786743164, + "learning_rate": 1.238471673254282e-06, + "loss": 0.4841, + "step": 49840 + }, + { + "epoch": 9.38, + "grad_norm": 6.970554351806641, + "learning_rate": 1.2347073216638434e-06, + "loss": 0.2273, + "step": 49850 + }, + { + "epoch": 9.38, + "grad_norm": 22.626224517822266, + "learning_rate": 1.2309429700734049e-06, + "loss": 0.482, + "step": 49860 + }, + { + "epoch": 9.39, + "grad_norm": 17.559032440185547, + "learning_rate": 1.2271786184829664e-06, + "loss": 0.4027, + "step": 49870 + }, + { + "epoch": 9.39, + "grad_norm": 7.236875057220459, + "learning_rate": 1.223414266892528e-06, + "loss": 0.278, + "step": 49880 + }, + { + "epoch": 9.39, + "grad_norm": 10.33447265625, + "learning_rate": 1.2196499153020893e-06, + "loss": 0.8987, + "step": 49890 + }, + { + "epoch": 9.39, + "grad_norm": 15.287224769592285, + "learning_rate": 1.2158855637116507e-06, + "loss": 0.4645, + "step": 49900 + }, + { + "epoch": 9.39, + "grad_norm": 0.09853526204824448, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.7842, + "step": 49910 + }, + { + "epoch": 9.4, + "grad_norm": 7.803390979766846, + "learning_rate": 1.2083568605307737e-06, + "loss": 0.5503, + "step": 49920 + }, + { + "epoch": 9.4, + "grad_norm": 34.554622650146484, + "learning_rate": 1.2045925089403351e-06, + "loss": 0.3182, + "step": 49930 + }, + { + "epoch": 9.4, + "grad_norm": 0.17311842739582062, + "learning_rate": 1.2008281573498966e-06, + "loss": 0.5634, + "step": 49940 + }, + { + "epoch": 9.4, + "grad_norm": 0.07609983533620834, + "learning_rate": 1.197063805759458e-06, + "loss": 0.3777, + "step": 49950 + }, + { + "epoch": 9.4, + "grad_norm": 0.10443427413702011, + "learning_rate": 1.1932994541690195e-06, + "loss": 0.3804, + "step": 49960 + }, + { + "epoch": 9.41, + "grad_norm": 1.417898178100586, + "learning_rate": 1.189535102578581e-06, + "loss": 0.3145, + "step": 49970 + }, + { + "epoch": 9.41, + "grad_norm": 0.049046795815229416, + "learning_rate": 1.1857707509881424e-06, + "loss": 0.5672, + "step": 49980 + }, + { + "epoch": 9.41, + "grad_norm": 0.9617329835891724, + "learning_rate": 1.1820063993977039e-06, + "loss": 0.5691, + "step": 49990 + }, + { + "epoch": 9.41, + "grad_norm": 10.128259658813477, + "learning_rate": 1.1782420478072651e-06, + "loss": 0.5053, + "step": 50000 + }, + { + "epoch": 9.41, + "grad_norm": 5.14787483215332, + "learning_rate": 1.1744776962168268e-06, + "loss": 0.5338, + "step": 50010 + }, + { + "epoch": 9.41, + "grad_norm": 0.5360021591186523, + "learning_rate": 1.1707133446263883e-06, + "loss": 0.3056, + "step": 50020 + }, + { + "epoch": 9.42, + "grad_norm": 10.969987869262695, + "learning_rate": 1.1669489930359497e-06, + "loss": 0.4361, + "step": 50030 + }, + { + "epoch": 9.42, + "grad_norm": 26.70008659362793, + "learning_rate": 1.1631846414455112e-06, + "loss": 0.3068, + "step": 50040 + }, + { + "epoch": 9.42, + "grad_norm": 10.523760795593262, + "learning_rate": 1.1594202898550726e-06, + "loss": 0.3849, + "step": 50050 + }, + { + "epoch": 9.42, + "grad_norm": 10.677010536193848, + "learning_rate": 1.155655938264634e-06, + "loss": 0.4415, + "step": 50060 + }, + { + "epoch": 9.42, + "grad_norm": 7.661884307861328, + "learning_rate": 1.1518915866741954e-06, + "loss": 0.0457, + "step": 50070 + }, + { + "epoch": 9.43, + "grad_norm": 15.285215377807617, + "learning_rate": 1.1481272350837568e-06, + "loss": 0.2535, + "step": 50080 + }, + { + "epoch": 9.43, + "grad_norm": 42.20638656616211, + "learning_rate": 1.1443628834933183e-06, + "loss": 0.4552, + "step": 50090 + }, + { + "epoch": 9.43, + "grad_norm": 35.964088439941406, + "learning_rate": 1.1405985319028797e-06, + "loss": 0.589, + "step": 50100 + }, + { + "epoch": 9.43, + "grad_norm": 2.185014486312866, + "learning_rate": 1.1368341803124412e-06, + "loss": 0.6969, + "step": 50110 + }, + { + "epoch": 9.43, + "grad_norm": 0.04032859951257706, + "learning_rate": 1.1330698287220027e-06, + "loss": 0.3934, + "step": 50120 + }, + { + "epoch": 9.44, + "grad_norm": 62.85636520385742, + "learning_rate": 1.1293054771315643e-06, + "loss": 0.4421, + "step": 50130 + }, + { + "epoch": 9.44, + "grad_norm": 11.45727252960205, + "learning_rate": 1.1255411255411256e-06, + "loss": 0.5068, + "step": 50140 + }, + { + "epoch": 9.44, + "grad_norm": 0.05326225981116295, + "learning_rate": 1.121776773950687e-06, + "loss": 0.3979, + "step": 50150 + }, + { + "epoch": 9.44, + "grad_norm": 16.516693115234375, + "learning_rate": 1.1180124223602485e-06, + "loss": 0.2904, + "step": 50160 + }, + { + "epoch": 9.44, + "grad_norm": 20.180761337280273, + "learning_rate": 1.11424807076981e-06, + "loss": 0.3479, + "step": 50170 + }, + { + "epoch": 9.44, + "grad_norm": 7.504915714263916, + "learning_rate": 1.1104837191793714e-06, + "loss": 0.4464, + "step": 50180 + }, + { + "epoch": 9.45, + "grad_norm": 8.153180122375488, + "learning_rate": 1.1067193675889329e-06, + "loss": 0.4644, + "step": 50190 + }, + { + "epoch": 9.45, + "grad_norm": 0.3960212171077728, + "learning_rate": 1.1029550159984943e-06, + "loss": 0.3971, + "step": 50200 + }, + { + "epoch": 9.45, + "grad_norm": 20.968027114868164, + "learning_rate": 1.0991906644080558e-06, + "loss": 0.2904, + "step": 50210 + }, + { + "epoch": 9.45, + "grad_norm": 28.1608943939209, + "learning_rate": 1.0954263128176173e-06, + "loss": 0.3588, + "step": 50220 + }, + { + "epoch": 9.45, + "grad_norm": 15.617292404174805, + "learning_rate": 1.0916619612271787e-06, + "loss": 0.4479, + "step": 50230 + }, + { + "epoch": 9.46, + "grad_norm": 0.6835996508598328, + "learning_rate": 1.0878976096367402e-06, + "loss": 0.3493, + "step": 50240 + }, + { + "epoch": 9.46, + "grad_norm": 11.191716194152832, + "learning_rate": 1.0841332580463016e-06, + "loss": 0.3882, + "step": 50250 + }, + { + "epoch": 9.46, + "grad_norm": 4.108872890472412, + "learning_rate": 1.0803689064558631e-06, + "loss": 0.4316, + "step": 50260 + }, + { + "epoch": 9.46, + "grad_norm": 6.614520072937012, + "learning_rate": 1.0766045548654246e-06, + "loss": 0.7346, + "step": 50270 + }, + { + "epoch": 9.46, + "grad_norm": 11.33967113494873, + "learning_rate": 1.072840203274986e-06, + "loss": 0.5321, + "step": 50280 + }, + { + "epoch": 9.47, + "grad_norm": 4.6037492752075195, + "learning_rate": 1.0690758516845475e-06, + "loss": 0.2836, + "step": 50290 + }, + { + "epoch": 9.47, + "grad_norm": 13.846996307373047, + "learning_rate": 1.065311500094109e-06, + "loss": 0.5464, + "step": 50300 + }, + { + "epoch": 9.47, + "grad_norm": 0.2525990307331085, + "learning_rate": 1.0615471485036704e-06, + "loss": 0.3565, + "step": 50310 + }, + { + "epoch": 9.47, + "grad_norm": 15.951204299926758, + "learning_rate": 1.0577827969132319e-06, + "loss": 0.4183, + "step": 50320 + }, + { + "epoch": 9.47, + "grad_norm": 5.232489585876465, + "learning_rate": 1.0540184453227931e-06, + "loss": 0.3319, + "step": 50330 + }, + { + "epoch": 9.47, + "grad_norm": 0.023767894133925438, + "learning_rate": 1.0502540937323546e-06, + "loss": 0.5855, + "step": 50340 + }, + { + "epoch": 9.48, + "grad_norm": 0.39152446389198303, + "learning_rate": 1.046489742141916e-06, + "loss": 0.3209, + "step": 50350 + }, + { + "epoch": 9.48, + "grad_norm": 15.136631965637207, + "learning_rate": 1.0427253905514775e-06, + "loss": 0.4202, + "step": 50360 + }, + { + "epoch": 9.48, + "grad_norm": 5.339528560638428, + "learning_rate": 1.0389610389610392e-06, + "loss": 0.2623, + "step": 50370 + }, + { + "epoch": 9.48, + "grad_norm": 5.4745097160339355, + "learning_rate": 1.0351966873706006e-06, + "loss": 0.317, + "step": 50380 + }, + { + "epoch": 9.48, + "grad_norm": 6.607236385345459, + "learning_rate": 1.031432335780162e-06, + "loss": 0.4808, + "step": 50390 + }, + { + "epoch": 9.49, + "grad_norm": 16.006132125854492, + "learning_rate": 1.0276679841897233e-06, + "loss": 0.195, + "step": 50400 + }, + { + "epoch": 9.49, + "grad_norm": 3.6388096809387207, + "learning_rate": 1.0239036325992848e-06, + "loss": 0.3289, + "step": 50410 + }, + { + "epoch": 9.49, + "grad_norm": 11.516218185424805, + "learning_rate": 1.0201392810088463e-06, + "loss": 0.4734, + "step": 50420 + }, + { + "epoch": 9.49, + "grad_norm": 30.199953079223633, + "learning_rate": 1.0163749294184077e-06, + "loss": 0.5322, + "step": 50430 + }, + { + "epoch": 9.49, + "grad_norm": 2.062737464904785, + "learning_rate": 1.0126105778279692e-06, + "loss": 0.2732, + "step": 50440 + }, + { + "epoch": 9.5, + "grad_norm": 0.2747786343097687, + "learning_rate": 1.0088462262375306e-06, + "loss": 0.389, + "step": 50450 + }, + { + "epoch": 9.5, + "grad_norm": 23.68992042541504, + "learning_rate": 1.0050818746470921e-06, + "loss": 0.7078, + "step": 50460 + }, + { + "epoch": 9.5, + "grad_norm": 10.194795608520508, + "learning_rate": 1.0013175230566536e-06, + "loss": 0.3264, + "step": 50470 + }, + { + "epoch": 9.5, + "grad_norm": 21.266271591186523, + "learning_rate": 9.97553171466215e-07, + "loss": 0.5062, + "step": 50480 + }, + { + "epoch": 9.5, + "grad_norm": 19.471515655517578, + "learning_rate": 9.937888198757765e-07, + "loss": 0.5086, + "step": 50490 + }, + { + "epoch": 9.5, + "grad_norm": 17.782821655273438, + "learning_rate": 9.90024468285338e-07, + "loss": 0.4353, + "step": 50500 + }, + { + "epoch": 9.51, + "grad_norm": 12.496779441833496, + "learning_rate": 9.862601166948994e-07, + "loss": 0.4012, + "step": 50510 + }, + { + "epoch": 9.51, + "grad_norm": 36.61666488647461, + "learning_rate": 9.824957651044609e-07, + "loss": 0.6408, + "step": 50520 + }, + { + "epoch": 9.51, + "grad_norm": 21.555734634399414, + "learning_rate": 9.787314135140223e-07, + "loss": 0.3065, + "step": 50530 + }, + { + "epoch": 9.51, + "grad_norm": 19.377365112304688, + "learning_rate": 9.749670619235838e-07, + "loss": 0.249, + "step": 50540 + }, + { + "epoch": 9.51, + "grad_norm": 20.095693588256836, + "learning_rate": 9.712027103331453e-07, + "loss": 0.3249, + "step": 50550 + }, + { + "epoch": 9.52, + "grad_norm": 18.395681381225586, + "learning_rate": 9.674383587427067e-07, + "loss": 0.3726, + "step": 50560 + }, + { + "epoch": 9.52, + "grad_norm": 0.07585670053958893, + "learning_rate": 9.636740071522682e-07, + "loss": 0.5645, + "step": 50570 + }, + { + "epoch": 9.52, + "grad_norm": 19.90555763244629, + "learning_rate": 9.599096555618294e-07, + "loss": 0.4348, + "step": 50580 + }, + { + "epoch": 9.52, + "grad_norm": 12.60806655883789, + "learning_rate": 9.561453039713909e-07, + "loss": 0.5422, + "step": 50590 + }, + { + "epoch": 9.52, + "grad_norm": 6.923961162567139, + "learning_rate": 9.523809523809525e-07, + "loss": 0.5583, + "step": 50600 + }, + { + "epoch": 9.53, + "grad_norm": 27.019386291503906, + "learning_rate": 9.486166007905138e-07, + "loss": 0.4477, + "step": 50610 + }, + { + "epoch": 9.53, + "grad_norm": 22.43336296081543, + "learning_rate": 9.448522492000754e-07, + "loss": 0.2844, + "step": 50620 + }, + { + "epoch": 9.53, + "grad_norm": 22.106285095214844, + "learning_rate": 9.410878976096368e-07, + "loss": 0.4775, + "step": 50630 + }, + { + "epoch": 9.53, + "grad_norm": 14.707420349121094, + "learning_rate": 9.373235460191983e-07, + "loss": 0.3398, + "step": 50640 + }, + { + "epoch": 9.53, + "grad_norm": 3.7978923320770264, + "learning_rate": 9.335591944287598e-07, + "loss": 0.3592, + "step": 50650 + }, + { + "epoch": 9.54, + "grad_norm": 16.250072479248047, + "learning_rate": 9.297948428383212e-07, + "loss": 0.5696, + "step": 50660 + }, + { + "epoch": 9.54, + "grad_norm": 5.815196990966797, + "learning_rate": 9.260304912478827e-07, + "loss": 0.2838, + "step": 50670 + }, + { + "epoch": 9.54, + "grad_norm": 0.15209926664829254, + "learning_rate": 9.22266139657444e-07, + "loss": 0.4138, + "step": 50680 + }, + { + "epoch": 9.54, + "grad_norm": 1.1849756240844727, + "learning_rate": 9.185017880670055e-07, + "loss": 0.5292, + "step": 50690 + }, + { + "epoch": 9.54, + "grad_norm": 22.73065948486328, + "learning_rate": 9.14737436476567e-07, + "loss": 0.367, + "step": 50700 + }, + { + "epoch": 9.54, + "grad_norm": 5.767063140869141, + "learning_rate": 9.109730848861284e-07, + "loss": 0.2446, + "step": 50710 + }, + { + "epoch": 9.55, + "grad_norm": 0.5526207089424133, + "learning_rate": 9.072087332956899e-07, + "loss": 0.3764, + "step": 50720 + }, + { + "epoch": 9.55, + "grad_norm": 13.495482444763184, + "learning_rate": 9.034443817052512e-07, + "loss": 0.4672, + "step": 50730 + }, + { + "epoch": 9.55, + "grad_norm": 3.9866769313812256, + "learning_rate": 8.996800301148129e-07, + "loss": 0.5576, + "step": 50740 + }, + { + "epoch": 9.55, + "grad_norm": 25.318235397338867, + "learning_rate": 8.959156785243743e-07, + "loss": 0.3028, + "step": 50750 + }, + { + "epoch": 9.55, + "grad_norm": 7.270644187927246, + "learning_rate": 8.921513269339357e-07, + "loss": 0.4609, + "step": 50760 + }, + { + "epoch": 9.56, + "grad_norm": 1.1293132305145264, + "learning_rate": 8.883869753434972e-07, + "loss": 0.3103, + "step": 50770 + }, + { + "epoch": 9.56, + "grad_norm": 2.1888303756713867, + "learning_rate": 8.846226237530586e-07, + "loss": 0.151, + "step": 50780 + }, + { + "epoch": 9.56, + "grad_norm": 16.56170082092285, + "learning_rate": 8.808582721626201e-07, + "loss": 0.4869, + "step": 50790 + }, + { + "epoch": 9.56, + "grad_norm": 15.505925178527832, + "learning_rate": 8.770939205721816e-07, + "loss": 0.6466, + "step": 50800 + }, + { + "epoch": 9.56, + "grad_norm": 37.347965240478516, + "learning_rate": 8.733295689817429e-07, + "loss": 0.5309, + "step": 50810 + }, + { + "epoch": 9.57, + "grad_norm": 9.028761863708496, + "learning_rate": 8.695652173913044e-07, + "loss": 0.4925, + "step": 50820 + }, + { + "epoch": 9.57, + "grad_norm": 18.40884780883789, + "learning_rate": 8.658008658008658e-07, + "loss": 0.4293, + "step": 50830 + }, + { + "epoch": 9.57, + "grad_norm": 20.18854331970215, + "learning_rate": 8.620365142104273e-07, + "loss": 0.4924, + "step": 50840 + }, + { + "epoch": 9.57, + "grad_norm": 17.72684097290039, + "learning_rate": 8.582721626199888e-07, + "loss": 0.3837, + "step": 50850 + }, + { + "epoch": 9.57, + "grad_norm": 8.206231117248535, + "learning_rate": 8.545078110295501e-07, + "loss": 0.463, + "step": 50860 + }, + { + "epoch": 9.57, + "grad_norm": 0.24733124673366547, + "learning_rate": 8.507434594391118e-07, + "loss": 0.5556, + "step": 50870 + }, + { + "epoch": 9.58, + "grad_norm": 21.102048873901367, + "learning_rate": 8.469791078486731e-07, + "loss": 0.3003, + "step": 50880 + }, + { + "epoch": 9.58, + "grad_norm": 9.614502906799316, + "learning_rate": 8.432147562582346e-07, + "loss": 0.4534, + "step": 50890 + }, + { + "epoch": 9.58, + "grad_norm": 0.05175158008933067, + "learning_rate": 8.394504046677961e-07, + "loss": 0.3368, + "step": 50900 + }, + { + "epoch": 9.58, + "grad_norm": 18.473926544189453, + "learning_rate": 8.356860530773575e-07, + "loss": 0.6013, + "step": 50910 + }, + { + "epoch": 9.58, + "grad_norm": 18.408864974975586, + "learning_rate": 8.31921701486919e-07, + "loss": 0.4173, + "step": 50920 + }, + { + "epoch": 9.59, + "grad_norm": 27.239898681640625, + "learning_rate": 8.281573498964803e-07, + "loss": 0.4574, + "step": 50930 + }, + { + "epoch": 9.59, + "grad_norm": 6.1605753898620605, + "learning_rate": 8.243929983060418e-07, + "loss": 0.6228, + "step": 50940 + }, + { + "epoch": 9.59, + "grad_norm": 4.264019012451172, + "learning_rate": 8.206286467156033e-07, + "loss": 0.2077, + "step": 50950 + }, + { + "epoch": 9.59, + "grad_norm": 9.811421394348145, + "learning_rate": 8.168642951251647e-07, + "loss": 0.4051, + "step": 50960 + }, + { + "epoch": 9.59, + "grad_norm": 5.31566858291626, + "learning_rate": 8.130999435347262e-07, + "loss": 0.37, + "step": 50970 + }, + { + "epoch": 9.6, + "grad_norm": 0.32057228684425354, + "learning_rate": 8.093355919442876e-07, + "loss": 0.6192, + "step": 50980 + }, + { + "epoch": 9.6, + "grad_norm": 5.027352809906006, + "learning_rate": 8.055712403538492e-07, + "loss": 0.7235, + "step": 50990 + }, + { + "epoch": 9.6, + "grad_norm": 0.16423563659191132, + "learning_rate": 8.018068887634107e-07, + "loss": 0.3614, + "step": 51000 + }, + { + "epoch": 9.6, + "grad_norm": 7.00469970703125, + "learning_rate": 7.98042537172972e-07, + "loss": 0.2669, + "step": 51010 + }, + { + "epoch": 9.6, + "grad_norm": 13.766214370727539, + "learning_rate": 7.942781855825335e-07, + "loss": 0.3001, + "step": 51020 + }, + { + "epoch": 9.6, + "grad_norm": 15.029500007629395, + "learning_rate": 7.905138339920949e-07, + "loss": 0.5506, + "step": 51030 + }, + { + "epoch": 9.61, + "grad_norm": 18.616168975830078, + "learning_rate": 7.867494824016564e-07, + "loss": 0.8327, + "step": 51040 + }, + { + "epoch": 9.61, + "grad_norm": 1.1364197731018066, + "learning_rate": 7.829851308112179e-07, + "loss": 0.3129, + "step": 51050 + }, + { + "epoch": 9.61, + "grad_norm": 29.443477630615234, + "learning_rate": 7.792207792207792e-07, + "loss": 0.5467, + "step": 51060 + }, + { + "epoch": 9.61, + "grad_norm": 34.73600387573242, + "learning_rate": 7.754564276303407e-07, + "loss": 0.4792, + "step": 51070 + }, + { + "epoch": 9.61, + "grad_norm": 0.39562925696372986, + "learning_rate": 7.716920760399021e-07, + "loss": 0.4972, + "step": 51080 + }, + { + "epoch": 9.62, + "grad_norm": 21.628002166748047, + "learning_rate": 7.679277244494636e-07, + "loss": 0.6476, + "step": 51090 + }, + { + "epoch": 9.62, + "grad_norm": 0.8809748888015747, + "learning_rate": 7.641633728590251e-07, + "loss": 0.5343, + "step": 51100 + }, + { + "epoch": 9.62, + "grad_norm": 28.88115882873535, + "learning_rate": 7.603990212685866e-07, + "loss": 0.3758, + "step": 51110 + }, + { + "epoch": 9.62, + "grad_norm": 8.242012977600098, + "learning_rate": 7.566346696781481e-07, + "loss": 0.3154, + "step": 51120 + }, + { + "epoch": 9.62, + "grad_norm": 0.03372791409492493, + "learning_rate": 7.528703180877094e-07, + "loss": 0.4118, + "step": 51130 + }, + { + "epoch": 9.63, + "grad_norm": 3.514137029647827, + "learning_rate": 7.491059664972709e-07, + "loss": 0.3894, + "step": 51140 + }, + { + "epoch": 9.63, + "grad_norm": 0.49889636039733887, + "learning_rate": 7.453416149068324e-07, + "loss": 0.3594, + "step": 51150 + }, + { + "epoch": 9.63, + "grad_norm": 0.3000124394893646, + "learning_rate": 7.415772633163938e-07, + "loss": 0.3072, + "step": 51160 + }, + { + "epoch": 9.63, + "grad_norm": 0.06081683188676834, + "learning_rate": 7.378129117259553e-07, + "loss": 0.3226, + "step": 51170 + }, + { + "epoch": 9.63, + "grad_norm": 5.96708345413208, + "learning_rate": 7.340485601355168e-07, + "loss": 0.4402, + "step": 51180 + }, + { + "epoch": 9.63, + "grad_norm": 0.35737451910972595, + "learning_rate": 7.302842085450781e-07, + "loss": 0.7907, + "step": 51190 + }, + { + "epoch": 9.64, + "grad_norm": 4.599313735961914, + "learning_rate": 7.265198569546396e-07, + "loss": 0.1227, + "step": 51200 + }, + { + "epoch": 9.64, + "grad_norm": 4.307950973510742, + "learning_rate": 7.22755505364201e-07, + "loss": 0.2469, + "step": 51210 + }, + { + "epoch": 9.64, + "grad_norm": 22.732894897460938, + "learning_rate": 7.189911537737625e-07, + "loss": 0.4584, + "step": 51220 + }, + { + "epoch": 9.64, + "grad_norm": 21.445600509643555, + "learning_rate": 7.152268021833241e-07, + "loss": 0.355, + "step": 51230 + }, + { + "epoch": 9.64, + "grad_norm": 7.818153381347656, + "learning_rate": 7.114624505928855e-07, + "loss": 0.3512, + "step": 51240 + }, + { + "epoch": 9.65, + "grad_norm": 11.798916816711426, + "learning_rate": 7.07698099002447e-07, + "loss": 0.4811, + "step": 51250 + }, + { + "epoch": 9.65, + "grad_norm": 21.753093719482422, + "learning_rate": 7.039337474120083e-07, + "loss": 0.3904, + "step": 51260 + }, + { + "epoch": 9.65, + "grad_norm": 9.064457893371582, + "learning_rate": 7.001693958215698e-07, + "loss": 0.3594, + "step": 51270 + }, + { + "epoch": 9.65, + "grad_norm": 0.17999929189682007, + "learning_rate": 6.964050442311313e-07, + "loss": 0.5093, + "step": 51280 + }, + { + "epoch": 9.65, + "grad_norm": 27.64767837524414, + "learning_rate": 6.926406926406927e-07, + "loss": 0.4248, + "step": 51290 + }, + { + "epoch": 9.66, + "grad_norm": 13.119711875915527, + "learning_rate": 6.888763410502542e-07, + "loss": 0.2889, + "step": 51300 + }, + { + "epoch": 9.66, + "grad_norm": 17.546010971069336, + "learning_rate": 6.851119894598155e-07, + "loss": 0.402, + "step": 51310 + }, + { + "epoch": 9.66, + "grad_norm": 3.10264253616333, + "learning_rate": 6.81347637869377e-07, + "loss": 0.3537, + "step": 51320 + }, + { + "epoch": 9.66, + "grad_norm": 34.127784729003906, + "learning_rate": 6.775832862789384e-07, + "loss": 0.4739, + "step": 51330 + }, + { + "epoch": 9.66, + "grad_norm": 20.95803451538086, + "learning_rate": 6.738189346884999e-07, + "loss": 0.2397, + "step": 51340 + }, + { + "epoch": 9.66, + "grad_norm": 20.92197036743164, + "learning_rate": 6.700545830980614e-07, + "loss": 0.7235, + "step": 51350 + }, + { + "epoch": 9.67, + "grad_norm": 0.09385684132575989, + "learning_rate": 6.662902315076229e-07, + "loss": 0.612, + "step": 51360 + }, + { + "epoch": 9.67, + "grad_norm": 11.531614303588867, + "learning_rate": 6.625258799171844e-07, + "loss": 0.1948, + "step": 51370 + }, + { + "epoch": 9.67, + "grad_norm": 15.86019515991211, + "learning_rate": 6.587615283267459e-07, + "loss": 0.2973, + "step": 51380 + }, + { + "epoch": 9.67, + "grad_norm": 12.996696472167969, + "learning_rate": 6.549971767363072e-07, + "loss": 0.3127, + "step": 51390 + }, + { + "epoch": 9.67, + "grad_norm": 12.861435890197754, + "learning_rate": 6.512328251458687e-07, + "loss": 0.5937, + "step": 51400 + }, + { + "epoch": 9.68, + "grad_norm": 8.911890029907227, + "learning_rate": 6.474684735554301e-07, + "loss": 0.364, + "step": 51410 + }, + { + "epoch": 9.68, + "grad_norm": 24.71923828125, + "learning_rate": 6.437041219649916e-07, + "loss": 0.6153, + "step": 51420 + }, + { + "epoch": 9.68, + "grad_norm": 17.451587677001953, + "learning_rate": 6.399397703745531e-07, + "loss": 0.6571, + "step": 51430 + }, + { + "epoch": 9.68, + "grad_norm": 20.86277198791504, + "learning_rate": 6.361754187841144e-07, + "loss": 0.3892, + "step": 51440 + }, + { + "epoch": 9.68, + "grad_norm": 32.68037414550781, + "learning_rate": 6.324110671936759e-07, + "loss": 0.2615, + "step": 51450 + }, + { + "epoch": 9.69, + "grad_norm": 25.63720703125, + "learning_rate": 6.286467156032373e-07, + "loss": 0.3267, + "step": 51460 + }, + { + "epoch": 9.69, + "grad_norm": 5.5767645835876465, + "learning_rate": 6.248823640127989e-07, + "loss": 0.2016, + "step": 51470 + }, + { + "epoch": 9.69, + "grad_norm": 27.157485961914062, + "learning_rate": 6.211180124223603e-07, + "loss": 0.6425, + "step": 51480 + }, + { + "epoch": 9.69, + "grad_norm": 8.377914428710938, + "learning_rate": 6.173536608319217e-07, + "loss": 0.2027, + "step": 51490 + }, + { + "epoch": 9.69, + "grad_norm": 21.33051300048828, + "learning_rate": 6.135893092414832e-07, + "loss": 0.2035, + "step": 51500 + }, + { + "epoch": 9.7, + "grad_norm": 17.3638916015625, + "learning_rate": 6.098249576510446e-07, + "loss": 0.1943, + "step": 51510 + }, + { + "epoch": 9.7, + "grad_norm": 0.3384857475757599, + "learning_rate": 6.060606060606061e-07, + "loss": 0.1849, + "step": 51520 + }, + { + "epoch": 9.7, + "grad_norm": 1.3415716886520386, + "learning_rate": 6.022962544701676e-07, + "loss": 0.5632, + "step": 51530 + }, + { + "epoch": 9.7, + "grad_norm": 30.25799560546875, + "learning_rate": 5.98531902879729e-07, + "loss": 0.43, + "step": 51540 + }, + { + "epoch": 9.7, + "grad_norm": 32.7429084777832, + "learning_rate": 5.947675512892905e-07, + "loss": 0.4043, + "step": 51550 + }, + { + "epoch": 9.7, + "grad_norm": 17.513532638549805, + "learning_rate": 5.910031996988519e-07, + "loss": 0.4679, + "step": 51560 + }, + { + "epoch": 9.71, + "grad_norm": 25.66455841064453, + "learning_rate": 5.872388481084134e-07, + "loss": 0.3619, + "step": 51570 + }, + { + "epoch": 9.71, + "grad_norm": 20.97486114501953, + "learning_rate": 5.834744965179749e-07, + "loss": 0.6867, + "step": 51580 + }, + { + "epoch": 9.71, + "grad_norm": 7.469673156738281, + "learning_rate": 5.797101449275363e-07, + "loss": 0.3848, + "step": 51590 + }, + { + "epoch": 9.71, + "grad_norm": 32.72283935546875, + "learning_rate": 5.759457933370977e-07, + "loss": 0.3702, + "step": 51600 + }, + { + "epoch": 9.71, + "grad_norm": 7.483311176300049, + "learning_rate": 5.721814417466591e-07, + "loss": 0.29, + "step": 51610 + }, + { + "epoch": 9.72, + "grad_norm": 12.17415714263916, + "learning_rate": 5.684170901562206e-07, + "loss": 0.3262, + "step": 51620 + }, + { + "epoch": 9.72, + "grad_norm": 18.75547218322754, + "learning_rate": 5.646527385657822e-07, + "loss": 0.4394, + "step": 51630 + }, + { + "epoch": 9.72, + "grad_norm": 12.477697372436523, + "learning_rate": 5.608883869753435e-07, + "loss": 0.4251, + "step": 51640 + }, + { + "epoch": 9.72, + "grad_norm": 15.879676818847656, + "learning_rate": 5.57124035384905e-07, + "loss": 0.4689, + "step": 51650 + }, + { + "epoch": 9.72, + "grad_norm": 31.482437133789062, + "learning_rate": 5.533596837944664e-07, + "loss": 0.525, + "step": 51660 + }, + { + "epoch": 9.73, + "grad_norm": 9.857966423034668, + "learning_rate": 5.495953322040279e-07, + "loss": 0.313, + "step": 51670 + }, + { + "epoch": 9.73, + "grad_norm": 0.783555269241333, + "learning_rate": 5.458309806135894e-07, + "loss": 0.5831, + "step": 51680 + }, + { + "epoch": 9.73, + "grad_norm": 0.14815597236156464, + "learning_rate": 5.420666290231508e-07, + "loss": 0.5301, + "step": 51690 + }, + { + "epoch": 9.73, + "grad_norm": 4.29640531539917, + "learning_rate": 5.383022774327123e-07, + "loss": 0.4009, + "step": 51700 + }, + { + "epoch": 9.73, + "grad_norm": 14.452727317810059, + "learning_rate": 5.345379258422737e-07, + "loss": 0.3563, + "step": 51710 + }, + { + "epoch": 9.73, + "grad_norm": 1.4909330606460571, + "learning_rate": 5.307735742518352e-07, + "loss": 0.3447, + "step": 51720 + }, + { + "epoch": 9.74, + "grad_norm": 0.06365931034088135, + "learning_rate": 5.270092226613966e-07, + "loss": 0.1957, + "step": 51730 + }, + { + "epoch": 9.74, + "grad_norm": 1.6421184539794922, + "learning_rate": 5.23244871070958e-07, + "loss": 0.5981, + "step": 51740 + }, + { + "epoch": 9.74, + "grad_norm": 0.5028958320617676, + "learning_rate": 5.194805194805196e-07, + "loss": 0.3268, + "step": 51750 + }, + { + "epoch": 9.74, + "grad_norm": 4.667960166931152, + "learning_rate": 5.15716167890081e-07, + "loss": 0.1761, + "step": 51760 + }, + { + "epoch": 9.74, + "grad_norm": 1.6822726726531982, + "learning_rate": 5.119518162996424e-07, + "loss": 0.5087, + "step": 51770 + }, + { + "epoch": 9.75, + "grad_norm": 0.5095930099487305, + "learning_rate": 5.081874647092039e-07, + "loss": 0.6654, + "step": 51780 + }, + { + "epoch": 9.75, + "grad_norm": 0.034996677190065384, + "learning_rate": 5.044231131187653e-07, + "loss": 0.22, + "step": 51790 + }, + { + "epoch": 9.75, + "grad_norm": 0.16648948192596436, + "learning_rate": 5.006587615283268e-07, + "loss": 0.1263, + "step": 51800 + }, + { + "epoch": 9.75, + "grad_norm": 32.999053955078125, + "learning_rate": 4.968944099378882e-07, + "loss": 0.2778, + "step": 51810 + }, + { + "epoch": 9.75, + "grad_norm": 54.918148040771484, + "learning_rate": 4.931300583474497e-07, + "loss": 0.5931, + "step": 51820 + }, + { + "epoch": 9.76, + "grad_norm": 28.86396026611328, + "learning_rate": 4.893657067570112e-07, + "loss": 0.2479, + "step": 51830 + }, + { + "epoch": 9.76, + "grad_norm": 33.95524597167969, + "learning_rate": 4.856013551665726e-07, + "loss": 0.2287, + "step": 51840 + }, + { + "epoch": 9.76, + "grad_norm": 20.653039932250977, + "learning_rate": 4.818370035761341e-07, + "loss": 0.4423, + "step": 51850 + }, + { + "epoch": 9.76, + "grad_norm": 13.929657936096191, + "learning_rate": 4.780726519856954e-07, + "loss": 0.5723, + "step": 51860 + }, + { + "epoch": 9.76, + "grad_norm": 7.612604141235352, + "learning_rate": 4.743083003952569e-07, + "loss": 0.6222, + "step": 51870 + }, + { + "epoch": 9.76, + "grad_norm": 14.222421646118164, + "learning_rate": 4.705439488048184e-07, + "loss": 0.3785, + "step": 51880 + }, + { + "epoch": 9.77, + "grad_norm": 0.42615869641304016, + "learning_rate": 4.667795972143799e-07, + "loss": 0.4963, + "step": 51890 + }, + { + "epoch": 9.77, + "grad_norm": 7.262185096740723, + "learning_rate": 4.6301524562394134e-07, + "loss": 0.6145, + "step": 51900 + }, + { + "epoch": 9.77, + "grad_norm": 11.753568649291992, + "learning_rate": 4.5925089403350275e-07, + "loss": 0.5361, + "step": 51910 + }, + { + "epoch": 9.77, + "grad_norm": 0.5254884362220764, + "learning_rate": 4.554865424430642e-07, + "loss": 0.6324, + "step": 51920 + }, + { + "epoch": 9.77, + "grad_norm": 0.03477970510721207, + "learning_rate": 4.517221908526256e-07, + "loss": 0.401, + "step": 51930 + }, + { + "epoch": 9.78, + "grad_norm": 15.322552680969238, + "learning_rate": 4.4795783926218713e-07, + "loss": 0.5914, + "step": 51940 + }, + { + "epoch": 9.78, + "grad_norm": 8.4584379196167, + "learning_rate": 4.441934876717486e-07, + "loss": 0.4317, + "step": 51950 + }, + { + "epoch": 9.78, + "grad_norm": 15.443913459777832, + "learning_rate": 4.4042913608131005e-07, + "loss": 0.4615, + "step": 51960 + }, + { + "epoch": 9.78, + "grad_norm": 6.165141582489014, + "learning_rate": 4.3666478449087146e-07, + "loss": 0.3485, + "step": 51970 + }, + { + "epoch": 9.78, + "grad_norm": 22.44362449645996, + "learning_rate": 4.329004329004329e-07, + "loss": 0.493, + "step": 51980 + }, + { + "epoch": 9.79, + "grad_norm": 0.9760726690292358, + "learning_rate": 4.291360813099944e-07, + "loss": 0.2653, + "step": 51990 + }, + { + "epoch": 9.79, + "grad_norm": 0.020068148151040077, + "learning_rate": 4.253717297195559e-07, + "loss": 0.2711, + "step": 52000 + }, + { + "epoch": 9.79, + "grad_norm": 14.025554656982422, + "learning_rate": 4.216073781291173e-07, + "loss": 0.4435, + "step": 52010 + }, + { + "epoch": 9.79, + "grad_norm": 0.10281354933977127, + "learning_rate": 4.1784302653867876e-07, + "loss": 0.2737, + "step": 52020 + }, + { + "epoch": 9.79, + "grad_norm": 17.83829116821289, + "learning_rate": 4.1407867494824017e-07, + "loss": 0.5229, + "step": 52030 + }, + { + "epoch": 9.79, + "grad_norm": 0.09223546832799911, + "learning_rate": 4.1031432335780163e-07, + "loss": 0.4226, + "step": 52040 + }, + { + "epoch": 9.8, + "grad_norm": 0.02461303025484085, + "learning_rate": 4.065499717673631e-07, + "loss": 0.2307, + "step": 52050 + }, + { + "epoch": 9.8, + "grad_norm": 12.551608085632324, + "learning_rate": 4.027856201769246e-07, + "loss": 0.1268, + "step": 52060 + }, + { + "epoch": 9.8, + "grad_norm": 12.722687721252441, + "learning_rate": 3.99021268586486e-07, + "loss": 0.5292, + "step": 52070 + }, + { + "epoch": 9.8, + "grad_norm": 7.628570556640625, + "learning_rate": 3.9525691699604747e-07, + "loss": 0.2919, + "step": 52080 + }, + { + "epoch": 9.8, + "grad_norm": 7.446393013000488, + "learning_rate": 3.9149256540560893e-07, + "loss": 0.426, + "step": 52090 + }, + { + "epoch": 9.81, + "grad_norm": 24.474695205688477, + "learning_rate": 3.8772821381517034e-07, + "loss": 0.4843, + "step": 52100 + }, + { + "epoch": 9.81, + "grad_norm": 14.59511661529541, + "learning_rate": 3.839638622247318e-07, + "loss": 0.4363, + "step": 52110 + }, + { + "epoch": 9.81, + "grad_norm": 0.08716907352209091, + "learning_rate": 3.801995106342933e-07, + "loss": 0.3595, + "step": 52120 + }, + { + "epoch": 9.81, + "grad_norm": 32.84730529785156, + "learning_rate": 3.764351590438547e-07, + "loss": 0.5217, + "step": 52130 + }, + { + "epoch": 9.81, + "grad_norm": 1.3554562330245972, + "learning_rate": 3.726708074534162e-07, + "loss": 0.2144, + "step": 52140 + }, + { + "epoch": 9.82, + "grad_norm": 14.616629600524902, + "learning_rate": 3.6890645586297765e-07, + "loss": 0.4909, + "step": 52150 + }, + { + "epoch": 9.82, + "grad_norm": 15.903406143188477, + "learning_rate": 3.6514210427253905e-07, + "loss": 0.6216, + "step": 52160 + }, + { + "epoch": 9.82, + "grad_norm": 10.447539329528809, + "learning_rate": 3.613777526821005e-07, + "loss": 0.3794, + "step": 52170 + }, + { + "epoch": 9.82, + "grad_norm": 5.395421981811523, + "learning_rate": 3.5761340109166203e-07, + "loss": 0.4563, + "step": 52180 + }, + { + "epoch": 9.82, + "grad_norm": 2.901611328125, + "learning_rate": 3.538490495012235e-07, + "loss": 0.3466, + "step": 52190 + }, + { + "epoch": 9.82, + "grad_norm": 7.655229568481445, + "learning_rate": 3.500846979107849e-07, + "loss": 0.5837, + "step": 52200 + }, + { + "epoch": 9.83, + "grad_norm": 46.457275390625, + "learning_rate": 3.4632034632034636e-07, + "loss": 0.3432, + "step": 52210 + }, + { + "epoch": 9.83, + "grad_norm": 0.17484161257743835, + "learning_rate": 3.4255599472990776e-07, + "loss": 0.3191, + "step": 52220 + }, + { + "epoch": 9.83, + "grad_norm": 21.340837478637695, + "learning_rate": 3.387916431394692e-07, + "loss": 0.9333, + "step": 52230 + }, + { + "epoch": 9.83, + "grad_norm": 9.154580116271973, + "learning_rate": 3.350272915490307e-07, + "loss": 0.2877, + "step": 52240 + }, + { + "epoch": 9.83, + "grad_norm": 31.7108097076416, + "learning_rate": 3.312629399585922e-07, + "loss": 0.5454, + "step": 52250 + }, + { + "epoch": 9.84, + "grad_norm": 0.7132735252380371, + "learning_rate": 3.274985883681536e-07, + "loss": 0.2906, + "step": 52260 + }, + { + "epoch": 9.84, + "grad_norm": 9.988203048706055, + "learning_rate": 3.2373423677771507e-07, + "loss": 0.3561, + "step": 52270 + }, + { + "epoch": 9.84, + "grad_norm": 20.169540405273438, + "learning_rate": 3.1996988518727653e-07, + "loss": 0.5719, + "step": 52280 + }, + { + "epoch": 9.84, + "grad_norm": 2.338794708251953, + "learning_rate": 3.1620553359683794e-07, + "loss": 0.5348, + "step": 52290 + }, + { + "epoch": 9.84, + "grad_norm": 9.757970809936523, + "learning_rate": 3.1244118200639945e-07, + "loss": 0.3251, + "step": 52300 + }, + { + "epoch": 9.85, + "grad_norm": 16.40189552307129, + "learning_rate": 3.0867683041596086e-07, + "loss": 0.4938, + "step": 52310 + }, + { + "epoch": 9.85, + "grad_norm": 0.104170061647892, + "learning_rate": 3.049124788255223e-07, + "loss": 0.473, + "step": 52320 + }, + { + "epoch": 9.85, + "grad_norm": 16.414779663085938, + "learning_rate": 3.011481272350838e-07, + "loss": 0.4607, + "step": 52330 + }, + { + "epoch": 9.85, + "grad_norm": 10.880690574645996, + "learning_rate": 2.9738377564464524e-07, + "loss": 0.5413, + "step": 52340 + }, + { + "epoch": 9.85, + "grad_norm": 0.09918955713510513, + "learning_rate": 2.936194240542067e-07, + "loss": 0.3942, + "step": 52350 + }, + { + "epoch": 9.86, + "grad_norm": 31.41875457763672, + "learning_rate": 2.8985507246376816e-07, + "loss": 0.5384, + "step": 52360 + }, + { + "epoch": 9.86, + "grad_norm": 15.386962890625, + "learning_rate": 2.8609072087332957e-07, + "loss": 0.4931, + "step": 52370 + }, + { + "epoch": 9.86, + "grad_norm": 6.390880584716797, + "learning_rate": 2.823263692828911e-07, + "loss": 0.3672, + "step": 52380 + }, + { + "epoch": 9.86, + "grad_norm": 29.264728546142578, + "learning_rate": 2.785620176924525e-07, + "loss": 0.2607, + "step": 52390 + }, + { + "epoch": 9.86, + "grad_norm": 10.851874351501465, + "learning_rate": 2.7479766610201395e-07, + "loss": 0.6167, + "step": 52400 + }, + { + "epoch": 9.86, + "grad_norm": 23.39576530456543, + "learning_rate": 2.710333145115754e-07, + "loss": 0.4207, + "step": 52410 + }, + { + "epoch": 9.87, + "grad_norm": 20.70049476623535, + "learning_rate": 2.6726896292113687e-07, + "loss": 0.502, + "step": 52420 + }, + { + "epoch": 9.87, + "grad_norm": 12.70500373840332, + "learning_rate": 2.635046113306983e-07, + "loss": 0.4077, + "step": 52430 + }, + { + "epoch": 9.87, + "grad_norm": 0.4478881359100342, + "learning_rate": 2.597402597402598e-07, + "loss": 0.184, + "step": 52440 + }, + { + "epoch": 9.87, + "grad_norm": 8.121723175048828, + "learning_rate": 2.559759081498212e-07, + "loss": 0.2444, + "step": 52450 + }, + { + "epoch": 9.87, + "grad_norm": 0.5330722332000732, + "learning_rate": 2.5221155655938266e-07, + "loss": 0.4361, + "step": 52460 + }, + { + "epoch": 9.88, + "grad_norm": 31.016155242919922, + "learning_rate": 2.484472049689441e-07, + "loss": 0.5419, + "step": 52470 + }, + { + "epoch": 9.88, + "grad_norm": 7.886919975280762, + "learning_rate": 2.446828533785056e-07, + "loss": 0.4408, + "step": 52480 + }, + { + "epoch": 9.88, + "grad_norm": 29.892057418823242, + "learning_rate": 2.4091850178806704e-07, + "loss": 0.4313, + "step": 52490 + }, + { + "epoch": 9.88, + "grad_norm": 0.040985073894262314, + "learning_rate": 2.3715415019762845e-07, + "loss": 0.3838, + "step": 52500 + }, + { + "epoch": 9.88, + "grad_norm": 20.565820693969727, + "learning_rate": 2.3338979860718994e-07, + "loss": 0.5992, + "step": 52510 + }, + { + "epoch": 9.89, + "grad_norm": 9.025721549987793, + "learning_rate": 2.2962544701675137e-07, + "loss": 0.3285, + "step": 52520 + }, + { + "epoch": 9.89, + "grad_norm": 18.053125381469727, + "learning_rate": 2.258610954263128e-07, + "loss": 0.4098, + "step": 52530 + }, + { + "epoch": 9.89, + "grad_norm": 20.12868309020996, + "learning_rate": 2.220967438358743e-07, + "loss": 0.5116, + "step": 52540 + }, + { + "epoch": 9.89, + "grad_norm": 18.908733367919922, + "learning_rate": 2.1833239224543573e-07, + "loss": 0.5385, + "step": 52550 + }, + { + "epoch": 9.89, + "grad_norm": 1.1563811302185059, + "learning_rate": 2.145680406549972e-07, + "loss": 0.5077, + "step": 52560 + }, + { + "epoch": 9.89, + "grad_norm": 4.780964374542236, + "learning_rate": 2.1080368906455865e-07, + "loss": 0.4955, + "step": 52570 + }, + { + "epoch": 9.9, + "grad_norm": 9.796252250671387, + "learning_rate": 2.0703933747412008e-07, + "loss": 0.4149, + "step": 52580 + }, + { + "epoch": 9.9, + "grad_norm": 0.043842703104019165, + "learning_rate": 2.0327498588368155e-07, + "loss": 0.2674, + "step": 52590 + }, + { + "epoch": 9.9, + "grad_norm": 17.572214126586914, + "learning_rate": 1.99510634293243e-07, + "loss": 0.3753, + "step": 52600 + }, + { + "epoch": 9.9, + "grad_norm": 1.6240875720977783, + "learning_rate": 1.9574628270280447e-07, + "loss": 0.2265, + "step": 52610 + }, + { + "epoch": 9.9, + "grad_norm": 23.174564361572266, + "learning_rate": 1.919819311123659e-07, + "loss": 0.7401, + "step": 52620 + }, + { + "epoch": 9.91, + "grad_norm": 6.317526817321777, + "learning_rate": 1.8821757952192736e-07, + "loss": 0.5374, + "step": 52630 + }, + { + "epoch": 9.91, + "grad_norm": 15.101905822753906, + "learning_rate": 1.8445322793148882e-07, + "loss": 0.3169, + "step": 52640 + }, + { + "epoch": 9.91, + "grad_norm": 12.77408218383789, + "learning_rate": 1.8068887634105026e-07, + "loss": 0.6194, + "step": 52650 + }, + { + "epoch": 9.91, + "grad_norm": 7.3459248542785645, + "learning_rate": 1.7692452475061174e-07, + "loss": 0.2904, + "step": 52660 + }, + { + "epoch": 9.91, + "grad_norm": 12.190526962280273, + "learning_rate": 1.7316017316017318e-07, + "loss": 0.4855, + "step": 52670 + }, + { + "epoch": 9.92, + "grad_norm": 7.406374931335449, + "learning_rate": 1.693958215697346e-07, + "loss": 0.2343, + "step": 52680 + }, + { + "epoch": 9.92, + "grad_norm": 0.23926931619644165, + "learning_rate": 1.656314699792961e-07, + "loss": 0.2742, + "step": 52690 + }, + { + "epoch": 9.92, + "grad_norm": 2.192960739135742, + "learning_rate": 1.6186711838885753e-07, + "loss": 0.3677, + "step": 52700 + }, + { + "epoch": 9.92, + "grad_norm": 12.674443244934082, + "learning_rate": 1.5810276679841897e-07, + "loss": 0.4027, + "step": 52710 + }, + { + "epoch": 9.92, + "grad_norm": 11.900289535522461, + "learning_rate": 1.5433841520798043e-07, + "loss": 0.3339, + "step": 52720 + }, + { + "epoch": 9.92, + "grad_norm": 14.372079849243164, + "learning_rate": 1.505740636175419e-07, + "loss": 0.5075, + "step": 52730 + }, + { + "epoch": 9.93, + "grad_norm": 2.849031925201416, + "learning_rate": 1.4680971202710335e-07, + "loss": 0.1418, + "step": 52740 + }, + { + "epoch": 9.93, + "grad_norm": 5.773308753967285, + "learning_rate": 1.4304536043666478e-07, + "loss": 0.4116, + "step": 52750 + }, + { + "epoch": 9.93, + "grad_norm": 10.172654151916504, + "learning_rate": 1.3928100884622625e-07, + "loss": 0.4328, + "step": 52760 + }, + { + "epoch": 9.93, + "grad_norm": 13.555858612060547, + "learning_rate": 1.355166572557877e-07, + "loss": 0.4646, + "step": 52770 + }, + { + "epoch": 9.93, + "grad_norm": 7.2199177742004395, + "learning_rate": 1.3175230566534914e-07, + "loss": 0.4224, + "step": 52780 + }, + { + "epoch": 9.94, + "grad_norm": 21.8734130859375, + "learning_rate": 1.279879540749106e-07, + "loss": 0.3808, + "step": 52790 + }, + { + "epoch": 9.94, + "grad_norm": 16.765634536743164, + "learning_rate": 1.2422360248447206e-07, + "loss": 0.615, + "step": 52800 + }, + { + "epoch": 9.94, + "grad_norm": 13.97728157043457, + "learning_rate": 1.2045925089403352e-07, + "loss": 0.516, + "step": 52810 + }, + { + "epoch": 9.94, + "grad_norm": 17.42995834350586, + "learning_rate": 1.1669489930359497e-07, + "loss": 0.3234, + "step": 52820 + }, + { + "epoch": 9.94, + "grad_norm": 0.2923338711261749, + "learning_rate": 1.129305477131564e-07, + "loss": 0.2994, + "step": 52830 + }, + { + "epoch": 9.95, + "grad_norm": 7.7803497314453125, + "learning_rate": 1.0916619612271786e-07, + "loss": 0.3944, + "step": 52840 + }, + { + "epoch": 9.95, + "grad_norm": 27.952133178710938, + "learning_rate": 1.0540184453227933e-07, + "loss": 0.3718, + "step": 52850 + }, + { + "epoch": 9.95, + "grad_norm": 34.63023376464844, + "learning_rate": 1.0163749294184077e-07, + "loss": 0.7521, + "step": 52860 + }, + { + "epoch": 9.95, + "grad_norm": 24.050615310668945, + "learning_rate": 9.787314135140223e-08, + "loss": 0.4687, + "step": 52870 + }, + { + "epoch": 9.95, + "grad_norm": 0.22404804825782776, + "learning_rate": 9.410878976096368e-08, + "loss": 0.3324, + "step": 52880 + }, + { + "epoch": 9.95, + "grad_norm": 8.329122543334961, + "learning_rate": 9.034443817052513e-08, + "loss": 0.5473, + "step": 52890 + }, + { + "epoch": 9.96, + "grad_norm": 2.2988033294677734, + "learning_rate": 8.658008658008659e-08, + "loss": 0.1834, + "step": 52900 + }, + { + "epoch": 9.96, + "grad_norm": 16.856712341308594, + "learning_rate": 8.281573498964805e-08, + "loss": 0.3419, + "step": 52910 + }, + { + "epoch": 9.96, + "grad_norm": 22.70846176147461, + "learning_rate": 7.905138339920948e-08, + "loss": 0.4666, + "step": 52920 + }, + { + "epoch": 9.96, + "grad_norm": 8.477212905883789, + "learning_rate": 7.528703180877094e-08, + "loss": 0.1898, + "step": 52930 + }, + { + "epoch": 9.96, + "grad_norm": 7.554108619689941, + "learning_rate": 7.152268021833239e-08, + "loss": 0.4636, + "step": 52940 + }, + { + "epoch": 9.97, + "grad_norm": 35.77024841308594, + "learning_rate": 6.775832862789385e-08, + "loss": 0.5227, + "step": 52950 + }, + { + "epoch": 9.97, + "grad_norm": 0.06595637649297714, + "learning_rate": 6.39939770374553e-08, + "loss": 0.3577, + "step": 52960 + }, + { + "epoch": 9.97, + "grad_norm": 19.368412017822266, + "learning_rate": 6.022962544701676e-08, + "loss": 0.5107, + "step": 52970 + }, + { + "epoch": 9.97, + "grad_norm": 0.05418463796377182, + "learning_rate": 5.64652738565782e-08, + "loss": 0.4543, + "step": 52980 + }, + { + "epoch": 9.97, + "grad_norm": 10.112632751464844, + "learning_rate": 5.270092226613966e-08, + "loss": 0.6876, + "step": 52990 + }, + { + "epoch": 9.98, + "grad_norm": 22.078157424926758, + "learning_rate": 4.893657067570112e-08, + "loss": 0.505, + "step": 53000 + }, + { + "epoch": 9.98, + "grad_norm": 7.689653396606445, + "learning_rate": 4.5172219085262564e-08, + "loss": 0.4111, + "step": 53010 + }, + { + "epoch": 9.98, + "grad_norm": 17.780681610107422, + "learning_rate": 4.1407867494824025e-08, + "loss": 0.6067, + "step": 53020 + }, + { + "epoch": 9.98, + "grad_norm": 54.270999908447266, + "learning_rate": 3.764351590438547e-08, + "loss": 0.3095, + "step": 53030 + }, + { + "epoch": 9.98, + "grad_norm": 12.391640663146973, + "learning_rate": 3.3879164313946926e-08, + "loss": 0.6079, + "step": 53040 + }, + { + "epoch": 9.98, + "grad_norm": 15.050522804260254, + "learning_rate": 3.011481272350838e-08, + "loss": 0.6605, + "step": 53050 + }, + { + "epoch": 9.99, + "grad_norm": 10.04753303527832, + "learning_rate": 2.635046113306983e-08, + "loss": 0.5057, + "step": 53060 + }, + { + "epoch": 9.99, + "grad_norm": 14.447115898132324, + "learning_rate": 2.2586109542631282e-08, + "loss": 0.5889, + "step": 53070 + }, + { + "epoch": 9.99, + "grad_norm": 15.600353240966797, + "learning_rate": 1.8821757952192736e-08, + "loss": 0.3044, + "step": 53080 + }, + { + "epoch": 9.99, + "grad_norm": 41.5399284362793, + "learning_rate": 1.505740636175419e-08, + "loss": 0.3476, + "step": 53090 + }, + { + "epoch": 9.99, + "grad_norm": 0.05904613435268402, + "learning_rate": 1.1293054771315641e-08, + "loss": 0.3915, + "step": 53100 + }, + { + "epoch": 10.0, + "grad_norm": 0.6749778389930725, + "learning_rate": 7.528703180877095e-09, + "loss": 0.4269, + "step": 53110 + }, + { + "epoch": 10.0, + "grad_norm": 0.3194694519042969, + "learning_rate": 3.7643515904385476e-09, + "loss": 0.2406, + "step": 53120 + }, + { + "epoch": 10.0, + "grad_norm": 0.023369356989860535, + "learning_rate": 0.0, + "loss": 0.3274, + "step": 53130 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9269333333333334, + "eval_loss": 0.3031522035598755, + "eval_runtime": 53.4027, + "eval_samples_per_second": 140.442, + "eval_steps_per_second": 17.565, + "step": 53130 + }, + { + "epoch": 10.0, + "step": 53130, + "total_flos": 3.29630230185984e+19, + "train_loss": 0.8302146609071148, + "train_runtime": 9599.828, + "train_samples_per_second": 44.272, + "train_steps_per_second": 5.534 + } + ], + "logging_steps": 10, + "max_steps": 53130, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 3.29630230185984e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}