{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9420444444444445, "eval_steps": 500, "global_step": 5624, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 9.912928587976149, "learning_rate": 1.4e-07, "loss": 1.0098, "step": 10 }, { "epoch": 0.01, "grad_norm": 9.065321089366511, "learning_rate": 2.8e-07, "loss": 1.0032, "step": 20 }, { "epoch": 0.02, "grad_norm": 4.529282680442283, "learning_rate": 4.2e-07, "loss": 0.9767, "step": 30 }, { "epoch": 0.03, "grad_norm": 4.079773534866118, "learning_rate": 5.6e-07, "loss": 0.9341, "step": 40 }, { "epoch": 0.04, "grad_norm": 1.7504240808921168, "learning_rate": 7.000000000000001e-07, "loss": 0.8727, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.7446189301316752, "learning_rate": 8.4e-07, "loss": 0.7997, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.5893489037586176, "learning_rate": 9.800000000000001e-07, "loss": 0.7828, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.5798012459841311, "learning_rate": 1.12e-06, "loss": 0.7671, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.5143143417454488, "learning_rate": 1.26e-06, "loss": 0.777, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.5006881361687121, "learning_rate": 1.4000000000000001e-06, "loss": 0.7709, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.5268772561224019, "learning_rate": 1.54e-06, "loss": 0.7751, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.49059329535011015, "learning_rate": 1.68e-06, "loss": 0.7588, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.548982179156723, "learning_rate": 1.82e-06, "loss": 0.758, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.5118740800557817, "learning_rate": 1.9600000000000003e-06, "loss": 0.7492, "step": 140 }, { "epoch": 0.11, "grad_norm": 0.47988356348194033, "learning_rate": 2.1e-06, "loss": 0.7479, "step": 150 }, { "epoch": 0.11, "grad_norm": 0.5324095582498372, "learning_rate": 2.24e-06, "loss": 0.7344, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.49578185528674784, "learning_rate": 2.38e-06, "loss": 0.7379, "step": 170 }, { "epoch": 0.13, "grad_norm": 0.4751722809020323, "learning_rate": 2.52e-06, "loss": 0.7515, "step": 180 }, { "epoch": 0.14, "grad_norm": 0.4898512842949614, "learning_rate": 2.66e-06, "loss": 0.7428, "step": 190 }, { "epoch": 0.14, "grad_norm": 0.4938014103724035, "learning_rate": 2.8000000000000003e-06, "loss": 0.7356, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.4853179196888149, "learning_rate": 2.94e-06, "loss": 0.7338, "step": 210 }, { "epoch": 0.16, "grad_norm": 0.5006261354893382, "learning_rate": 3.08e-06, "loss": 0.7228, "step": 220 }, { "epoch": 0.16, "grad_norm": 0.49494536099466524, "learning_rate": 3.22e-06, "loss": 0.7371, "step": 230 }, { "epoch": 0.17, "grad_norm": 0.4745560090617258, "learning_rate": 3.36e-06, "loss": 0.7374, "step": 240 }, { "epoch": 0.18, "grad_norm": 0.458424659300056, "learning_rate": 3.5e-06, "loss": 0.7284, "step": 250 }, { "epoch": 0.18, "grad_norm": 0.4918105642778609, "learning_rate": 3.64e-06, "loss": 0.719, "step": 260 }, { "epoch": 0.19, "grad_norm": 0.45994092727545755, "learning_rate": 3.7800000000000002e-06, "loss": 0.7328, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.4888877840053054, "learning_rate": 3.920000000000001e-06, "loss": 0.7257, "step": 280 }, { "epoch": 0.21, "grad_norm": 0.4891132357037931, "learning_rate": 4.059999999999999e-06, "loss": 0.7146, "step": 290 }, { "epoch": 0.21, "grad_norm": 0.4659780107286472, "learning_rate": 4.2e-06, "loss": 0.7207, "step": 300 }, { "epoch": 0.22, "grad_norm": 0.4747662452681582, "learning_rate": 4.34e-06, "loss": 0.7196, "step": 310 }, { "epoch": 0.23, "grad_norm": 0.46183058951309874, "learning_rate": 4.48e-06, "loss": 0.7166, "step": 320 }, { "epoch": 0.23, "grad_norm": 0.47556837186042844, "learning_rate": 4.62e-06, "loss": 0.7138, "step": 330 }, { "epoch": 0.24, "grad_norm": 0.4646419935884572, "learning_rate": 4.76e-06, "loss": 0.7166, "step": 340 }, { "epoch": 0.25, "grad_norm": 0.47208612393069765, "learning_rate": 4.9e-06, "loss": 0.7071, "step": 350 }, { "epoch": 0.26, "grad_norm": 0.47395551626034477, "learning_rate": 5.04e-06, "loss": 0.7081, "step": 360 }, { "epoch": 0.26, "grad_norm": 0.46256038389399284, "learning_rate": 5.1799999999999995e-06, "loss": 0.7112, "step": 370 }, { "epoch": 0.27, "grad_norm": 0.44989559880311664, "learning_rate": 5.32e-06, "loss": 0.7157, "step": 380 }, { "epoch": 0.28, "grad_norm": 0.4759980664139243, "learning_rate": 5.46e-06, "loss": 0.716, "step": 390 }, { "epoch": 0.28, "grad_norm": 0.47761427911509746, "learning_rate": 5.600000000000001e-06, "loss": 0.6936, "step": 400 }, { "epoch": 0.29, "grad_norm": 0.4823631066912239, "learning_rate": 5.739999999999999e-06, "loss": 0.7096, "step": 410 }, { "epoch": 0.3, "grad_norm": 0.4692563644972781, "learning_rate": 5.88e-06, "loss": 0.6955, "step": 420 }, { "epoch": 0.31, "grad_norm": 0.4758216043542266, "learning_rate": 6.02e-06, "loss": 0.7046, "step": 430 }, { "epoch": 0.31, "grad_norm": 0.4607724176991764, "learning_rate": 6.16e-06, "loss": 0.7071, "step": 440 }, { "epoch": 0.32, "grad_norm": 0.47650098464440593, "learning_rate": 6.3e-06, "loss": 0.6948, "step": 450 }, { "epoch": 0.33, "grad_norm": 0.4927763843500283, "learning_rate": 6.44e-06, "loss": 0.7138, "step": 460 }, { "epoch": 0.33, "grad_norm": 0.44343044028786904, "learning_rate": 6.58e-06, "loss": 0.7033, "step": 470 }, { "epoch": 0.34, "grad_norm": 0.45708129790603597, "learning_rate": 6.72e-06, "loss": 0.7038, "step": 480 }, { "epoch": 0.35, "grad_norm": 0.47564264251663835, "learning_rate": 6.8599999999999995e-06, "loss": 0.6974, "step": 490 }, { "epoch": 0.36, "grad_norm": 0.4561386006973232, "learning_rate": 7e-06, "loss": 0.702, "step": 500 }, { "epoch": 0.36, "grad_norm": 0.4318637464381274, "learning_rate": 6.999934216315939e-06, "loss": 0.7054, "step": 510 }, { "epoch": 0.37, "grad_norm": 0.47772094451329594, "learning_rate": 6.999736867736609e-06, "loss": 0.6946, "step": 520 }, { "epoch": 0.38, "grad_norm": 0.45891608711087106, "learning_rate": 6.9994079616804764e-06, "loss": 0.6952, "step": 530 }, { "epoch": 0.38, "grad_norm": 0.46731862765960264, "learning_rate": 6.9989475105113426e-06, "loss": 0.6888, "step": 540 }, { "epoch": 0.39, "grad_norm": 0.4667223098464595, "learning_rate": 6.998355531537879e-06, "loss": 0.7017, "step": 550 }, { "epoch": 0.4, "grad_norm": 0.46285196540927176, "learning_rate": 6.997632047012975e-06, "loss": 0.7051, "step": 560 }, { "epoch": 0.41, "grad_norm": 0.48044807815149254, "learning_rate": 6.996777084132904e-06, "loss": 0.701, "step": 570 }, { "epoch": 0.41, "grad_norm": 0.47600970966063727, "learning_rate": 6.995790675036298e-06, "loss": 0.7001, "step": 580 }, { "epoch": 0.42, "grad_norm": 0.4494522317826872, "learning_rate": 6.994672856802944e-06, "loss": 0.7042, "step": 590 }, { "epoch": 0.43, "grad_norm": 0.4623294450089233, "learning_rate": 6.993423671452386e-06, "loss": 0.69, "step": 600 }, { "epoch": 0.43, "grad_norm": 0.43825456028915594, "learning_rate": 6.9920431659423436e-06, "loss": 0.6996, "step": 610 }, { "epoch": 0.44, "grad_norm": 0.4568055452742323, "learning_rate": 6.990531392166956e-06, "loss": 0.6939, "step": 620 }, { "epoch": 0.45, "grad_norm": 0.4302767633743081, "learning_rate": 6.988888406954821e-06, "loss": 0.6898, "step": 630 }, { "epoch": 0.46, "grad_norm": 0.4762852616798798, "learning_rate": 6.9871142720668644e-06, "loss": 0.703, "step": 640 }, { "epoch": 0.46, "grad_norm": 0.4572026337069386, "learning_rate": 6.985209054194017e-06, "loss": 0.7004, "step": 650 }, { "epoch": 0.47, "grad_norm": 0.45803902960498666, "learning_rate": 6.983172824954708e-06, "loss": 0.6853, "step": 660 }, { "epoch": 0.48, "grad_norm": 0.44353624606381903, "learning_rate": 6.9810056608921725e-06, "loss": 0.7074, "step": 670 }, { "epoch": 0.48, "grad_norm": 0.44517458769087626, "learning_rate": 6.978707643471573e-06, "loss": 0.6988, "step": 680 }, { "epoch": 0.49, "grad_norm": 0.4616555458392388, "learning_rate": 6.97627885907694e-06, "loss": 0.7034, "step": 690 }, { "epoch": 0.5, "grad_norm": 0.4770896081066365, "learning_rate": 6.973719399007923e-06, "loss": 0.6935, "step": 700 }, { "epoch": 0.5, "grad_norm": 0.45665921054521347, "learning_rate": 6.9710293594763545e-06, "loss": 0.6773, "step": 710 }, { "epoch": 0.51, "grad_norm": 0.48834217157342125, "learning_rate": 6.968208841602645e-06, "loss": 0.6974, "step": 720 }, { "epoch": 0.52, "grad_norm": 0.4661409470252182, "learning_rate": 6.965257951411967e-06, "loss": 0.6796, "step": 730 }, { "epoch": 0.53, "grad_norm": 0.4249423447942054, "learning_rate": 6.962176799830279e-06, "loss": 0.686, "step": 740 }, { "epoch": 0.53, "grad_norm": 0.4517631229399239, "learning_rate": 6.958965502680155e-06, "loss": 0.6968, "step": 750 }, { "epoch": 0.54, "grad_norm": 0.4334006789419362, "learning_rate": 6.955624180676427e-06, "loss": 0.705, "step": 760 }, { "epoch": 0.55, "grad_norm": 0.44354874837116653, "learning_rate": 6.9521529594216516e-06, "loss": 0.6954, "step": 770 }, { "epoch": 0.55, "grad_norm": 0.4606606226964418, "learning_rate": 6.948551969401381e-06, "loss": 0.6965, "step": 780 }, { "epoch": 0.56, "grad_norm": 0.46221163538458165, "learning_rate": 6.94482134597927e-06, "loss": 0.695, "step": 790 }, { "epoch": 0.57, "grad_norm": 0.4636824720485381, "learning_rate": 6.940961229391975e-06, "loss": 0.6919, "step": 800 }, { "epoch": 0.58, "grad_norm": 0.4450527833539268, "learning_rate": 6.936971764743891e-06, "loss": 0.6977, "step": 810 }, { "epoch": 0.58, "grad_norm": 0.4358125416971688, "learning_rate": 6.932853102001694e-06, "loss": 0.6998, "step": 820 }, { "epoch": 0.59, "grad_norm": 0.45623590289661414, "learning_rate": 6.928605395988701e-06, "loss": 0.6954, "step": 830 }, { "epoch": 0.6, "grad_norm": 0.4536975058820564, "learning_rate": 6.924228806379058e-06, "loss": 0.6742, "step": 840 }, { "epoch": 0.6, "grad_norm": 0.4563719379438227, "learning_rate": 6.919723497691728e-06, "loss": 0.6921, "step": 850 }, { "epoch": 0.61, "grad_norm": 0.45279224746852664, "learning_rate": 6.915089639284313e-06, "loss": 0.6861, "step": 860 }, { "epoch": 0.62, "grad_norm": 0.466062080319079, "learning_rate": 6.910327405346686e-06, "loss": 0.6895, "step": 870 }, { "epoch": 0.63, "grad_norm": 0.443881137156012, "learning_rate": 6.905436974894443e-06, "loss": 0.7008, "step": 880 }, { "epoch": 0.63, "grad_norm": 0.47752762402129206, "learning_rate": 6.900418531762173e-06, "loss": 0.6985, "step": 890 }, { "epoch": 0.64, "grad_norm": 0.4542692407893758, "learning_rate": 6.89527226459655e-06, "loss": 0.6822, "step": 900 }, { "epoch": 0.65, "grad_norm": 0.4314820719874765, "learning_rate": 6.889998366849237e-06, "loss": 0.691, "step": 910 }, { "epoch": 0.65, "grad_norm": 0.4278370127210443, "learning_rate": 6.884597036769621e-06, "loss": 0.689, "step": 920 }, { "epoch": 0.66, "grad_norm": 0.45134601911703476, "learning_rate": 6.879068477397353e-06, "loss": 0.6898, "step": 930 }, { "epoch": 0.67, "grad_norm": 0.45160503192413054, "learning_rate": 6.87341289655472e-06, "loss": 0.6869, "step": 940 }, { "epoch": 0.68, "grad_norm": 0.41025143635863104, "learning_rate": 6.867630506838833e-06, "loss": 0.6984, "step": 950 }, { "epoch": 0.68, "grad_norm": 0.46520301654074564, "learning_rate": 6.861721525613633e-06, "loss": 0.6843, "step": 960 }, { "epoch": 0.69, "grad_norm": 0.451991102882798, "learning_rate": 6.8556861750017235e-06, "loss": 0.6962, "step": 970 }, { "epoch": 0.7, "grad_norm": 0.418111038766468, "learning_rate": 6.849524681876018e-06, "loss": 0.6797, "step": 980 }, { "epoch": 0.7, "grad_norm": 0.4403261547939229, "learning_rate": 6.843237277851211e-06, "loss": 0.6965, "step": 990 }, { "epoch": 0.71, "grad_norm": 0.426598785059419, "learning_rate": 6.836824199275074e-06, "loss": 0.6821, "step": 1000 }, { "epoch": 0.72, "grad_norm": 0.42988247771547117, "learning_rate": 6.830285687219569e-06, "loss": 0.6911, "step": 1010 }, { "epoch": 0.73, "grad_norm": 0.452230475071558, "learning_rate": 6.823621987471789e-06, "loss": 0.6851, "step": 1020 }, { "epoch": 0.73, "grad_norm": 0.4267205539811686, "learning_rate": 6.816833350524716e-06, "loss": 0.6777, "step": 1030 }, { "epoch": 0.74, "grad_norm": 0.44148424584394874, "learning_rate": 6.809920031567808e-06, "loss": 0.6838, "step": 1040 }, { "epoch": 0.75, "grad_norm": 0.43306877795839893, "learning_rate": 6.802882290477399e-06, "loss": 0.6864, "step": 1050 }, { "epoch": 0.75, "grad_norm": 0.4952482617663558, "learning_rate": 6.79572039180694e-06, "loss": 0.6904, "step": 1060 }, { "epoch": 0.76, "grad_norm": 0.45382453893592856, "learning_rate": 6.788434604777048e-06, "loss": 0.6795, "step": 1070 }, { "epoch": 0.77, "grad_norm": 0.452960843334945, "learning_rate": 6.781025203265388e-06, "loss": 0.6891, "step": 1080 }, { "epoch": 0.78, "grad_norm": 0.4537364245497661, "learning_rate": 6.773492465796373e-06, "loss": 0.6907, "step": 1090 }, { "epoch": 0.78, "grad_norm": 0.44929090527897886, "learning_rate": 6.765836675530703e-06, "loss": 0.6798, "step": 1100 }, { "epoch": 0.79, "grad_norm": 0.46381413350008455, "learning_rate": 6.758058120254715e-06, "loss": 0.6716, "step": 1110 }, { "epoch": 0.8, "grad_norm": 0.4309028536458763, "learning_rate": 6.750157092369563e-06, "loss": 0.6799, "step": 1120 }, { "epoch": 0.8, "grad_norm": 0.43717422966700575, "learning_rate": 6.742133888880233e-06, "loss": 0.6883, "step": 1130 }, { "epoch": 0.81, "grad_norm": 0.4459700930425581, "learning_rate": 6.7339888113843696e-06, "loss": 0.6891, "step": 1140 }, { "epoch": 0.82, "grad_norm": 0.44045298948848877, "learning_rate": 6.725722166060951e-06, "loss": 0.6817, "step": 1150 }, { "epoch": 0.82, "grad_norm": 0.4485899862146157, "learning_rate": 6.717334263658766e-06, "loss": 0.6897, "step": 1160 }, { "epoch": 0.83, "grad_norm": 0.45682000330961775, "learning_rate": 6.70882541948474e-06, "loss": 0.6776, "step": 1170 }, { "epoch": 0.84, "grad_norm": 0.48037041295136884, "learning_rate": 6.700195953392085e-06, "loss": 0.6872, "step": 1180 }, { "epoch": 0.85, "grad_norm": 0.44334741491819346, "learning_rate": 6.691446189768268e-06, "loss": 0.6798, "step": 1190 }, { "epoch": 0.85, "grad_norm": 0.4674740757760583, "learning_rate": 6.682576457522825e-06, "loss": 0.6977, "step": 1200 }, { "epoch": 0.86, "grad_norm": 0.4696181980144796, "learning_rate": 6.673587090074993e-06, "loss": 0.6896, "step": 1210 }, { "epoch": 0.87, "grad_norm": 0.4593954697303246, "learning_rate": 6.664478425341176e-06, "loss": 0.6749, "step": 1220 }, { "epoch": 0.87, "grad_norm": 0.41647753357217115, "learning_rate": 6.655250805722244e-06, "loss": 0.6894, "step": 1230 }, { "epoch": 0.88, "grad_norm": 0.4245409839045758, "learning_rate": 6.645904578090662e-06, "loss": 0.6693, "step": 1240 }, { "epoch": 0.89, "grad_norm": 0.45490183172736, "learning_rate": 6.636440093777451e-06, "loss": 0.6881, "step": 1250 }, { "epoch": 0.9, "grad_norm": 0.4633877447287089, "learning_rate": 6.626857708558979e-06, "loss": 0.6953, "step": 1260 }, { "epoch": 0.9, "grad_norm": 0.45069656102358646, "learning_rate": 6.617157782643591e-06, "loss": 0.6787, "step": 1270 }, { "epoch": 0.91, "grad_norm": 0.44438426822862237, "learning_rate": 6.6073406806580646e-06, "loss": 0.6859, "step": 1280 }, { "epoch": 0.92, "grad_norm": 0.4335460798475662, "learning_rate": 6.597406771633906e-06, "loss": 0.6829, "step": 1290 }, { "epoch": 0.92, "grad_norm": 0.4282672786086354, "learning_rate": 6.587356428993477e-06, "loss": 0.6831, "step": 1300 }, { "epoch": 0.93, "grad_norm": 0.46465171297436636, "learning_rate": 6.577190030535957e-06, "loss": 0.6778, "step": 1310 }, { "epoch": 0.94, "grad_norm": 0.4590812961346198, "learning_rate": 6.566907958423142e-06, "loss": 0.6701, "step": 1320 }, { "epoch": 0.95, "grad_norm": 0.4180631333820519, "learning_rate": 6.5565105991650815e-06, "loss": 0.6825, "step": 1330 }, { "epoch": 0.95, "grad_norm": 0.42684427340923925, "learning_rate": 6.545998343605544e-06, "loss": 0.6823, "step": 1340 }, { "epoch": 0.96, "grad_norm": 0.6515643833482546, "learning_rate": 6.5353715869073275e-06, "loss": 0.6748, "step": 1350 }, { "epoch": 0.97, "grad_norm": 0.42995190312179654, "learning_rate": 6.524630728537408e-06, "loss": 0.6896, "step": 1360 }, { "epoch": 0.97, "grad_norm": 0.4307066820527156, "learning_rate": 6.513776172251919e-06, "loss": 0.6821, "step": 1370 }, { "epoch": 0.98, "grad_norm": 0.4401373902110004, "learning_rate": 6.5028083260809735e-06, "loss": 0.6729, "step": 1380 }, { "epoch": 0.99, "grad_norm": 0.420372235119902, "learning_rate": 6.491727602313334e-06, "loss": 0.6812, "step": 1390 }, { "epoch": 1.0, "grad_norm": 0.44387468527179835, "learning_rate": 6.4805344174808986e-06, "loss": 0.6713, "step": 1400 }, { "epoch": 1.0, "grad_norm": 0.4224291568526637, "learning_rate": 6.4692291923430634e-06, "loss": 0.6928, "step": 1410 }, { "epoch": 1.01, "grad_norm": 0.42342827072921446, "learning_rate": 6.457812351870889e-06, "loss": 0.6925, "step": 1420 }, { "epoch": 1.02, "grad_norm": 0.4614687139520872, "learning_rate": 6.446284325231132e-06, "loss": 0.6804, "step": 1430 }, { "epoch": 1.01, "grad_norm": 0.4513094113300999, "learning_rate": 6.434645545770116e-06, "loss": 0.649, "step": 1440 }, { "epoch": 1.01, "grad_norm": 0.46129242006354043, "learning_rate": 6.422896450997434e-06, "loss": 0.6244, "step": 1450 }, { "epoch": 1.02, "grad_norm": 0.44352477273420793, "learning_rate": 6.411037482569509e-06, "loss": 0.6231, "step": 1460 }, { "epoch": 1.03, "grad_norm": 0.43347730975194065, "learning_rate": 6.399069086272988e-06, "loss": 0.6163, "step": 1470 }, { "epoch": 1.03, "grad_norm": 0.5042235757137699, "learning_rate": 6.386991712007985e-06, "loss": 0.6295, "step": 1480 }, { "epoch": 1.04, "grad_norm": 0.4635765704926019, "learning_rate": 6.374805813771171e-06, "loss": 0.6145, "step": 1490 }, { "epoch": 1.05, "grad_norm": 0.4672283056367441, "learning_rate": 6.362511849638706e-06, "loss": 0.6248, "step": 1500 }, { "epoch": 1.05, "grad_norm": 0.44386378239345664, "learning_rate": 6.3501102817490184e-06, "loss": 0.6208, "step": 1510 }, { "epoch": 1.06, "grad_norm": 0.45014512458671113, "learning_rate": 6.337601576285438e-06, "loss": 0.6241, "step": 1520 }, { "epoch": 1.07, "grad_norm": 0.47077991205008496, "learning_rate": 6.324986203458665e-06, "loss": 0.637, "step": 1530 }, { "epoch": 1.08, "grad_norm": 0.43971957336428713, "learning_rate": 6.3122646374891014e-06, "loss": 0.6274, "step": 1540 }, { "epoch": 1.08, "grad_norm": 0.45398595356146343, "learning_rate": 6.299437356589018e-06, "loss": 0.6172, "step": 1550 }, { "epoch": 1.09, "grad_norm": 0.4638039927896387, "learning_rate": 6.2865048429445835e-06, "loss": 0.6162, "step": 1560 }, { "epoch": 1.1, "grad_norm": 0.456884430778857, "learning_rate": 6.273467582697736e-06, "loss": 0.6358, "step": 1570 }, { "epoch": 1.1, "grad_norm": 0.4513273711536076, "learning_rate": 6.260326065927908e-06, "loss": 0.6256, "step": 1580 }, { "epoch": 1.11, "grad_norm": 0.4585546365167011, "learning_rate": 6.247080786633608e-06, "loss": 0.6343, "step": 1590 }, { "epoch": 1.12, "grad_norm": 0.4837809920582229, "learning_rate": 6.233732242713847e-06, "loss": 0.6205, "step": 1600 }, { "epoch": 1.13, "grad_norm": 0.45062031874118463, "learning_rate": 6.220280935949423e-06, "loss": 0.6181, "step": 1610 }, { "epoch": 1.13, "grad_norm": 0.4934582241182996, "learning_rate": 6.206727371984055e-06, "loss": 0.6101, "step": 1620 }, { "epoch": 1.14, "grad_norm": 0.45848465100131724, "learning_rate": 6.193072060305386e-06, "loss": 0.6274, "step": 1630 }, { "epoch": 1.15, "grad_norm": 0.49225379713590917, "learning_rate": 6.17931551422582e-06, "loss": 0.6287, "step": 1640 }, { "epoch": 1.15, "grad_norm": 0.43783738072351636, "learning_rate": 6.165458250863233e-06, "loss": 0.6322, "step": 1650 }, { "epoch": 1.16, "grad_norm": 0.45111919610212603, "learning_rate": 6.15150079112153e-06, "loss": 0.6343, "step": 1660 }, { "epoch": 1.17, "grad_norm": 0.7283719867926337, "learning_rate": 6.137443659671066e-06, "loss": 0.6245, "step": 1670 }, { "epoch": 1.18, "grad_norm": 0.4317614230374671, "learning_rate": 6.123287384928924e-06, "loss": 0.6252, "step": 1680 }, { "epoch": 1.18, "grad_norm": 0.43630742763076885, "learning_rate": 6.1090324990390505e-06, "loss": 0.6281, "step": 1690 }, { "epoch": 1.19, "grad_norm": 0.49179102646470696, "learning_rate": 6.09467953785225e-06, "loss": 0.6304, "step": 1700 }, { "epoch": 1.2, "grad_norm": 0.4269421327683836, "learning_rate": 6.080229040906045e-06, "loss": 0.6205, "step": 1710 }, { "epoch": 1.2, "grad_norm": 0.44873848635658836, "learning_rate": 6.065681551404392e-06, "loss": 0.6203, "step": 1720 }, { "epoch": 1.21, "grad_norm": 0.43522811508044484, "learning_rate": 6.051037616197267e-06, "loss": 0.6233, "step": 1730 }, { "epoch": 1.22, "grad_norm": 0.43363424076560303, "learning_rate": 6.036297785760099e-06, "loss": 0.6274, "step": 1740 }, { "epoch": 1.23, "grad_norm": 0.4420787259752861, "learning_rate": 6.0214626141730895e-06, "loss": 0.6388, "step": 1750 }, { "epoch": 1.23, "grad_norm": 0.445119846862499, "learning_rate": 6.006532659100377e-06, "loss": 0.6107, "step": 1760 }, { "epoch": 1.24, "grad_norm": 0.4380767674114949, "learning_rate": 5.991508481769071e-06, "loss": 0.6341, "step": 1770 }, { "epoch": 1.25, "grad_norm": 0.44003117819419657, "learning_rate": 5.976390646948166e-06, "loss": 0.6344, "step": 1780 }, { "epoch": 1.25, "grad_norm": 0.45806509086322245, "learning_rate": 5.961179722927302e-06, "loss": 0.6283, "step": 1790 }, { "epoch": 1.26, "grad_norm": 0.4545928600817147, "learning_rate": 5.9458762814954016e-06, "loss": 0.6254, "step": 1800 }, { "epoch": 1.27, "grad_norm": 0.4438181707408447, "learning_rate": 5.930480897919185e-06, "loss": 0.631, "step": 1810 }, { "epoch": 1.28, "grad_norm": 0.44695115171581695, "learning_rate": 5.9149941509215366e-06, "loss": 0.6338, "step": 1820 }, { "epoch": 1.28, "grad_norm": 0.4280430227739119, "learning_rate": 5.899416622659754e-06, "loss": 0.6182, "step": 1830 }, { "epoch": 1.29, "grad_norm": 0.458726186518369, "learning_rate": 5.883748898703666e-06, "loss": 0.6162, "step": 1840 }, { "epoch": 1.3, "grad_norm": 0.43445566304338457, "learning_rate": 5.8679915680136155e-06, "loss": 0.6228, "step": 1850 }, { "epoch": 1.3, "grad_norm": 0.44895947980462597, "learning_rate": 5.852145222918326e-06, "loss": 0.6373, "step": 1860 }, { "epoch": 1.31, "grad_norm": 0.43403817083393664, "learning_rate": 5.83621045909263e-06, "loss": 0.6376, "step": 1870 }, { "epoch": 1.32, "grad_norm": 0.4673939224968789, "learning_rate": 5.820187875535083e-06, "loss": 0.6215, "step": 1880 }, { "epoch": 1.33, "grad_norm": 0.46323588428022766, "learning_rate": 5.804078074545439e-06, "loss": 0.6187, "step": 1890 }, { "epoch": 1.33, "grad_norm": 0.4530033509696719, "learning_rate": 5.7878816617020204e-06, "loss": 0.6239, "step": 1900 }, { "epoch": 1.34, "grad_norm": 0.4317929663828983, "learning_rate": 5.771599245838943e-06, "loss": 0.6168, "step": 1910 }, { "epoch": 1.35, "grad_norm": 0.436592310414347, "learning_rate": 5.7552314390232364e-06, "loss": 0.6179, "step": 1920 }, { "epoch": 1.35, "grad_norm": 0.4702835623046126, "learning_rate": 5.738778856531832e-06, "loss": 0.6272, "step": 1930 }, { "epoch": 1.36, "grad_norm": 0.4619318889613922, "learning_rate": 5.72224211682844e-06, "loss": 0.6256, "step": 1940 }, { "epoch": 1.37, "grad_norm": 0.49429029776316813, "learning_rate": 5.705621841540292e-06, "loss": 0.6283, "step": 1950 }, { "epoch": 1.37, "grad_norm": 0.47054367378052575, "learning_rate": 5.688918655434783e-06, "loss": 0.6156, "step": 1960 }, { "epoch": 1.38, "grad_norm": 0.45638233691668284, "learning_rate": 5.67213318639598e-06, "loss": 0.6257, "step": 1970 }, { "epoch": 1.39, "grad_norm": 0.43819489071261747, "learning_rate": 5.655266065401021e-06, "loss": 0.6255, "step": 1980 }, { "epoch": 1.4, "grad_norm": 0.45603698357049277, "learning_rate": 5.638317926496398e-06, "loss": 0.6267, "step": 1990 }, { "epoch": 1.4, "grad_norm": 0.45518318702227223, "learning_rate": 5.6212894067741176e-06, "loss": 0.6357, "step": 2000 }, { "epoch": 1.41, "grad_norm": 0.4402683023420712, "learning_rate": 5.604181146347758e-06, "loss": 0.6311, "step": 2010 }, { "epoch": 1.42, "grad_norm": 0.4498808898227514, "learning_rate": 5.5869937883284065e-06, "loss": 0.6213, "step": 2020 }, { "epoch": 1.42, "grad_norm": 0.46040698115780887, "learning_rate": 5.569727978800478e-06, "loss": 0.6223, "step": 2030 }, { "epoch": 1.43, "grad_norm": 0.44168864627397236, "learning_rate": 5.552384366797435e-06, "loss": 0.6268, "step": 2040 }, { "epoch": 1.44, "grad_norm": 0.45494321235524204, "learning_rate": 5.534963604277388e-06, "loss": 0.6193, "step": 2050 }, { "epoch": 1.45, "grad_norm": 0.44543538788588954, "learning_rate": 5.517466346098587e-06, "loss": 0.6311, "step": 2060 }, { "epoch": 1.45, "grad_norm": 0.45370006917207745, "learning_rate": 5.4998932499948055e-06, "loss": 0.6263, "step": 2070 }, { "epoch": 1.46, "grad_norm": 0.4457705866746906, "learning_rate": 5.482244976550616e-06, "loss": 0.6267, "step": 2080 }, { "epoch": 1.47, "grad_norm": 0.44178347775287935, "learning_rate": 5.464522189176559e-06, "loss": 0.6168, "step": 2090 }, { "epoch": 1.47, "grad_norm": 0.4510685099498634, "learning_rate": 5.446725554084202e-06, "loss": 0.6071, "step": 2100 }, { "epoch": 1.48, "grad_norm": 0.4463056440103558, "learning_rate": 5.4288557402611e-06, "loss": 0.6193, "step": 2110 }, { "epoch": 1.49, "grad_norm": 0.4450825773000299, "learning_rate": 5.410913419445647e-06, "loss": 0.6114, "step": 2120 }, { "epoch": 1.5, "grad_norm": 0.4609214677792106, "learning_rate": 5.3928992661018194e-06, "loss": 0.6255, "step": 2130 }, { "epoch": 1.5, "grad_norm": 0.48687583594807843, "learning_rate": 5.374813957393832e-06, "loss": 0.6286, "step": 2140 }, { "epoch": 1.51, "grad_norm": 0.47549284042607015, "learning_rate": 5.356658173160674e-06, "loss": 0.6143, "step": 2150 }, { "epoch": 1.52, "grad_norm": 0.49532165280916113, "learning_rate": 5.338432595890562e-06, "loss": 0.6249, "step": 2160 }, { "epoch": 1.52, "grad_norm": 0.45253915740067313, "learning_rate": 5.320137910695275e-06, "loss": 0.6257, "step": 2170 }, { "epoch": 1.53, "grad_norm": 0.43721435814923637, "learning_rate": 5.301774805284408e-06, "loss": 0.6178, "step": 2180 }, { "epoch": 1.54, "grad_norm": 0.4683301857922748, "learning_rate": 5.2833439699395175e-06, "loss": 0.6173, "step": 2190 }, { "epoch": 1.55, "grad_norm": 0.43871464981194036, "learning_rate": 5.264846097488175e-06, "loss": 0.6214, "step": 2200 }, { "epoch": 1.55, "grad_norm": 0.4524085111628937, "learning_rate": 5.246281883277922e-06, "loss": 0.6346, "step": 2210 }, { "epoch": 1.56, "grad_norm": 0.4468406698869542, "learning_rate": 5.227652025150132e-06, "loss": 0.614, "step": 2220 }, { "epoch": 1.57, "grad_norm": 0.468252187542662, "learning_rate": 5.208957223413776e-06, "loss": 0.6057, "step": 2230 }, { "epoch": 1.57, "grad_norm": 0.46458186348478814, "learning_rate": 5.1901981808191e-06, "loss": 0.6192, "step": 2240 }, { "epoch": 1.58, "grad_norm": 0.4589397282179608, "learning_rate": 5.1713756025312095e-06, "loss": 0.6197, "step": 2250 }, { "epoch": 1.59, "grad_norm": 0.4733441471283767, "learning_rate": 5.1524901961035555e-06, "loss": 0.6146, "step": 2260 }, { "epoch": 1.6, "grad_norm": 0.49573981085967583, "learning_rate": 5.1335426714513436e-06, "loss": 0.6205, "step": 2270 }, { "epoch": 1.6, "grad_norm": 0.45753588591278177, "learning_rate": 5.114533740824848e-06, "loss": 0.6194, "step": 2280 }, { "epoch": 1.61, "grad_norm": 0.44981584915327405, "learning_rate": 5.095464118782631e-06, "loss": 0.6285, "step": 2290 }, { "epoch": 1.62, "grad_norm": 0.44941448245640475, "learning_rate": 5.076334522164687e-06, "loss": 0.6183, "step": 2300 }, { "epoch": 1.62, "grad_norm": 0.46348841235648264, "learning_rate": 5.057145670065498e-06, "loss": 0.6178, "step": 2310 }, { "epoch": 1.63, "grad_norm": 0.4819885899523623, "learning_rate": 5.037898283806995e-06, "loss": 0.6209, "step": 2320 }, { "epoch": 1.64, "grad_norm": 0.45974762343297226, "learning_rate": 5.018593086911453e-06, "loss": 0.6144, "step": 2330 }, { "epoch": 1.65, "grad_norm": 0.4832719455105882, "learning_rate": 4.999230805074284e-06, "loss": 0.6255, "step": 2340 }, { "epoch": 1.65, "grad_norm": 0.4580501245903807, "learning_rate": 4.979812166136764e-06, "loss": 0.622, "step": 2350 }, { "epoch": 1.66, "grad_norm": 0.4869292416366864, "learning_rate": 4.960337900058668e-06, "loss": 0.6295, "step": 2360 }, { "epoch": 1.67, "grad_norm": 0.44734991176527494, "learning_rate": 4.940808738890834e-06, "loss": 0.61, "step": 2370 }, { "epoch": 1.67, "grad_norm": 0.4836741219786191, "learning_rate": 4.921225416747647e-06, "loss": 0.6131, "step": 2380 }, { "epoch": 1.68, "grad_norm": 0.43868937063180397, "learning_rate": 4.901588669779433e-06, "loss": 0.6261, "step": 2390 }, { "epoch": 1.69, "grad_norm": 0.4549440779907735, "learning_rate": 4.881899236144797e-06, "loss": 0.6216, "step": 2400 }, { "epoch": 1.69, "grad_norm": 0.4561309327019534, "learning_rate": 4.862157855982875e-06, "loss": 0.6262, "step": 2410 }, { "epoch": 1.7, "grad_norm": 0.4521274007767562, "learning_rate": 4.8423652713855e-06, "loss": 0.6214, "step": 2420 }, { "epoch": 1.71, "grad_norm": 0.4876373591113174, "learning_rate": 4.822522226369323e-06, "loss": 0.6303, "step": 2430 }, { "epoch": 1.72, "grad_norm": 0.4403247558369275, "learning_rate": 4.802629466847827e-06, "loss": 0.6236, "step": 2440 }, { "epoch": 1.72, "grad_norm": 0.4392883872725244, "learning_rate": 4.782687740603308e-06, "loss": 0.6125, "step": 2450 }, { "epoch": 1.73, "grad_norm": 0.44359149108855517, "learning_rate": 4.762697797258742e-06, "loss": 0.6208, "step": 2460 }, { "epoch": 1.74, "grad_norm": 0.45892783125410747, "learning_rate": 4.742660388249629e-06, "loss": 0.6146, "step": 2470 }, { "epoch": 1.74, "grad_norm": 0.46353318895549067, "learning_rate": 4.722576266795729e-06, "loss": 0.6199, "step": 2480 }, { "epoch": 1.75, "grad_norm": 0.4642990741363008, "learning_rate": 4.702446187872758e-06, "loss": 0.6182, "step": 2490 }, { "epoch": 1.76, "grad_norm": 0.44827792507065956, "learning_rate": 4.682270908184003e-06, "loss": 0.6246, "step": 2500 }, { "epoch": 1.77, "grad_norm": 0.45544933714150454, "learning_rate": 4.662051186131876e-06, "loss": 0.6256, "step": 2510 }, { "epoch": 1.77, "grad_norm": 0.4485500362120205, "learning_rate": 4.641787781789412e-06, "loss": 0.6181, "step": 2520 }, { "epoch": 1.78, "grad_norm": 0.42631048877270405, "learning_rate": 4.6214814568716894e-06, "loss": 0.6331, "step": 2530 }, { "epoch": 1.79, "grad_norm": 0.4714279586473698, "learning_rate": 4.601132974707202e-06, "loss": 0.628, "step": 2540 }, { "epoch": 1.79, "grad_norm": 0.4228608375782349, "learning_rate": 4.5807431002091605e-06, "loss": 0.6054, "step": 2550 }, { "epoch": 1.8, "grad_norm": 0.46872660848782277, "learning_rate": 4.560312599846746e-06, "loss": 0.6102, "step": 2560 }, { "epoch": 1.81, "grad_norm": 0.4379038714391558, "learning_rate": 4.539842241616287e-06, "loss": 0.6143, "step": 2570 }, { "epoch": 1.82, "grad_norm": 0.4719919574560488, "learning_rate": 4.519332795012404e-06, "loss": 0.6197, "step": 2580 }, { "epoch": 1.82, "grad_norm": 0.4560470541146194, "learning_rate": 4.498785030999068e-06, "loss": 0.6132, "step": 2590 }, { "epoch": 1.83, "grad_norm": 0.48502107778992737, "learning_rate": 4.478199721980633e-06, "loss": 0.631, "step": 2600 }, { "epoch": 1.84, "grad_norm": 0.45288928959662245, "learning_rate": 4.457577641772792e-06, "loss": 0.6148, "step": 2610 }, { "epoch": 1.84, "grad_norm": 0.45740004712492455, "learning_rate": 4.436919565573495e-06, "loss": 0.613, "step": 2620 }, { "epoch": 1.85, "grad_norm": 0.4680089016865197, "learning_rate": 4.416226269933802e-06, "loss": 0.6109, "step": 2630 }, { "epoch": 1.86, "grad_norm": 0.4498754217059588, "learning_rate": 4.395498532728697e-06, "loss": 0.627, "step": 2640 }, { "epoch": 1.87, "grad_norm": 0.490510820257092, "learning_rate": 4.374737133127847e-06, "loss": 0.6287, "step": 2650 }, { "epoch": 1.87, "grad_norm": 0.4384793154811805, "learning_rate": 4.35394285156631e-06, "loss": 0.6265, "step": 2660 }, { "epoch": 1.88, "grad_norm": 0.42053564372682345, "learning_rate": 4.3331164697151995e-06, "loss": 0.6123, "step": 2670 }, { "epoch": 1.89, "grad_norm": 0.44499220286710817, "learning_rate": 4.3122587704523015e-06, "loss": 0.6196, "step": 2680 }, { "epoch": 1.89, "grad_norm": 0.4681953108721627, "learning_rate": 4.291370537832641e-06, "loss": 0.6301, "step": 2690 }, { "epoch": 1.9, "grad_norm": 0.4245150987038812, "learning_rate": 4.2704525570590185e-06, "loss": 0.6203, "step": 2700 }, { "epoch": 1.91, "grad_norm": 0.4738423212960381, "learning_rate": 4.2495056144524824e-06, "loss": 0.6159, "step": 2710 }, { "epoch": 1.92, "grad_norm": 0.49926406862961464, "learning_rate": 4.228530497422779e-06, "loss": 0.6193, "step": 2720 }, { "epoch": 1.92, "grad_norm": 0.4423739374256911, "learning_rate": 4.207527994438748e-06, "loss": 0.617, "step": 2730 }, { "epoch": 1.93, "grad_norm": 0.44692873617751755, "learning_rate": 4.186498894998689e-06, "loss": 0.6135, "step": 2740 }, { "epoch": 1.94, "grad_norm": 0.4358994979972626, "learning_rate": 4.165443989600678e-06, "loss": 0.6121, "step": 2750 }, { "epoch": 1.94, "grad_norm": 0.46452930431844286, "learning_rate": 4.144364069712854e-06, "loss": 0.6167, "step": 2760 }, { "epoch": 1.95, "grad_norm": 0.4816111574015236, "learning_rate": 4.123259927743669e-06, "loss": 0.6203, "step": 2770 }, { "epoch": 1.96, "grad_norm": 0.45232518080467465, "learning_rate": 4.102132357012098e-06, "loss": 0.6199, "step": 2780 }, { "epoch": 1.97, "grad_norm": 0.45515782747165817, "learning_rate": 4.08098215171782e-06, "loss": 0.6174, "step": 2790 }, { "epoch": 1.97, "grad_norm": 0.44933646029392305, "learning_rate": 4.059810106911363e-06, "loss": 0.6188, "step": 2800 }, { "epoch": 1.98, "grad_norm": 0.45633219759975596, "learning_rate": 4.038617018464217e-06, "loss": 0.6168, "step": 2810 }, { "epoch": 1.99, "grad_norm": 0.4663774750339656, "learning_rate": 4.017403683038914e-06, "loss": 0.6199, "step": 2820 }, { "epoch": 1.99, "grad_norm": 0.4565589400061048, "learning_rate": 3.996170898059087e-06, "loss": 0.6187, "step": 2830 }, { "epoch": 2.0, "grad_norm": 0.45638098232431645, "learning_rate": 3.97491946167949e-06, "loss": 0.6133, "step": 2840 }, { "epoch": 2.01, "grad_norm": 0.4330737687010161, "learning_rate": 3.9536501727559956e-06, "loss": 0.6179, "step": 2850 }, { "epoch": 2.01, "grad_norm": 0.44620897297773393, "learning_rate": 3.932363830815563e-06, "loss": 0.606, "step": 2860 }, { "epoch": 2.0, "grad_norm": 0.4727298461430969, "learning_rate": 3.911061236026192e-06, "loss": 0.5804, "step": 2870 }, { "epoch": 2.01, "grad_norm": 0.5332182751767908, "learning_rate": 3.889743189166831e-06, "loss": 0.5552, "step": 2880 }, { "epoch": 2.02, "grad_norm": 0.471875496548638, "learning_rate": 3.868410491597286e-06, "loss": 0.5467, "step": 2890 }, { "epoch": 2.02, "grad_norm": 0.4869637805163024, "learning_rate": 3.847063945228094e-06, "loss": 0.5691, "step": 2900 }, { "epoch": 2.03, "grad_norm": 0.4714418364302173, "learning_rate": 3.825704352490375e-06, "loss": 0.5788, "step": 2910 }, { "epoch": 2.04, "grad_norm": 0.49636094733662106, "learning_rate": 3.804332516305672e-06, "loss": 0.5583, "step": 2920 }, { "epoch": 2.05, "grad_norm": 0.48087980189754664, "learning_rate": 3.782949240055768e-06, "loss": 0.5632, "step": 2930 }, { "epoch": 2.05, "grad_norm": 0.4873147689537464, "learning_rate": 3.7615553275524852e-06, "loss": 0.5602, "step": 2940 }, { "epoch": 2.06, "grad_norm": 0.4603275098510104, "learning_rate": 3.74015158300747e-06, "loss": 0.5641, "step": 2950 }, { "epoch": 2.07, "grad_norm": 0.5162191764305892, "learning_rate": 3.7187388110019604e-06, "loss": 0.5628, "step": 2960 }, { "epoch": 2.07, "grad_norm": 0.49005627074608765, "learning_rate": 3.697317816456546e-06, "loss": 0.559, "step": 2970 }, { "epoch": 2.08, "grad_norm": 0.4585568665283943, "learning_rate": 3.6758894046009037e-06, "loss": 0.547, "step": 2980 }, { "epoch": 2.09, "grad_norm": 0.4506260874603515, "learning_rate": 3.6544543809435346e-06, "loss": 0.5433, "step": 2990 }, { "epoch": 2.1, "grad_norm": 0.46595533436834136, "learning_rate": 3.6330135512414822e-06, "loss": 0.5666, "step": 3000 }, { "epoch": 2.1, "grad_norm": 0.4690150184503375, "learning_rate": 3.6115677214700397e-06, "loss": 0.5596, "step": 3010 }, { "epoch": 2.11, "grad_norm": 0.4683369095498927, "learning_rate": 3.5901176977924606e-06, "loss": 0.5458, "step": 3020 }, { "epoch": 2.12, "grad_norm": 0.4710288608351933, "learning_rate": 3.568664286529646e-06, "loss": 0.5507, "step": 3030 }, { "epoch": 2.12, "grad_norm": 0.4928542807361932, "learning_rate": 3.5472082941298433e-06, "loss": 0.5665, "step": 3040 }, { "epoch": 2.13, "grad_norm": 0.4972921543225756, "learning_rate": 3.5257505271383217e-06, "loss": 0.5586, "step": 3050 }, { "epoch": 2.14, "grad_norm": 0.4855107426051562, "learning_rate": 3.504291792167063e-06, "loss": 0.5615, "step": 3060 }, { "epoch": 2.15, "grad_norm": 0.4623236179613674, "learning_rate": 3.4828328958644326e-06, "loss": 0.5638, "step": 3070 }, { "epoch": 2.15, "grad_norm": 0.46028741167647896, "learning_rate": 3.4613746448848622e-06, "loss": 0.5464, "step": 3080 }, { "epoch": 2.16, "grad_norm": 0.46156508300115645, "learning_rate": 3.439917845858524e-06, "loss": 0.567, "step": 3090 }, { "epoch": 2.17, "grad_norm": 0.5669489602625127, "learning_rate": 3.418463305361013e-06, "loss": 0.5524, "step": 3100 }, { "epoch": 2.17, "grad_norm": 0.49099941076825016, "learning_rate": 3.3970118298830207e-06, "loss": 0.5591, "step": 3110 }, { "epoch": 2.18, "grad_norm": 0.5207064606888653, "learning_rate": 3.3755642258000265e-06, "loss": 0.5538, "step": 3120 }, { "epoch": 2.19, "grad_norm": 0.4830219809120518, "learning_rate": 3.3541212993419773e-06, "loss": 0.5475, "step": 3130 }, { "epoch": 2.19, "grad_norm": 0.4801836621711601, "learning_rate": 3.3326838565629895e-06, "loss": 0.5413, "step": 3140 }, { "epoch": 2.2, "grad_norm": 0.47387958333244534, "learning_rate": 3.31125270331104e-06, "loss": 0.5537, "step": 3150 }, { "epoch": 2.21, "grad_norm": 0.5090490511350312, "learning_rate": 3.289828645197681e-06, "loss": 0.5567, "step": 3160 }, { "epoch": 2.22, "grad_norm": 0.5286353188714713, "learning_rate": 3.2684124875677518e-06, "loss": 0.5589, "step": 3170 }, { "epoch": 2.22, "grad_norm": 0.4927074981163475, "learning_rate": 3.247005035469109e-06, "loss": 0.5697, "step": 3180 }, { "epoch": 2.23, "grad_norm": 0.47340856305327644, "learning_rate": 3.2256070936223603e-06, "loss": 0.5687, "step": 3190 }, { "epoch": 2.24, "grad_norm": 0.5115028667136483, "learning_rate": 3.2042194663906193e-06, "loss": 0.5625, "step": 3200 }, { "epoch": 2.24, "grad_norm": 0.4723602653535651, "learning_rate": 3.182842957749263e-06, "loss": 0.5633, "step": 3210 }, { "epoch": 2.25, "grad_norm": 0.4679538952450783, "learning_rate": 3.1614783712557156e-06, "loss": 0.5572, "step": 3220 }, { "epoch": 2.26, "grad_norm": 0.48051919166640805, "learning_rate": 3.1401265100192383e-06, "loss": 0.5648, "step": 3230 }, { "epoch": 2.27, "grad_norm": 0.4594423765819446, "learning_rate": 3.1187881766707425e-06, "loss": 0.5595, "step": 3240 }, { "epoch": 2.27, "grad_norm": 0.49220125939314296, "learning_rate": 3.0974641733326154e-06, "loss": 0.5479, "step": 3250 }, { "epoch": 2.28, "grad_norm": 0.4944265110257382, "learning_rate": 3.0761553015885717e-06, "loss": 0.5502, "step": 3260 }, { "epoch": 2.29, "grad_norm": 0.495744161270211, "learning_rate": 3.0548623624535165e-06, "loss": 0.5629, "step": 3270 }, { "epoch": 2.29, "grad_norm": 0.478561888744776, "learning_rate": 3.0335861563434403e-06, "loss": 0.5597, "step": 3280 }, { "epoch": 2.3, "grad_norm": 0.4946624980435279, "learning_rate": 3.012327483045325e-06, "loss": 0.556, "step": 3290 }, { "epoch": 2.31, "grad_norm": 0.4913013156645444, "learning_rate": 2.9910871416870855e-06, "loss": 0.5638, "step": 3300 }, { "epoch": 2.32, "grad_norm": 0.46629667333688474, "learning_rate": 2.9698659307075224e-06, "loss": 0.5508, "step": 3310 }, { "epoch": 2.32, "grad_norm": 0.47577400823898375, "learning_rate": 2.948664647826318e-06, "loss": 0.5518, "step": 3320 }, { "epoch": 2.33, "grad_norm": 0.48528006049817207, "learning_rate": 2.9274840900140375e-06, "loss": 0.5582, "step": 3330 }, { "epoch": 2.34, "grad_norm": 0.5499143301618472, "learning_rate": 2.906325053462181e-06, "loss": 0.548, "step": 3340 }, { "epoch": 2.34, "grad_norm": 0.4772816560553211, "learning_rate": 2.8851883335532496e-06, "loss": 0.5523, "step": 3350 }, { "epoch": 2.35, "grad_norm": 0.49887071761697505, "learning_rate": 2.8640747248308445e-06, "loss": 0.5544, "step": 3360 }, { "epoch": 2.36, "grad_norm": 0.4853842631362592, "learning_rate": 2.8429850209698053e-06, "loss": 0.5558, "step": 3370 }, { "epoch": 2.37, "grad_norm": 0.45895465861964546, "learning_rate": 2.8219200147463677e-06, "loss": 0.5598, "step": 3380 }, { "epoch": 2.37, "grad_norm": 0.4662802877247775, "learning_rate": 2.8008804980083695e-06, "loss": 0.5551, "step": 3390 }, { "epoch": 2.38, "grad_norm": 0.4881083174435456, "learning_rate": 2.7798672616454785e-06, "loss": 0.5511, "step": 3400 }, { "epoch": 2.39, "grad_norm": 0.5016617932642891, "learning_rate": 2.75888109555947e-06, "loss": 0.5438, "step": 3410 }, { "epoch": 2.39, "grad_norm": 0.4831166076149674, "learning_rate": 2.7379227886345244e-06, "loss": 0.5598, "step": 3420 }, { "epoch": 2.4, "grad_norm": 0.4953933886035155, "learning_rate": 2.716993128707581e-06, "loss": 0.5609, "step": 3430 }, { "epoch": 2.41, "grad_norm": 0.503170266490847, "learning_rate": 2.696092902538716e-06, "loss": 0.5488, "step": 3440 }, { "epoch": 2.42, "grad_norm": 0.5098380667106547, "learning_rate": 2.675222895781574e-06, "loss": 0.5539, "step": 3450 }, { "epoch": 2.42, "grad_norm": 0.49948084086860606, "learning_rate": 2.6543838929538285e-06, "loss": 0.5581, "step": 3460 }, { "epoch": 2.43, "grad_norm": 0.4872613273522286, "learning_rate": 2.6335766774076965e-06, "loss": 0.5562, "step": 3470 }, { "epoch": 2.44, "grad_norm": 0.47926716145131487, "learning_rate": 2.6128020313004875e-06, "loss": 0.5561, "step": 3480 }, { "epoch": 2.44, "grad_norm": 0.49339314189894584, "learning_rate": 2.592060735565206e-06, "loss": 0.5633, "step": 3490 }, { "epoch": 2.45, "grad_norm": 0.4888816777932096, "learning_rate": 2.5713535698811926e-06, "loss": 0.5623, "step": 3500 }, { "epoch": 2.46, "grad_norm": 0.47873225411797143, "learning_rate": 2.550681312644815e-06, "loss": 0.5629, "step": 3510 }, { "epoch": 2.47, "grad_norm": 0.4985498589688127, "learning_rate": 2.5300447409402104e-06, "loss": 0.5517, "step": 3520 }, { "epoch": 2.47, "grad_norm": 0.4699404709889953, "learning_rate": 2.509444630510071e-06, "loss": 0.5542, "step": 3530 }, { "epoch": 2.48, "grad_norm": 0.5471742855253533, "learning_rate": 2.4888817557264883e-06, "loss": 0.5573, "step": 3540 }, { "epoch": 2.49, "grad_norm": 0.4890601716460387, "learning_rate": 2.468356889561835e-06, "loss": 0.5496, "step": 3550 }, { "epoch": 2.49, "grad_norm": 0.4884550896007432, "learning_rate": 2.4478708035597206e-06, "loss": 0.5517, "step": 3560 }, { "epoch": 2.5, "grad_norm": 0.53082092791935, "learning_rate": 2.427424267805977e-06, "loss": 0.5643, "step": 3570 }, { "epoch": 2.51, "grad_norm": 0.4588900957688972, "learning_rate": 2.407018050899719e-06, "loss": 0.5588, "step": 3580 }, { "epoch": 2.51, "grad_norm": 0.4930240761419014, "learning_rate": 2.3866529199244454e-06, "loss": 0.5534, "step": 3590 }, { "epoch": 2.52, "grad_norm": 0.4995410840918172, "learning_rate": 2.36632964041921e-06, "loss": 0.5526, "step": 3600 }, { "epoch": 2.53, "grad_norm": 0.4889682103736911, "learning_rate": 2.3460489763498393e-06, "loss": 0.5575, "step": 3610 }, { "epoch": 2.54, "grad_norm": 0.47254332660748083, "learning_rate": 2.3258116900802188e-06, "loss": 0.5641, "step": 3620 }, { "epoch": 2.54, "grad_norm": 0.5271806756431864, "learning_rate": 2.3056185423436304e-06, "loss": 0.5515, "step": 3630 }, { "epoch": 2.55, "grad_norm": 0.5014716634327129, "learning_rate": 2.2854702922141627e-06, "loss": 0.5578, "step": 3640 }, { "epoch": 2.56, "grad_norm": 0.48930981901485066, "learning_rate": 2.265367697078168e-06, "loss": 0.5648, "step": 3650 }, { "epoch": 2.56, "grad_norm": 0.4822043988267899, "learning_rate": 2.245311512605801e-06, "loss": 0.5554, "step": 3660 }, { "epoch": 2.57, "grad_norm": 0.4978119631671631, "learning_rate": 2.2253024927226053e-06, "loss": 0.5586, "step": 3670 }, { "epoch": 2.58, "grad_norm": 0.49756480432664524, "learning_rate": 2.2053413895811764e-06, "loss": 0.5578, "step": 3680 }, { "epoch": 2.59, "grad_norm": 0.4671920108876918, "learning_rate": 2.1854289535328864e-06, "loss": 0.5557, "step": 3690 }, { "epoch": 2.59, "grad_norm": 0.513655855548841, "learning_rate": 2.165565933099682e-06, "loss": 0.5589, "step": 3700 }, { "epoch": 2.6, "grad_norm": 0.46274876339767745, "learning_rate": 2.1457530749459373e-06, "loss": 0.5588, "step": 3710 }, { "epoch": 2.61, "grad_norm": 0.48340392958868733, "learning_rate": 2.1259911238503988e-06, "loss": 0.5481, "step": 3720 }, { "epoch": 2.61, "grad_norm": 0.5024001177410511, "learning_rate": 2.1062808226781767e-06, "loss": 0.5604, "step": 3730 }, { "epoch": 2.62, "grad_norm": 0.4794062865649958, "learning_rate": 2.0866229123528305e-06, "loss": 0.552, "step": 3740 }, { "epoch": 2.63, "grad_norm": 0.49502474291815657, "learning_rate": 2.0670181318285076e-06, "loss": 0.5526, "step": 3750 }, { "epoch": 2.64, "grad_norm": 0.4912138589836612, "learning_rate": 2.0474672180621754e-06, "loss": 0.5433, "step": 3760 }, { "epoch": 2.64, "grad_norm": 0.46287983551015915, "learning_rate": 2.027970905985908e-06, "loss": 0.5607, "step": 3770 }, { "epoch": 2.65, "grad_norm": 0.4818908530273005, "learning_rate": 2.008529928479269e-06, "loss": 0.5552, "step": 3780 }, { "epoch": 2.66, "grad_norm": 0.49475825963312386, "learning_rate": 1.9891450163417574e-06, "loss": 0.5473, "step": 3790 }, { "epoch": 2.66, "grad_norm": 0.5090335613659759, "learning_rate": 1.9698168982653334e-06, "loss": 0.5469, "step": 3800 }, { "epoch": 2.67, "grad_norm": 0.48712846229296525, "learning_rate": 1.950546300807037e-06, "loss": 0.5526, "step": 3810 }, { "epoch": 2.68, "grad_norm": 0.5087151308068611, "learning_rate": 1.931333948361664e-06, "loss": 0.563, "step": 3820 }, { "epoch": 2.69, "grad_norm": 0.4770122954574883, "learning_rate": 1.9121805631345406e-06, "loss": 0.5588, "step": 3830 }, { "epoch": 2.69, "grad_norm": 0.49875337542296333, "learning_rate": 1.8930868651143776e-06, "loss": 0.5556, "step": 3840 }, { "epoch": 2.7, "grad_norm": 0.46661280379905284, "learning_rate": 1.8740535720462034e-06, "loss": 0.5518, "step": 3850 }, { "epoch": 2.71, "grad_norm": 0.49444595207088565, "learning_rate": 1.8550813994043814e-06, "loss": 0.5679, "step": 3860 }, { "epoch": 2.71, "grad_norm": 0.48381227476419236, "learning_rate": 1.8361710603657162e-06, "loss": 0.5572, "step": 3870 }, { "epoch": 2.72, "grad_norm": 0.5055312948711096, "learning_rate": 1.8173232657826508e-06, "loss": 0.5538, "step": 3880 }, { "epoch": 2.73, "grad_norm": 0.4686625212413926, "learning_rate": 1.7985387241565343e-06, "loss": 0.559, "step": 3890 }, { "epoch": 2.74, "grad_norm": 0.4804255341689684, "learning_rate": 1.7798181416109966e-06, "loss": 0.544, "step": 3900 }, { "epoch": 2.74, "grad_norm": 0.5090131219052505, "learning_rate": 1.7611622218654e-06, "loss": 0.5565, "step": 3910 }, { "epoch": 2.75, "grad_norm": 0.4823380469403731, "learning_rate": 1.7425716662083936e-06, "loss": 0.5586, "step": 3920 }, { "epoch": 2.76, "grad_norm": 0.5039478306212927, "learning_rate": 1.7240471734715416e-06, "loss": 0.5582, "step": 3930 }, { "epoch": 2.76, "grad_norm": 0.48106143586192984, "learning_rate": 1.7055894400030597e-06, "loss": 0.5527, "step": 3940 }, { "epoch": 2.77, "grad_norm": 0.4948095621947108, "learning_rate": 1.6871991596416367e-06, "loss": 0.5534, "step": 3950 }, { "epoch": 2.78, "grad_norm": 0.47985601211032985, "learning_rate": 1.668877023690356e-06, "loss": 0.5514, "step": 3960 }, { "epoch": 2.79, "grad_norm": 0.5044751224020304, "learning_rate": 1.6506237208907045e-06, "loss": 0.5541, "step": 3970 }, { "epoch": 2.79, "grad_norm": 0.5080452899508979, "learning_rate": 1.6324399373966833e-06, "loss": 0.5506, "step": 3980 }, { "epoch": 2.8, "grad_norm": 0.4931986436565961, "learning_rate": 1.6143263567490192e-06, "loss": 0.5736, "step": 3990 }, { "epoch": 2.81, "grad_norm": 0.4684816221900875, "learning_rate": 1.596283659849464e-06, "loss": 0.556, "step": 4000 }, { "epoch": 2.81, "grad_norm": 0.4785014812413059, "learning_rate": 1.5783125249352016e-06, "loss": 0.5579, "step": 4010 }, { "epoch": 2.82, "grad_norm": 0.5116019647376474, "learning_rate": 1.5604136275533513e-06, "loss": 0.5552, "step": 4020 }, { "epoch": 2.83, "grad_norm": 0.5395436792240803, "learning_rate": 1.5425876405355793e-06, "loss": 0.5384, "step": 4030 }, { "epoch": 2.83, "grad_norm": 0.4900436595350879, "learning_rate": 1.5248352339727968e-06, "loss": 0.5622, "step": 4040 }, { "epoch": 2.84, "grad_norm": 0.47513280378884526, "learning_rate": 1.5071570751899785e-06, "loss": 0.5636, "step": 4050 }, { "epoch": 2.85, "grad_norm": 0.4839906292088417, "learning_rate": 1.4895538287210727e-06, "loss": 0.5527, "step": 4060 }, { "epoch": 2.86, "grad_norm": 0.5376958097507211, "learning_rate": 1.4720261562840272e-06, "loss": 0.5635, "step": 4070 }, { "epoch": 2.86, "grad_norm": 0.48771290149288943, "learning_rate": 1.4545747167559066e-06, "loss": 0.564, "step": 4080 }, { "epoch": 2.87, "grad_norm": 0.4854524808894032, "learning_rate": 1.4372001661481314e-06, "loss": 0.5598, "step": 4090 }, { "epoch": 2.88, "grad_norm": 0.4700143505212195, "learning_rate": 1.4199031575818126e-06, "loss": 0.5375, "step": 4100 }, { "epoch": 2.88, "grad_norm": 0.4915439052479703, "learning_rate": 1.4026843412632083e-06, "loss": 0.5548, "step": 4110 }, { "epoch": 2.89, "grad_norm": 0.4869720592283153, "learning_rate": 1.385544364459273e-06, "loss": 0.5571, "step": 4120 }, { "epoch": 2.9, "grad_norm": 0.4716126280570366, "learning_rate": 1.3684838714733317e-06, "loss": 0.5516, "step": 4130 }, { "epoch": 2.91, "grad_norm": 0.4965381533290548, "learning_rate": 1.3515035036208578e-06, "loss": 0.5578, "step": 4140 }, { "epoch": 2.91, "grad_norm": 0.49674828915458996, "learning_rate": 1.3346038992053705e-06, "loss": 0.5498, "step": 4150 }, { "epoch": 2.92, "grad_norm": 0.47680857026122736, "learning_rate": 1.3177856934944328e-06, "loss": 0.5531, "step": 4160 }, { "epoch": 2.93, "grad_norm": 0.4870948629881832, "learning_rate": 1.3010495186957768e-06, "loss": 0.552, "step": 4170 }, { "epoch": 2.93, "grad_norm": 0.483089196852953, "learning_rate": 1.2843960039335355e-06, "loss": 0.5564, "step": 4180 }, { "epoch": 2.94, "grad_norm": 0.5140997811965615, "learning_rate": 1.2678257752245992e-06, "loss": 0.5504, "step": 4190 }, { "epoch": 2.95, "grad_norm": 0.4779902409617231, "learning_rate": 1.2513394554550753e-06, "loss": 0.5478, "step": 4200 }, { "epoch": 2.96, "grad_norm": 0.47680861915825756, "learning_rate": 1.2349376643568792e-06, "loss": 0.5555, "step": 4210 }, { "epoch": 2.96, "grad_norm": 0.47618772244534097, "learning_rate": 1.218621018484434e-06, "loss": 0.5509, "step": 4220 }, { "epoch": 2.97, "grad_norm": 0.46991117646305874, "learning_rate": 1.202390131191501e-06, "loss": 0.5572, "step": 4230 }, { "epoch": 2.98, "grad_norm": 0.48145576248836425, "learning_rate": 1.1862456126081136e-06, "loss": 0.562, "step": 4240 }, { "epoch": 2.98, "grad_norm": 0.49862994419451123, "learning_rate": 1.170188069617649e-06, "loss": 0.5574, "step": 4250 }, { "epoch": 2.99, "grad_norm": 0.5025682535998525, "learning_rate": 1.1542181058340122e-06, "loss": 0.5569, "step": 4260 }, { "epoch": 3.0, "grad_norm": 0.47850092658350835, "learning_rate": 1.1383363215789488e-06, "loss": 0.5543, "step": 4270 }, { "epoch": 3.01, "grad_norm": 0.5044422425999335, "learning_rate": 1.1225433138594741e-06, "loss": 0.5599, "step": 4280 }, { "epoch": 3.01, "grad_norm": 0.47419325850109234, "learning_rate": 1.1068396763454339e-06, "loss": 0.5586, "step": 4290 }, { "epoch": 3.0, "grad_norm": 0.560597143802205, "learning_rate": 1.0912259993471857e-06, "loss": 0.5524, "step": 4300 }, { "epoch": 3.01, "grad_norm": 0.5148468793364267, "learning_rate": 1.0757028697934152e-06, "loss": 0.5084, "step": 4310 }, { "epoch": 3.01, "grad_norm": 0.5017714203601242, "learning_rate": 1.060270871209064e-06, "loss": 0.5156, "step": 4320 }, { "epoch": 3.02, "grad_norm": 0.49357251631602217, "learning_rate": 1.0449305836934003e-06, "loss": 0.5109, "step": 4330 }, { "epoch": 3.03, "grad_norm": 0.4936913138076729, "learning_rate": 1.02968258389821e-06, "loss": 0.5158, "step": 4340 }, { "epoch": 3.04, "grad_norm": 0.5049259973539401, "learning_rate": 1.0145274450061254e-06, "loss": 0.5217, "step": 4350 }, { "epoch": 3.04, "grad_norm": 0.517079836314341, "learning_rate": 9.994657367090686e-07, "loss": 0.5136, "step": 4360 }, { "epoch": 3.05, "grad_norm": 0.4837364294449262, "learning_rate": 9.844980251868449e-07, "loss": 0.518, "step": 4370 }, { "epoch": 3.06, "grad_norm": 0.4869343961795407, "learning_rate": 9.696248730858605e-07, "loss": 0.5132, "step": 4380 }, { "epoch": 3.06, "grad_norm": 0.5085658265111329, "learning_rate": 9.54846839497964e-07, "loss": 0.5165, "step": 4390 }, { "epoch": 3.07, "grad_norm": 0.47424129042024027, "learning_rate": 9.401644799394382e-07, "loss": 0.5215, "step": 4400 }, { "epoch": 3.08, "grad_norm": 0.4991885159298539, "learning_rate": 9.255783463301111e-07, "loss": 0.5092, "step": 4410 }, { "epoch": 3.09, "grad_norm": 0.47972707851164975, "learning_rate": 9.110889869726167e-07, "loss": 0.5289, "step": 4420 }, { "epoch": 3.09, "grad_norm": 0.48477312158187885, "learning_rate": 8.966969465317753e-07, "loss": 0.5373, "step": 4430 }, { "epoch": 3.1, "grad_norm": 0.5150113149802942, "learning_rate": 8.824027660141253e-07, "loss": 0.5144, "step": 4440 }, { "epoch": 3.11, "grad_norm": 0.5012820847152873, "learning_rate": 8.682069827475828e-07, "loss": 0.5232, "step": 4450 }, { "epoch": 3.11, "grad_norm": 0.536197598669663, "learning_rate": 8.541101303612473e-07, "loss": 0.5312, "step": 4460 }, { "epoch": 3.12, "grad_norm": 0.47456874746453287, "learning_rate": 8.401127387653379e-07, "loss": 0.5021, "step": 4470 }, { "epoch": 3.13, "grad_norm": 0.5022494921077733, "learning_rate": 8.262153341312734e-07, "loss": 0.5039, "step": 4480 }, { "epoch": 3.14, "grad_norm": 0.5128622291867768, "learning_rate": 8.124184388719e-07, "loss": 0.5189, "step": 4490 }, { "epoch": 3.14, "grad_norm": 0.49970434341288505, "learning_rate": 7.987225716218441e-07, "loss": 0.5266, "step": 4500 }, { "epoch": 3.15, "grad_norm": 0.4990813361708124, "learning_rate": 7.851282472180222e-07, "loss": 0.5189, "step": 4510 }, { "epoch": 3.16, "grad_norm": 0.5361324180050252, "learning_rate": 7.716359766802858e-07, "loss": 0.5283, "step": 4520 }, { "epoch": 3.16, "grad_norm": 0.49325303865409753, "learning_rate": 7.582462671922154e-07, "loss": 0.5134, "step": 4530 }, { "epoch": 3.17, "grad_norm": 0.5074499214352016, "learning_rate": 7.449596220820492e-07, "loss": 0.5219, "step": 4540 }, { "epoch": 3.18, "grad_norm": 0.48687866167974014, "learning_rate": 7.317765408037668e-07, "loss": 0.5131, "step": 4550 }, { "epoch": 3.19, "grad_norm": 0.5209017279406115, "learning_rate": 7.186975189183119e-07, "loss": 0.5263, "step": 4560 }, { "epoch": 3.19, "grad_norm": 0.5017929271897994, "learning_rate": 7.057230480749689e-07, "loss": 0.5221, "step": 4570 }, { "epoch": 3.2, "grad_norm": 0.4909543768911595, "learning_rate": 6.928536159928746e-07, "loss": 0.5082, "step": 4580 }, { "epoch": 3.21, "grad_norm": 0.5217040631589964, "learning_rate": 6.800897064426877e-07, "loss": 0.5136, "step": 4590 }, { "epoch": 3.21, "grad_norm": 0.5007485735211247, "learning_rate": 6.674317992284038e-07, "loss": 0.5158, "step": 4600 }, { "epoch": 3.22, "grad_norm": 0.495432605404129, "learning_rate": 6.548803701693218e-07, "loss": 0.5191, "step": 4610 }, { "epoch": 3.23, "grad_norm": 0.5457479536125451, "learning_rate": 6.424358910821511e-07, "loss": 0.5144, "step": 4620 }, { "epoch": 3.24, "grad_norm": 0.5106414076169086, "learning_rate": 6.300988297632804e-07, "loss": 0.5288, "step": 4630 }, { "epoch": 3.24, "grad_norm": 0.5211736668510725, "learning_rate": 6.178696499711915e-07, "loss": 0.5218, "step": 4640 }, { "epoch": 3.25, "grad_norm": 0.4891406143758845, "learning_rate": 6.057488114090288e-07, "loss": 0.5107, "step": 4650 }, { "epoch": 3.26, "grad_norm": 0.5178228254981688, "learning_rate": 5.937367697073139e-07, "loss": 0.5004, "step": 4660 }, { "epoch": 3.26, "grad_norm": 0.49831173988741256, "learning_rate": 5.818339764068217e-07, "loss": 0.5167, "step": 4670 }, { "epoch": 3.27, "grad_norm": 0.5445792027132667, "learning_rate": 5.700408789416051e-07, "loss": 0.5251, "step": 4680 }, { "epoch": 3.28, "grad_norm": 0.5412064520692698, "learning_rate": 5.58357920622179e-07, "loss": 0.5185, "step": 4690 }, { "epoch": 3.28, "grad_norm": 0.5194173017222409, "learning_rate": 5.467855406188503e-07, "loss": 0.5213, "step": 4700 }, { "epoch": 3.29, "grad_norm": 0.530585691377951, "learning_rate": 5.353241739452134e-07, "loss": 0.5213, "step": 4710 }, { "epoch": 3.3, "grad_norm": 0.5334266089134705, "learning_rate": 5.239742514417958e-07, "loss": 0.5213, "step": 4720 }, { "epoch": 3.31, "grad_norm": 0.5323190599173516, "learning_rate": 5.127361997598647e-07, "loss": 0.5173, "step": 4730 }, { "epoch": 3.31, "grad_norm": 0.4977075988891876, "learning_rate": 5.016104413453866e-07, "loss": 0.5163, "step": 4740 }, { "epoch": 3.32, "grad_norm": 0.5072133518376746, "learning_rate": 4.905973944231479e-07, "loss": 0.5147, "step": 4750 }, { "epoch": 3.33, "grad_norm": 0.5089446326634548, "learning_rate": 4.796974729810328e-07, "loss": 0.5206, "step": 4760 }, { "epoch": 3.33, "grad_norm": 0.5173579821056443, "learning_rate": 4.6891108675446453e-07, "loss": 0.5233, "step": 4770 }, { "epoch": 3.34, "grad_norm": 0.49509093398735665, "learning_rate": 4.5823864121099967e-07, "loss": 0.5143, "step": 4780 }, { "epoch": 3.35, "grad_norm": 0.510739525920679, "learning_rate": 4.476805375350865e-07, "loss": 0.5204, "step": 4790 }, { "epoch": 3.36, "grad_norm": 0.5285640385275354, "learning_rate": 4.372371726129854e-07, "loss": 0.5226, "step": 4800 }, { "epoch": 3.36, "grad_norm": 0.49804779846917624, "learning_rate": 4.269089390178512e-07, "loss": 0.5257, "step": 4810 }, { "epoch": 3.37, "grad_norm": 0.4960403523798791, "learning_rate": 4.1669622499497205e-07, "loss": 0.5224, "step": 4820 }, { "epoch": 3.38, "grad_norm": 0.509776799973484, "learning_rate": 4.0659941444717833e-07, "loss": 0.5153, "step": 4830 }, { "epoch": 3.38, "grad_norm": 0.48108044641737857, "learning_rate": 3.966188869204094e-07, "loss": 0.5175, "step": 4840 }, { "epoch": 3.39, "grad_norm": 0.5141883943099625, "learning_rate": 3.8675501758944926e-07, "loss": 0.5147, "step": 4850 }, { "epoch": 3.4, "grad_norm": 0.5086149236998669, "learning_rate": 3.7700817724381983e-07, "loss": 0.5128, "step": 4860 }, { "epoch": 3.41, "grad_norm": 0.5107670739104685, "learning_rate": 3.6737873227384263e-07, "loss": 0.5162, "step": 4870 }, { "epoch": 3.41, "grad_norm": 0.48090817905611477, "learning_rate": 3.578670446568711e-07, "loss": 0.5289, "step": 4880 }, { "epoch": 3.42, "grad_norm": 0.5149098967385166, "learning_rate": 3.484734719436782e-07, "loss": 0.5224, "step": 4890 }, { "epoch": 3.43, "grad_norm": 0.4967090096149114, "learning_rate": 3.3919836724501743e-07, "loss": 0.5064, "step": 4900 }, { "epoch": 3.43, "grad_norm": 0.49198009223776107, "learning_rate": 3.3004207921835004e-07, "loss": 0.526, "step": 4910 }, { "epoch": 3.44, "grad_norm": 0.5260886992405347, "learning_rate": 3.210049520547388e-07, "loss": 0.5278, "step": 4920 }, { "epoch": 3.45, "grad_norm": 0.49827609520509064, "learning_rate": 3.1208732546590843e-07, "loss": 0.5269, "step": 4930 }, { "epoch": 3.46, "grad_norm": 0.5199185251610714, "learning_rate": 3.0328953467147543e-07, "loss": 0.5125, "step": 4940 }, { "epoch": 3.46, "grad_norm": 0.5165139482645277, "learning_rate": 2.946119103863483e-07, "loss": 0.5095, "step": 4950 }, { "epoch": 3.47, "grad_norm": 0.48760733590102007, "learning_rate": 2.86054778808296e-07, "loss": 0.5262, "step": 4960 }, { "epoch": 3.48, "grad_norm": 0.49481920675979196, "learning_rate": 2.7761846160568403e-07, "loss": 0.5209, "step": 4970 }, { "epoch": 3.48, "grad_norm": 0.5017608349952136, "learning_rate": 2.69303275905384e-07, "loss": 0.5137, "step": 4980 }, { "epoch": 3.49, "grad_norm": 0.5222144874040826, "learning_rate": 2.611095342808526e-07, "loss": 0.5162, "step": 4990 }, { "epoch": 3.5, "grad_norm": 0.4928255848647095, "learning_rate": 2.530375447403815e-07, "loss": 0.5176, "step": 5000 }, { "epoch": 3.51, "grad_norm": 0.530457616289496, "learning_rate": 2.4508761071551906e-07, "loss": 0.5181, "step": 5010 }, { "epoch": 3.51, "grad_norm": 0.5147706319208548, "learning_rate": 2.3726003104966393e-07, "loss": 0.5095, "step": 5020 }, { "epoch": 3.52, "grad_norm": 0.523763253857449, "learning_rate": 2.2955509998683214e-07, "loss": 0.5108, "step": 5030 }, { "epoch": 3.53, "grad_norm": 0.5323084690421006, "learning_rate": 2.2197310716059603e-07, "loss": 0.511, "step": 5040 }, { "epoch": 3.53, "grad_norm": 0.5088461348117514, "learning_rate": 2.1451433758319543e-07, "loss": 0.5265, "step": 5050 }, { "epoch": 3.54, "grad_norm": 0.5478220673331649, "learning_rate": 2.0717907163482507e-07, "loss": 0.5112, "step": 5060 }, { "epoch": 3.55, "grad_norm": 0.5414027895276027, "learning_rate": 1.9996758505309593e-07, "loss": 0.5231, "step": 5070 }, { "epoch": 3.56, "grad_norm": 0.4983898932091525, "learning_rate": 1.9288014892266753e-07, "loss": 0.5105, "step": 5080 }, { "epoch": 3.56, "grad_norm": 0.5093531347734784, "learning_rate": 1.8591702966505952e-07, "loss": 0.5127, "step": 5090 }, { "epoch": 3.57, "grad_norm": 0.677948629367298, "learning_rate": 1.790784890286352e-07, "loss": 0.5219, "step": 5100 }, { "epoch": 3.58, "grad_norm": 0.5010683504531009, "learning_rate": 1.7236478407876555e-07, "loss": 0.5054, "step": 5110 }, { "epoch": 3.58, "grad_norm": 0.5179768835662841, "learning_rate": 1.6577616718816123e-07, "loss": 0.5251, "step": 5120 }, { "epoch": 3.59, "grad_norm": 0.5087954420227027, "learning_rate": 1.5931288602738958e-07, "loss": 0.5137, "step": 5130 }, { "epoch": 3.6, "grad_norm": 0.5083448366233918, "learning_rate": 1.5297518355556132e-07, "loss": 0.5059, "step": 5140 }, { "epoch": 3.6, "grad_norm": 0.5170972166302202, "learning_rate": 1.467632980112023e-07, "loss": 0.5214, "step": 5150 }, { "epoch": 3.61, "grad_norm": 0.5145933451855358, "learning_rate": 1.406774629032923e-07, "loss": 0.511, "step": 5160 }, { "epoch": 3.62, "grad_norm": 0.5012480980422283, "learning_rate": 1.347179070024903e-07, "loss": 0.5179, "step": 5170 }, { "epoch": 3.63, "grad_norm": 0.5157422802936725, "learning_rate": 1.2888485433253521e-07, "loss": 0.5193, "step": 5180 }, { "epoch": 3.63, "grad_norm": 0.5104197669978088, "learning_rate": 1.2317852416182378e-07, "loss": 0.5221, "step": 5190 }, { "epoch": 3.64, "grad_norm": 0.48689303934415246, "learning_rate": 1.1759913099516816e-07, "loss": 0.5118, "step": 5200 }, { "epoch": 3.65, "grad_norm": 0.5105879788600957, "learning_rate": 1.1214688456573247e-07, "loss": 0.5178, "step": 5210 }, { "epoch": 3.65, "grad_norm": 0.4742285263986786, "learning_rate": 1.0682198982714814e-07, "loss": 0.534, "step": 5220 }, { "epoch": 3.66, "grad_norm": 0.5096564376650945, "learning_rate": 1.0162464694581235e-07, "loss": 0.5272, "step": 5230 }, { "epoch": 3.67, "grad_norm": 0.5068212494030221, "learning_rate": 9.65550512933605e-08, "loss": 0.5252, "step": 5240 }, { "epoch": 3.68, "grad_norm": 0.5132873703711879, "learning_rate": 9.16133934393224e-08, "loss": 0.5161, "step": 5250 }, { "epoch": 3.68, "grad_norm": 0.496214740845792, "learning_rate": 8.67998591439612e-08, "loss": 0.518, "step": 5260 }, { "epoch": 3.69, "grad_norm": 0.5257117991696062, "learning_rate": 8.21146293512876e-08, "loss": 0.5201, "step": 5270 }, { "epoch": 3.7, "grad_norm": 0.5038613162646833, "learning_rate": 7.755788018225961e-08, "loss": 0.5439, "step": 5280 }, { "epoch": 3.7, "grad_norm": 0.5108263338716986, "learning_rate": 7.31297829281617e-08, "loss": 0.5132, "step": 5290 }, { "epoch": 3.71, "grad_norm": 0.5149498952477054, "learning_rate": 6.883050404416552e-08, "loss": 0.5111, "step": 5300 }, { "epoch": 3.72, "grad_norm": 0.5047749285108949, "learning_rate": 6.46602051430732e-08, "loss": 0.5307, "step": 5310 }, { "epoch": 3.73, "grad_norm": 0.5156795764243357, "learning_rate": 6.061904298924253e-08, "loss": 0.5285, "step": 5320 }, { "epoch": 3.73, "grad_norm": 0.5144201053701509, "learning_rate": 5.670716949269278e-08, "loss": 0.5148, "step": 5330 }, { "epoch": 3.74, "grad_norm": 0.507394331507882, "learning_rate": 5.2924731703395564e-08, "loss": 0.5206, "step": 5340 }, { "epoch": 3.75, "grad_norm": 0.48368946217469994, "learning_rate": 4.927187180574666e-08, "loss": 0.526, "step": 5350 }, { "epoch": 3.75, "grad_norm": 0.5047554925764675, "learning_rate": 4.574872711322103e-08, "loss": 0.5126, "step": 5360 }, { "epoch": 3.76, "grad_norm": 0.4949463708763226, "learning_rate": 4.2355430063211405e-08, "loss": 0.5204, "step": 5370 }, { "epoch": 3.77, "grad_norm": 0.5079311960306774, "learning_rate": 3.909210821205017e-08, "loss": 0.5189, "step": 5380 }, { "epoch": 3.78, "grad_norm": 0.4902741464423996, "learning_rate": 3.595888423021354e-08, "loss": 0.513, "step": 5390 }, { "epoch": 3.78, "grad_norm": 0.5421885848655773, "learning_rate": 3.295587589771071e-08, "loss": 0.5093, "step": 5400 }, { "epoch": 3.79, "grad_norm": 0.49756539244831294, "learning_rate": 3.008319609965676e-08, "loss": 0.5144, "step": 5410 }, { "epoch": 3.8, "grad_norm": 0.5074328229331989, "learning_rate": 2.734095282202942e-08, "loss": 0.5133, "step": 5420 }, { "epoch": 3.8, "grad_norm": 0.49772891591572227, "learning_rate": 2.4729249147608378e-08, "loss": 0.5251, "step": 5430 }, { "epoch": 3.81, "grad_norm": 0.5088034449477752, "learning_rate": 2.224818325210237e-08, "loss": 0.5175, "step": 5440 }, { "epoch": 3.82, "grad_norm": 0.4826584150965653, "learning_rate": 1.9897848400456496e-08, "loss": 0.5141, "step": 5450 }, { "epoch": 3.83, "grad_norm": 0.5172662799041124, "learning_rate": 1.7678332943348807e-08, "loss": 0.5197, "step": 5460 }, { "epoch": 3.83, "grad_norm": 0.48940063691629393, "learning_rate": 1.5589720313866794e-08, "loss": 0.5059, "step": 5470 }, { "epoch": 3.84, "grad_norm": 0.517098264403305, "learning_rate": 1.3632089024371574e-08, "loss": 0.5141, "step": 5480 }, { "epoch": 3.85, "grad_norm": 0.48979313956431636, "learning_rate": 1.1805512663549345e-08, "loss": 0.5136, "step": 5490 }, { "epoch": 3.85, "grad_norm": 0.48660701715860905, "learning_rate": 1.0110059893640055e-08, "loss": 0.5212, "step": 5500 }, { "epoch": 3.86, "grad_norm": 0.4841422308843411, "learning_rate": 8.54579444786152e-09, "loss": 0.5228, "step": 5510 }, { "epoch": 3.87, "grad_norm": 0.4851052180007293, "learning_rate": 7.112775128009174e-09, "loss": 0.5146, "step": 5520 }, { "epoch": 3.88, "grad_norm": 0.49400550323274894, "learning_rate": 5.811055802249721e-09, "loss": 0.5277, "step": 5530 }, { "epoch": 3.88, "grad_norm": 0.512928633478054, "learning_rate": 4.640685403093147e-09, "loss": 0.5216, "step": 5540 }, { "epoch": 3.89, "grad_norm": 0.48929257944769156, "learning_rate": 3.6017079255547534e-09, "loss": 0.5172, "step": 5550 }, { "epoch": 3.9, "grad_norm": 0.5049424795736568, "learning_rate": 2.6941624255001904e-09, "loss": 0.5147, "step": 5560 }, { "epoch": 3.9, "grad_norm": 0.5046094240590331, "learning_rate": 1.9180830181797505e-09, "loss": 0.5222, "step": 5570 }, { "epoch": 3.91, "grad_norm": 0.5035868811303936, "learning_rate": 1.273498876942558e-09, "loss": 0.511, "step": 5580 }, { "epoch": 3.92, "grad_norm": 0.48822795586505313, "learning_rate": 7.604342321435032e-10, "loss": 0.5222, "step": 5590 }, { "epoch": 3.92, "grad_norm": 0.5199444169728372, "learning_rate": 3.789083702293028e-10, "loss": 0.5236, "step": 5600 }, { "epoch": 3.93, "grad_norm": 0.528390397947652, "learning_rate": 1.2893563301535904e-10, "loss": 0.5187, "step": 5610 }, { "epoch": 3.94, "grad_norm": 0.49922315857737276, "learning_rate": 1.0525417146023396e-11, "loss": 0.5179, "step": 5620 } ], "logging_steps": 10, "max_steps": 5624, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 2354981319475200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }