|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9420444444444445, |
|
"eval_steps": 500, |
|
"global_step": 5624, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.912928587976149, |
|
"learning_rate": 1.4e-07, |
|
"loss": 1.0098, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.065321089366511, |
|
"learning_rate": 2.8e-07, |
|
"loss": 1.0032, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.529282680442283, |
|
"learning_rate": 4.2e-07, |
|
"loss": 0.9767, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.079773534866118, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.9341, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7504240808921168, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 0.8727, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7446189301316752, |
|
"learning_rate": 8.4e-07, |
|
"loss": 0.7997, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5893489037586176, |
|
"learning_rate": 9.800000000000001e-07, |
|
"loss": 0.7828, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5798012459841311, |
|
"learning_rate": 1.12e-06, |
|
"loss": 0.7671, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5143143417454488, |
|
"learning_rate": 1.26e-06, |
|
"loss": 0.777, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5006881361687121, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 0.7709, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5268772561224019, |
|
"learning_rate": 1.54e-06, |
|
"loss": 0.7751, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.49059329535011015, |
|
"learning_rate": 1.68e-06, |
|
"loss": 0.7588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.548982179156723, |
|
"learning_rate": 1.82e-06, |
|
"loss": 0.758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5118740800557817, |
|
"learning_rate": 1.9600000000000003e-06, |
|
"loss": 0.7492, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.47988356348194033, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.7479, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5324095582498372, |
|
"learning_rate": 2.24e-06, |
|
"loss": 0.7344, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.49578185528674784, |
|
"learning_rate": 2.38e-06, |
|
"loss": 0.7379, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4751722809020323, |
|
"learning_rate": 2.52e-06, |
|
"loss": 0.7515, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4898512842949614, |
|
"learning_rate": 2.66e-06, |
|
"loss": 0.7428, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4938014103724035, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.7356, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4853179196888149, |
|
"learning_rate": 2.94e-06, |
|
"loss": 0.7338, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5006261354893382, |
|
"learning_rate": 3.08e-06, |
|
"loss": 0.7228, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.49494536099466524, |
|
"learning_rate": 3.22e-06, |
|
"loss": 0.7371, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4745560090617258, |
|
"learning_rate": 3.36e-06, |
|
"loss": 0.7374, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.458424659300056, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.7284, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.4918105642778609, |
|
"learning_rate": 3.64e-06, |
|
"loss": 0.719, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.45994092727545755, |
|
"learning_rate": 3.7800000000000002e-06, |
|
"loss": 0.7328, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4888877840053054, |
|
"learning_rate": 3.920000000000001e-06, |
|
"loss": 0.7257, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4891132357037931, |
|
"learning_rate": 4.059999999999999e-06, |
|
"loss": 0.7146, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4659780107286472, |
|
"learning_rate": 4.2e-06, |
|
"loss": 0.7207, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4747662452681582, |
|
"learning_rate": 4.34e-06, |
|
"loss": 0.7196, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.46183058951309874, |
|
"learning_rate": 4.48e-06, |
|
"loss": 0.7166, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.47556837186042844, |
|
"learning_rate": 4.62e-06, |
|
"loss": 0.7138, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.4646419935884572, |
|
"learning_rate": 4.76e-06, |
|
"loss": 0.7166, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.47208612393069765, |
|
"learning_rate": 4.9e-06, |
|
"loss": 0.7071, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.47395551626034477, |
|
"learning_rate": 5.04e-06, |
|
"loss": 0.7081, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.46256038389399284, |
|
"learning_rate": 5.1799999999999995e-06, |
|
"loss": 0.7112, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.44989559880311664, |
|
"learning_rate": 5.32e-06, |
|
"loss": 0.7157, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4759980664139243, |
|
"learning_rate": 5.46e-06, |
|
"loss": 0.716, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.47761427911509746, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.6936, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.4823631066912239, |
|
"learning_rate": 5.739999999999999e-06, |
|
"loss": 0.7096, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4692563644972781, |
|
"learning_rate": 5.88e-06, |
|
"loss": 0.6955, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4758216043542266, |
|
"learning_rate": 6.02e-06, |
|
"loss": 0.7046, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4607724176991764, |
|
"learning_rate": 6.16e-06, |
|
"loss": 0.7071, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.47650098464440593, |
|
"learning_rate": 6.3e-06, |
|
"loss": 0.6948, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4927763843500283, |
|
"learning_rate": 6.44e-06, |
|
"loss": 0.7138, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.44343044028786904, |
|
"learning_rate": 6.58e-06, |
|
"loss": 0.7033, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.45708129790603597, |
|
"learning_rate": 6.72e-06, |
|
"loss": 0.7038, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.47564264251663835, |
|
"learning_rate": 6.8599999999999995e-06, |
|
"loss": 0.6974, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4561386006973232, |
|
"learning_rate": 7e-06, |
|
"loss": 0.702, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4318637464381274, |
|
"learning_rate": 6.999934216315939e-06, |
|
"loss": 0.7054, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.47772094451329594, |
|
"learning_rate": 6.999736867736609e-06, |
|
"loss": 0.6946, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.45891608711087106, |
|
"learning_rate": 6.9994079616804764e-06, |
|
"loss": 0.6952, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.46731862765960264, |
|
"learning_rate": 6.9989475105113426e-06, |
|
"loss": 0.6888, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.4667223098464595, |
|
"learning_rate": 6.998355531537879e-06, |
|
"loss": 0.7017, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.46285196540927176, |
|
"learning_rate": 6.997632047012975e-06, |
|
"loss": 0.7051, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.48044807815149254, |
|
"learning_rate": 6.996777084132904e-06, |
|
"loss": 0.701, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.47600970966063727, |
|
"learning_rate": 6.995790675036298e-06, |
|
"loss": 0.7001, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4494522317826872, |
|
"learning_rate": 6.994672856802944e-06, |
|
"loss": 0.7042, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.4623294450089233, |
|
"learning_rate": 6.993423671452386e-06, |
|
"loss": 0.69, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.43825456028915594, |
|
"learning_rate": 6.9920431659423436e-06, |
|
"loss": 0.6996, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4568055452742323, |
|
"learning_rate": 6.990531392166956e-06, |
|
"loss": 0.6939, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4302767633743081, |
|
"learning_rate": 6.988888406954821e-06, |
|
"loss": 0.6898, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4762852616798798, |
|
"learning_rate": 6.9871142720668644e-06, |
|
"loss": 0.703, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4572026337069386, |
|
"learning_rate": 6.985209054194017e-06, |
|
"loss": 0.7004, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.45803902960498666, |
|
"learning_rate": 6.983172824954708e-06, |
|
"loss": 0.6853, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.44353624606381903, |
|
"learning_rate": 6.9810056608921725e-06, |
|
"loss": 0.7074, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.44517458769087626, |
|
"learning_rate": 6.978707643471573e-06, |
|
"loss": 0.6988, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.4616555458392388, |
|
"learning_rate": 6.97627885907694e-06, |
|
"loss": 0.7034, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.4770896081066365, |
|
"learning_rate": 6.973719399007923e-06, |
|
"loss": 0.6935, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.45665921054521347, |
|
"learning_rate": 6.9710293594763545e-06, |
|
"loss": 0.6773, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.48834217157342125, |
|
"learning_rate": 6.968208841602645e-06, |
|
"loss": 0.6974, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.4661409470252182, |
|
"learning_rate": 6.965257951411967e-06, |
|
"loss": 0.6796, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4249423447942054, |
|
"learning_rate": 6.962176799830279e-06, |
|
"loss": 0.686, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.4517631229399239, |
|
"learning_rate": 6.958965502680155e-06, |
|
"loss": 0.6968, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.4334006789419362, |
|
"learning_rate": 6.955624180676427e-06, |
|
"loss": 0.705, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.44354874837116653, |
|
"learning_rate": 6.9521529594216516e-06, |
|
"loss": 0.6954, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.4606606226964418, |
|
"learning_rate": 6.948551969401381e-06, |
|
"loss": 0.6965, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.46221163538458165, |
|
"learning_rate": 6.94482134597927e-06, |
|
"loss": 0.695, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.4636824720485381, |
|
"learning_rate": 6.940961229391975e-06, |
|
"loss": 0.6919, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4450527833539268, |
|
"learning_rate": 6.936971764743891e-06, |
|
"loss": 0.6977, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4358125416971688, |
|
"learning_rate": 6.932853102001694e-06, |
|
"loss": 0.6998, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.45623590289661414, |
|
"learning_rate": 6.928605395988701e-06, |
|
"loss": 0.6954, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4536975058820564, |
|
"learning_rate": 6.924228806379058e-06, |
|
"loss": 0.6742, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4563719379438227, |
|
"learning_rate": 6.919723497691728e-06, |
|
"loss": 0.6921, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.45279224746852664, |
|
"learning_rate": 6.915089639284313e-06, |
|
"loss": 0.6861, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.466062080319079, |
|
"learning_rate": 6.910327405346686e-06, |
|
"loss": 0.6895, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.443881137156012, |
|
"learning_rate": 6.905436974894443e-06, |
|
"loss": 0.7008, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.47752762402129206, |
|
"learning_rate": 6.900418531762173e-06, |
|
"loss": 0.6985, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4542692407893758, |
|
"learning_rate": 6.89527226459655e-06, |
|
"loss": 0.6822, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4314820719874765, |
|
"learning_rate": 6.889998366849237e-06, |
|
"loss": 0.691, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4278370127210443, |
|
"learning_rate": 6.884597036769621e-06, |
|
"loss": 0.689, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.45134601911703476, |
|
"learning_rate": 6.879068477397353e-06, |
|
"loss": 0.6898, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.45160503192413054, |
|
"learning_rate": 6.87341289655472e-06, |
|
"loss": 0.6869, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.41025143635863104, |
|
"learning_rate": 6.867630506838833e-06, |
|
"loss": 0.6984, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.46520301654074564, |
|
"learning_rate": 6.861721525613633e-06, |
|
"loss": 0.6843, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.451991102882798, |
|
"learning_rate": 6.8556861750017235e-06, |
|
"loss": 0.6962, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.418111038766468, |
|
"learning_rate": 6.849524681876018e-06, |
|
"loss": 0.6797, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.4403261547939229, |
|
"learning_rate": 6.843237277851211e-06, |
|
"loss": 0.6965, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.426598785059419, |
|
"learning_rate": 6.836824199275074e-06, |
|
"loss": 0.6821, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.42988247771547117, |
|
"learning_rate": 6.830285687219569e-06, |
|
"loss": 0.6911, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.452230475071558, |
|
"learning_rate": 6.823621987471789e-06, |
|
"loss": 0.6851, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.4267205539811686, |
|
"learning_rate": 6.816833350524716e-06, |
|
"loss": 0.6777, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.44148424584394874, |
|
"learning_rate": 6.809920031567808e-06, |
|
"loss": 0.6838, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.43306877795839893, |
|
"learning_rate": 6.802882290477399e-06, |
|
"loss": 0.6864, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4952482617663558, |
|
"learning_rate": 6.79572039180694e-06, |
|
"loss": 0.6904, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.45382453893592856, |
|
"learning_rate": 6.788434604777048e-06, |
|
"loss": 0.6795, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.452960843334945, |
|
"learning_rate": 6.781025203265388e-06, |
|
"loss": 0.6891, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4537364245497661, |
|
"learning_rate": 6.773492465796373e-06, |
|
"loss": 0.6907, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.44929090527897886, |
|
"learning_rate": 6.765836675530703e-06, |
|
"loss": 0.6798, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.46381413350008455, |
|
"learning_rate": 6.758058120254715e-06, |
|
"loss": 0.6716, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4309028536458763, |
|
"learning_rate": 6.750157092369563e-06, |
|
"loss": 0.6799, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.43717422966700575, |
|
"learning_rate": 6.742133888880233e-06, |
|
"loss": 0.6883, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4459700930425581, |
|
"learning_rate": 6.7339888113843696e-06, |
|
"loss": 0.6891, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.44045298948848877, |
|
"learning_rate": 6.725722166060951e-06, |
|
"loss": 0.6817, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.4485899862146157, |
|
"learning_rate": 6.717334263658766e-06, |
|
"loss": 0.6897, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.45682000330961775, |
|
"learning_rate": 6.70882541948474e-06, |
|
"loss": 0.6776, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.48037041295136884, |
|
"learning_rate": 6.700195953392085e-06, |
|
"loss": 0.6872, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.44334741491819346, |
|
"learning_rate": 6.691446189768268e-06, |
|
"loss": 0.6798, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.4674740757760583, |
|
"learning_rate": 6.682576457522825e-06, |
|
"loss": 0.6977, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4696181980144796, |
|
"learning_rate": 6.673587090074993e-06, |
|
"loss": 0.6896, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4593954697303246, |
|
"learning_rate": 6.664478425341176e-06, |
|
"loss": 0.6749, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.41647753357217115, |
|
"learning_rate": 6.655250805722244e-06, |
|
"loss": 0.6894, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.4245409839045758, |
|
"learning_rate": 6.645904578090662e-06, |
|
"loss": 0.6693, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.45490183172736, |
|
"learning_rate": 6.636440093777451e-06, |
|
"loss": 0.6881, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4633877447287089, |
|
"learning_rate": 6.626857708558979e-06, |
|
"loss": 0.6953, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.45069656102358646, |
|
"learning_rate": 6.617157782643591e-06, |
|
"loss": 0.6787, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.44438426822862237, |
|
"learning_rate": 6.6073406806580646e-06, |
|
"loss": 0.6859, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4335460798475662, |
|
"learning_rate": 6.597406771633906e-06, |
|
"loss": 0.6829, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4282672786086354, |
|
"learning_rate": 6.587356428993477e-06, |
|
"loss": 0.6831, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.46465171297436636, |
|
"learning_rate": 6.577190030535957e-06, |
|
"loss": 0.6778, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4590812961346198, |
|
"learning_rate": 6.566907958423142e-06, |
|
"loss": 0.6701, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.4180631333820519, |
|
"learning_rate": 6.5565105991650815e-06, |
|
"loss": 0.6825, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.42684427340923925, |
|
"learning_rate": 6.545998343605544e-06, |
|
"loss": 0.6823, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6515643833482546, |
|
"learning_rate": 6.5353715869073275e-06, |
|
"loss": 0.6748, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.42995190312179654, |
|
"learning_rate": 6.524630728537408e-06, |
|
"loss": 0.6896, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4307066820527156, |
|
"learning_rate": 6.513776172251919e-06, |
|
"loss": 0.6821, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4401373902110004, |
|
"learning_rate": 6.5028083260809735e-06, |
|
"loss": 0.6729, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.420372235119902, |
|
"learning_rate": 6.491727602313334e-06, |
|
"loss": 0.6812, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.44387468527179835, |
|
"learning_rate": 6.4805344174808986e-06, |
|
"loss": 0.6713, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4224291568526637, |
|
"learning_rate": 6.4692291923430634e-06, |
|
"loss": 0.6928, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.42342827072921446, |
|
"learning_rate": 6.457812351870889e-06, |
|
"loss": 0.6925, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.4614687139520872, |
|
"learning_rate": 6.446284325231132e-06, |
|
"loss": 0.6804, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.4513094113300999, |
|
"learning_rate": 6.434645545770116e-06, |
|
"loss": 0.649, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.46129242006354043, |
|
"learning_rate": 6.422896450997434e-06, |
|
"loss": 0.6244, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.44352477273420793, |
|
"learning_rate": 6.411037482569509e-06, |
|
"loss": 0.6231, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.43347730975194065, |
|
"learning_rate": 6.399069086272988e-06, |
|
"loss": 0.6163, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.5042235757137699, |
|
"learning_rate": 6.386991712007985e-06, |
|
"loss": 0.6295, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.4635765704926019, |
|
"learning_rate": 6.374805813771171e-06, |
|
"loss": 0.6145, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.4672283056367441, |
|
"learning_rate": 6.362511849638706e-06, |
|
"loss": 0.6248, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.44386378239345664, |
|
"learning_rate": 6.3501102817490184e-06, |
|
"loss": 0.6208, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.45014512458671113, |
|
"learning_rate": 6.337601576285438e-06, |
|
"loss": 0.6241, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.47077991205008496, |
|
"learning_rate": 6.324986203458665e-06, |
|
"loss": 0.637, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.43971957336428713, |
|
"learning_rate": 6.3122646374891014e-06, |
|
"loss": 0.6274, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.45398595356146343, |
|
"learning_rate": 6.299437356589018e-06, |
|
"loss": 0.6172, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.4638039927896387, |
|
"learning_rate": 6.2865048429445835e-06, |
|
"loss": 0.6162, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.456884430778857, |
|
"learning_rate": 6.273467582697736e-06, |
|
"loss": 0.6358, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.4513273711536076, |
|
"learning_rate": 6.260326065927908e-06, |
|
"loss": 0.6256, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.4585546365167011, |
|
"learning_rate": 6.247080786633608e-06, |
|
"loss": 0.6343, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.4837809920582229, |
|
"learning_rate": 6.233732242713847e-06, |
|
"loss": 0.6205, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.45062031874118463, |
|
"learning_rate": 6.220280935949423e-06, |
|
"loss": 0.6181, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.4934582241182996, |
|
"learning_rate": 6.206727371984055e-06, |
|
"loss": 0.6101, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.45848465100131724, |
|
"learning_rate": 6.193072060305386e-06, |
|
"loss": 0.6274, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.49225379713590917, |
|
"learning_rate": 6.17931551422582e-06, |
|
"loss": 0.6287, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.43783738072351636, |
|
"learning_rate": 6.165458250863233e-06, |
|
"loss": 0.6322, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.45111919610212603, |
|
"learning_rate": 6.15150079112153e-06, |
|
"loss": 0.6343, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.7283719867926337, |
|
"learning_rate": 6.137443659671066e-06, |
|
"loss": 0.6245, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.4317614230374671, |
|
"learning_rate": 6.123287384928924e-06, |
|
"loss": 0.6252, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.43630742763076885, |
|
"learning_rate": 6.1090324990390505e-06, |
|
"loss": 0.6281, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.49179102646470696, |
|
"learning_rate": 6.09467953785225e-06, |
|
"loss": 0.6304, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.4269421327683836, |
|
"learning_rate": 6.080229040906045e-06, |
|
"loss": 0.6205, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.44873848635658836, |
|
"learning_rate": 6.065681551404392e-06, |
|
"loss": 0.6203, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.43522811508044484, |
|
"learning_rate": 6.051037616197267e-06, |
|
"loss": 0.6233, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.43363424076560303, |
|
"learning_rate": 6.036297785760099e-06, |
|
"loss": 0.6274, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.4420787259752861, |
|
"learning_rate": 6.0214626141730895e-06, |
|
"loss": 0.6388, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.445119846862499, |
|
"learning_rate": 6.006532659100377e-06, |
|
"loss": 0.6107, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.4380767674114949, |
|
"learning_rate": 5.991508481769071e-06, |
|
"loss": 0.6341, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.44003117819419657, |
|
"learning_rate": 5.976390646948166e-06, |
|
"loss": 0.6344, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.45806509086322245, |
|
"learning_rate": 5.961179722927302e-06, |
|
"loss": 0.6283, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.4545928600817147, |
|
"learning_rate": 5.9458762814954016e-06, |
|
"loss": 0.6254, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.4438181707408447, |
|
"learning_rate": 5.930480897919185e-06, |
|
"loss": 0.631, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.44695115171581695, |
|
"learning_rate": 5.9149941509215366e-06, |
|
"loss": 0.6338, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.4280430227739119, |
|
"learning_rate": 5.899416622659754e-06, |
|
"loss": 0.6182, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.458726186518369, |
|
"learning_rate": 5.883748898703666e-06, |
|
"loss": 0.6162, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.43445566304338457, |
|
"learning_rate": 5.8679915680136155e-06, |
|
"loss": 0.6228, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.44895947980462597, |
|
"learning_rate": 5.852145222918326e-06, |
|
"loss": 0.6373, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.43403817083393664, |
|
"learning_rate": 5.83621045909263e-06, |
|
"loss": 0.6376, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.4673939224968789, |
|
"learning_rate": 5.820187875535083e-06, |
|
"loss": 0.6215, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.46323588428022766, |
|
"learning_rate": 5.804078074545439e-06, |
|
"loss": 0.6187, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4530033509696719, |
|
"learning_rate": 5.7878816617020204e-06, |
|
"loss": 0.6239, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.4317929663828983, |
|
"learning_rate": 5.771599245838943e-06, |
|
"loss": 0.6168, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.436592310414347, |
|
"learning_rate": 5.7552314390232364e-06, |
|
"loss": 0.6179, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.4702835623046126, |
|
"learning_rate": 5.738778856531832e-06, |
|
"loss": 0.6272, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.4619318889613922, |
|
"learning_rate": 5.72224211682844e-06, |
|
"loss": 0.6256, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.49429029776316813, |
|
"learning_rate": 5.705621841540292e-06, |
|
"loss": 0.6283, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.47054367378052575, |
|
"learning_rate": 5.688918655434783e-06, |
|
"loss": 0.6156, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.45638233691668284, |
|
"learning_rate": 5.67213318639598e-06, |
|
"loss": 0.6257, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.43819489071261747, |
|
"learning_rate": 5.655266065401021e-06, |
|
"loss": 0.6255, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.45603698357049277, |
|
"learning_rate": 5.638317926496398e-06, |
|
"loss": 0.6267, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.45518318702227223, |
|
"learning_rate": 5.6212894067741176e-06, |
|
"loss": 0.6357, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.4402683023420712, |
|
"learning_rate": 5.604181146347758e-06, |
|
"loss": 0.6311, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.4498808898227514, |
|
"learning_rate": 5.5869937883284065e-06, |
|
"loss": 0.6213, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.46040698115780887, |
|
"learning_rate": 5.569727978800478e-06, |
|
"loss": 0.6223, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.44168864627397236, |
|
"learning_rate": 5.552384366797435e-06, |
|
"loss": 0.6268, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.45494321235524204, |
|
"learning_rate": 5.534963604277388e-06, |
|
"loss": 0.6193, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.44543538788588954, |
|
"learning_rate": 5.517466346098587e-06, |
|
"loss": 0.6311, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.45370006917207745, |
|
"learning_rate": 5.4998932499948055e-06, |
|
"loss": 0.6263, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.4457705866746906, |
|
"learning_rate": 5.482244976550616e-06, |
|
"loss": 0.6267, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.44178347775287935, |
|
"learning_rate": 5.464522189176559e-06, |
|
"loss": 0.6168, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.4510685099498634, |
|
"learning_rate": 5.446725554084202e-06, |
|
"loss": 0.6071, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.4463056440103558, |
|
"learning_rate": 5.4288557402611e-06, |
|
"loss": 0.6193, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.4450825773000299, |
|
"learning_rate": 5.410913419445647e-06, |
|
"loss": 0.6114, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.4609214677792106, |
|
"learning_rate": 5.3928992661018194e-06, |
|
"loss": 0.6255, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.48687583594807843, |
|
"learning_rate": 5.374813957393832e-06, |
|
"loss": 0.6286, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.47549284042607015, |
|
"learning_rate": 5.356658173160674e-06, |
|
"loss": 0.6143, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.49532165280916113, |
|
"learning_rate": 5.338432595890562e-06, |
|
"loss": 0.6249, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.45253915740067313, |
|
"learning_rate": 5.320137910695275e-06, |
|
"loss": 0.6257, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.43721435814923637, |
|
"learning_rate": 5.301774805284408e-06, |
|
"loss": 0.6178, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.4683301857922748, |
|
"learning_rate": 5.2833439699395175e-06, |
|
"loss": 0.6173, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.43871464981194036, |
|
"learning_rate": 5.264846097488175e-06, |
|
"loss": 0.6214, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.4524085111628937, |
|
"learning_rate": 5.246281883277922e-06, |
|
"loss": 0.6346, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.4468406698869542, |
|
"learning_rate": 5.227652025150132e-06, |
|
"loss": 0.614, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.468252187542662, |
|
"learning_rate": 5.208957223413776e-06, |
|
"loss": 0.6057, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.46458186348478814, |
|
"learning_rate": 5.1901981808191e-06, |
|
"loss": 0.6192, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.4589397282179608, |
|
"learning_rate": 5.1713756025312095e-06, |
|
"loss": 0.6197, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.4733441471283767, |
|
"learning_rate": 5.1524901961035555e-06, |
|
"loss": 0.6146, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.49573981085967583, |
|
"learning_rate": 5.1335426714513436e-06, |
|
"loss": 0.6205, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.45753588591278177, |
|
"learning_rate": 5.114533740824848e-06, |
|
"loss": 0.6194, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.44981584915327405, |
|
"learning_rate": 5.095464118782631e-06, |
|
"loss": 0.6285, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.44941448245640475, |
|
"learning_rate": 5.076334522164687e-06, |
|
"loss": 0.6183, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.46348841235648264, |
|
"learning_rate": 5.057145670065498e-06, |
|
"loss": 0.6178, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.4819885899523623, |
|
"learning_rate": 5.037898283806995e-06, |
|
"loss": 0.6209, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.45974762343297226, |
|
"learning_rate": 5.018593086911453e-06, |
|
"loss": 0.6144, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.4832719455105882, |
|
"learning_rate": 4.999230805074284e-06, |
|
"loss": 0.6255, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.4580501245903807, |
|
"learning_rate": 4.979812166136764e-06, |
|
"loss": 0.622, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.4869292416366864, |
|
"learning_rate": 4.960337900058668e-06, |
|
"loss": 0.6295, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.44734991176527494, |
|
"learning_rate": 4.940808738890834e-06, |
|
"loss": 0.61, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.4836741219786191, |
|
"learning_rate": 4.921225416747647e-06, |
|
"loss": 0.6131, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.43868937063180397, |
|
"learning_rate": 4.901588669779433e-06, |
|
"loss": 0.6261, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.4549440779907735, |
|
"learning_rate": 4.881899236144797e-06, |
|
"loss": 0.6216, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.4561309327019534, |
|
"learning_rate": 4.862157855982875e-06, |
|
"loss": 0.6262, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.4521274007767562, |
|
"learning_rate": 4.8423652713855e-06, |
|
"loss": 0.6214, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.4876373591113174, |
|
"learning_rate": 4.822522226369323e-06, |
|
"loss": 0.6303, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.4403247558369275, |
|
"learning_rate": 4.802629466847827e-06, |
|
"loss": 0.6236, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.4392883872725244, |
|
"learning_rate": 4.782687740603308e-06, |
|
"loss": 0.6125, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.44359149108855517, |
|
"learning_rate": 4.762697797258742e-06, |
|
"loss": 0.6208, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.45892783125410747, |
|
"learning_rate": 4.742660388249629e-06, |
|
"loss": 0.6146, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.46353318895549067, |
|
"learning_rate": 4.722576266795729e-06, |
|
"loss": 0.6199, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.4642990741363008, |
|
"learning_rate": 4.702446187872758e-06, |
|
"loss": 0.6182, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.44827792507065956, |
|
"learning_rate": 4.682270908184003e-06, |
|
"loss": 0.6246, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.45544933714150454, |
|
"learning_rate": 4.662051186131876e-06, |
|
"loss": 0.6256, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.4485500362120205, |
|
"learning_rate": 4.641787781789412e-06, |
|
"loss": 0.6181, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.42631048877270405, |
|
"learning_rate": 4.6214814568716894e-06, |
|
"loss": 0.6331, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.4714279586473698, |
|
"learning_rate": 4.601132974707202e-06, |
|
"loss": 0.628, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.4228608375782349, |
|
"learning_rate": 4.5807431002091605e-06, |
|
"loss": 0.6054, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.46872660848782277, |
|
"learning_rate": 4.560312599846746e-06, |
|
"loss": 0.6102, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.4379038714391558, |
|
"learning_rate": 4.539842241616287e-06, |
|
"loss": 0.6143, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.4719919574560488, |
|
"learning_rate": 4.519332795012404e-06, |
|
"loss": 0.6197, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.4560470541146194, |
|
"learning_rate": 4.498785030999068e-06, |
|
"loss": 0.6132, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.48502107778992737, |
|
"learning_rate": 4.478199721980633e-06, |
|
"loss": 0.631, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.45288928959662245, |
|
"learning_rate": 4.457577641772792e-06, |
|
"loss": 0.6148, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.45740004712492455, |
|
"learning_rate": 4.436919565573495e-06, |
|
"loss": 0.613, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.4680089016865197, |
|
"learning_rate": 4.416226269933802e-06, |
|
"loss": 0.6109, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.4498754217059588, |
|
"learning_rate": 4.395498532728697e-06, |
|
"loss": 0.627, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.490510820257092, |
|
"learning_rate": 4.374737133127847e-06, |
|
"loss": 0.6287, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.4384793154811805, |
|
"learning_rate": 4.35394285156631e-06, |
|
"loss": 0.6265, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.42053564372682345, |
|
"learning_rate": 4.3331164697151995e-06, |
|
"loss": 0.6123, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.44499220286710817, |
|
"learning_rate": 4.3122587704523015e-06, |
|
"loss": 0.6196, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4681953108721627, |
|
"learning_rate": 4.291370537832641e-06, |
|
"loss": 0.6301, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4245150987038812, |
|
"learning_rate": 4.2704525570590185e-06, |
|
"loss": 0.6203, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.4738423212960381, |
|
"learning_rate": 4.2495056144524824e-06, |
|
"loss": 0.6159, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.49926406862961464, |
|
"learning_rate": 4.228530497422779e-06, |
|
"loss": 0.6193, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.4423739374256911, |
|
"learning_rate": 4.207527994438748e-06, |
|
"loss": 0.617, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.44692873617751755, |
|
"learning_rate": 4.186498894998689e-06, |
|
"loss": 0.6135, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.4358994979972626, |
|
"learning_rate": 4.165443989600678e-06, |
|
"loss": 0.6121, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.46452930431844286, |
|
"learning_rate": 4.144364069712854e-06, |
|
"loss": 0.6167, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.4816111574015236, |
|
"learning_rate": 4.123259927743669e-06, |
|
"loss": 0.6203, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.45232518080467465, |
|
"learning_rate": 4.102132357012098e-06, |
|
"loss": 0.6199, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.45515782747165817, |
|
"learning_rate": 4.08098215171782e-06, |
|
"loss": 0.6174, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.44933646029392305, |
|
"learning_rate": 4.059810106911363e-06, |
|
"loss": 0.6188, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.45633219759975596, |
|
"learning_rate": 4.038617018464217e-06, |
|
"loss": 0.6168, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.4663774750339656, |
|
"learning_rate": 4.017403683038914e-06, |
|
"loss": 0.6199, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.4565589400061048, |
|
"learning_rate": 3.996170898059087e-06, |
|
"loss": 0.6187, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.45638098232431645, |
|
"learning_rate": 3.97491946167949e-06, |
|
"loss": 0.6133, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.4330737687010161, |
|
"learning_rate": 3.9536501727559956e-06, |
|
"loss": 0.6179, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.44620897297773393, |
|
"learning_rate": 3.932363830815563e-06, |
|
"loss": 0.606, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4727298461430969, |
|
"learning_rate": 3.911061236026192e-06, |
|
"loss": 0.5804, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.5332182751767908, |
|
"learning_rate": 3.889743189166831e-06, |
|
"loss": 0.5552, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.471875496548638, |
|
"learning_rate": 3.868410491597286e-06, |
|
"loss": 0.5467, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.4869637805163024, |
|
"learning_rate": 3.847063945228094e-06, |
|
"loss": 0.5691, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.4714418364302173, |
|
"learning_rate": 3.825704352490375e-06, |
|
"loss": 0.5788, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.49636094733662106, |
|
"learning_rate": 3.804332516305672e-06, |
|
"loss": 0.5583, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.48087980189754664, |
|
"learning_rate": 3.782949240055768e-06, |
|
"loss": 0.5632, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.4873147689537464, |
|
"learning_rate": 3.7615553275524852e-06, |
|
"loss": 0.5602, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.4603275098510104, |
|
"learning_rate": 3.74015158300747e-06, |
|
"loss": 0.5641, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.5162191764305892, |
|
"learning_rate": 3.7187388110019604e-06, |
|
"loss": 0.5628, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.49005627074608765, |
|
"learning_rate": 3.697317816456546e-06, |
|
"loss": 0.559, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.4585568665283943, |
|
"learning_rate": 3.6758894046009037e-06, |
|
"loss": 0.547, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.4506260874603515, |
|
"learning_rate": 3.6544543809435346e-06, |
|
"loss": 0.5433, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.46595533436834136, |
|
"learning_rate": 3.6330135512414822e-06, |
|
"loss": 0.5666, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.4690150184503375, |
|
"learning_rate": 3.6115677214700397e-06, |
|
"loss": 0.5596, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.4683369095498927, |
|
"learning_rate": 3.5901176977924606e-06, |
|
"loss": 0.5458, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.4710288608351933, |
|
"learning_rate": 3.568664286529646e-06, |
|
"loss": 0.5507, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.4928542807361932, |
|
"learning_rate": 3.5472082941298433e-06, |
|
"loss": 0.5665, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.4972921543225756, |
|
"learning_rate": 3.5257505271383217e-06, |
|
"loss": 0.5586, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.4855107426051562, |
|
"learning_rate": 3.504291792167063e-06, |
|
"loss": 0.5615, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.4623236179613674, |
|
"learning_rate": 3.4828328958644326e-06, |
|
"loss": 0.5638, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.46028741167647896, |
|
"learning_rate": 3.4613746448848622e-06, |
|
"loss": 0.5464, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.46156508300115645, |
|
"learning_rate": 3.439917845858524e-06, |
|
"loss": 0.567, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.5669489602625127, |
|
"learning_rate": 3.418463305361013e-06, |
|
"loss": 0.5524, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.49099941076825016, |
|
"learning_rate": 3.3970118298830207e-06, |
|
"loss": 0.5591, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.5207064606888653, |
|
"learning_rate": 3.3755642258000265e-06, |
|
"loss": 0.5538, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.4830219809120518, |
|
"learning_rate": 3.3541212993419773e-06, |
|
"loss": 0.5475, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.4801836621711601, |
|
"learning_rate": 3.3326838565629895e-06, |
|
"loss": 0.5413, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.47387958333244534, |
|
"learning_rate": 3.31125270331104e-06, |
|
"loss": 0.5537, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.5090490511350312, |
|
"learning_rate": 3.289828645197681e-06, |
|
"loss": 0.5567, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.5286353188714713, |
|
"learning_rate": 3.2684124875677518e-06, |
|
"loss": 0.5589, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.4927074981163475, |
|
"learning_rate": 3.247005035469109e-06, |
|
"loss": 0.5697, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.47340856305327644, |
|
"learning_rate": 3.2256070936223603e-06, |
|
"loss": 0.5687, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.5115028667136483, |
|
"learning_rate": 3.2042194663906193e-06, |
|
"loss": 0.5625, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.4723602653535651, |
|
"learning_rate": 3.182842957749263e-06, |
|
"loss": 0.5633, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.4679538952450783, |
|
"learning_rate": 3.1614783712557156e-06, |
|
"loss": 0.5572, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.48051919166640805, |
|
"learning_rate": 3.1401265100192383e-06, |
|
"loss": 0.5648, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.4594423765819446, |
|
"learning_rate": 3.1187881766707425e-06, |
|
"loss": 0.5595, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.49220125939314296, |
|
"learning_rate": 3.0974641733326154e-06, |
|
"loss": 0.5479, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.4944265110257382, |
|
"learning_rate": 3.0761553015885717e-06, |
|
"loss": 0.5502, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.495744161270211, |
|
"learning_rate": 3.0548623624535165e-06, |
|
"loss": 0.5629, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.478561888744776, |
|
"learning_rate": 3.0335861563434403e-06, |
|
"loss": 0.5597, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.4946624980435279, |
|
"learning_rate": 3.012327483045325e-06, |
|
"loss": 0.556, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.4913013156645444, |
|
"learning_rate": 2.9910871416870855e-06, |
|
"loss": 0.5638, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.46629667333688474, |
|
"learning_rate": 2.9698659307075224e-06, |
|
"loss": 0.5508, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.47577400823898375, |
|
"learning_rate": 2.948664647826318e-06, |
|
"loss": 0.5518, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.48528006049817207, |
|
"learning_rate": 2.9274840900140375e-06, |
|
"loss": 0.5582, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.5499143301618472, |
|
"learning_rate": 2.906325053462181e-06, |
|
"loss": 0.548, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.4772816560553211, |
|
"learning_rate": 2.8851883335532496e-06, |
|
"loss": 0.5523, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.49887071761697505, |
|
"learning_rate": 2.8640747248308445e-06, |
|
"loss": 0.5544, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.4853842631362592, |
|
"learning_rate": 2.8429850209698053e-06, |
|
"loss": 0.5558, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.45895465861964546, |
|
"learning_rate": 2.8219200147463677e-06, |
|
"loss": 0.5598, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.4662802877247775, |
|
"learning_rate": 2.8008804980083695e-06, |
|
"loss": 0.5551, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.4881083174435456, |
|
"learning_rate": 2.7798672616454785e-06, |
|
"loss": 0.5511, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.5016617932642891, |
|
"learning_rate": 2.75888109555947e-06, |
|
"loss": 0.5438, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.4831166076149674, |
|
"learning_rate": 2.7379227886345244e-06, |
|
"loss": 0.5598, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4953933886035155, |
|
"learning_rate": 2.716993128707581e-06, |
|
"loss": 0.5609, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.503170266490847, |
|
"learning_rate": 2.696092902538716e-06, |
|
"loss": 0.5488, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.5098380667106547, |
|
"learning_rate": 2.675222895781574e-06, |
|
"loss": 0.5539, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.49948084086860606, |
|
"learning_rate": 2.6543838929538285e-06, |
|
"loss": 0.5581, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.4872613273522286, |
|
"learning_rate": 2.6335766774076965e-06, |
|
"loss": 0.5562, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.47926716145131487, |
|
"learning_rate": 2.6128020313004875e-06, |
|
"loss": 0.5561, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.49339314189894584, |
|
"learning_rate": 2.592060735565206e-06, |
|
"loss": 0.5633, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.4888816777932096, |
|
"learning_rate": 2.5713535698811926e-06, |
|
"loss": 0.5623, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.47873225411797143, |
|
"learning_rate": 2.550681312644815e-06, |
|
"loss": 0.5629, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.4985498589688127, |
|
"learning_rate": 2.5300447409402104e-06, |
|
"loss": 0.5517, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.4699404709889953, |
|
"learning_rate": 2.509444630510071e-06, |
|
"loss": 0.5542, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.5471742855253533, |
|
"learning_rate": 2.4888817557264883e-06, |
|
"loss": 0.5573, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.4890601716460387, |
|
"learning_rate": 2.468356889561835e-06, |
|
"loss": 0.5496, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.4884550896007432, |
|
"learning_rate": 2.4478708035597206e-06, |
|
"loss": 0.5517, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.53082092791935, |
|
"learning_rate": 2.427424267805977e-06, |
|
"loss": 0.5643, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.4588900957688972, |
|
"learning_rate": 2.407018050899719e-06, |
|
"loss": 0.5588, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.4930240761419014, |
|
"learning_rate": 2.3866529199244454e-06, |
|
"loss": 0.5534, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.4995410840918172, |
|
"learning_rate": 2.36632964041921e-06, |
|
"loss": 0.5526, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.4889682103736911, |
|
"learning_rate": 2.3460489763498393e-06, |
|
"loss": 0.5575, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.47254332660748083, |
|
"learning_rate": 2.3258116900802188e-06, |
|
"loss": 0.5641, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.5271806756431864, |
|
"learning_rate": 2.3056185423436304e-06, |
|
"loss": 0.5515, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.5014716634327129, |
|
"learning_rate": 2.2854702922141627e-06, |
|
"loss": 0.5578, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.48930981901485066, |
|
"learning_rate": 2.265367697078168e-06, |
|
"loss": 0.5648, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.4822043988267899, |
|
"learning_rate": 2.245311512605801e-06, |
|
"loss": 0.5554, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.4978119631671631, |
|
"learning_rate": 2.2253024927226053e-06, |
|
"loss": 0.5586, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.49756480432664524, |
|
"learning_rate": 2.2053413895811764e-06, |
|
"loss": 0.5578, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.4671920108876918, |
|
"learning_rate": 2.1854289535328864e-06, |
|
"loss": 0.5557, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.513655855548841, |
|
"learning_rate": 2.165565933099682e-06, |
|
"loss": 0.5589, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.46274876339767745, |
|
"learning_rate": 2.1457530749459373e-06, |
|
"loss": 0.5588, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.48340392958868733, |
|
"learning_rate": 2.1259911238503988e-06, |
|
"loss": 0.5481, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.5024001177410511, |
|
"learning_rate": 2.1062808226781767e-06, |
|
"loss": 0.5604, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.4794062865649958, |
|
"learning_rate": 2.0866229123528305e-06, |
|
"loss": 0.552, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.49502474291815657, |
|
"learning_rate": 2.0670181318285076e-06, |
|
"loss": 0.5526, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.4912138589836612, |
|
"learning_rate": 2.0474672180621754e-06, |
|
"loss": 0.5433, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.46287983551015915, |
|
"learning_rate": 2.027970905985908e-06, |
|
"loss": 0.5607, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.4818908530273005, |
|
"learning_rate": 2.008529928479269e-06, |
|
"loss": 0.5552, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.49475825963312386, |
|
"learning_rate": 1.9891450163417574e-06, |
|
"loss": 0.5473, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.5090335613659759, |
|
"learning_rate": 1.9698168982653334e-06, |
|
"loss": 0.5469, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.48712846229296525, |
|
"learning_rate": 1.950546300807037e-06, |
|
"loss": 0.5526, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.5087151308068611, |
|
"learning_rate": 1.931333948361664e-06, |
|
"loss": 0.563, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.4770122954574883, |
|
"learning_rate": 1.9121805631345406e-06, |
|
"loss": 0.5588, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.49875337542296333, |
|
"learning_rate": 1.8930868651143776e-06, |
|
"loss": 0.5556, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.46661280379905284, |
|
"learning_rate": 1.8740535720462034e-06, |
|
"loss": 0.5518, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.49444595207088565, |
|
"learning_rate": 1.8550813994043814e-06, |
|
"loss": 0.5679, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.48381227476419236, |
|
"learning_rate": 1.8361710603657162e-06, |
|
"loss": 0.5572, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.5055312948711096, |
|
"learning_rate": 1.8173232657826508e-06, |
|
"loss": 0.5538, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.4686625212413926, |
|
"learning_rate": 1.7985387241565343e-06, |
|
"loss": 0.559, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.4804255341689684, |
|
"learning_rate": 1.7798181416109966e-06, |
|
"loss": 0.544, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.5090131219052505, |
|
"learning_rate": 1.7611622218654e-06, |
|
"loss": 0.5565, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.4823380469403731, |
|
"learning_rate": 1.7425716662083936e-06, |
|
"loss": 0.5586, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.5039478306212927, |
|
"learning_rate": 1.7240471734715416e-06, |
|
"loss": 0.5582, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.48106143586192984, |
|
"learning_rate": 1.7055894400030597e-06, |
|
"loss": 0.5527, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.4948095621947108, |
|
"learning_rate": 1.6871991596416367e-06, |
|
"loss": 0.5534, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.47985601211032985, |
|
"learning_rate": 1.668877023690356e-06, |
|
"loss": 0.5514, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.5044751224020304, |
|
"learning_rate": 1.6506237208907045e-06, |
|
"loss": 0.5541, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.5080452899508979, |
|
"learning_rate": 1.6324399373966833e-06, |
|
"loss": 0.5506, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.4931986436565961, |
|
"learning_rate": 1.6143263567490192e-06, |
|
"loss": 0.5736, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.4684816221900875, |
|
"learning_rate": 1.596283659849464e-06, |
|
"loss": 0.556, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.4785014812413059, |
|
"learning_rate": 1.5783125249352016e-06, |
|
"loss": 0.5579, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.5116019647376474, |
|
"learning_rate": 1.5604136275533513e-06, |
|
"loss": 0.5552, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.5395436792240803, |
|
"learning_rate": 1.5425876405355793e-06, |
|
"loss": 0.5384, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.4900436595350879, |
|
"learning_rate": 1.5248352339727968e-06, |
|
"loss": 0.5622, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.47513280378884526, |
|
"learning_rate": 1.5071570751899785e-06, |
|
"loss": 0.5636, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.4839906292088417, |
|
"learning_rate": 1.4895538287210727e-06, |
|
"loss": 0.5527, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.5376958097507211, |
|
"learning_rate": 1.4720261562840272e-06, |
|
"loss": 0.5635, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.48771290149288943, |
|
"learning_rate": 1.4545747167559066e-06, |
|
"loss": 0.564, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.4854524808894032, |
|
"learning_rate": 1.4372001661481314e-06, |
|
"loss": 0.5598, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.4700143505212195, |
|
"learning_rate": 1.4199031575818126e-06, |
|
"loss": 0.5375, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.4915439052479703, |
|
"learning_rate": 1.4026843412632083e-06, |
|
"loss": 0.5548, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.4869720592283153, |
|
"learning_rate": 1.385544364459273e-06, |
|
"loss": 0.5571, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.4716126280570366, |
|
"learning_rate": 1.3684838714733317e-06, |
|
"loss": 0.5516, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.4965381533290548, |
|
"learning_rate": 1.3515035036208578e-06, |
|
"loss": 0.5578, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.49674828915458996, |
|
"learning_rate": 1.3346038992053705e-06, |
|
"loss": 0.5498, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.47680857026122736, |
|
"learning_rate": 1.3177856934944328e-06, |
|
"loss": 0.5531, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.4870948629881832, |
|
"learning_rate": 1.3010495186957768e-06, |
|
"loss": 0.552, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.483089196852953, |
|
"learning_rate": 1.2843960039335355e-06, |
|
"loss": 0.5564, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.5140997811965615, |
|
"learning_rate": 1.2678257752245992e-06, |
|
"loss": 0.5504, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.4779902409617231, |
|
"learning_rate": 1.2513394554550753e-06, |
|
"loss": 0.5478, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.47680861915825756, |
|
"learning_rate": 1.2349376643568792e-06, |
|
"loss": 0.5555, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.47618772244534097, |
|
"learning_rate": 1.218621018484434e-06, |
|
"loss": 0.5509, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.46991117646305874, |
|
"learning_rate": 1.202390131191501e-06, |
|
"loss": 0.5572, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.48145576248836425, |
|
"learning_rate": 1.1862456126081136e-06, |
|
"loss": 0.562, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.49862994419451123, |
|
"learning_rate": 1.170188069617649e-06, |
|
"loss": 0.5574, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.5025682535998525, |
|
"learning_rate": 1.1542181058340122e-06, |
|
"loss": 0.5569, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.47850092658350835, |
|
"learning_rate": 1.1383363215789488e-06, |
|
"loss": 0.5543, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.5044422425999335, |
|
"learning_rate": 1.1225433138594741e-06, |
|
"loss": 0.5599, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.47419325850109234, |
|
"learning_rate": 1.1068396763454339e-06, |
|
"loss": 0.5586, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.560597143802205, |
|
"learning_rate": 1.0912259993471857e-06, |
|
"loss": 0.5524, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.5148468793364267, |
|
"learning_rate": 1.0757028697934152e-06, |
|
"loss": 0.5084, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.5017714203601242, |
|
"learning_rate": 1.060270871209064e-06, |
|
"loss": 0.5156, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.49357251631602217, |
|
"learning_rate": 1.0449305836934003e-06, |
|
"loss": 0.5109, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.4936913138076729, |
|
"learning_rate": 1.02968258389821e-06, |
|
"loss": 0.5158, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.5049259973539401, |
|
"learning_rate": 1.0145274450061254e-06, |
|
"loss": 0.5217, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.517079836314341, |
|
"learning_rate": 9.994657367090686e-07, |
|
"loss": 0.5136, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.4837364294449262, |
|
"learning_rate": 9.844980251868449e-07, |
|
"loss": 0.518, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.4869343961795407, |
|
"learning_rate": 9.696248730858605e-07, |
|
"loss": 0.5132, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.5085658265111329, |
|
"learning_rate": 9.54846839497964e-07, |
|
"loss": 0.5165, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.47424129042024027, |
|
"learning_rate": 9.401644799394382e-07, |
|
"loss": 0.5215, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.4991885159298539, |
|
"learning_rate": 9.255783463301111e-07, |
|
"loss": 0.5092, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.47972707851164975, |
|
"learning_rate": 9.110889869726167e-07, |
|
"loss": 0.5289, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.48477312158187885, |
|
"learning_rate": 8.966969465317753e-07, |
|
"loss": 0.5373, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.5150113149802942, |
|
"learning_rate": 8.824027660141253e-07, |
|
"loss": 0.5144, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.5012820847152873, |
|
"learning_rate": 8.682069827475828e-07, |
|
"loss": 0.5232, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.536197598669663, |
|
"learning_rate": 8.541101303612473e-07, |
|
"loss": 0.5312, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.47456874746453287, |
|
"learning_rate": 8.401127387653379e-07, |
|
"loss": 0.5021, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.5022494921077733, |
|
"learning_rate": 8.262153341312734e-07, |
|
"loss": 0.5039, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.5128622291867768, |
|
"learning_rate": 8.124184388719e-07, |
|
"loss": 0.5189, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.49970434341288505, |
|
"learning_rate": 7.987225716218441e-07, |
|
"loss": 0.5266, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.4990813361708124, |
|
"learning_rate": 7.851282472180222e-07, |
|
"loss": 0.5189, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.5361324180050252, |
|
"learning_rate": 7.716359766802858e-07, |
|
"loss": 0.5283, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.49325303865409753, |
|
"learning_rate": 7.582462671922154e-07, |
|
"loss": 0.5134, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.5074499214352016, |
|
"learning_rate": 7.449596220820492e-07, |
|
"loss": 0.5219, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.48687866167974014, |
|
"learning_rate": 7.317765408037668e-07, |
|
"loss": 0.5131, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.5209017279406115, |
|
"learning_rate": 7.186975189183119e-07, |
|
"loss": 0.5263, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.5017929271897994, |
|
"learning_rate": 7.057230480749689e-07, |
|
"loss": 0.5221, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.4909543768911595, |
|
"learning_rate": 6.928536159928746e-07, |
|
"loss": 0.5082, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.5217040631589964, |
|
"learning_rate": 6.800897064426877e-07, |
|
"loss": 0.5136, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.5007485735211247, |
|
"learning_rate": 6.674317992284038e-07, |
|
"loss": 0.5158, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.495432605404129, |
|
"learning_rate": 6.548803701693218e-07, |
|
"loss": 0.5191, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.5457479536125451, |
|
"learning_rate": 6.424358910821511e-07, |
|
"loss": 0.5144, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.5106414076169086, |
|
"learning_rate": 6.300988297632804e-07, |
|
"loss": 0.5288, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.5211736668510725, |
|
"learning_rate": 6.178696499711915e-07, |
|
"loss": 0.5218, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.4891406143758845, |
|
"learning_rate": 6.057488114090288e-07, |
|
"loss": 0.5107, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.5178228254981688, |
|
"learning_rate": 5.937367697073139e-07, |
|
"loss": 0.5004, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.49831173988741256, |
|
"learning_rate": 5.818339764068217e-07, |
|
"loss": 0.5167, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.5445792027132667, |
|
"learning_rate": 5.700408789416051e-07, |
|
"loss": 0.5251, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.5412064520692698, |
|
"learning_rate": 5.58357920622179e-07, |
|
"loss": 0.5185, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.5194173017222409, |
|
"learning_rate": 5.467855406188503e-07, |
|
"loss": 0.5213, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.530585691377951, |
|
"learning_rate": 5.353241739452134e-07, |
|
"loss": 0.5213, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.5334266089134705, |
|
"learning_rate": 5.239742514417958e-07, |
|
"loss": 0.5213, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.5323190599173516, |
|
"learning_rate": 5.127361997598647e-07, |
|
"loss": 0.5173, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.4977075988891876, |
|
"learning_rate": 5.016104413453866e-07, |
|
"loss": 0.5163, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.5072133518376746, |
|
"learning_rate": 4.905973944231479e-07, |
|
"loss": 0.5147, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.5089446326634548, |
|
"learning_rate": 4.796974729810328e-07, |
|
"loss": 0.5206, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.5173579821056443, |
|
"learning_rate": 4.6891108675446453e-07, |
|
"loss": 0.5233, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.49509093398735665, |
|
"learning_rate": 4.5823864121099967e-07, |
|
"loss": 0.5143, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.510739525920679, |
|
"learning_rate": 4.476805375350865e-07, |
|
"loss": 0.5204, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.5285640385275354, |
|
"learning_rate": 4.372371726129854e-07, |
|
"loss": 0.5226, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.49804779846917624, |
|
"learning_rate": 4.269089390178512e-07, |
|
"loss": 0.5257, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.4960403523798791, |
|
"learning_rate": 4.1669622499497205e-07, |
|
"loss": 0.5224, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.509776799973484, |
|
"learning_rate": 4.0659941444717833e-07, |
|
"loss": 0.5153, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.48108044641737857, |
|
"learning_rate": 3.966188869204094e-07, |
|
"loss": 0.5175, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.5141883943099625, |
|
"learning_rate": 3.8675501758944926e-07, |
|
"loss": 0.5147, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.5086149236998669, |
|
"learning_rate": 3.7700817724381983e-07, |
|
"loss": 0.5128, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.5107670739104685, |
|
"learning_rate": 3.6737873227384263e-07, |
|
"loss": 0.5162, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.48090817905611477, |
|
"learning_rate": 3.578670446568711e-07, |
|
"loss": 0.5289, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.5149098967385166, |
|
"learning_rate": 3.484734719436782e-07, |
|
"loss": 0.5224, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.4967090096149114, |
|
"learning_rate": 3.3919836724501743e-07, |
|
"loss": 0.5064, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.49198009223776107, |
|
"learning_rate": 3.3004207921835004e-07, |
|
"loss": 0.526, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.5260886992405347, |
|
"learning_rate": 3.210049520547388e-07, |
|
"loss": 0.5278, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.49827609520509064, |
|
"learning_rate": 3.1208732546590843e-07, |
|
"loss": 0.5269, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.5199185251610714, |
|
"learning_rate": 3.0328953467147543e-07, |
|
"loss": 0.5125, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.5165139482645277, |
|
"learning_rate": 2.946119103863483e-07, |
|
"loss": 0.5095, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.48760733590102007, |
|
"learning_rate": 2.86054778808296e-07, |
|
"loss": 0.5262, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.49481920675979196, |
|
"learning_rate": 2.7761846160568403e-07, |
|
"loss": 0.5209, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.5017608349952136, |
|
"learning_rate": 2.69303275905384e-07, |
|
"loss": 0.5137, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.5222144874040826, |
|
"learning_rate": 2.611095342808526e-07, |
|
"loss": 0.5162, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.4928255848647095, |
|
"learning_rate": 2.530375447403815e-07, |
|
"loss": 0.5176, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.530457616289496, |
|
"learning_rate": 2.4508761071551906e-07, |
|
"loss": 0.5181, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.5147706319208548, |
|
"learning_rate": 2.3726003104966393e-07, |
|
"loss": 0.5095, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.523763253857449, |
|
"learning_rate": 2.2955509998683214e-07, |
|
"loss": 0.5108, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.5323084690421006, |
|
"learning_rate": 2.2197310716059603e-07, |
|
"loss": 0.511, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.5088461348117514, |
|
"learning_rate": 2.1451433758319543e-07, |
|
"loss": 0.5265, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.5478220673331649, |
|
"learning_rate": 2.0717907163482507e-07, |
|
"loss": 0.5112, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.5414027895276027, |
|
"learning_rate": 1.9996758505309593e-07, |
|
"loss": 0.5231, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.4983898932091525, |
|
"learning_rate": 1.9288014892266753e-07, |
|
"loss": 0.5105, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.5093531347734784, |
|
"learning_rate": 1.8591702966505952e-07, |
|
"loss": 0.5127, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.677948629367298, |
|
"learning_rate": 1.790784890286352e-07, |
|
"loss": 0.5219, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.5010683504531009, |
|
"learning_rate": 1.7236478407876555e-07, |
|
"loss": 0.5054, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.5179768835662841, |
|
"learning_rate": 1.6577616718816123e-07, |
|
"loss": 0.5251, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.5087954420227027, |
|
"learning_rate": 1.5931288602738958e-07, |
|
"loss": 0.5137, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.5083448366233918, |
|
"learning_rate": 1.5297518355556132e-07, |
|
"loss": 0.5059, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.5170972166302202, |
|
"learning_rate": 1.467632980112023e-07, |
|
"loss": 0.5214, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.5145933451855358, |
|
"learning_rate": 1.406774629032923e-07, |
|
"loss": 0.511, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.5012480980422283, |
|
"learning_rate": 1.347179070024903e-07, |
|
"loss": 0.5179, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.5157422802936725, |
|
"learning_rate": 1.2888485433253521e-07, |
|
"loss": 0.5193, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.5104197669978088, |
|
"learning_rate": 1.2317852416182378e-07, |
|
"loss": 0.5221, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.48689303934415246, |
|
"learning_rate": 1.1759913099516816e-07, |
|
"loss": 0.5118, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.5105879788600957, |
|
"learning_rate": 1.1214688456573247e-07, |
|
"loss": 0.5178, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.4742285263986786, |
|
"learning_rate": 1.0682198982714814e-07, |
|
"loss": 0.534, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.5096564376650945, |
|
"learning_rate": 1.0162464694581235e-07, |
|
"loss": 0.5272, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.5068212494030221, |
|
"learning_rate": 9.65550512933605e-08, |
|
"loss": 0.5252, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.5132873703711879, |
|
"learning_rate": 9.16133934393224e-08, |
|
"loss": 0.5161, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.496214740845792, |
|
"learning_rate": 8.67998591439612e-08, |
|
"loss": 0.518, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.5257117991696062, |
|
"learning_rate": 8.21146293512876e-08, |
|
"loss": 0.5201, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.5038613162646833, |
|
"learning_rate": 7.755788018225961e-08, |
|
"loss": 0.5439, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.5108263338716986, |
|
"learning_rate": 7.31297829281617e-08, |
|
"loss": 0.5132, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.5149498952477054, |
|
"learning_rate": 6.883050404416552e-08, |
|
"loss": 0.5111, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.5047749285108949, |
|
"learning_rate": 6.46602051430732e-08, |
|
"loss": 0.5307, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.5156795764243357, |
|
"learning_rate": 6.061904298924253e-08, |
|
"loss": 0.5285, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.5144201053701509, |
|
"learning_rate": 5.670716949269278e-08, |
|
"loss": 0.5148, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.507394331507882, |
|
"learning_rate": 5.2924731703395564e-08, |
|
"loss": 0.5206, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.48368946217469994, |
|
"learning_rate": 4.927187180574666e-08, |
|
"loss": 0.526, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.5047554925764675, |
|
"learning_rate": 4.574872711322103e-08, |
|
"loss": 0.5126, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.4949463708763226, |
|
"learning_rate": 4.2355430063211405e-08, |
|
"loss": 0.5204, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.5079311960306774, |
|
"learning_rate": 3.909210821205017e-08, |
|
"loss": 0.5189, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.4902741464423996, |
|
"learning_rate": 3.595888423021354e-08, |
|
"loss": 0.513, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.5421885848655773, |
|
"learning_rate": 3.295587589771071e-08, |
|
"loss": 0.5093, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.49756539244831294, |
|
"learning_rate": 3.008319609965676e-08, |
|
"loss": 0.5144, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.5074328229331989, |
|
"learning_rate": 2.734095282202942e-08, |
|
"loss": 0.5133, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.49772891591572227, |
|
"learning_rate": 2.4729249147608378e-08, |
|
"loss": 0.5251, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.5088034449477752, |
|
"learning_rate": 2.224818325210237e-08, |
|
"loss": 0.5175, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.4826584150965653, |
|
"learning_rate": 1.9897848400456496e-08, |
|
"loss": 0.5141, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.5172662799041124, |
|
"learning_rate": 1.7678332943348807e-08, |
|
"loss": 0.5197, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.48940063691629393, |
|
"learning_rate": 1.5589720313866794e-08, |
|
"loss": 0.5059, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.517098264403305, |
|
"learning_rate": 1.3632089024371574e-08, |
|
"loss": 0.5141, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.48979313956431636, |
|
"learning_rate": 1.1805512663549345e-08, |
|
"loss": 0.5136, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.48660701715860905, |
|
"learning_rate": 1.0110059893640055e-08, |
|
"loss": 0.5212, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.4841422308843411, |
|
"learning_rate": 8.54579444786152e-09, |
|
"loss": 0.5228, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.4851052180007293, |
|
"learning_rate": 7.112775128009174e-09, |
|
"loss": 0.5146, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.49400550323274894, |
|
"learning_rate": 5.811055802249721e-09, |
|
"loss": 0.5277, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.512928633478054, |
|
"learning_rate": 4.640685403093147e-09, |
|
"loss": 0.5216, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.48929257944769156, |
|
"learning_rate": 3.6017079255547534e-09, |
|
"loss": 0.5172, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.5049424795736568, |
|
"learning_rate": 2.6941624255001904e-09, |
|
"loss": 0.5147, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.5046094240590331, |
|
"learning_rate": 1.9180830181797505e-09, |
|
"loss": 0.5222, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.5035868811303936, |
|
"learning_rate": 1.273498876942558e-09, |
|
"loss": 0.511, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.48822795586505313, |
|
"learning_rate": 7.604342321435032e-10, |
|
"loss": 0.5222, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.5199444169728372, |
|
"learning_rate": 3.789083702293028e-10, |
|
"loss": 0.5236, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.528390397947652, |
|
"learning_rate": 1.2893563301535904e-10, |
|
"loss": 0.5187, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.49922315857737276, |
|
"learning_rate": 1.0525417146023396e-11, |
|
"loss": 0.5179, |
|
"step": 5620 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5624, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"total_flos": 2354981319475200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|