|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.992429977289932, |
|
"eval_steps": 500, |
|
"global_step": 3300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03028009084027252, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001999954685354173, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06056018168054504, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001999818745523526, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09084027252081757, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019995921928281894, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12112036336109008, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019992750478004738, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1514004542013626, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001998867339183008, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18168054504163514, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019983691039261357, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21196063588190764, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001997780387184565, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24224072672218017, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019971012423132775, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27252081756245267, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019963317308626914, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3028009084027252, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3330809992429977, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3633610900832703, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019934817353485501, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3936411809235428, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001992351536782881, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4239212717638153, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019911314021003613, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45420136260408783, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48448145344436033, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019884217748453623, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5147615442846328, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019869325278444824, |
|
"loss": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5450416351249053, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019853538358476932, |
|
"loss": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5753217259651779, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019836858419307324, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6056018168054504, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6358819076457229, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019800825610923934, |
|
"loss": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6661619984859954, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6964420893262679, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019761239915510302, |
|
"loss": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7267221801665406, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019740119169423337, |
|
"loss": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.757002271006813, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019718115683235417, |
|
"loss": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7872823618470856, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019695231451106912, |
|
"loss": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.817562452687358, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019671468547019573, |
|
"loss": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8478425435276306, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001964682912458856, |
|
"loss": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.878122634367903, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019621315416867274, |
|
"loss": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9084027252081757, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9386828160484482, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019567674473737218, |
|
"loss": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9689629068887207, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019539552099769126, |
|
"loss": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9992429977289932, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0295230885692657, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019480716290349995, |
|
"loss": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0598031794095382, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019450008187146684, |
|
"loss": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0900832702498107, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019418443636395248, |
|
"loss": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1203633610900834, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019386025498768558, |
|
"loss": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1506434519303559, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019352756712299468, |
|
"loss": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1809235427706284, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019318640292114524, |
|
"loss": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2112036336109009, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2414837244511734, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019247876994925292, |
|
"loss": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2717638152914459, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000192112365311485, |
|
"loss": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3020439061317184, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019173761259529633, |
|
"loss": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3323239969719909, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3626040878122634, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3928841786525359, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001905636093763031, |
|
"loss": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4231642694928084, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019015581150138693, |
|
"loss": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4534443603330809, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018973984286913584, |
|
"loss": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4837244511733534, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001893157411784924, |
|
"loss": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5140045420136259, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5442846328538986, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018844329309978145, |
|
"loss": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.574564723694171, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018799502578106534, |
|
"loss": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6048448145344436, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018753878353549357, |
|
"loss": 0.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.635124905374716, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018707460771197774, |
|
"loss": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6654049962149886, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6956850870552613, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018612262431802007, |
|
"loss": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7259651778955338, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001856349030251589, |
|
"loss": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7562452687358063, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001851394207016957, |
|
"loss": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7865253595760788, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018463622225284242, |
|
"loss": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8168054504163513, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8470855412566238, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001836068600922156, |
|
"loss": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8773656320968963, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018308078967080546, |
|
"loss": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9076457229371688, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001825471896962774, |
|
"loss": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9379258137774413, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018200610852841913, |
|
"loss": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9682059046177138, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018145759520503358, |
|
"loss": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9984859954579863, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.028766086298259, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018033847160624225, |
|
"loss": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0590461771385313, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017976796275621555, |
|
"loss": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.089326267978804, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017919022459222752, |
|
"loss": 0.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1196063588190763, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.149886449659349, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017801327041281207, |
|
"loss": 0.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.1801665404996213, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017741416106390826, |
|
"loss": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.210446631339894, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017680803572442318, |
|
"loss": 0.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2407267221801668, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001761949493270671, |
|
"loss": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2710068130204393, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3012869038607118, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001749481162389254, |
|
"loss": 0.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3315669947009843, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3618470855412568, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001736741137876405, |
|
"loss": 0.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.3921271763815293, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017302706799479574, |
|
"loss": 0.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4224072672218018, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4526873580620743, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017171318047589637, |
|
"loss": 0.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4829674489023468, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001710464578265369, |
|
"loss": 0.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5132475397426193, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017037329628703004, |
|
"loss": 0.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5435276305828918, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016969375686552937, |
|
"loss": 0.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5738077214231643, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016900790114821122, |
|
"loss": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6040878122634368, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016831579129369346, |
|
"loss": 0.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6343679031037093, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016761749002740193, |
|
"loss": 0.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.6646479939439818, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6949280847842543, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016620256696108188, |
|
"loss": 0.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7252081756245268, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7554882664647993, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016476364487153023, |
|
"loss": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7857683573050718, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016403534686527225, |
|
"loss": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8160484481453443, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016330124538088705, |
|
"loss": 0.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8463285389856168, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016256140694947217, |
|
"loss": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8766086298258893, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016181589862206052, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9068887206661618, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016106478796354382, |
|
"loss": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9371688115064343, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016030814304654895, |
|
"loss": 0.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9674489023467068, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001595460324452688, |
|
"loss": 0.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.9977289931869797, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.028009084027252, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0582891748675247, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015722759967030898, |
|
"loss": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.088569265707797, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015644432188667695, |
|
"loss": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.1188493565480697, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001556559285941344, |
|
"loss": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.149129447388342, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000154862491244207, |
|
"loss": 0.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.1794095382286147, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.209689629068887, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015326077245747999, |
|
"loss": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.2399697199091597, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015245263618331945, |
|
"loss": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.270249810749432, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001516397461638962, |
|
"loss": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.3005299015897047, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015082217607085692, |
|
"loss": 0.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.330809992429977, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3610900832702497, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001491732924645604, |
|
"loss": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.391370174110522, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014834212838845637, |
|
"loss": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.4216502649507947, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001475065830994995, |
|
"loss": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.451930355791067, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014666673232256738, |
|
"loss": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.4822104466313397, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014582265217274104, |
|
"loss": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.5124905374716127, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001449744191484066, |
|
"loss": 0.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.542770628311885, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014412211012432212, |
|
"loss": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.5730507191521577, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014326580234465085, |
|
"loss": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.60333080999243, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014240557341596018, |
|
"loss": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6336109008327027, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.663890991672975, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.6941710825132477, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013980214108958624, |
|
"loss": 0.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.72445117335352, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013892701063173918, |
|
"loss": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.7547312641937927, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001380483522464923, |
|
"loss": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.785011355034065, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013716624556603274, |
|
"loss": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.8152914458743377, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001362807705350641, |
|
"loss": 0.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.84557153671461, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013539200740356118, |
|
"loss": 0.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.8758516275548827, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013450003671949706, |
|
"loss": 0.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.906131718395155, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013360493932154302, |
|
"loss": 0.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.9364118092354277, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.9666919000757, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013180568914815752, |
|
"loss": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.9969719909159727, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.027252081756245, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012999490912770107, |
|
"loss": 0.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.057532172596518, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001290854004005399, |
|
"loss": 0.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.08781226343679, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.118092354277063, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001272585576455398, |
|
"loss": 0.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.148372445117335, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012634138918316568, |
|
"loss": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.178652535957608, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012542183341934872, |
|
"loss": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.20893262679788, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001244999736927764, |
|
"loss": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.239212717638153, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.269492808478425, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012264967674257646, |
|
"loss": 0.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.299772899318698, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012172140721005079, |
|
"loss": 0.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.33005299015897, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.360333080999243, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011985904666457455, |
|
"loss": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.390613171839515, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011892512443604102, |
|
"loss": 0.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.420893262679788, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011798948703688539, |
|
"loss": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.45117335352006, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001170522192632624, |
|
"loss": 0.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.4814534443603335, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011611340605908642, |
|
"loss": 0.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.511733535200605, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011517313250833317, |
|
"loss": 0.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.5420136260408785, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.57229371688115, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011328854535702543, |
|
"loss": 0.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.6025738077214235, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011234440255526948, |
|
"loss": 0.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.632853898561696, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011139914098905406, |
|
"loss": 0.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.6631339894019685, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 0.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.693414080242241, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010950560433041826, |
|
"loss": 0.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.7236941710825135, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010855750084788398, |
|
"loss": 0.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.753974261922786, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010760862180510951, |
|
"loss": 0.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.7842543527630585, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010665905319833041, |
|
"loss": 0.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.814534443603331, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010570888108627681, |
|
"loss": 0.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.8448145344436035, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 0.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.875094625283876, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010380707084693901, |
|
"loss": 0.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.9053747161241485, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010285560507936961, |
|
"loss": 0.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.935654806964421, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010190388051033466, |
|
"loss": 0.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.9659348978046935, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00010095198339395769, |
|
"loss": 0.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.996214988644966, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.0264950794852385, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.904801660604234e-05, |
|
"loss": 0.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.056775170325511, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.809611948966533e-05, |
|
"loss": 0.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.0870552611657835, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.71443949206304e-05, |
|
"loss": 0.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.117335352006056, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.619292915306101e-05, |
|
"loss": 0.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.1476154428463285, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.524180841762577e-05, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.177895533686601, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.42911189137232e-05, |
|
"loss": 0.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.2081756245268735, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.334094680166962e-05, |
|
"loss": 0.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.238455715367146, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.239137819489047e-05, |
|
"loss": 0.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.2687358062074185, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.144249915211605e-05, |
|
"loss": 0.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.299015897047691, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.049439566958175e-05, |
|
"loss": 0.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.3292959878879635, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.359576078728236, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.860085901094595e-05, |
|
"loss": 0.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.3898561695685085, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.765559744473053e-05, |
|
"loss": 0.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.420136260408781, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.67114546429746e-05, |
|
"loss": 0.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.4504163512490535, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.480696442089326, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.482686749166686e-05, |
|
"loss": 0.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.5109765329295985, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.38865939409136e-05, |
|
"loss": 0.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.541256623769871, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.294778073673762e-05, |
|
"loss": 0.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.5715367146101435, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.201051296311462e-05, |
|
"loss": 0.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.601816805450416, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.107487556395901e-05, |
|
"loss": 0.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.6320968962906885, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.014095333542548e-05, |
|
"loss": 0.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.662376987130961, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.6926570779712335, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.827859278994925e-05, |
|
"loss": 0.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.722937168811506, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.735032325742355e-05, |
|
"loss": 0.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.7532172596517785, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.783497350492052, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.550002630722366e-05, |
|
"loss": 0.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.8137774413323235, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.457816658065134e-05, |
|
"loss": 0.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.844057532172597, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.365861081683433e-05, |
|
"loss": 0.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.8743376230128685, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.274144235446023e-05, |
|
"loss": 0.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.904617713853142, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 0.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.934897804693414, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.09145995994601e-05, |
|
"loss": 0.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.965177895533687, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.000509087229895e-05, |
|
"loss": 0.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.995457986373959, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.025738077214232, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.819431085184251e-05, |
|
"loss": 0.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.056018168054504, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.729320366825784e-05, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.086298258894777, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.639506067845697e-05, |
|
"loss": 0.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.116578349735049, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.549996328050296e-05, |
|
"loss": 0.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.146858440575322, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.460799259643884e-05, |
|
"loss": 0.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.177138531415594, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.371922946493591e-05, |
|
"loss": 0.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.207418622255867, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.283375443396726e-05, |
|
"loss": 0.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 6.237698713096139, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.19516477535077e-05, |
|
"loss": 0.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.267978803936412, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.107298936826086e-05, |
|
"loss": 0.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.298258894776684, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.019785891041381e-05, |
|
"loss": 0.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.328538985616957, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 0.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 6.358819076457229, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.389099167297502, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.759442658403985e-05, |
|
"loss": 0.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 6.419379258137774, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6734197655349156e-05, |
|
"loss": 0.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.449659348978047, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5877889875677845e-05, |
|
"loss": 0.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.479939439818319, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5025580851593436e-05, |
|
"loss": 0.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.510219530658592, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.417734782725896e-05, |
|
"loss": 0.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.540499621498864, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.333326767743263e-05, |
|
"loss": 0.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.570779712339137, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.249341690050051e-05, |
|
"loss": 0.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 6.601059803179409, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.1657871611543605e-05, |
|
"loss": 0.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.631339894019682, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.082670753543961e-05, |
|
"loss": 0.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.661619984859954, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.691900075700227, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.9177823929143106e-05, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.722180166540499, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.836025383610382e-05, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.752460257380772, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.754736381668057e-05, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 6.782740348221044, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.673922754252002e-05, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 6.813020439061317, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.843300529901589, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.513750875579303e-05, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 6.873580620741862, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.434407140586565e-05, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.903860711582134, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.355567811332311e-05, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.934140802422407, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.277240032969105e-05, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.964420893262679, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.994700984102952, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.024981074943224, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.045396755473121e-05, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.055261165783497, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.969185695345105e-05, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.085541256623769, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.893521203645618e-05, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.115821347464043, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.8184101377939476e-05, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 7.146101438304315, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7438593050527845e-05, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.176381529144588, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.669875461911297e-05, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.20666161998486, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5964653134727776e-05, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.236941710825133, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.523635512846981e-05, |
|
"loss": 0.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.267221801665405, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.297501892505678, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.379743303891815e-05, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 7.32778198334595, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.358062074186223, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.238250997259808e-05, |
|
"loss": 0.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.388342165026495, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1684208706306574e-05, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.418622255866768, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.099209885178882e-05, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 7.44890234670704, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.030624313447067e-05, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.479182437547313, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.962670371296996e-05, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 7.509462528387585, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8953542173463133e-05, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.539742619227858, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.828681952410366e-05, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.57002271006813, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7626596189492983e-05, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.600302800908403, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6972932005204267e-05, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 7.630582891748675, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6325886212359498e-05, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 7.660862982588948, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 0.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.69114307342922, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5051883761074614e-05, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 7.721423164269493, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 7.751703255109765, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3805050672932928e-05, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 7.781983345950038, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3191964275576805e-05, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 7.81226343679031, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2585838936091754e-05, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 7.842543527630583, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.198672958718796e-05, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.872823618470855, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.139469052572127e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.903103709311128, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0809775407772503e-05, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 7.9333838001514, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0232037243784475e-05, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 7.963663890991673, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9661528393757744e-05, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 7.993943981831945, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.024224072672219, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.854240479496643e-05, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 8.05450416351249, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7993891471580893e-05, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 8.084784254352764, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.74528103037226e-05, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 8.115064345193035, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6919210329194533e-05, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.145344436033309, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6393139907784404e-05, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 8.17562452687358, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.205904617713854, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5363777747157572e-05, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 8.236184708554125, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4860579298304312e-05, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.266464799394399, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4365096974841108e-05, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.29674489023467, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3877375681979943e-05, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.327024981074944, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 8.357305071915215, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2925392288022298e-05, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.387585162755489, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2461216464506454e-05, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 8.41786525359576, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2004974218934695e-05, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.448145344436034, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1556706900218572e-05, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.478425435276305, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.508705526116579, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.068425882150762e-05, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 8.53898561695685, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.026015713086418e-05, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 8.569265707797124, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.844188498613116e-06, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 8.599545798637395, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.436390623696911e-06, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 8.629825889477669, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 8.66010598031794, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 8.690386071158214, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.262387404703653e-06, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 8.720666161998485, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.887634688515e-06, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 8.750946252838759, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.521230050747086e-06, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 8.78122634367903, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 8.811506434519304, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.813597078854772e-06, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 8.841786525359575, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.472432877005341e-06, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 8.872066616199849, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.139745012314424e-06, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 8.90234670704012, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.8155636360475385e-06, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 8.932626797880394, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.499918128533155e-06, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.962906888720667, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.192837096500058e-06, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 8.993186979560939, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 9.02346707040121, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.604479002308737e-06, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 9.053747161241484, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.323255262627846e-06, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 9.084027252081757, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.114307342922029, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7868458313272904e-06, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 9.144587433762302, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5317087541144377e-06, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.174867524602574, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2853145298042953e-06, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.205147615442847, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.047685488930874e-06, |
|
"loss": 0.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.235427706283119, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.818843167645835e-06, |
|
"loss": 0.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 9.265707797123392, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5988083057666533e-06, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.295987887963664, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3876008448969976e-06, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 9.326267978803937, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.356548069644209, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9917438907606556e-06, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.386828160484482, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8071302737293295e-06, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.417108251324754, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6314158069267948e-06, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 9.447388342165027, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4646164152307018e-06, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.477668433005299, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3067472155517735e-06, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 9.507948523845572, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.157822515463758e-06, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 9.538228614685844, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 9.568508705526117, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.868597899638898e-07, |
|
"loss": 0.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 9.598788796366389, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.648463217118984e-07, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 9.629068887206662, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.518264651449779e-07, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 9.659348978046934, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 0.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 9.689629068887207, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.5280774269154115e-07, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 9.719909159727479, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6682691373086665e-07, |
|
"loss": 0.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 9.750189250567752, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.898757686722542e-07, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 9.780469341408024, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.219612815434924e-07, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 9.810749432248297, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.630896073864352e-07, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 9.841029523088569, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1326608169920372e-07, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 9.871309613928842, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.249521995263964e-08, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 9.901589704769114, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.078071718107701e-08, |
|
"loss": 0.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 9.931869795609387, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.81254476474213e-08, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 9.962149886449659, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.531464582713252e-09, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 9.992429977289932, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.992429977289932, |
|
"step": 3300, |
|
"total_flos": 1.623229319872512e+17, |
|
"train_loss": 0.0, |
|
"train_runtime": 3221.4406, |
|
"train_samples_per_second": 4.101, |
|
"train_steps_per_second": 1.024 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.623229319872512e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|