|
{ |
|
"best_metric": 0.5752944946289062, |
|
"best_model_checkpoint": "./output/checkpoint-1200", |
|
"epoch": 4.719101123595506, |
|
"eval_steps": 150, |
|
"global_step": 2100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02247191011235955, |
|
"grad_norm": 40.276283264160156, |
|
"learning_rate": 1.25e-06, |
|
"loss": 2.1095, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": 15.660475730895996, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.7071, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06741573033707865, |
|
"grad_norm": 32.55162811279297, |
|
"learning_rate": 3.75e-06, |
|
"loss": 1.746, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 15.043834686279297, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5731, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 24.83352279663086, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.3603, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 35.10358428955078, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.3892, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15730337078651685, |
|
"grad_norm": 35.139347076416016, |
|
"learning_rate": 8.75e-06, |
|
"loss": 1.2658, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 28.40645980834961, |
|
"learning_rate": 1e-05, |
|
"loss": 1.187, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20224719101123595, |
|
"grad_norm": 29.196102142333984, |
|
"learning_rate": 1.125e-05, |
|
"loss": 1.237, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 25.024761199951172, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.2521, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24719101123595505, |
|
"grad_norm": 25.139511108398438, |
|
"learning_rate": 1.2499871543489788e-05, |
|
"loss": 1.1565, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 12.163933753967285, |
|
"learning_rate": 1.2499486179239496e-05, |
|
"loss": 1.1179, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29213483146067415, |
|
"grad_norm": 7.7505998611450195, |
|
"learning_rate": 1.2498843923089939e-05, |
|
"loss": 1.1047, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 21.35865592956543, |
|
"learning_rate": 1.249794480144175e-05, |
|
"loss": 1.1226, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 15.02802562713623, |
|
"learning_rate": 1.24967888512543e-05, |
|
"loss": 0.9727, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"eval_loss": 0.9761117696762085, |
|
"eval_runtime": 28.1312, |
|
"eval_samples_per_second": 14.077, |
|
"eval_steps_per_second": 14.077, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 21.93962287902832, |
|
"learning_rate": 1.2495376120044174e-05, |
|
"loss": 1.1303, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.38202247191011235, |
|
"grad_norm": 22.738882064819336, |
|
"learning_rate": 1.2493706665883217e-05, |
|
"loss": 1.1115, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 19.962158203125, |
|
"learning_rate": 1.2491780557396153e-05, |
|
"loss": 1.0215, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.42696629213483145, |
|
"grad_norm": 22.92249870300293, |
|
"learning_rate": 1.2489597873757757e-05, |
|
"loss": 1.0086, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 20.59571075439453, |
|
"learning_rate": 1.2487158704689602e-05, |
|
"loss": 0.9309, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.47191011235955055, |
|
"grad_norm": 21.059810638427734, |
|
"learning_rate": 1.248446315045638e-05, |
|
"loss": 1.0798, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 17.537137985229492, |
|
"learning_rate": 1.2481511321861762e-05, |
|
"loss": 0.9138, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5168539325842697, |
|
"grad_norm": 16.027517318725586, |
|
"learning_rate": 1.2478303340243865e-05, |
|
"loss": 0.9623, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 18.933513641357422, |
|
"learning_rate": 1.2474839337470245e-05, |
|
"loss": 0.9063, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 18.865337371826172, |
|
"learning_rate": 1.2471119455932489e-05, |
|
"loss": 0.8277, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 17.30205535888672, |
|
"learning_rate": 1.246714384854036e-05, |
|
"loss": 0.7923, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6067415730337079, |
|
"grad_norm": 14.635787963867188, |
|
"learning_rate": 1.2462912678715502e-05, |
|
"loss": 0.8241, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 21.17339324951172, |
|
"learning_rate": 1.245842612038474e-05, |
|
"loss": 0.8488, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.651685393258427, |
|
"grad_norm": 19.38275909423828, |
|
"learning_rate": 1.2453684357972907e-05, |
|
"loss": 1.0164, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 20.8955078125, |
|
"learning_rate": 1.2448687586395288e-05, |
|
"loss": 0.9468, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"eval_loss": 0.7623159885406494, |
|
"eval_runtime": 28.3099, |
|
"eval_samples_per_second": 13.988, |
|
"eval_steps_per_second": 13.988, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6966292134831461, |
|
"grad_norm": 14.662369728088379, |
|
"learning_rate": 1.2443436011049593e-05, |
|
"loss": 0.8178, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 15.885370254516602, |
|
"learning_rate": 1.2437929847807512e-05, |
|
"loss": 0.9479, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7415730337078652, |
|
"grad_norm": 14.278594970703125, |
|
"learning_rate": 1.2432169323005851e-05, |
|
"loss": 0.8635, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 14.948090553283691, |
|
"learning_rate": 1.2426154673437223e-05, |
|
"loss": 0.7926, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 20.756702423095703, |
|
"learning_rate": 1.2419886146340315e-05, |
|
"loss": 0.7433, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 13.336278915405273, |
|
"learning_rate": 1.2413363999389718e-05, |
|
"loss": 0.9501, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8314606741573034, |
|
"grad_norm": 18.14997100830078, |
|
"learning_rate": 1.2406588500685356e-05, |
|
"loss": 0.7787, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 14.070672988891602, |
|
"learning_rate": 1.2399559928741435e-05, |
|
"loss": 0.8501, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8764044943820225, |
|
"grad_norm": 12.321798324584961, |
|
"learning_rate": 1.2392278572475025e-05, |
|
"loss": 0.8531, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 13.217580795288086, |
|
"learning_rate": 1.2384744731194159e-05, |
|
"loss": 0.7066, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9213483146067416, |
|
"grad_norm": 21.007793426513672, |
|
"learning_rate": 1.2376958714585546e-05, |
|
"loss": 0.7631, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 25.68875503540039, |
|
"learning_rate": 1.2368920842701831e-05, |
|
"loss": 0.6831, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9662921348314607, |
|
"grad_norm": 18.43454933166504, |
|
"learning_rate": 1.2360631445948449e-05, |
|
"loss": 0.7968, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 11.963883399963379, |
|
"learning_rate": 1.2352090865070027e-05, |
|
"loss": 0.7182, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0112359550561798, |
|
"grad_norm": 19.328462600708008, |
|
"learning_rate": 1.2343299451136397e-05, |
|
"loss": 0.7027, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0112359550561798, |
|
"eval_loss": 0.6809844970703125, |
|
"eval_runtime": 28.6076, |
|
"eval_samples_per_second": 13.842, |
|
"eval_steps_per_second": 13.842, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0337078651685394, |
|
"grad_norm": 17.414684295654297, |
|
"learning_rate": 1.2334257565528155e-05, |
|
"loss": 0.5859, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0561797752808988, |
|
"grad_norm": 15.297569274902344, |
|
"learning_rate": 1.2324965579921801e-05, |
|
"loss": 0.4641, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0786516853932584, |
|
"grad_norm": 18.19464874267578, |
|
"learning_rate": 1.2315423876274468e-05, |
|
"loss": 0.5484, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.101123595505618, |
|
"grad_norm": 9.914576530456543, |
|
"learning_rate": 1.2305632846808221e-05, |
|
"loss": 0.6468, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"grad_norm": 14.114611625671387, |
|
"learning_rate": 1.2295592893993934e-05, |
|
"loss": 0.6038, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.146067415730337, |
|
"grad_norm": 16.208505630493164, |
|
"learning_rate": 1.2285304430534745e-05, |
|
"loss": 0.5434, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1685393258426966, |
|
"grad_norm": 9.227933883666992, |
|
"learning_rate": 1.2274767879349083e-05, |
|
"loss": 0.5278, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1910112359550562, |
|
"grad_norm": 13.076558113098145, |
|
"learning_rate": 1.2263983673553307e-05, |
|
"loss": 0.7342, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2134831460674158, |
|
"grad_norm": 14.731674194335938, |
|
"learning_rate": 1.2252952256443871e-05, |
|
"loss": 0.5825, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2359550561797752, |
|
"grad_norm": 12.931262016296387, |
|
"learning_rate": 1.2241674081479129e-05, |
|
"loss": 0.6817, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2584269662921348, |
|
"grad_norm": 19.48564910888672, |
|
"learning_rate": 1.223014961226068e-05, |
|
"loss": 0.6012, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2808988764044944, |
|
"grad_norm": 17.863492965698242, |
|
"learning_rate": 1.2218379322514316e-05, |
|
"loss": 0.6, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.303370786516854, |
|
"grad_norm": 16.681968688964844, |
|
"learning_rate": 1.2206363696070545e-05, |
|
"loss": 0.5687, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3258426966292136, |
|
"grad_norm": 11.682393074035645, |
|
"learning_rate": 1.219410322684471e-05, |
|
"loss": 0.6333, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 10.291793823242188, |
|
"learning_rate": 1.2181598418816679e-05, |
|
"loss": 0.5697, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"eval_loss": 0.648504376411438, |
|
"eval_runtime": 28.6285, |
|
"eval_samples_per_second": 13.832, |
|
"eval_steps_per_second": 13.832, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3707865168539326, |
|
"grad_norm": 18.982742309570312, |
|
"learning_rate": 1.2168849786010134e-05, |
|
"loss": 0.4987, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3932584269662922, |
|
"grad_norm": 6.644986629486084, |
|
"learning_rate": 1.2155857852471433e-05, |
|
"loss": 0.6424, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4157303370786516, |
|
"grad_norm": 10.34019947052002, |
|
"learning_rate": 1.2142623152248081e-05, |
|
"loss": 0.674, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4382022471910112, |
|
"grad_norm": 20.887239456176758, |
|
"learning_rate": 1.2129146229366767e-05, |
|
"loss": 0.5793, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.4606741573033708, |
|
"grad_norm": 19.285852432250977, |
|
"learning_rate": 1.2115427637811003e-05, |
|
"loss": 0.5608, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4831460674157304, |
|
"grad_norm": 14.167954444885254, |
|
"learning_rate": 1.2101467941498358e-05, |
|
"loss": 0.4507, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.50561797752809, |
|
"grad_norm": 17.428260803222656, |
|
"learning_rate": 1.208726771425727e-05, |
|
"loss": 0.4813, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5280898876404494, |
|
"grad_norm": 14.576336860656738, |
|
"learning_rate": 1.2072827539803463e-05, |
|
"loss": 0.6145, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.550561797752809, |
|
"grad_norm": 11.203570365905762, |
|
"learning_rate": 1.205814801171595e-05, |
|
"loss": 0.4898, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5730337078651684, |
|
"grad_norm": 16.303321838378906, |
|
"learning_rate": 1.2043229733412637e-05, |
|
"loss": 0.5359, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.595505617977528, |
|
"grad_norm": 12.990396499633789, |
|
"learning_rate": 1.2028073318125511e-05, |
|
"loss": 0.5608, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6179775280898876, |
|
"grad_norm": 15.660808563232422, |
|
"learning_rate": 1.2012679388875442e-05, |
|
"loss": 0.5824, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6404494382022472, |
|
"grad_norm": 20.047142028808594, |
|
"learning_rate": 1.1997048578446569e-05, |
|
"loss": 0.4947, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.6629213483146068, |
|
"grad_norm": 15.481894493103027, |
|
"learning_rate": 1.1981181529360284e-05, |
|
"loss": 0.6934, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"grad_norm": 13.069948196411133, |
|
"learning_rate": 1.1965078893848829e-05, |
|
"loss": 0.5284, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"eval_loss": 0.5964965224266052, |
|
"eval_runtime": 28.8612, |
|
"eval_samples_per_second": 13.721, |
|
"eval_steps_per_second": 13.721, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7078651685393258, |
|
"grad_norm": 15.860553741455078, |
|
"learning_rate": 1.1948741333828482e-05, |
|
"loss": 0.5074, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.7303370786516854, |
|
"grad_norm": 15.73456859588623, |
|
"learning_rate": 1.1932169520872344e-05, |
|
"loss": 0.5663, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7528089887640448, |
|
"grad_norm": 15.158873558044434, |
|
"learning_rate": 1.1915364136182738e-05, |
|
"loss": 0.5084, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7752808988764044, |
|
"grad_norm": 18.966686248779297, |
|
"learning_rate": 1.189832587056321e-05, |
|
"loss": 0.5863, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 16.93968391418457, |
|
"learning_rate": 1.188105542439012e-05, |
|
"loss": 0.4685, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8202247191011236, |
|
"grad_norm": 15.844802856445312, |
|
"learning_rate": 1.186355350758387e-05, |
|
"loss": 0.4631, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8426966292134832, |
|
"grad_norm": 6.261096000671387, |
|
"learning_rate": 1.1845820839579707e-05, |
|
"loss": 0.4791, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.8651685393258428, |
|
"grad_norm": 21.932397842407227, |
|
"learning_rate": 1.1827858149298162e-05, |
|
"loss": 0.5318, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8876404494382022, |
|
"grad_norm": 20.80328369140625, |
|
"learning_rate": 1.1809666175115075e-05, |
|
"loss": 0.4769, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9101123595505618, |
|
"grad_norm": 11.984331130981445, |
|
"learning_rate": 1.1791245664831252e-05, |
|
"loss": 0.4934, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.9325842696629212, |
|
"grad_norm": 15.739178657531738, |
|
"learning_rate": 1.177259737564172e-05, |
|
"loss": 0.546, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.9550561797752808, |
|
"grad_norm": 18.232080459594727, |
|
"learning_rate": 1.1753722074104613e-05, |
|
"loss": 0.5, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9775280898876404, |
|
"grad_norm": 19.222999572753906, |
|
"learning_rate": 1.1734620536109645e-05, |
|
"loss": 0.5939, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 20.398212432861328, |
|
"learning_rate": 1.1715293546846223e-05, |
|
"loss": 0.5186, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.0224719101123596, |
|
"grad_norm": 15.60082721710205, |
|
"learning_rate": 1.1695741900771185e-05, |
|
"loss": 0.2977, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.0224719101123596, |
|
"eval_loss": 0.5883856415748596, |
|
"eval_runtime": 28.4376, |
|
"eval_samples_per_second": 13.925, |
|
"eval_steps_per_second": 13.925, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.044943820224719, |
|
"grad_norm": 11.865720748901367, |
|
"learning_rate": 1.1675966401576116e-05, |
|
"loss": 0.3224, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.067415730337079, |
|
"grad_norm": 13.449054718017578, |
|
"learning_rate": 1.1655967862154335e-05, |
|
"loss": 0.3297, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.0898876404494384, |
|
"grad_norm": 18.293087005615234, |
|
"learning_rate": 1.1635747104567469e-05, |
|
"loss": 0.282, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.1123595505617976, |
|
"grad_norm": 15.209367752075195, |
|
"learning_rate": 1.1615304960011663e-05, |
|
"loss": 0.3504, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.134831460674157, |
|
"grad_norm": 17.038095474243164, |
|
"learning_rate": 1.1594642268783414e-05, |
|
"loss": 0.2422, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.157303370786517, |
|
"grad_norm": 14.031147956848145, |
|
"learning_rate": 1.1573759880245028e-05, |
|
"loss": 0.2884, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.1797752808988764, |
|
"grad_norm": 16.43686294555664, |
|
"learning_rate": 1.1552658652789704e-05, |
|
"loss": 0.2729, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.202247191011236, |
|
"grad_norm": 18.40167808532715, |
|
"learning_rate": 1.153133945380626e-05, |
|
"loss": 0.3775, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.2247191011235956, |
|
"grad_norm": 11.52310848236084, |
|
"learning_rate": 1.1509803159643458e-05, |
|
"loss": 0.4183, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 17.900732040405273, |
|
"learning_rate": 1.1488050655574003e-05, |
|
"loss": 0.3087, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.2696629213483144, |
|
"grad_norm": 12.462130546569824, |
|
"learning_rate": 1.1466082835758142e-05, |
|
"loss": 0.371, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.292134831460674, |
|
"grad_norm": 15.067594528198242, |
|
"learning_rate": 1.1443900603206901e-05, |
|
"loss": 0.2704, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.3146067415730336, |
|
"grad_norm": 12.72071647644043, |
|
"learning_rate": 1.1421504869744979e-05, |
|
"loss": 0.2859, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.337078651685393, |
|
"grad_norm": 19.164932250976562, |
|
"learning_rate": 1.139889655597326e-05, |
|
"loss": 0.3761, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"grad_norm": 12.603534698486328, |
|
"learning_rate": 1.1376076591230975e-05, |
|
"loss": 0.3465, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"eval_loss": 0.5863937735557556, |
|
"eval_runtime": 28.6419, |
|
"eval_samples_per_second": 13.826, |
|
"eval_steps_per_second": 13.826, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.3820224719101124, |
|
"grad_norm": 13.150344848632812, |
|
"learning_rate": 1.1353045913557491e-05, |
|
"loss": 0.3628, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.404494382022472, |
|
"grad_norm": 13.253989219665527, |
|
"learning_rate": 1.1329805469653767e-05, |
|
"loss": 0.3521, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.4269662921348316, |
|
"grad_norm": 15.47899055480957, |
|
"learning_rate": 1.1306356214843423e-05, |
|
"loss": 0.2289, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.449438202247191, |
|
"grad_norm": 20.19160270690918, |
|
"learning_rate": 1.1282699113033476e-05, |
|
"loss": 0.2723, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.4719101123595504, |
|
"grad_norm": 16.03728485107422, |
|
"learning_rate": 1.125883513667473e-05, |
|
"loss": 0.3398, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.49438202247191, |
|
"grad_norm": 21.989660263061523, |
|
"learning_rate": 1.123476526672178e-05, |
|
"loss": 0.3926, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.5168539325842696, |
|
"grad_norm": 16.33092498779297, |
|
"learning_rate": 1.1210490492592705e-05, |
|
"loss": 0.2243, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.539325842696629, |
|
"grad_norm": 17.589994430541992, |
|
"learning_rate": 1.118601181212839e-05, |
|
"loss": 0.3433, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.561797752808989, |
|
"grad_norm": 10.739886283874512, |
|
"learning_rate": 1.1161330231551516e-05, |
|
"loss": 0.2862, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.5842696629213484, |
|
"grad_norm": 15.978007316589355, |
|
"learning_rate": 1.1136446765425187e-05, |
|
"loss": 0.2902, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.606741573033708, |
|
"grad_norm": 12.640671730041504, |
|
"learning_rate": 1.1111362436611233e-05, |
|
"loss": 0.3325, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.629213483146067, |
|
"grad_norm": 16.383241653442383, |
|
"learning_rate": 1.1086078276228168e-05, |
|
"loss": 0.3235, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.6516853932584272, |
|
"grad_norm": 14.427544593811035, |
|
"learning_rate": 1.1060595323608789e-05, |
|
"loss": 0.3674, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.6741573033707864, |
|
"grad_norm": 12.737131118774414, |
|
"learning_rate": 1.1034914626257467e-05, |
|
"loss": 0.2976, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 13.494490623474121, |
|
"learning_rate": 1.1009037239807091e-05, |
|
"loss": 0.3386, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"eval_loss": 0.5752944946289062, |
|
"eval_runtime": 28.5405, |
|
"eval_samples_per_second": 13.875, |
|
"eval_steps_per_second": 13.875, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"grad_norm": 18.343557357788086, |
|
"learning_rate": 1.098296422797566e-05, |
|
"loss": 0.4059, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.741573033707865, |
|
"grad_norm": 14.710756301879883, |
|
"learning_rate": 1.095669666252257e-05, |
|
"loss": 0.3214, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.764044943820225, |
|
"grad_norm": 14.75973892211914, |
|
"learning_rate": 1.0930235623204552e-05, |
|
"loss": 0.3608, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.7865168539325844, |
|
"grad_norm": 9.905698776245117, |
|
"learning_rate": 1.0903582197731294e-05, |
|
"loss": 0.299, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 12.442407608032227, |
|
"learning_rate": 1.0876737481720722e-05, |
|
"loss": 0.4008, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.831460674157303, |
|
"grad_norm": 11.765353202819824, |
|
"learning_rate": 1.0849702578653969e-05, |
|
"loss": 0.3743, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.853932584269663, |
|
"grad_norm": 21.303213119506836, |
|
"learning_rate": 1.0822478599830009e-05, |
|
"loss": 0.4142, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.8764044943820224, |
|
"grad_norm": 12.464083671569824, |
|
"learning_rate": 1.0795066664319983e-05, |
|
"loss": 0.3029, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.898876404494382, |
|
"grad_norm": 20.486202239990234, |
|
"learning_rate": 1.0767467898921198e-05, |
|
"loss": 0.3827, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.9213483146067416, |
|
"grad_norm": 16.888824462890625, |
|
"learning_rate": 1.0739683438110799e-05, |
|
"loss": 0.3854, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.943820224719101, |
|
"grad_norm": 17.157941818237305, |
|
"learning_rate": 1.0711714423999145e-05, |
|
"loss": 0.3273, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.966292134831461, |
|
"grad_norm": 16.582439422607422, |
|
"learning_rate": 1.0683562006282862e-05, |
|
"loss": 0.3334, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.98876404494382, |
|
"grad_norm": 19.243640899658203, |
|
"learning_rate": 1.0655227342197573e-05, |
|
"loss": 0.3262, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.0112359550561796, |
|
"grad_norm": 10.776522636413574, |
|
"learning_rate": 1.0626711596470345e-05, |
|
"loss": 0.2177, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.033707865168539, |
|
"grad_norm": 13.728617668151855, |
|
"learning_rate": 1.0598015941271792e-05, |
|
"loss": 0.1772, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.033707865168539, |
|
"eval_loss": 0.6071353554725647, |
|
"eval_runtime": 28.1538, |
|
"eval_samples_per_second": 14.066, |
|
"eval_steps_per_second": 14.066, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.056179775280899, |
|
"grad_norm": 15.800386428833008, |
|
"learning_rate": 1.0569141556167905e-05, |
|
"loss": 0.1571, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.0786516853932584, |
|
"grad_norm": 11.595304489135742, |
|
"learning_rate": 1.0540089628071565e-05, |
|
"loss": 0.1715, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.101123595505618, |
|
"grad_norm": 19.881999969482422, |
|
"learning_rate": 1.0510861351193747e-05, |
|
"loss": 0.1924, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.1235955056179776, |
|
"grad_norm": 18.425518035888672, |
|
"learning_rate": 1.0481457926994435e-05, |
|
"loss": 0.1942, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.146067415730337, |
|
"grad_norm": 19.82516098022461, |
|
"learning_rate": 1.045188056413323e-05, |
|
"loss": 0.1355, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.168539325842697, |
|
"grad_norm": 21.702138900756836, |
|
"learning_rate": 1.0422130478419676e-05, |
|
"loss": 0.1985, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.191011235955056, |
|
"grad_norm": 9.89587116241455, |
|
"learning_rate": 1.0392208892763269e-05, |
|
"loss": 0.1726, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.2134831460674156, |
|
"grad_norm": 18.09695053100586, |
|
"learning_rate": 1.0362117037123204e-05, |
|
"loss": 0.2026, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.235955056179775, |
|
"grad_norm": 18.88958168029785, |
|
"learning_rate": 1.0331856148457804e-05, |
|
"loss": 0.1631, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.258426966292135, |
|
"grad_norm": 17.335662841796875, |
|
"learning_rate": 1.030142747067368e-05, |
|
"loss": 0.162, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.2808988764044944, |
|
"grad_norm": 24.29629898071289, |
|
"learning_rate": 1.027083225457459e-05, |
|
"loss": 0.1695, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.303370786516854, |
|
"grad_norm": 11.860310554504395, |
|
"learning_rate": 1.0240071757810035e-05, |
|
"loss": 0.1698, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.3258426966292136, |
|
"grad_norm": 20.780662536621094, |
|
"learning_rate": 1.0209147244823564e-05, |
|
"loss": 0.1495, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.348314606741573, |
|
"grad_norm": 21.13768768310547, |
|
"learning_rate": 1.0178059986800773e-05, |
|
"loss": 0.1736, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"grad_norm": 11.82013988494873, |
|
"learning_rate": 1.0146811261617086e-05, |
|
"loss": 0.2058, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"eval_loss": 0.5937667489051819, |
|
"eval_runtime": 28.4093, |
|
"eval_samples_per_second": 13.939, |
|
"eval_steps_per_second": 13.939, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.393258426966292, |
|
"grad_norm": 7.896106719970703, |
|
"learning_rate": 1.0115402353785198e-05, |
|
"loss": 0.1455, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.4157303370786516, |
|
"grad_norm": 15.18211555480957, |
|
"learning_rate": 1.0083834554402293e-05, |
|
"loss": 0.1313, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.438202247191011, |
|
"grad_norm": 14.026439666748047, |
|
"learning_rate": 1.0052109161096959e-05, |
|
"loss": 0.2089, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.460674157303371, |
|
"grad_norm": 17.17043113708496, |
|
"learning_rate": 1.0020227477975852e-05, |
|
"loss": 0.214, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.4831460674157304, |
|
"grad_norm": 16.3658447265625, |
|
"learning_rate": 9.988190815570101e-06, |
|
"loss": 0.2524, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.50561797752809, |
|
"grad_norm": 13.139276504516602, |
|
"learning_rate": 9.95600049078141e-06, |
|
"loss": 0.1869, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.5280898876404496, |
|
"grad_norm": 12.490981101989746, |
|
"learning_rate": 9.923657826827957e-06, |
|
"loss": 0.1826, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.550561797752809, |
|
"grad_norm": 15.54690170288086, |
|
"learning_rate": 9.891164153189975e-06, |
|
"loss": 0.1897, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.5730337078651684, |
|
"grad_norm": 15.49406623840332, |
|
"learning_rate": 9.858520805555123e-06, |
|
"loss": 0.1425, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.595505617977528, |
|
"grad_norm": 15.071846008300781, |
|
"learning_rate": 9.825729125763562e-06, |
|
"loss": 0.2484, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.6179775280898876, |
|
"grad_norm": 17.417387008666992, |
|
"learning_rate": 9.792790461752813e-06, |
|
"loss": 0.1701, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.640449438202247, |
|
"grad_norm": 17.218463897705078, |
|
"learning_rate": 9.759706167502343e-06, |
|
"loss": 0.1796, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.662921348314607, |
|
"grad_norm": 15.873698234558105, |
|
"learning_rate": 9.726477602977906e-06, |
|
"loss": 0.197, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.6853932584269664, |
|
"grad_norm": 19.27431297302246, |
|
"learning_rate": 9.693106134075641e-06, |
|
"loss": 0.2239, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.7078651685393256, |
|
"grad_norm": 16.935728073120117, |
|
"learning_rate": 9.659593132565929e-06, |
|
"loss": 0.1388, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.7078651685393256, |
|
"eval_loss": 0.5907432436943054, |
|
"eval_runtime": 28.0316, |
|
"eval_samples_per_second": 14.127, |
|
"eval_steps_per_second": 14.127, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.7303370786516856, |
|
"grad_norm": 15.374393463134766, |
|
"learning_rate": 9.625939976037002e-06, |
|
"loss": 0.2022, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.752808988764045, |
|
"grad_norm": 13.357222557067871, |
|
"learning_rate": 9.59214804783831e-06, |
|
"loss": 0.1715, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.7752808988764044, |
|
"grad_norm": 15.346492767333984, |
|
"learning_rate": 9.558218737023673e-06, |
|
"loss": 0.2355, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.797752808988764, |
|
"grad_norm": 8.775595664978027, |
|
"learning_rate": 9.524153438294159e-06, |
|
"loss": 0.192, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.8202247191011236, |
|
"grad_norm": 18.73467254638672, |
|
"learning_rate": 9.489953551940784e-06, |
|
"loss": 0.226, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.842696629213483, |
|
"grad_norm": 16.3373966217041, |
|
"learning_rate": 9.455620483786914e-06, |
|
"loss": 0.2295, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.865168539325843, |
|
"grad_norm": 14.356298446655273, |
|
"learning_rate": 9.421155645130514e-06, |
|
"loss": 0.1508, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.8876404494382024, |
|
"grad_norm": 20.167428970336914, |
|
"learning_rate": 9.386560452686111e-06, |
|
"loss": 0.2378, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.9101123595505616, |
|
"grad_norm": 18.679561614990234, |
|
"learning_rate": 9.351836328526564e-06, |
|
"loss": 0.2386, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.932584269662921, |
|
"grad_norm": 13.456331253051758, |
|
"learning_rate": 9.316984700024613e-06, |
|
"loss": 0.1977, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.955056179775281, |
|
"grad_norm": 16.58353042602539, |
|
"learning_rate": 9.282006999794201e-06, |
|
"loss": 0.2215, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.9775280898876404, |
|
"grad_norm": 12.72409439086914, |
|
"learning_rate": 9.246904665631587e-06, |
|
"loss": 0.166, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 15.713534355163574, |
|
"learning_rate": 9.211679140456241e-06, |
|
"loss": 0.1595, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.022471910112359, |
|
"grad_norm": 17.47097396850586, |
|
"learning_rate": 9.176331872251538e-06, |
|
"loss": 0.1038, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.044943820224719, |
|
"grad_norm": 15.764228820800781, |
|
"learning_rate": 9.140864314005223e-06, |
|
"loss": 0.1084, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.044943820224719, |
|
"eval_loss": 0.6473093032836914, |
|
"eval_runtime": 28.5511, |
|
"eval_samples_per_second": 13.87, |
|
"eval_steps_per_second": 13.87, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.067415730337078, |
|
"grad_norm": 19.521255493164062, |
|
"learning_rate": 9.105277923649698e-06, |
|
"loss": 0.1045, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.089887640449438, |
|
"grad_norm": 6.521065711975098, |
|
"learning_rate": 9.069574164002092e-06, |
|
"loss": 0.0885, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.112359550561798, |
|
"grad_norm": 10.468367576599121, |
|
"learning_rate": 9.033754502704119e-06, |
|
"loss": 0.0933, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.134831460674158, |
|
"grad_norm": 8.916471481323242, |
|
"learning_rate": 8.997820412161765e-06, |
|
"loss": 0.1004, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.157303370786517, |
|
"grad_norm": 7.306938171386719, |
|
"learning_rate": 8.961773369484739e-06, |
|
"loss": 0.1019, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.179775280898877, |
|
"grad_norm": 16.291723251342773, |
|
"learning_rate": 8.925614856425787e-06, |
|
"loss": 0.1043, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.202247191011236, |
|
"grad_norm": 11.017843246459961, |
|
"learning_rate": 8.88934635931975e-06, |
|
"loss": 0.1468, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.224719101123595, |
|
"grad_norm": 11.328751564025879, |
|
"learning_rate": 8.852969369022494e-06, |
|
"loss": 0.1022, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.247191011235955, |
|
"grad_norm": 10.77855396270752, |
|
"learning_rate": 8.816485380849613e-06, |
|
"loss": 0.1098, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.269662921348314, |
|
"grad_norm": 21.963041305541992, |
|
"learning_rate": 8.779895894514961e-06, |
|
"loss": 0.0932, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.292134831460674, |
|
"grad_norm": 10.421232223510742, |
|
"learning_rate": 8.743202414069012e-06, |
|
"loss": 0.1115, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.314606741573034, |
|
"grad_norm": 12.438689231872559, |
|
"learning_rate": 8.706406447837024e-06, |
|
"loss": 0.093, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.337078651685394, |
|
"grad_norm": 15.851973533630371, |
|
"learning_rate": 8.669509508357052e-06, |
|
"loss": 0.1099, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.359550561797753, |
|
"grad_norm": 12.311159133911133, |
|
"learning_rate": 8.632513112317761e-06, |
|
"loss": 0.1131, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.382022471910112, |
|
"grad_norm": 13.830821990966797, |
|
"learning_rate": 8.59541878049609e-06, |
|
"loss": 0.1418, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.382022471910112, |
|
"eval_loss": 0.6235558390617371, |
|
"eval_runtime": 28.0922, |
|
"eval_samples_per_second": 14.096, |
|
"eval_steps_per_second": 14.096, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.404494382022472, |
|
"grad_norm": 20.932544708251953, |
|
"learning_rate": 8.558228037694728e-06, |
|
"loss": 0.0978, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.426966292134831, |
|
"grad_norm": 17.910070419311523, |
|
"learning_rate": 8.520942412679448e-06, |
|
"loss": 0.1239, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.449438202247191, |
|
"grad_norm": 13.0515718460083, |
|
"learning_rate": 8.483563438116257e-06, |
|
"loss": 0.0958, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.47191011235955, |
|
"grad_norm": 14.775212287902832, |
|
"learning_rate": 8.446092650508393e-06, |
|
"loss": 0.0913, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.49438202247191, |
|
"grad_norm": 14.85338306427002, |
|
"learning_rate": 8.408531590133173e-06, |
|
"loss": 0.1077, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.51685393258427, |
|
"grad_norm": 10.560493469238281, |
|
"learning_rate": 8.370881800978673e-06, |
|
"loss": 0.1092, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.539325842696629, |
|
"grad_norm": 11.044146537780762, |
|
"learning_rate": 8.333144830680262e-06, |
|
"loss": 0.1359, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.561797752808989, |
|
"grad_norm": 10.464937210083008, |
|
"learning_rate": 8.29532223045698e-06, |
|
"loss": 0.0996, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.584269662921348, |
|
"grad_norm": 11.029562950134277, |
|
"learning_rate": 8.257415555047786e-06, |
|
"loss": 0.1169, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.606741573033708, |
|
"grad_norm": 7.897212028503418, |
|
"learning_rate": 8.219426362647631e-06, |
|
"loss": 0.101, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.629213483146067, |
|
"grad_norm": 12.705263137817383, |
|
"learning_rate": 8.181356214843423e-06, |
|
"loss": 0.1036, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.651685393258427, |
|
"grad_norm": 18.093433380126953, |
|
"learning_rate": 8.143206676549826e-06, |
|
"loss": 0.1008, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.674157303370786, |
|
"grad_norm": 15.597516059875488, |
|
"learning_rate": 8.104979315944941e-06, |
|
"loss": 0.1057, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.696629213483146, |
|
"grad_norm": 11.599696159362793, |
|
"learning_rate": 8.066675704405837e-06, |
|
"loss": 0.102, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.719101123595506, |
|
"grad_norm": 13.57479190826416, |
|
"learning_rate": 8.028297416443953e-06, |
|
"loss": 0.1374, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.719101123595506, |
|
"eval_loss": 0.6200308203697205, |
|
"eval_runtime": 28.1044, |
|
"eval_samples_per_second": 14.09, |
|
"eval_steps_per_second": 14.09, |
|
"step": 2100 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0762916960406733e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|